tools

various tools
git clone git://deadbeef.fr/tools.git
Log | Files | Refs | README | LICENSE

commit 17c3065ecfa59bee483911aa7569852c57bda17b
parent cb277a9720d9e8ece11b3f66bc1e47bf07069574
Author: Morel Bérenger <berenger.morel@neutralite.org>
Date:   Wed, 26 Feb 2020 07:59:35 +0100

renamed the command lmerge to avoid a conflict with rcs merge binary (wonder if someone uses that though)

Diffstat:
MMakefile | 8++++----
MREADME | 6+++---
Almerge.1.md | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Almerge.cpp | 289++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dmerge.1.md | 68--------------------------------------------------------------------
Dmerge.cpp | 287-------------------------------------------------------------------------------
6 files changed, 364 insertions(+), 362 deletions(-)

diff --git a/Makefile b/Makefile @@ -2,7 +2,7 @@ CC ?= cc CXX ?= c++ .PHONY: all -all: manpages merge.1 merge +all: manpages lmerge.1 lmerge %.1: %.1.md pandoc -s --to=man $< -o $@ @@ -10,11 +10,11 @@ all: manpages merge.1 merge %.o: %.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ -merge: merge.o +lmerge: lmerge.o $(CXX) -o $@ $^ -manpages: merge.1 +manpages: lmerge.1 .PHONY: clean clean: - rm -f merge merge.1 *.o + rm -f lmerge lmerge.1 *.o diff --git a/README b/README @@ -2,7 +2,7 @@ This tool merges sequential entries if they have some fields with same values. USAGE: -See merge.1.md +See lmerge.1.md DEPENDENCIES: @@ -22,8 +22,8 @@ example it's more hackish un current stable (Buster) than it was in old-stable On Debian buster, I do this for example (beware, every single change can break the build): -clang++ -o merge \ - merge.cpp /usr/lib/x86_64-linux-musl/crt1.o \ +clang++ -o lmerge \ + lmerge.cpp /usr/lib/x86_64-linux-musl/crt1.o \ -Os -nostdlib -static -fno-exceptions -stdlib=libc++ -nobuiltininc -nostdinc++ \ -L /usr/lib/x86_64-linux-musl \ -lpthread -lc \ diff --git a/lmerge.1.md b/lmerge.1.md @@ -0,0 +1,68 @@ +% lmerge(1) lmerge manpage +% Bérenger Morel +% 2020-02-25 + +# NAME + +*lmerge* - merges entries with common fields + +# SYNOPSIS + +`cat foo | *lmerge*` + +# DESCRIPTION + +Merges consecutive entries when they share a common field. +Entries are read from stdin. + +*lmerge* does not remove duplicated fields. + +# OPTIONS + +For now, options are managed through environment variables: + +FIELDS + +: list of indexes (starting from 0, separated with **commas** (\',\', 0x2C) of +the fields that will be compared. + +FIELD_SEP + +: list of characters that will be considered as field separators. +Defaults to **space** (\' \', 0x20) and **horizontal tabulation** (\'\\t\', 0x09). + +ENTRY_SEP + +: list of characters that will be considered as entry separators. +Defaults to **newline** (\'\\n\', 0x0A). + +# EXAMPLE + +This invocation: + +```sh +FIELD_SEP=": \t" FIELDS="1,3" ./lmerge <<EOF +0 foo:hello:1 +1 bar:hello:2 + 2:foo:world:3 + 2:bar:world:4 +EOF +``` + +will generate this result: + +``` +0 foo:hello:1 +1 bar:hello:2 + 2:foo:world:3: 2:bar:world:4 +``` + +# BUGS + +Multi-byte characters can not be used for FIELD_SEP and ENTRY_SEP. + +# TODO + +Allow the use of command-line parameters to configure behavior. + +# SEE ALSO diff --git a/lmerge.cpp b/lmerge.cpp @@ -0,0 +1,289 @@ +#ifdef LIBCPP_MUSL_STATIC +#define __GLIBC_PREREQ(x,y) 0 +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <assert.h> +#include <stdint.h> +#include <ctype.h> + +#include <algorithm> +#include <vector> + +/** + * This program reads stdin and when consecutive lines have specific fields all + * containing the same value, prints them replacing the newline character by + * the 1st character in FIELD_SEP. + * Fields are delimited by the FIELD_SEP environment variable. If not defined, + * " \\t" is used instead (see isblank(3)). + * Fields to use are defined by the environment variable FIELDS, which only + * use unsigned decimal integers separated by commas, other characters makes the + * value invalid. + * If FIELDS is not defined or invalid, exits with an error. + * Empty field indexes ("1,,3") are ignored (will resolve in "1,3"). + * Do not work if input is not in line mode. + * Line separator is defined by ENTRY_SEP, or "\\n" if not defined. + * + * TODO: + * * check that ENTRY_SEP works as expected; + * * fix the fact input needs a "\\n" at end of last line for it to be merged; + * * UTF-8 support (field separators); + * * providing FIELDS variable as command-line option; + * * -v/--version option; + * * -h/--help option; + * * remove bloated STL containers; + * * allow to customize the memory allocation scheme at runtime; + * * allow to not print twice merged fields; + * * allow to set verbosity on stderr; + * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions; + * * print as many lines as there where duplicates? + * + * Coding rules: + * * const affect what is before it, so it must follow the type; + **/ + +#include "vector.hpp" + +class field_marker; +typedef vector<char> line_cache; +typedef vector<field_marker> field_marker_t; + +class field_marker +{ + uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX; + +public: + bool ignore( void ) const + { + return m_start == m_end && m_start == UINT16_MAX; + } + + void define( uint16_t start, uint16_t end ) + { + assert( end >= start && start != UINT8_MAX && end != UINT8_MAX ); + m_start = start; m_end = end; + } + + uint16_t start( void ) const + { + assert( !ignore() ); + return m_start; + } + + uint16_t end( void ) const + { + assert( !ignore() ); + return m_end; + } +}; + +bool allocate_markers( + char const * const FIELDS, + field_marker_t& field_cache +); + +int main( void ) +{ + char const * const DEFAULT_FIELD_SEP = " \t"; + char const * const DEFAULT_ENTRY_SEP = "\n"; + + char const * SEP_START = getenv( "FIELD_SEP" ); + if( !SEP_START ) + { + SEP_START = DEFAULT_FIELD_SEP; + } + char const * SEP_ENTRY = getenv( "ENTRY_SEP" ); + if( !SEP_ENTRY ) + { + SEP_ENTRY = DEFAULT_ENTRY_SEP; + } + + char const * const FIELDS = getenv( "FIELDS" ); + if( !FIELDS ) + { + fputs( "ERROR: FIELDS is not defined\n", stderr ); + return EXIT_FAILURE; + } + + if( strlen( FIELDS ) == 0 ) + { + fputs( "ERROR: FIELDS is empty\n", stderr ); + return EXIT_FAILURE; + } + + field_marker_t field_cache; + if( allocate_markers( FIELDS, field_cache ) ) + { + return EXIT_FAILURE; + } + + size_t buf_sz = 2048; + char* buf = nullptr; + + // allocating a cache of at least 16 bytes. + // Note: I don't see how merging lines smaller than 16 bytes can be useful + // also, not even enough mem for that would indicate bigger problems... + while( !buf && buf_sz >= 32 ) + { + buf_sz /= 2; + buf = static_cast<char*>( malloc( buf_sz ) ); + } + + if( !buf ) + { + fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno ); + return EXIT_FAILURE; + } + + bool fetch = true; + line_cache last_line; + char const * const SEP_END = SEP_START + strlen( SEP_START ); + while( !feof( stdin ) ) + { + if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) ) + { + free( buf ); + buf = nullptr; + buf_sz = 0; + if( !feof( stdin ) ) + { + fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno ); + return EXIT_FAILURE; + } + break; + } + + size_t str_sz = strlen( buf ); + if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) ) + { + fprintf( stderr, "ERROR: buffer too small for some lines\n" ); + return EXIT_FAILURE; + } + + if( !fetch ) + { + char const* dst_ptr = buf; + for( size_t i = 0; i < field_cache.size(); ++i ) + { + field_marker const& src = field_cache[i]; + if( src.ignore() ) + { + char const *sep = SEP_END; + while( sep == SEP_END ) + { + ++dst_ptr; + sep = SEP_START; + for( ; sep != SEP_END && *dst_ptr && *dst_ptr != *sep; ++sep ){} + } + ++dst_ptr; + continue; + } + char const * src_ptr = last_line.data() + src.start(); + size_t len = src.end() - src.start(); + if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) ) + { + fetch = true; + break; + } + + if( 0 != memcmp( dst_ptr, src_ptr, len ) ) + { + fetch = true; + break; + } + + char last = dst_ptr[len]; + char const * sep_ = SEP_START; + assert( sep_ != nullptr ); + while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last ) + { + ++sep_; + } + if( 0 == *sep_ ) + { + fetch = true; + break; + } + dst_ptr += len; + assert( dst_ptr >= buf ); + } + + fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout ); + } + + last_line.assign( buf, buf + str_sz ); + if( last_line.back() == *SEP_ENTRY ) + { + last_line.back() = 0; + } + + if( fetch ) + { + line_cache::iterator start = last_line.begin(); + line_cache::iterator cache_end = last_line.end(); + size_t field_index = 0; + while( start != cache_end && field_index < field_cache.size() ) + { + auto end = last_line.end(); + if( last_line.back() == 0 ) + { + --end; + } + line_cache::iterator it = std::find_first_of + ( + start, end, + SEP_START, SEP_END + ); + if( !field_cache[field_index].ignore() ) + { + field_cache[field_index].define( + static_cast<uint16_t>( start - last_line.begin() ), + static_cast<uint16_t>( it - last_line.begin() ) + ); + } + start = it + 1; + ++field_index; + } + fetch = false; + } + fputs( last_line.data(), stdout ); + } + fputc( *SEP_ENTRY, stdout ); + return EXIT_SUCCESS; +} + +bool allocate_markers( + char const * const FIELDS, + field_marker_t& field_cache +) +{ + field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases + size_t last_field = 0; + + char const * fields = FIELDS - 1; + do + { + ++fields; + if( isdigit( *fields ) ) + { + last_field = last_field * 10 + static_cast<size_t>( *fields - '0' ); + } + else if( *fields == ',' || *fields == 0 ) + { + size_t max = std::max( field_cache.size(), last_field ); + field_cache.resize( max ); + field_cache[last_field - 1].define( 0, 0 ); + last_field = 0; + } + else + { + fputs( "ERROR: FIELDS contains illegal characters\n", stderr ); + return true; + } + }while( *fields ); + field_cache.shrink_to_fit(); + return false; +} diff --git a/merge.1.md b/merge.1.md @@ -1,68 +0,0 @@ -% merge(1) merge manpage -% Bérenger Morel -% 2020-02-25 - -# NAME - -*merge* - merges entries with common fields - -# SYNOPSIS - -`cat foo | *merge*` - -# DESCRIPTION - -Merges consecutive entries when they share a common field. -Entries are read from stdin. - -*merge* does not remove duplicated fields. - -# OPTIONS - -For now, options are managed through environment variables: - -FIELDS - -: list of indexes (starting from 0, separated with **commas** (\',\', 0x2C) of -the fields that will be compared. - -FIELD_SEP - -: list of characters that will be considered as field separators. -Defaults to **space** (\' \', 0x20) and **horizontal tabulation** (\'\\t\', 0x09). - -ENTRY_SEP - -: list of characters that will be considered as entry separators. -Defaults to **newline** (\'\\n\', 0x0A). - -# EXAMPLE - -This invocation: - -```sh -FIELD_SEP=": \t" FIELDS="1,3" ./merge <<EOF -0 foo:hello:1 -1 bar:hello:2 - 2:foo:world:3 - 2:bar:world:4 -EOF -``` - -will generate this result: - -``` -0 foo:hello:1 -1 bar:hello:2 - 2:foo:world:3: 2:bar:world:4 -``` - -# BUGS - -Multi-byte characters can not be used for FIELD_SEP and ENTRY_SEP. - -# TODO - -Allow the use of command-line parameters to configure behavior. - -# SEE ALSO diff --git a/merge.cpp b/merge.cpp @@ -1,287 +0,0 @@ -#ifdef LIBCPP_MUSL_STATIC -#define __GLIBC_PREREQ(x,y) 0 -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <errno.h> -#include <string.h> -#include <assert.h> -#include <stdint.h> -#include <ctype.h> - -#include <algorithm> -#include <vector> - -/** - * This program reads stdin and when consecutive lines have specific fields all - * containing the same value, prints them replacing the newline character by - * the 1st character in FIELD_SEP. - * Fields are delimited by the FIELD_SEP environment variable. If not defined, - * " \\t" is used instead (see isblank(3)). - * Fields to use are defined by the environment variable FIELDS, which only - * use unsigned decimal integers separated by commas, other characters makes the - * value invalid. - * If FIELDS is not defined or invalid, exits with an error. - * Empty field indexes ("1,,3") are ignored (will resolve in "1,3"). - * Do not work if input is not in line mode. - * Line separator is defined by ENTRY_SEP, or "\\n" if not defined. - * - * TODO: - * * check that ENTRY_SEP works as expected; - * * fix the fact input needs a "\\n" at end of last line for it to be merged; - * * UTF-8 support (field separators); - * * providing FIELDS variable as command-line option; - * * -v/--version option; - * * -h/--help option; - * * remove bloated STL containers; - * * allow to customize the memory allocation scheme at runtime; - * * allow to not print twice merged fields; - * * allow to set verbosity on stderr; - * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions; - * * print as many lines as there where duplicates? - * - * Coding rules: - * * const affect what is before it, so it must follow the type; - **/ - -class field_marker; -typedef std::vector<char> line_cache; -typedef std::vector<field_marker> field_marker_t; - -class field_marker -{ - uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX; - -public: - bool ignore( void ) const - { - return m_start == m_end && m_start == UINT16_MAX; - } - - void define( uint16_t start, uint16_t end ) - { - assert( end >= start && start != UINT8_MAX && end != UINT8_MAX ); - m_start = start; m_end = end; - } - - uint16_t start( void ) const - { - assert( !ignore() ); - return m_start; - } - - uint16_t end( void ) const - { - assert( !ignore() ); - return m_end; - } -}; - -bool allocate_markers( - char const * const FIELDS, - field_marker_t& field_cache -); - -int main( void ) -{ - char const * const DEFAULT_FIELD_SEP = " \t"; - char const * const DEFAULT_ENTRY_SEP = "\n"; - - char const * SEP_START = getenv( "FIELD_SEP" ); - if( !SEP_START ) - { - SEP_START = DEFAULT_FIELD_SEP; - } - char const * SEP_ENTRY = getenv( "ENTRY_SEP" ); - if( !SEP_ENTRY ) - { - SEP_ENTRY = DEFAULT_ENTRY_SEP; - } - - char const * const FIELDS = getenv( "FIELDS" ); - if( !FIELDS ) - { - fputs( "ERROR: FIELDS is not defined\n", stderr ); - return EXIT_FAILURE; - } - - if( strlen( FIELDS ) == 0 ) - { - fputs( "ERROR: FIELDS is empty\n", stderr ); - return EXIT_FAILURE; - } - - field_marker_t field_cache; - if( allocate_markers( FIELDS, field_cache ) ) - { - return EXIT_FAILURE; - } - - size_t buf_sz = 2048; - char* buf = nullptr; - - // allocating a cache of at least 16 bytes. - // Note: I don't see how merging lines smaller than 16 bytes can be useful - // also, not even enough mem for that would indicate bigger problems... - while( !buf && buf_sz >= 32 ) - { - buf_sz /= 2; - buf = static_cast<char*>( malloc( buf_sz ) ); - } - - if( !buf ) - { - fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno ); - return EXIT_FAILURE; - } - - bool fetch = true; - line_cache last_line; - char const * const SEP_END = SEP_START + strlen( SEP_START ); - while( !feof( stdin ) ) - { - if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) ) - { - free( buf ); - buf = nullptr; - buf_sz = 0; - if( !feof( stdin ) ) - { - fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno ); - return EXIT_FAILURE; - } - break; - } - - size_t str_sz = strlen( buf ); - if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) ) - { - fprintf( stderr, "ERROR: buffer too small for some lines\n" ); - return EXIT_FAILURE; - } - - if( !fetch ) - { - char const* dst_ptr = buf; - for( size_t i = 0; i < field_cache.size(); ++i ) - { - field_marker const& src = field_cache[i]; - if( src.ignore() ) - { - char const *sep = SEP_END; - while( sep == SEP_END ) - { - ++dst_ptr; - sep = SEP_START; - for( ; sep != SEP_END && *dst_ptr && *dst_ptr != *sep; ++sep ){} - } - ++dst_ptr; - continue; - } - char const * src_ptr = last_line.data() + src.start(); - size_t len = src.end() - src.start(); - if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) ) - { - fetch = true; - break; - } - - if( 0 != memcmp( dst_ptr, src_ptr, len ) ) - { - fetch = true; - break; - } - - char last = dst_ptr[len]; - char const * sep_ = SEP_START; - assert( sep_ != nullptr ); - while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last ) - { - ++sep_; - } - if( 0 == *sep_ ) - { - fetch = true; - break; - } - dst_ptr += len; - assert( dst_ptr >= buf ); - } - - fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout ); - } - - last_line.assign( buf, buf + str_sz ); - if( last_line.back() == *SEP_ENTRY ) - { - last_line.back() = 0; - } - - if( fetch ) - { - line_cache::iterator start = last_line.begin(); - line_cache::iterator cache_end = last_line.end(); - size_t field_index = 0; - while( start != cache_end && field_index < field_cache.size() ) - { - auto end = last_line.end(); - if( last_line.back() == 0 ) - { - --end; - } - line_cache::iterator it = std::find_first_of - ( - start, end, - SEP_START, SEP_END - ); - if( !field_cache[field_index].ignore() ) - { - field_cache[field_index].define( - static_cast<uint16_t>( start - last_line.begin() ), - static_cast<uint16_t>( it - last_line.begin() ) - ); - } - start = it + 1; - ++field_index; - } - fetch = false; - } - fputs( last_line.data(), stdout ); - } - fputc( *SEP_ENTRY, stdout ); - return EXIT_SUCCESS; -} - -bool allocate_markers( - char const * const FIELDS, - field_marker_t& field_cache -) -{ - field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases - size_t last_field = 0; - - char const * fields = FIELDS - 1; - do - { - ++fields; - if( isdigit( *fields ) ) - { - last_field = last_field * 10 + static_cast<size_t>( *fields - '0' ); - } - else if( *fields == ',' || *fields == 0 ) - { - size_t max = std::max( field_cache.size(), last_field ); - field_cache.resize( max ); - field_cache[last_field - 1].define( 0, 0 ); - last_field = 0; - } - else - { - fputs( "ERROR: FIELDS contains illegal characters\n", stderr ); - return true; - } - }while( *fields ); - field_cache.shrink_to_fit(); - return false; -}