lmerge.cpp (9336B)
1 // Copyright (c) 2020 Morel Bérenger 2 // 3 // This software is provided 'as-is', without any express or implied 4 // warranty. In no event will the authors be held liable for any damages 5 // arising from the use of this software. 6 // 7 // Permission is granted to anyone to use this software for any purpose, 8 // including commercial applications, and to alter it and redistribute it 9 // freely, subject to the following restrictions: 10 // 11 // 1. The origin of this software must not be misrepresented; you must not 12 // claim that you wrote the original software. If you use this software 13 // in a product, an acknowledgment in the product documentation would be 14 // appreciated but is not required. 15 // 2. Altered source versions must be plainly marked as such, and must not be 16 // misrepresented as being the original software. 17 // 3. This notice may not be removed or altered from any source distribution. 18 19 #ifdef LIBCPP_MUSL_STATIC 20 #define __GLIBC_PREREQ(x,y) 0 21 #endif 22 23 #include <stdlib.h> 24 #include <stdio.h> 25 #include <errno.h> 26 #include <string.h> 27 #include <assert.h> 28 #include <stdint.h> 29 #include <ctype.h> 30 31 #include <unistd.h> 32 33 #include <algorithm> 34 #include <iterator> 35 36 /** 37 * TODO: 38 * * check that ENTRY_SEP works as expected; 39 * * fix the fact input needs a "\\n" at end of last line for it to be merged; 40 * * UTF-8 support (field separators); 41 * * -v/--version option; 42 * * allow to customize the memory allocation scheme at runtime; 43 * * allow to not print twice merged fields; 44 * * allow to set verbosity on stderr; 45 * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions; 46 * * print as many lines as there where duplicates? 47 * 48 * Coding rules: 49 * * const affect what is before it, so it must follow the type; 50 **/ 51 52 #include <vector.hpp> 53 #include <optparser.hpp> 54 55 class field_marker; 56 typedef vector<char> line_cache; 57 typedef vector<field_marker> field_marker_t; 58 59 class field_marker 60 { 61 uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX; 62 63 public: 64 bool ignore( void ) const 65 { 66 return m_start == m_end && m_start == UINT16_MAX; 67 } 68 69 void define( uint16_t start, uint16_t end ) 70 { 71 assert( end >= start && start != UINT8_MAX && end != UINT8_MAX ); 72 m_start = start; m_end = end; 73 } 74 75 uint16_t start( void ) const 76 { 77 assert( !ignore() ); 78 return m_start; 79 } 80 81 uint16_t end( void ) const 82 { 83 assert( !ignore() ); 84 return m_end; 85 } 86 }; 87 88 bool allocate_markers( 89 char const * const FIELDS, 90 field_marker_t& field_cache 91 ); 92 93 void print_help( char const* pgm, FILE* target, opt_desc_t const* start, opt_desc_t const* end ); 94 95 int main( int argc, char **argv ) 96 { 97 char const * SEP_START = getenv( "FIELD_SEP" ); 98 char const * SEP_ENTRY = getenv( "ENTRY_SEP" ); 99 char const * FIELDS = getenv( "FIELDS" ); 100 if( !SEP_START ) 101 { 102 SEP_START = " \t"; 103 } 104 if( !SEP_ENTRY ) 105 { 106 SEP_ENTRY = "\n"; 107 } 108 109 opt_desc_t opts[] = 110 { 111 STD_HELP, 112 #ifndef NO_CMDLINE 113 { "field_sep", "field separator" , 't', 0, &SEP_START, set<char const**>, show<char*> }, 114 { "entry_sep", "entry separator" , 'l', 0, &SEP_ENTRY, set<char const**>, show<char*> }, 115 { "fields" , "fields to compare", 'f', 0, &FIELDS , set<char const**>, show<char*> }, 116 #endif 117 }; 118 auto b_opts = std::begin( opts ); 119 auto e_opts = std::end( opts ); 120 121 char **arg = &argv[1]; assert( argc > 0 ); 122 for( int iarg = 1; iarg != argc; ++iarg, ++arg ) 123 { 124 auto error = parse_cmd_opt( *arg, b_opts, e_opts ); 125 switch( error ) 126 { 127 case MAX_COUNT: 128 arg_warning( *arg, error ); 129 break; 130 case NONE: 131 case IGNORED: 132 break; 133 case SET_NO_VAL: 134 case SET_VAL_IGN: 135 case SET_FAIL: 136 case BAD_ARGS: 137 case BAD_SETTER: 138 print_help( argv[0], stderr, b_opts, e_opts ); 139 arg_error( *arg, error ); 140 return EXIT_FAILURE; 141 } 142 } 143 144 if( opts[0].count ) 145 { 146 print_help( argv[0], stdout, b_opts, e_opts ); 147 return EXIT_SUCCESS; 148 } 149 150 if( !FIELDS ) 151 { 152 fputs( "ERROR: FIELDS is not defined\n", stderr ); 153 return EXIT_FAILURE; 154 } 155 156 if( strlen( FIELDS ) == 0 ) 157 { 158 fputs( "ERROR: FIELDS is empty\n", stderr ); 159 return EXIT_FAILURE; 160 } 161 162 field_marker_t field_cache; 163 if( allocate_markers( FIELDS, field_cache ) ) 164 { 165 return EXIT_FAILURE; 166 } 167 168 size_t buf_sz = 2048; 169 char* buf = nullptr; 170 171 // allocating a cache of at least 16 bytes. 172 // Note: I don't see how merging lines smaller than 16 bytes can be useful 173 // also, not even enough mem for that would indicate bigger problems... 174 while( !buf && buf_sz >= 32 ) 175 { 176 buf_sz /= 2; 177 char* nbuf = static_cast<char*>( realloc( buf, buf_sz ) ); 178 if( !nbuf ) 179 { 180 free( buf ); 181 return EXIT_FAILURE; 182 } 183 buf = nbuf; 184 } 185 186 if( !buf ) 187 { 188 fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno ); 189 free( buf ); 190 return EXIT_FAILURE; 191 } 192 193 bool fetch = true; 194 line_cache last_line; 195 char const * const SEP_END = SEP_START + strlen( SEP_START ); 196 while( !feof( stdin ) ) 197 { 198 if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) ) 199 { 200 free( buf ); 201 buf = nullptr; 202 if( !feof( stdin ) ) 203 { 204 fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno ); 205 free( buf ); 206 return EXIT_FAILURE; 207 } 208 break; 209 } 210 211 size_t str_sz = strlen( buf ); 212 if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) ) 213 { 214 fprintf( stderr, "ERROR: buffer too small for some lines\n" ); 215 free( buf ); 216 return EXIT_FAILURE; 217 } 218 219 if( !fetch ) 220 { 221 char const* dst_ptr = buf; 222 for( size_t i = 0; i < field_cache.size(); ++i ) 223 { 224 field_marker const& src = field_cache[i]; 225 if( src.ignore() ) 226 { 227 char const *sep = SEP_END; 228 while( sep == SEP_END ) 229 { 230 ++dst_ptr; 231 sep = SEP_START; 232 for( ; sep != SEP_END && *dst_ptr && *dst_ptr != *sep; ++sep ){} 233 } 234 ++dst_ptr; 235 continue; 236 } 237 char const * src_ptr = last_line.data() + src.start(); 238 size_t len = src.end() - src.start(); 239 if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) ) 240 { 241 fetch = true; 242 break; 243 } 244 245 if( 0 != memcmp( dst_ptr, src_ptr, len ) ) 246 { 247 fetch = true; 248 break; 249 } 250 251 char last = dst_ptr[len]; 252 char const * sep_ = SEP_START; 253 assert( sep_ != nullptr ); 254 while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last ) 255 { 256 ++sep_; 257 } 258 if( 0 == *sep_ ) 259 { 260 fetch = true; 261 break; 262 } 263 dst_ptr += len; 264 assert( dst_ptr >= buf ); 265 } 266 267 fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout ); 268 } 269 270 last_line.assign( buf, buf + str_sz ); 271 if( last_line.back() == *SEP_ENTRY ) 272 { 273 last_line.back() = 0; 274 } 275 276 if( fetch ) 277 { 278 line_cache::iterator start = last_line.begin(); 279 line_cache::iterator cache_end = last_line.end(); 280 size_t field_index = 0; 281 while( start != cache_end && field_index < field_cache.size() ) 282 { 283 auto end = last_line.end(); 284 if( last_line.back() == 0 ) 285 { 286 --end; 287 } 288 line_cache::iterator it = std::find_first_of 289 ( 290 start, end, 291 SEP_START, SEP_END 292 ); 293 if( !field_cache[field_index].ignore() ) 294 { 295 field_cache[field_index].define( 296 static_cast<uint16_t>( start - last_line.begin() ), 297 static_cast<uint16_t>( it - last_line.begin() ) 298 ); 299 } 300 start = it + 1; 301 ++field_index; 302 } 303 fetch = false; 304 } 305 fputs( last_line.data(), stdout ); 306 } 307 fputc( *SEP_ENTRY, stdout ); 308 free( buf ); 309 return EXIT_SUCCESS; 310 } 311 312 bool allocate_markers( 313 char const * const FIELDS, 314 field_marker_t& field_cache 315 ) 316 { 317 field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases 318 size_t last_field = 0; 319 320 char const * fields = FIELDS - 1; 321 do 322 { 323 ++fields; 324 if( isdigit( *fields ) ) 325 { 326 last_field = last_field * 10 + static_cast<size_t>( *fields - '0' ); 327 } 328 else if( *fields == ',' || *fields == 0 ) 329 { 330 size_t max = std::max( field_cache.size(), last_field ); 331 field_cache.resize( max ); 332 field_cache[last_field - 1].define( 0, 0 ); 333 last_field = 0; 334 } 335 else 336 { 337 fputs( "ERROR: FIELDS contains illegal characters\n", stderr ); 338 return true; 339 } 340 }while( *fields ); 341 field_cache.shrink_to_fit(); 342 return false; 343 } 344 345 void print_help( char const* pgm, FILE* target, opt_desc_t const* start, opt_desc_t const* end ) 346 { 347 uint16_t w, h; 348 if( term_ch_size( &w, &h, STDOUT_FILENO ) ) 349 { 350 w = 80; 351 fputs( "TODO: could not get terminal's size\n", stderr ); 352 } 353 fputs( "Usage: ", target ); 354 fputs( pgm, target ); 355 fputs( " [OPTIONS]\nDescription:\n", target ); 356 char desc[] = 357 "This program reads stdin and when consecutive lines have specific " 358 "fields all containing the same value, prints them replacing the " 359 "newline character by the 1st character in FIELD_SEP.\n" 360 "Fields are delimited by the FIELD_SEP environment variable. If not " 361 "defined, \" \\t\" is used instead (see isblank(3)).\n" 362 "Fields to use are defined by the environment variable FIELDS, which " 363 "only use unsigned decimal integers separated by commas, other " 364 "characters makes the value invalid.\n" 365 "If FIELDS is not defined or invalid, exits with an error.\n" 366 "Empty field indexes (\"1,,3\") are ignored (will resolve in \"1,3\").\n" 367 "Do not work if input is not in line mode.\n" 368 "Line separator is defined by ENTRY_SEP, or \"\\n\" if not defined.\n" 369 ; 370 if( indent_txt( desc, std::end( desc ), 1, w, 8, target ) ) 371 { 372 fputs( "TODO: handle indent_txt's errors\n", stderr ); 373 } 374 375 fputs( "Options:\n", target ); 376 print_opts( target, start, end ); 377 } 378
