meillo@14: /* cut - remove parts of lines of files meillo@14: Copyright (C) 1984 by David M. Ihnat meillo@14: meillo@14: This program is a total rewrite of the Bell Laboratories Unix(Tm) meillo@14: command of the same name, as of System V. It contains no proprietary meillo@14: code, and therefore may be used without violation of any proprietary meillo@14: agreements whatsoever. However, you will notice that the program is meillo@14: copyrighted by me. This is to assure the program does *not* fall meillo@14: into the public domain. Thus, I may specify just what I am now: meillo@14: This program may be freely copied and distributed, provided this notice meillo@14: remains; it may not be sold for profit without express written consent of meillo@14: the author. meillo@14: Please note that I recreated the behavior of the Unix(Tm) 'cut' command meillo@14: as faithfully as possible; however, I haven't run a full set of regression meillo@14: tests. Thus, the user of this program accepts full responsibility for any meillo@14: effects or loss; in particular, the author is not responsible for any losses, meillo@14: explicit or incidental, that may be incurred through use of this program. meillo@14: meillo@14: I ask that any bugs (and, if possible, fixes) be reported to me when meillo@14: possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us meillo@14: meillo@14: POSIX changes, bug fixes, long-named options, and cleanup meillo@14: by David MacKenzie . meillo@14: meillo@14: Options: meillo@14: --bytes=byte-list meillo@14: -b byte-list Print only the bytes in positions listed meillo@14: in BYTE-LIST. meillo@14: Tabs and backspaces are treated like any meillo@14: other character; they take up 1 byte. meillo@14: meillo@14: --characters=character-list meillo@14: -c character-list Print only characters in positions listed meillo@14: in CHARACTER-LIST. meillo@14: The same as -b for now, but meillo@14: internationalization will change that. meillo@14: Tabs and backspaces are treated like any meillo@14: other character; they take up 1 character. meillo@14: meillo@14: --fields=field-list meillo@14: -f field-list Print only the fields listed in FIELD-LIST. meillo@14: Fields are separated by a TAB by default. meillo@14: meillo@14: --delimiter=delim meillo@14: -d delim For -f, fields are separated by the first meillo@14: character in DELIM instead of TAB. meillo@14: meillo@14: -n Do not split multibyte chars (no-op for now). meillo@14: meillo@14: --only-delimited meillo@14: -s For -f, do not print lines that do not contain meillo@14: the field separator character. meillo@14: meillo@14: The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers meillo@14: or ranges separated by commas. The first byte, character, and field meillo@14: are numbered 1. meillo@14: meillo@14: A FILE of `-' means standard input. */ meillo@14: meillo@14: #define _GNU_SOURCE meillo@14: #include meillo@14: #ifndef isblank meillo@14: #define isblank(c) ((c) == ' ' || (c) == '\t') meillo@14: #endif meillo@14: #include meillo@14: #include meillo@14: #include meillo@14: #include "system.h" meillo@14: meillo@14: #ifdef isascii meillo@14: #define ISDIGIT(c) (isascii ((c)) && isdigit ((c))) meillo@14: #else meillo@14: #define ISDIGIT(c) (isdigit ((c))) meillo@14: #endif meillo@14: meillo@14: char *xmalloc (); meillo@14: char *xrealloc (); meillo@14: int set_fields (); meillo@14: int cut_file (); meillo@14: void cut_stream (); meillo@14: void cut_bytes (); meillo@14: void cut_fields (); meillo@14: void enlarge_line (); meillo@14: void error (); meillo@14: void invalid_list (); meillo@14: void usage (); meillo@14: meillo@14: /* The number of elements allocated for the input line meillo@14: and the byte or field number. meillo@14: Enlarged as necessary. */ meillo@14: int line_size; meillo@14: meillo@14: /* Processed output buffer. */ meillo@14: char *outbuf; meillo@14: meillo@14: /* Where to save next char to output. */ meillo@14: char *outbufptr; meillo@14: meillo@14: /* Raw line buffer for field mode. */ meillo@14: char *inbuf; meillo@14: meillo@14: /* Where to save next input char. */ meillo@14: char *inbufptr; meillo@14: meillo@14: /* What can be done about a byte or field. */ meillo@14: enum field_action meillo@14: { meillo@14: field_omit, meillo@14: field_output meillo@14: }; meillo@14: meillo@14: /* In byte mode, which bytes to output. meillo@14: In field mode, which `delim'-separated fields to output. meillo@14: Both bytes and fields are numbered starting with 1, meillo@14: so the first element of `fields' is unused. */ meillo@14: enum field_action *fields; meillo@14: meillo@14: enum operating_mode meillo@14: { meillo@14: undefined_mode, meillo@14: meillo@14: /* Output characters that are in the given bytes. */ meillo@14: byte_mode, meillo@14: meillo@14: /* Output the given delimeter-separated fields. */ meillo@14: field_mode meillo@14: }; meillo@14: meillo@14: enum operating_mode operating_mode; meillo@14: meillo@14: /* If nonzero, meillo@14: for field mode, do not output lines containing no delimeter characters. */ meillo@14: int delimited_lines_only; meillo@14: meillo@14: /* The delimeter character for field mode. */ meillo@14: unsigned char delim; meillo@14: meillo@14: /* Nonzero if we have ever read standard input. */ meillo@14: int have_read_stdin; meillo@14: meillo@14: /* The name this program was run with. */ meillo@14: char *program_name; meillo@14: meillo@14: struct option longopts[] = meillo@14: { meillo@14: {"bytes", 1, 0, 'b'}, meillo@14: {"characters", 1, 0, 'c'}, meillo@14: {"fields", 1, 0, 'f'}, meillo@14: {"delimiter", 1, 0, 'd'}, meillo@14: {"only-delimited", 0, 0, 's'}, meillo@14: {0, 0, 0, 0} meillo@14: }; meillo@14: meillo@14: void meillo@14: main (argc, argv) meillo@14: int argc; meillo@14: char **argv; meillo@14: { meillo@14: int optc, exit_status = 0; meillo@14: meillo@14: program_name = argv[0]; meillo@14: meillo@14: line_size = 512; meillo@14: operating_mode = undefined_mode; meillo@14: delimited_lines_only = 0; meillo@14: delim = '\0'; meillo@14: have_read_stdin = 0; meillo@14: meillo@14: fields = (enum field_action *) meillo@14: xmalloc (line_size * sizeof (enum field_action)); meillo@14: outbuf = (char *) xmalloc (line_size); meillo@14: inbuf = (char *) xmalloc (line_size); meillo@14: meillo@14: for (optc = 0; optc < line_size; optc++) meillo@14: fields[optc] = field_omit; meillo@14: meillo@14: while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, (int *) 0)) meillo@14: != EOF) meillo@14: { meillo@14: switch (optc) meillo@14: { meillo@14: case 'b': meillo@14: case 'c': meillo@14: /* Build the byte list. */ meillo@14: if (operating_mode != undefined_mode) meillo@14: usage (); meillo@14: operating_mode = byte_mode; meillo@14: if (set_fields (optarg) == 0) meillo@14: error (2, 0, "no fields given"); meillo@14: break; meillo@14: meillo@14: case 'f': meillo@14: /* Build the field list. */ meillo@14: if (operating_mode != undefined_mode) meillo@14: usage (); meillo@14: operating_mode = field_mode; meillo@14: if (set_fields (optarg) == 0) meillo@14: error (2, 0, "no fields given"); meillo@14: break; meillo@14: meillo@14: case 'd': meillo@14: /* New delimiter. */ meillo@14: if (optarg[0] == '\0') meillo@14: error (2, 0, "no delimiter given"); meillo@14: if (optarg[1] != '\0') meillo@14: error (2, 0, "delimiter must be a single character"); meillo@14: delim = optarg[0]; meillo@14: break; meillo@14: meillo@14: case 'n': meillo@14: break; meillo@14: meillo@14: case 's': meillo@14: delimited_lines_only++; meillo@14: break; meillo@14: meillo@14: default: meillo@14: usage (); meillo@14: } meillo@14: } meillo@14: meillo@14: if (operating_mode == undefined_mode) meillo@14: usage (); meillo@14: meillo@14: if ((delimited_lines_only || delim != '\0') && operating_mode != field_mode) meillo@14: usage (); meillo@14: meillo@14: if (delim == '\0') meillo@14: delim = '\t'; meillo@14: meillo@14: if (optind == argc) meillo@14: exit_status |= cut_file ("-"); meillo@14: else meillo@14: for (; optind < argc; optind++) meillo@14: exit_status |= cut_file (argv[optind]); meillo@14: meillo@14: if (have_read_stdin && fclose (stdin) == EOF) meillo@14: { meillo@14: error (0, errno, "-"); meillo@14: exit_status = 1; meillo@14: } meillo@14: if (ferror (stdout) || fclose (stdout) == EOF) meillo@14: error (1, 0, "write error"); meillo@14: meillo@14: exit (exit_status); meillo@14: } meillo@14: meillo@14: /* Select for printing the positions in `fields' that are listed in meillo@14: byte or field specification FIELDSTR. FIELDSTR should be meillo@14: composed of one or more numbers or ranges of numbers, separated by meillo@14: blanks or commas. Incomplete ranges may be given: `-m' means meillo@14: `1-m'; `n-' means `n' through end of line or last field. meillo@14: meillo@14: Return the number of fields selected. */ meillo@14: meillo@14: int meillo@14: set_fields (fieldstr) meillo@14: char *fieldstr; meillo@14: { meillo@14: int initial = 1; /* Value of first number in a range. */ meillo@14: int dash_found = 0; /* Nonzero if a '-' is found in this field. */ meillo@14: int value = 0; /* If nonzero, a number being accumulated. */ meillo@14: int fields_selected = 0; /* Number of fields selected so far. */ meillo@14: /* If nonzero, index of first field in a range that goes to end of line. */ meillo@14: int eol_range_start = 0; meillo@14: meillo@14: for (;;) meillo@14: { meillo@14: if (*fieldstr == '-') meillo@14: { meillo@14: /* Starting a range. */ meillo@14: if (dash_found) meillo@14: invalid_list (); meillo@14: dash_found++; meillo@14: fieldstr++; meillo@14: meillo@14: if (value) meillo@14: { meillo@14: if (value >= line_size) meillo@14: enlarge_line (value); meillo@14: initial = value; meillo@14: value = 0; meillo@14: } meillo@14: else meillo@14: initial = 1; meillo@14: } meillo@14: else if (*fieldstr == ',' || isblank (*fieldstr) || *fieldstr == '\0') meillo@14: { meillo@14: /* Ending the string, or this field/byte sublist. */ meillo@14: if (dash_found) meillo@14: { meillo@14: dash_found = 0; meillo@14: meillo@14: /* A range. Possibilites: -n, m-n, n-. meillo@14: In any case, `initial' contains the start of the range. */ meillo@14: if (value == 0) meillo@14: { meillo@14: /* `n-'. From `initial' to end of line. */ meillo@14: eol_range_start = initial; meillo@14: fields_selected++; meillo@14: } meillo@14: else meillo@14: { meillo@14: /* `m-n' or `-n' (1-n). */ meillo@14: if (value < initial) meillo@14: invalid_list (); meillo@14: meillo@14: if (value >= line_size) meillo@14: enlarge_line (value); meillo@14: meillo@14: /* Is there already a range going to end of line? */ meillo@14: if (eol_range_start != 0) meillo@14: { meillo@14: /* Yes. Is the new sequence already contained meillo@14: in the old one? If so, no processing is meillo@14: necessary. */ meillo@14: if (initial < eol_range_start) meillo@14: { meillo@14: /* No, the new sequence starts before the meillo@14: old. Does the old range going to end of line meillo@14: extend into the new range? */ meillo@14: if (eol_range_start < value) meillo@14: /* Yes. Simply move the end of line marker. */ meillo@14: eol_range_start = initial; meillo@14: else meillo@14: { meillo@14: /* No. A simple range, before and disjoint from meillo@14: the range going to end of line. Fill it. */ meillo@14: for (; initial <= value; initial++) meillo@14: fields[initial] = field_output; meillo@14: } meillo@14: meillo@14: /* In any case, some fields were selected. */ meillo@14: fields_selected++; meillo@14: } meillo@14: } meillo@14: else meillo@14: { meillo@14: /* There is no range going to end of line. */ meillo@14: for (; initial <= value; initial++) meillo@14: fields[initial] = field_output; meillo@14: fields_selected++; meillo@14: } meillo@14: value = 0; meillo@14: } meillo@14: } meillo@14: else if (value != 0) meillo@14: { meillo@14: /* A simple field number, not a range. */ meillo@14: if (value >= line_size) meillo@14: enlarge_line (value); meillo@14: meillo@14: fields[value] = field_output; meillo@14: value = 0; meillo@14: fields_selected++; meillo@14: } meillo@14: meillo@14: if (*fieldstr == '\0') meillo@14: { meillo@14: /* If there was a range going to end of line, fill the meillo@14: array from the end of line point. */ meillo@14: if (eol_range_start) meillo@14: for (initial = eol_range_start; initial < line_size; initial++) meillo@14: fields[initial] = field_output; meillo@14: meillo@14: return fields_selected; meillo@14: } meillo@14: meillo@14: fieldstr++; meillo@14: } meillo@14: else if (ISDIGIT (*fieldstr)) meillo@14: { meillo@14: value = 10 * value + *fieldstr - '0'; meillo@14: fieldstr++; meillo@14: } meillo@14: else meillo@14: invalid_list (); meillo@14: } meillo@14: } meillo@14: meillo@14: /* Process file FILE to standard output. meillo@14: Return 0 if successful, 1 if not. */ meillo@14: meillo@14: int meillo@14: cut_file (file) meillo@14: char *file; meillo@14: { meillo@14: FILE *stream; meillo@14: meillo@14: if (!strcmp (file, "-")) meillo@14: { meillo@14: have_read_stdin = 1; meillo@14: stream = stdin; meillo@14: } meillo@14: else meillo@14: { meillo@14: stream = fopen (file, "r"); meillo@14: if (stream == NULL) meillo@14: { meillo@14: error (0, errno, "%s", file); meillo@14: return 1; meillo@14: } meillo@14: } meillo@14: meillo@14: cut_stream (stream); meillo@14: meillo@14: if (ferror (stream)) meillo@14: { meillo@14: error (0, errno, "%s", file); meillo@14: return 1; meillo@14: } meillo@14: if (!strcmp (file, "-")) meillo@14: clearerr (stream); /* Also clear EOF. */ meillo@14: else if (fclose (stream) == EOF) meillo@14: { meillo@14: error (0, errno, "%s", file); meillo@14: return 1; meillo@14: } meillo@14: return 0; meillo@14: } meillo@14: meillo@14: void meillo@14: cut_stream (stream) meillo@14: FILE *stream; meillo@14: { meillo@14: if (operating_mode == byte_mode) meillo@14: cut_bytes (stream); meillo@14: else meillo@14: cut_fields (stream); meillo@14: } meillo@14: meillo@14: /* Print the file open for reading on stream STREAM meillo@14: with the bytes marked `field_omit' in `fields' removed from each line. */ meillo@14: meillo@14: void meillo@14: cut_bytes (stream) meillo@14: FILE *stream; meillo@14: { meillo@14: register int c; /* Each character from the file. */ meillo@14: int doneflag = 0; /* Nonzero if EOF reached. */ meillo@14: int char_count; /* Number of chars in the line so far. */ meillo@14: meillo@14: while (doneflag == 0) meillo@14: { meillo@14: /* Start processing a line. */ meillo@14: outbufptr = outbuf; meillo@14: char_count = 0; meillo@14: meillo@14: do meillo@14: { meillo@14: c = getc (stream); meillo@14: if (c == EOF) meillo@14: { meillo@14: doneflag++; meillo@14: break; meillo@14: } meillo@14: meillo@14: /* If this character is to be sent, stow it in the outbuffer. */ meillo@14: meillo@14: if (++char_count == line_size - 1) meillo@14: enlarge_line (char_count); meillo@14: meillo@14: if (fields[char_count] == field_output || c == '\n') meillo@14: *outbufptr++ = c; meillo@14: } meillo@14: while (c != '\n'); meillo@14: meillo@14: if (char_count) meillo@14: fwrite (outbuf, sizeof (char), outbufptr - outbuf, stdout); meillo@14: } meillo@14: } meillo@14: meillo@14: /* Print the file open for reading on stream STREAM meillo@14: with the fields marked `field_omit' in `fields' removed from each line. meillo@14: All characters are initially stowed in the raw input buffer, until meillo@14: at least one field has been found. */ meillo@14: meillo@14: void meillo@14: cut_fields (stream) meillo@14: FILE *stream; meillo@14: { meillo@14: register int c; /* Each character from the file. */ meillo@14: int doneflag = 0; /* Nonzero if EOF reached. */ meillo@14: int char_count; /* Number of chars in line before any delim. */ meillo@14: int fieldfound; /* Nonzero if any fields to print found. */ meillo@14: int curr_field; /* Current index in `fields'. */ meillo@14: meillo@14: while (doneflag == 0) meillo@14: { meillo@14: char_count = 0; meillo@14: fieldfound = 0; meillo@14: curr_field = 1; meillo@14: outbufptr = outbuf; meillo@14: inbufptr = inbuf; meillo@14: meillo@14: do meillo@14: { meillo@14: c = getc (stream); meillo@14: if (c == EOF) meillo@14: { meillo@14: doneflag++; meillo@14: break; meillo@14: } meillo@14: meillo@14: if (fields[curr_field] == field_output && c != '\n') meillo@14: { meillo@14: /* Working on a field. It, and its terminating meillo@14: delimiter, go only into the processed buffer. */ meillo@14: fieldfound = 1; meillo@14: if (outbufptr - outbuf == line_size - 2) meillo@14: enlarge_line (outbufptr - outbuf); meillo@14: *outbufptr++ = c; meillo@14: } meillo@14: else if (fieldfound == 0) meillo@14: { meillo@14: if (++char_count == line_size - 1) meillo@14: enlarge_line (char_count); meillo@14: *inbufptr++ = c; meillo@14: } meillo@14: meillo@14: if (c == delim && ++curr_field == line_size - 1) meillo@14: enlarge_line (curr_field); meillo@14: } meillo@14: while (c != '\n'); meillo@14: meillo@14: if (fieldfound) meillo@14: { meillo@14: /* Something was found. Print it. */ meillo@14: if (outbufptr[-1] == delim) meillo@14: --outbufptr; /* Suppress trailing delimiter. */ meillo@14: meillo@14: fwrite (outbuf, sizeof (char), outbufptr - outbuf, stdout); meillo@14: if (c == '\n') meillo@14: putc (c, stdout); meillo@14: } meillo@14: else if (!delimited_lines_only && char_count) meillo@14: /* A line with some characters, no delimiters, and no meillo@14: suppression. Print it. */ meillo@14: fwrite (inbuf, sizeof (char), inbufptr - inbuf, stdout); meillo@14: } meillo@14: } meillo@14: meillo@14: /* Extend the buffers to accomodate at least NEW_SIZE characters. */ meillo@14: meillo@14: void meillo@14: enlarge_line (new_size) meillo@14: int new_size; meillo@14: { meillo@14: char *newp; meillo@14: int i; meillo@14: meillo@14: new_size += 256; /* Leave some room to grow. */ meillo@14: meillo@14: fields = (enum field_action *) meillo@14: xrealloc (fields, new_size * sizeof (enum field_action)); meillo@14: meillo@14: newp = (char *) xrealloc (outbuf, new_size); meillo@14: outbufptr += newp - outbuf; meillo@14: outbuf = newp; meillo@14: meillo@14: newp = (char *) xrealloc (inbuf, new_size); meillo@14: inbufptr += newp - inbuf; meillo@14: inbuf = newp; meillo@14: meillo@14: for (i = line_size; i < new_size; i++) meillo@14: fields[i] = field_omit; meillo@14: line_size = new_size; meillo@14: } meillo@14: meillo@14: void meillo@14: invalid_list () meillo@14: { meillo@14: error (2, 0, "invalid byte or field list"); meillo@14: } meillo@14: meillo@14: void meillo@14: usage () meillo@14: { meillo@14: fprintf (stderr, "\ meillo@14: Usage: %s {-b byte-list,--bytes=byte-list} [-n] [file...]\n\ meillo@14: %s {-c character-list,--characters=character-list} [file...]\n\ meillo@14: %s {-f field-list,--fields=field-list} [-d delim] [-s]\n\ meillo@14: [--delimiter=delim] [--only-delimited] [file...]\n", meillo@14: program_name, program_name, program_name); meillo@14: exit (2); meillo@14: }