1 /* cut.c - print selected ranges from a file
2  *
3  * Copyright 2016 Rob Landley <rob@landley.net>
4  *
5  * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
6  *
7  * Deviations from posix: added -DF. We can only accept 512 selections, and
8  * "-" counts as start to end. Using spaces to separate a comma-separated list
9  * is silly and inconsistent with dd, ps, cp, and mount.
10  *
11  * todo: -n, -s with -c
12 
13 USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
14 
15 config CUT
16   bool "cut"
17   default y
18   help
19     usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
20 
21     Print selected parts of lines from each FILE to standard output.
22 
23     Each selection LIST is comma separated, either numbers (counting from 1)
24     or dash separated ranges (inclusive, with X- meaning to end of line and -X
25     from start). By default selection ranges are sorted and collated, use -D
26     to prevent that.
27 
28     -b	Select bytes
29     -c	Select UTF-8 characters
30     -C	Select unicode columns
31     -d	Use DELIM (default is TAB for -f, run of whitespace for -F)
32     -D	Don't sort/collate selections or match -fF lines without delimiter
33     -f	Select fields (words) separated by single DELIM character
34     -F	Select fields separated by DELIM regex
35     -O	Output delimiter (default one space for -F, input delim for -f)
36     -s	Skip lines without delimiters
37 */
38 #define FOR_cut
39 #include "toys.h"
40 
41 GLOBALS(
42   char *d, *O;
43   struct arg_list *select[5]; // we treat them the same, so loop through
44 
45   int pairs;
46   regex_t reg;
47 )
48 
49 
50 // Apply selections to an input line, producing output
cut_line(char ** pline,long len)51 static void cut_line(char **pline, long len)
52 {
53   unsigned *pairs = (void *)toybuf;
54   char *line;
55   int i, j;
56 
57   if (!pline) return;
58   line = *pline;
59   if (len && line[len-1]=='\n') line[--len] = 0;
60 
61   // Loop through selections
62   for (i=0; i<TT.pairs; i++) {
63     unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
64     char *s = line, *ss;
65 
66     // input: start/end position, count=difference between them
67     // output: s = start of string, len = bytes to output
68 
69     if (start) start--;
70     if (start>=len) continue;
71     if (!end || end>len) end = len;
72     count = end-start;
73 
74     // Find start and end of output string for the relevant selection type
75     if (toys.optflags&FLAG_b) s += start;
76     else if (toys.optflags&FLAG_C) {
77       // crunch_str() currently assumes that combining characters get
78       // escaped, to provide an unambiguous visual representation.
79       // This assumes the input string is null terminated.
80       if (start) crunch_str(&s, start, 0, 0, 0);
81       if (!*s) continue;
82       start = s-line;
83       ss = s;
84       crunch_str(&ss, count, 0, 0, 0);
85       count = ss-s;
86 
87     } else if (toys.optflags&FLAG_c) {
88       wchar_t wc;
89       char *sss;
90 
91       // Find start
92       ss = line+len;
93       while (start && s<ss) {
94         if (0<=(j = utf8towc(&wc, s, len))) start--;
95         s += (j<1) ? 1 : j;
96       }
97       if (s == ss) continue;
98 
99       // Find end
100       end = count;
101       sss = s;
102       while (end && sss<ss) {
103         if (0<=(j = utf8towc(&wc, sss, len))) end--;
104         sss += (j<1) ? 1 : j;
105       }
106       count = sss-s;
107     } else {
108       regmatch_t match;
109 
110       // Loop through skipping appropriate number of fields
111       for (j = 0; j<2; j++) {
112         ss = s;
113         if (j) start = count;
114         else end = start;
115         while (*ss && start) {
116           if (toys.optflags&FLAG_f) {
117             if (!strchr(TT.d, *ss++)) continue;
118             if (!--start && j) ss--;
119           } else {
120             if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
121               ss = line+len;
122               continue;
123             }
124             if (!match.rm_eo) break; // zero length match == no delimiter
125             ss += (!--start && j) ? match.rm_so : match.rm_eo;
126           }
127         }
128         if (!j && !*(s = ss)) break;
129       }
130 
131       // If we never encountered even one separator, print whole line (posix!)
132       if (!j && end == start) {
133         if (toys.optflags&FLAG_D) break;
134         if (toys.optflags&FLAG_s) return;
135         fwrite(line, len, 1, stdout);
136         break;
137       } else if (!*s) continue;
138       count = ss-s;
139     }
140     if (i && TT.O) fputs(TT.O, stdout);
141     fwrite(s, count, 1, stdout);
142   }
143   xputc('\n');
144 }
145 
compar(unsigned * a,unsigned * b)146 static int compar(unsigned *a, unsigned *b)
147 {
148   if (*a<*b) return -1;
149   if (*a>*b) return 1;
150   if (a[1]<b[1]) return -1;
151   if (a[1]>b[1]) return 1;
152 
153   return 0;
154 }
155 
156 // parse A or A-B or A- or -B
get_range(void * data,char * str,int len)157 static char *get_range(void *data, char *str, int len)
158 {
159   char *end = str;
160   unsigned *pairs = (void *)toybuf, i;
161 
162   // Using toybuf[] to store ranges means we can have 512 selections max.
163   if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
164   pairs += 2*TT.pairs++;
165 
166   pairs[1] = UINT_MAX;
167   for (i = 0; ;i++) {
168     if (i==2) return end;
169     if (isdigit(*end)) {
170       long long ll = estrtol(end, &end, 10);
171 
172       if (ll<1 || ll>UINT_MAX || errno) return end;
173       pairs[i] = ll;
174     }
175     if (*end++ != '-') break;
176   }
177   if (!i) pairs[1] = pairs[0];
178   if ((end-str)<len) return end;
179   if (pairs[0]>pairs[1]) return str;
180 
181   // No error
182   return 0;
183 }
184 
cut_main(void)185 void cut_main(void)
186 {
187   int i;
188   char buf[8];
189 
190   // Parse command line arguments
191   if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
192     error_exit("-s needs -Ff");
193   if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
194     error_exit("-d needs -Ff");
195   if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
196   if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
197   if (!TT.O) {
198     if (toys.optflags&FLAG_F) TT.O = " ";
199     else if (toys.optflags&FLAG_f) TT.O = TT.d;
200   }
201 
202   // Parse ranges, which are attached to a selection type (only one can be set)
203   for (i = 0; i<ARRAY_LEN(TT.select); i++) {
204     sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
205     if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
206   }
207   if (!TT.pairs) error_exit("no selections");
208 
209   // Sort and collate selections
210   if (!(toys.optflags&FLAG_D)) {
211     int from, to;
212     unsigned *pairs = (void *)toybuf;
213 
214     qsort(toybuf, TT.pairs, 8, (void *)compar);
215     for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
216       if (pairs[from] > pairs[to+1]) {
217         to += 2;
218         memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
219       } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
220     }
221     TT.pairs = (to/2)+1;
222   }
223 
224   // For each argument, loop through lines of file and call cut_line() on each
225   loopfiles_lines(toys.optargs, cut_line);
226 }
227