1 /* cut.c - print selected ranges from a file
2 *
3 * Copyright 2016 Rob Landley <rob@landley.net>
4 *
5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
6 *
7 * Deviations from posix: added -DF. We can only accept 512 selections, and
8 * "-" counts as start to end. Using spaces to separate a comma-separated list
9 * is silly and inconsistent with dd, ps, cp, and mount.
10 *
11 * todo: -n, -s with -c
12
13 USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
14
15 config CUT
16 bool "cut"
17 default y
18 help
19 usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
20
21 Print selected parts of lines from each FILE to standard output.
22
23 Each selection LIST is comma separated, either numbers (counting from 1)
24 or dash separated ranges (inclusive, with X- meaning to end of line and -X
25 from start). By default selection ranges are sorted and collated, use -D
26 to prevent that.
27
28 -b Select bytes
29 -c Select UTF-8 characters
30 -C Select unicode columns
31 -d Use DELIM (default is TAB for -f, run of whitespace for -F)
32 -D Don't sort/collate selections or match -fF lines without delimiter
33 -f Select fields (words) separated by single DELIM character
34 -F Select fields separated by DELIM regex
35 -O Output delimiter (default one space for -F, input delim for -f)
36 -s Skip lines without delimiters
37 */
38 #define FOR_cut
39 #include "toys.h"
40
41 GLOBALS(
42 char *d, *O;
43 struct arg_list *select[5]; // we treat them the same, so loop through
44
45 int pairs;
46 regex_t reg;
47 )
48
49 // Return number of bytes to start of first column fitting in columns
50 // invalid sequences are skipped/ignored
unicolumns(char * start,unsigned columns)51 int unicolumns(char *start, unsigned columns)
52 {
53 int i, j = 0;
54 wchar_t wc;
55 char *s = start, *ss = start;
56
57 // Skip start, rounding down if we hit a multicolumn char
58 while (j<columns && (i = utf8towc(&wc, s, 4))) {
59 if (i<0) s++;
60 else {
61 s += i;
62 if (0<(i = wcwidth(wc))) {
63 if ((j += i)>columns) break;
64 ss = s;
65 }
66 }
67 }
68
69 return ss-start;
70 }
71
72 // Apply selections to an input line, producing output
cut_line(char ** pline,long len)73 static void cut_line(char **pline, long len)
74 {
75 unsigned *pairs = (void *)toybuf;
76 char *line;
77 int i, j;
78
79 if (!pline) return;
80 line = *pline;
81 if (len && line[len-1]=='\n') line[--len] = 0;
82
83 // Loop through selections
84 for (i=0; i<TT.pairs; i++) {
85 unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
86 char *s = line, *ss;
87
88 // input: start/end position, count=difference between them
89 // output: s = start of string, len = bytes to output
90
91 if (start) start--;
92 if (start>=len) continue;
93 if (!end || end>len) end = len;
94 count = end-start;
95
96 // Find start and end of output string for the relevant selection type
97 if (toys.optflags&FLAG_b) s += start;
98 else if (toys.optflags&FLAG_C) {
99 // crunch_str() currently assumes that combining characters get
100 // escaped, to provide an unambiguous visual representation.
101 // This assumes the input string is null terminated.
102 //if (start) crunch_str(&s, start, 0, 0, 0);
103 //if (!*s) continue;
104 //start = s-line;
105 //ss = s;
106 //crunch_str(&ss, count, 0, 0, 0);
107 //count = ss-s;
108
109 s += unicolumns(s, start);
110 count = unicolumns(s, end-start);
111 } else if (toys.optflags&FLAG_c) {
112 wchar_t wc;
113 char *sss;
114
115 // Find start
116 ss = line+len;
117 while (start && s<ss) {
118 if (0<=(j = utf8towc(&wc, s, len))) start--;
119 s += (j<1) ? 1 : j;
120 }
121 if (s == ss) continue;
122
123 // Find end
124 end = count;
125 sss = s;
126 while (end && sss<ss) {
127 if (0<=(j = utf8towc(&wc, sss, len))) end--;
128 sss += (j<1) ? 1 : j;
129 }
130 count = sss-s;
131 } else {
132 regmatch_t match;
133
134 // Loop through skipping appropriate number of fields
135 for (j = 0; j<2; j++) {
136 ss = s;
137 if (j) start = count;
138 else end = start;
139 while (*ss && start) {
140 if (toys.optflags&FLAG_f) {
141 if (!strchr(TT.d, *ss++)) continue;
142 if (!--start && j) ss--;
143 } else {
144 if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
145 ss = line+len;
146 continue;
147 }
148 if (!match.rm_eo) break; // zero length match == no delimiter
149 ss += (!--start && j) ? match.rm_so : match.rm_eo;
150 }
151 }
152 if (!j && !*(s = ss)) break;
153 }
154
155 // If we never encountered even one separator, print whole line (posix!)
156 if (!j && end == start) {
157 if (toys.optflags&FLAG_D) break;
158 if (toys.optflags&FLAG_s) return;
159 fwrite(line, len, 1, stdout);
160 break;
161 } else if (!*s) continue;
162 count = ss-s;
163 }
164 if (i && TT.O) fputs(TT.O, stdout);
165 fwrite(s, count, 1, stdout);
166 }
167 xputc('\n');
168 }
169
compar(unsigned * a,unsigned * b)170 static int compar(unsigned *a, unsigned *b)
171 {
172 if (*a<*b) return -1;
173 if (*a>*b) return 1;
174 if (a[1]<b[1]) return -1;
175 if (a[1]>b[1]) return 1;
176
177 return 0;
178 }
179
180 // parse A or A-B or A- or -B
get_range(void * data,char * str,int len)181 static char *get_range(void *data, char *str, int len)
182 {
183 char *end = str;
184 unsigned *pairs = (void *)toybuf, i;
185
186 // Using toybuf[] to store ranges means we can have 512 selections max.
187 if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
188 pairs += 2*TT.pairs++;
189
190 pairs[1] = UINT_MAX;
191 for (i = 0; ;i++) {
192 if (i==2) return end;
193 if (isdigit(*end)) {
194 long long ll = estrtol(end, &end, 10);
195
196 if (ll<1 || ll>UINT_MAX || errno) return end;
197 pairs[i] = ll;
198 }
199 if (*end++ != '-') break;
200 }
201 if (!i) pairs[1] = pairs[0];
202 if ((end-str)<len) return end;
203 if (pairs[0]>pairs[1]) return str;
204
205 // No error
206 return 0;
207 }
208
cut_main(void)209 void cut_main(void)
210 {
211 int i;
212 char buf[8];
213
214 // Parse command line arguments
215 if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
216 error_exit("-s needs -Ff");
217 if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
218 error_exit("-d needs -Ff");
219 if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
220 if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
221 if (!TT.O) {
222 if (toys.optflags&FLAG_F) TT.O = " ";
223 else if (toys.optflags&FLAG_f) TT.O = TT.d;
224 }
225
226 // Parse ranges, which are attached to a selection type (only one can be set)
227 for (i = 0; i<ARRAY_LEN(TT.select); i++) {
228 sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
229 if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
230 }
231 if (!TT.pairs) error_exit("no selections");
232
233 // Sort and collate selections
234 if (!(toys.optflags&FLAG_D)) {
235 int from, to;
236 unsigned *pairs = (void *)toybuf;
237
238 qsort(toybuf, TT.pairs, 8, (void *)compar);
239 for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
240 if (pairs[from] > pairs[to+1]) {
241 to += 2;
242 memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
243 } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
244 }
245 TT.pairs = (to/2)+1;
246 }
247
248 // For each argument, loop through lines of file and call cut_line() on each
249 loopfiles_lines(toys.optargs, cut_line);
250 }
251