1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9 * TODO: make y// handle unicode, unicode delimiters
10 * TODO: handle error return from emit(), error_msg/exit consistently
11 * What's the right thing to do for -i when write fails? Skip to next?
12 * test '//q' with no previous regex, also repeat previous regex?
13 *
14 * Deviations from POSIX: allow extended regular expressions with -r,
15 * editing in place with -i, separate with -s, NUL-separated input with -z,
16 * printf escapes in text, line continuations, semicolons after all commands,
17 * 2-address anywhere an address is allowed, "T" command, multiline
18 * continuations for [abc], \; to end [abc] argument before end of line.
19
20 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
21
22 config SED
23 bool "sed"
24 default y
25 help
26 usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
27
28 Stream editor. Apply editing SCRIPTs to lines of input.
29
30 -e Add SCRIPT to list
31 -f Add contents of SCRIPT_FILE to list
32 -i Edit each file in place (-iEXT keeps backup file with extension EXT)
33 -n No default output (use the p command to output matched lines)
34 -r Use extended regular expression syntax
35 -E POSIX alias for -r
36 -s Treat input files separately (implied by -i)
37 -z Use \0 rather than \n as input line separator
38
39 A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
40 All -e SCRIPTs are combined as if separated by newlines, followed by all -f
41 SCRIPT_FILEs. If no -e or -f then first argument is the SCRIPT.
42
43 COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
44
45 [ADDRESS[,ADDRESS]][!]COMMAND
46
47 ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
48 last line (-s or -i makes it last line of each file). One address matches one
49 line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
50 match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
51
52 REGULAR EXPRESSIONS start and end with the same character (anything but
53 backslash or newline). To use the delimiter in the regex escape it with a
54 backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
55 An empty regex repeats the previous one. ADDRESS regexes require any
56 first delimiter except / to be \escaped to distinguish it from COMMANDs.
57
58 Sed reads each line of input, processes it, and writes it out or discards it
59 before reading the next. Sed can remember one additional line in a separate
60 buffer (the h, H, g, G, and x commands), and can read the next line of input
61 early (the n and N commands), but otherwise operates on individual lines.
62
63 Each COMMAND starts with a single character. Commands with no arguments are:
64
65 ! Run this command when the ADDRESS _didn't_ match.
66 { Start new command block, continuing until a corresponding "}".
67 Command blocks nest and can have ADDRESSes applying to the whole block.
68 } End command block (this COMMAND cannot have an address)
69 d Delete this line and move on to the next one
70 (ignores remaining COMMANDs)
71 D Delete one line of input and restart command SCRIPT (same as "d"
72 unless you've glued lines together with "N" or similar)
73 g Get remembered line (overwriting current line)
74 G Get remembered line (appending to current line)
75 h Remember this line (overwriting remembered line)
76 H Remember this line (appending to remembered line, if any)
77 l Print line escaping \abfrtv (but not \n), octal escape other nonprintng
78 chars, wrap lines to terminal width with \, append $ to end of line.
79 n Print default output and read next line over current line (quit at EOF)
80 N Append \n and next line of input to this line. Quit at EOF without
81 default output. Advances line counter for ADDRESS and "=".
82 p Print this line
83 P Print this line up to first newline (from "N")
84 q Quit (print default output, no more commands processed or lines read)
85 x Exchange this line with remembered line (overwrite in both directions)
86 = Print the current line number (plus newline)
87 # Comment, ignores rest of this line of SCRIPT (until newline)
88
89 Commands that take an argument:
90
91 : LABEL Target for jump commands
92 a TEXT Append text to output before reading next line
93 b LABEL Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
94 c TEXT Delete matching ADDRESS range and output TEXT instead
95 i TEXT Insert text (output immediately)
96 r FILE Append contents of FILE to output before reading next line.
97 s/S/R/F Search for regex S replace match with R using flags F. Delimiter
98 is anything but \n or \, escape with \ to use in S or R. Printf
99 escapes work. Unescaped & in R becomes full matched text, \1
100 through \9 = parenthetical subexpression from S. \ at end of
101 line appends next line of SCRIPT. The flags in F are:
102 [0-9] A number N, substitute only Nth match
103 g Global, substitute all matches
104 i/I Ignore case when matching
105 p Print resulting line when match found and replaced
106 w [file] Write (append) line to file when match replaced
107 t LABEL Test, jump if s/// command matched this line since last test
108 T LABEL Test false, jump to :LABEL only if no s/// found a match
109 w FILE Write (append) line to file
110 y/old/new/ Change each character in 'old' to corresponding character
111 in 'new' (with standard backslash escapes, delimiter can be
112 any repeated character except \ or \n)
113
114 The TEXT arguments (to a c i) may end with an unescaped "\" to append
115 the next line (leading whitespace is not skipped), and treat ";" as a
116 literal character (use "\;" instead).
117 */
118
119 #define FOR_sed
120 #include "toys.h"
121
122 GLOBALS(
123 char *i;
124 struct arg_list *f, *e;
125
126 // processed pattern list
127 struct double_list *pattern;
128
129 char *nextline, *remember;
130 void *restart, *lastregex;
131 long nextlen, rememberlen, count;
132 int fdout, noeol;
133 unsigned xx;
134 char delim;
135 )
136
137 // Linked list of parsed sed commands. Offset fields indicate location where
138 // regex or string starts, ala offset+(char *)struct, because we remalloc()
139 // these to expand them for multiline inputs, and pointers would have to be
140 // individually adjusted.
141
142 struct sedcmd {
143 struct sedcmd *next, *prev;
144
145 // Begin and end of each match
146 long lmatch[2]; // line number of match
147 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
148 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
149 unsigned not, hit;
150 unsigned sflags; // s///flag bits: i=1, g=2, p=4, x=8
151 char c; // action
152 };
153
154 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)155 static int emit(char *line, long len, int eol)
156 {
157 int l, old = line[len];
158
159 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
160 TT.noeol = !eol;
161 if (eol) line[len++] = '\n';
162 if (!len) return 0;
163 l = writeall(TT.fdout, line, len);
164 if (eol) line[len-1] = old;
165 if (l != len) {
166 if (TT.fdout != 1) perror_msg("short write");
167
168 return 1;
169 }
170
171 return 0;
172 }
173
174 // Extend allocation to include new string, with newline between if newlen<0
175
extend_string(char ** old,char * new,int oldlen,int newlen)176 static char *extend_string(char **old, char *new, int oldlen, int newlen)
177 {
178 int newline = newlen < 0;
179 char *s;
180
181 if (newline) newlen = -newlen;
182 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
183 if (newline) s[oldlen++] = '\n';
184 memcpy(s+oldlen, new, newlen);
185 s[oldlen+newlen] = 0;
186
187 return s+oldlen+newlen+1;
188 }
189
190 // An empty regex repeats the previous one
get_regex(void * command,int offset)191 static void *get_regex(void *command, int offset)
192 {
193 if (!offset) {
194 if (!TT.lastregex) error_exit("no previous regex");
195 return TT.lastregex;
196 }
197
198 return TT.lastregex = offset+(char *)command;
199 }
200
201 // Apply pattern to line from input file
sed_line(char ** pline,long plen)202 static void sed_line(char **pline, long plen)
203 {
204 struct append {
205 struct append *next, *prev;
206 int file;
207 char *str;
208 } *append = 0;
209 char *line = TT.nextline;
210 long len = TT.nextlen;
211 struct sedcmd *command;
212 int eol = 0, tea = 0;
213
214 // Ignore EOF for all files before last unless -i
215 if (!pline && !FLAG(i) && !FLAG(s)) return;
216
217 // Grab next line for deferred processing (EOF detection: we get a NULL
218 // pline at EOF to flush last line). Note that only end of _last_ input
219 // file matches $ (unless we're doing -i).
220 TT.nextline = 0;
221 TT.nextlen = 0;
222 if (pline) {
223 TT.nextline = *pline;
224 TT.nextlen = plen;
225 *pline = 0;
226 }
227
228 if (!line || !len) return;
229 if (line[len-1] == '\n') line[--len] = eol++;
230 TT.count++;
231
232 // The restart-1 is because we added one to make sure it wasn't NULL,
233 // otherwise N as last command would restart script
234 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
235 TT.restart = 0;
236
237 while (command) {
238 char *str, c = command->c;
239
240 // Have we got a line or regex matching range for this rule?
241 if (*command->lmatch || *command->rmatch) {
242 int miss = 0;
243 long lm;
244
245 // In a match that might end?
246 if (command->hit) {
247 if (!(lm = command->lmatch[1])) {
248 if (!command->rmatch[1]) command->hit = 0;
249 else {
250 void *rm = get_regex(command, command->rmatch[1]);
251
252 // regex match end includes matching line, so defer deactivation
253 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
254 }
255 } else if (lm > 0 && lm < TT.count) command->hit = 0;
256 else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
257
258 // Start a new match?
259 } else {
260 if (!(lm = *command->lmatch)) {
261 void *rm = get_regex(command, *command->rmatch);
262
263 if (line && !regexec0(rm, line, len, 0, 0, 0))
264 command->hit = TT.count;
265 } else if (lm == TT.count || (lm == -1 && !pline))
266 command->hit = TT.count;
267
268 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
269 }
270
271 // Didn't match?
272 lm = !(command->not^!!command->hit);
273
274 // Deferred disable from regex end match
275 if (miss || command->lmatch[1] == TT.count) command->hit = 0;
276
277 if (lm) {
278 // Handle skipping curly bracket command group
279 if (c == '{') {
280 int curly = 1;
281
282 while (curly) {
283 command = command->next;
284 if (command->c == '{') curly++;
285 if (command->c == '}') curly--;
286 }
287 }
288 command = command->next;
289 continue;
290 }
291 }
292
293 // A deleted line can still update line match state for later commands
294 if (!line) {
295 command = command->next;
296 continue;
297 }
298
299 // Process command
300
301 if (c=='a' || c=='r') {
302 struct append *a = xzalloc(sizeof(struct append));
303 if (command->arg1) a->str = command->arg1+(char *)command;
304 a->file = c=='r';
305 dlist_add_nomalloc((void *)&append, (void *)a);
306 } else if (c=='b' || c=='t' || c=='T') {
307 int t = tea;
308
309 if (c != 'b') tea = 0;
310 if (c=='b' || t^(c=='T')) {
311 if (!command->arg1) break;
312 str = command->arg1+(char *)command;
313 for (command = (void *)TT.pattern; command; command = command->next)
314 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
315 break;
316 if (!command) error_exit("no :%s", str);
317 }
318 } else if (c=='c') {
319 str = command->arg1+(char *)command;
320 if (!command->hit) emit(str, strlen(str), 1);
321 free(line);
322 line = 0;
323 continue;
324 } else if (c=='d') {
325 free(line);
326 line = 0;
327 continue;
328 } else if (c=='D') {
329 // Delete up to \n or end of buffer
330 str = line;
331 while ((str-line)<len) if (*(str++) == '\n') break;
332 len -= str - line;
333 memmove(line, str, len);
334
335 // if "delete" blanks line, disable further processing
336 // otherwise trim and restart script
337 if (!len) {
338 free(line);
339 line = 0;
340 } else {
341 line[len] = 0;
342 command = (void *)TT.pattern;
343 }
344 continue;
345 } else if (c=='g') {
346 free(line);
347 line = xstrdup(TT.remember);
348 len = TT.rememberlen;
349 } else if (c=='G') {
350 line = xrealloc(line, len+TT.rememberlen+2);
351 line[len++] = '\n';
352 memcpy(line+len, TT.remember, TT.rememberlen);
353 line[len += TT.rememberlen] = 0;
354 } else if (c=='h') {
355 free(TT.remember);
356 TT.remember = xstrdup(line);
357 TT.rememberlen = len;
358 } else if (c=='H') {
359 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
360 TT.remember[TT.rememberlen++] = '\n';
361 memcpy(TT.remember+TT.rememberlen, line, len);
362 TT.remember[TT.rememberlen += len] = 0;
363 } else if (c=='i') {
364 str = command->arg1+(char *)command;
365 emit(str, strlen(str), 1);
366 } else if (c=='l') {
367 int i, x, off;
368
369 if (!TT.xx) {
370 terminal_size(&TT.xx, 0);
371 if (!TT.xx) TT.xx = 80;
372 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
373 if (TT.xx > 4) TT.xx -= 4;
374 }
375
376 for (i = off = 0; i<len; i++) {
377 if (off >= TT.xx) {
378 toybuf[off++] = '\\';
379 emit(toybuf, off, 1);
380 off = 0;
381 }
382 x = stridx("\\\a\b\f\r\t\v", line[i]);
383 if (x != -1) {
384 toybuf[off++] = '\\';
385 toybuf[off++] = "\\abfrtv"[x];
386 } else if (line[i] >= ' ') toybuf[off++] = line[i];
387 else off += sprintf(toybuf+off, "\\%03o", line[i]);
388 }
389 toybuf[off++] = '$';
390 emit(toybuf, off, 1);
391 } else if (c=='n') {
392 TT.restart = command->next+1;
393
394 break;
395 } else if (c=='N') {
396 // Can't just grab next line because we could have multiple N and
397 // we need to actually read ahead to get N;$p EOF detection right.
398 if (pline) {
399 TT.restart = command->next+1;
400 extend_string(&line, TT.nextline, len, -TT.nextlen);
401 free(TT.nextline);
402 TT.nextline = line;
403 TT.nextlen += len + 1;
404 line = 0;
405 }
406
407 // Pending append goes out right after N
408 goto done;
409 } else if (c=='p' || c=='P') {
410 char *l = (c=='P') ? strchr(line, '\n') : 0;
411
412 if (emit(line, l ? l-line : len, eol)) break;
413 } else if (c=='q' || c=='Q') {
414 if (pline) *pline = (void *)1;
415 free(TT.nextline);
416 if (!toys.exitval && command->arg1)
417 toys.exitval = atoi(command->arg1+(char *)command);
418 TT.nextline = 0;
419 TT.nextlen = 0;
420 if (c=='Q') line = 0;
421
422 break;
423 } else if (c=='s') {
424 char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
425 regmatch_t *match = (void *)toybuf;
426 regex_t *reg = get_regex(command, command->arg1);
427 int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
428 mlen, off, newlen;
429
430 // Loop finding match in remaining line (up to remaining len)
431 while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
432 mflags = REG_NOTBOL;
433
434 // Zero length matches don't count immediately after a previous match
435 mlen = match[0].rm_eo-match[0].rm_so;
436 if (!mlen && !zmatch) {
437 if (rline-line == len) break;
438 l2[l2used++] = *rline++;
439 zmatch++;
440 continue;
441 } else zmatch = 0;
442
443 // If we're replacing only a specific match, skip if this isn't it
444 off = command->sflags>>4;
445 if (off && off != ++count) {
446 if (l2) memcpy(l2+l2used, rline, match[0].rm_eo);
447 l2used += match[0].rm_eo;
448 rline += match[0].rm_eo;
449
450 continue;
451 }
452 // The fact getline() can allocate unbounded amounts of memory is
453 // a bigger issue, but while we're here check for integer overflow
454 if (match[0].rm_eo > INT_MAX) perror_exit(0);
455
456 // newlen = strlen(new) but with \1 and & and printf escapes
457 for (off = newlen = 0; new[off]; off++) {
458 int cc = -1;
459
460 if (new[off] == '&') cc = 0;
461 else if (new[off] == '\\') cc = new[++off] - '0';
462 if (cc < 0 || cc > 9) {
463 newlen++;
464 continue;
465 }
466 newlen += match[cc].rm_eo-match[cc].rm_so;
467 }
468
469 // Copy changed data to new string
470
471 // Adjust allocation size of new string, copy data we know we'll keep
472 l2l += newlen-mlen;
473 if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
474 if (match[0].rm_so) {
475 memcpy(l2+l2used, rline, match[0].rm_so);
476 l2used += match[0].rm_so;
477 }
478
479 // copy in new replacement text
480 for (off = mlen = 0; new[off]; off++) {
481 int cc = 0, ll;
482
483 if (new[off] == '\\') {
484 cc = new[++off] - '0';
485 if (cc<0 || cc>9) {
486 if (!(l2[l2used+mlen++] = unescape(new[off])))
487 l2[l2used+mlen-1] = new[off];
488
489 continue;
490 } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
491 } else if (new[off] != '&') {
492 l2[l2used+mlen++] = new[off];
493
494 continue;
495 }
496
497 if (match[cc].rm_so != -1) {
498 ll = match[cc].rm_eo-match[cc].rm_so;
499 memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
500 mlen += ll;
501 }
502 }
503 l2used += newlen;
504 rline += match[0].rm_eo;
505
506 // Stop after first substitution unless we have flag g
507 if (!(command->sflags & 2)) break;
508 }
509
510 // If we made any changes, finish off l2 and swap it for line
511 if (l2) {
512 // grab trailing unmatched data and null terminator, swap with original
513 mlen = len-(rline-line);
514 memcpy(l2+l2used, rline, mlen+1);
515 len = l2used + mlen;
516 free(line);
517 line = l2;
518 }
519
520 if (mflags) {
521 // flag p
522 if (command->sflags & 4) emit(line, len, eol);
523
524 tea = 1;
525 if (command->w) goto writenow;
526 }
527 } else if (c=='w') {
528 int fd, noeol;
529 char *name;
530
531 writenow:
532 // Swap out emit() context
533 fd = TT.fdout;
534 noeol = TT.noeol;
535
536 // We save filehandle and newline status before filename
537 name = command->w + (char *)command;
538 memcpy(&TT.fdout, name, 4);
539 name += 4;
540 TT.noeol = *(name++);
541
542 // write, then save/restore context
543 if (emit(line, len, eol))
544 perror_exit("w '%s'", command->arg1+(char *)command);
545 *(--name) = TT.noeol;
546 TT.noeol = noeol;
547 TT.fdout = fd;
548 } else if (c=='x') {
549 long swap = TT.rememberlen;
550
551 str = TT.remember;
552 TT.remember = line;
553 line = str;
554 TT.rememberlen = len;
555 len = swap;
556 } else if (c=='y') {
557 char *from, *to = (char *)command;
558 int i, j;
559
560 from = to+command->arg1;
561 to += command->arg2;
562
563 for (i = 0; i < len; i++) {
564 j = stridx(from, line[i]);
565 if (j != -1) line[i] = to[j];
566 }
567 } else if (c=='=') {
568 sprintf(toybuf, "%ld", TT.count);
569 if (emit(toybuf, strlen(toybuf), 1)) break;
570 }
571
572 command = command->next;
573 }
574
575 if (line && !FLAG(n)) emit(line, len, eol);
576
577 done:
578 if (dlist_terminate(append)) while (append) {
579 struct append *a = append->next;
580
581 if (append->file) {
582 int fd = open(append->str, O_RDONLY);
583
584 // Force newline if noeol pending
585 if (fd != -1) {
586 if (TT.noeol) xwrite(TT.fdout, "\n", 1);
587 TT.noeol = 0;
588 xsendfile(fd, TT.fdout);
589 close(fd);
590 }
591 } else if (append->str) emit(append->str, strlen(append->str), 1);
592 else emit(line, 0, 0);
593 free(append);
594 append = a;
595 }
596 free(line);
597 }
598
599 // Callback called on each input file
do_sed_file(int fd,char * name)600 static void do_sed_file(int fd, char *name)
601 {
602 char *tmp, *s;
603
604 if (FLAG(i)) {
605 if (!fd) return error_msg("-i on stdin");
606 TT.fdout = copy_tempfile(fd, name, &tmp);
607 }
608 if (FLAG(i) || FLAG(s)) {
609 struct sedcmd *command;
610
611 TT.count = 0;
612 for (command = (void *)TT.pattern; command; command = command->next)
613 command->hit = 0;
614 }
615 do_lines(fd, TT.delim, sed_line);
616 if (FLAG(i)) {
617 if (TT.i && *TT.i) {
618 xrename(name, s = xmprintf("%s%s", name, TT.i));
619 free(s);
620 }
621 replace_tempfile(-1, TT.fdout, &tmp);
622 TT.fdout = 1;
623 }
624 if (FLAG(i) || FLAG(s)) {
625 TT.nextline = 0;
626 TT.nextlen = TT.noeol = 0;
627 }
628 }
629
630 // Copy chunk of string between two delimiters, converting printf escapes.
631 // returns processed copy of string (0 if error), *pstr advances to next
632 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
633 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)634 static char *unescape_delimited_string(char **pstr, char *delim)
635 {
636 char *to, *from, mode = 0, d;
637
638 // Grab leading delimiter (if necessary), allocate space for new string
639 from = *pstr;
640 if (!delim || !*delim) {
641 if (!(d = *(from++))) return 0;
642 if (d == '\\') d = *(from++);
643 if (!d || d == '\\') return 0;
644 if (delim) *delim = d;
645 } else d = *delim;
646 to = delim = xmalloc(strlen(*pstr)+1);
647
648 while (mode || *from != d) {
649 if (!*from) return 0;
650
651 // delimiter in regex character range doesn't count
652 if (*from == '[') {
653 if (!mode) {
654 mode = ']';
655 if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
656 } else if (mode == ']' && strchr(".=:", from[1])) {
657 *(to++) = *(from++);
658 mode = *from;
659 }
660 } else if (*from == mode) {
661 if (mode == ']') mode = 0;
662 else {
663 *(to++) = *(from++);
664 mode = ']';
665 }
666 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
667 // but the perl build does it, so we need to filter it out.
668 } else if (mode && *from == '-' && from[-1] == from[1]) {
669 from+=2;
670 continue;
671 } else if (*from == '\\') {
672 if (!from[1]) return 0;
673
674 // Check escaped end delimiter before printf style escapes.
675 if (from[1] == d) from++;
676 else if (from[1]=='\\') *(to++) = *(from++);
677 else {
678 char c = unescape(from[1]);
679
680 if (c) {
681 *(to++) = c;
682 from+=2;
683 continue;
684 } else if (!mode) *(to++) = *(from++);
685 }
686 }
687 *(to++) = *(from++);
688 }
689 *to = 0;
690 *pstr = from+1;
691
692 return delim;
693 }
694
695 // Translate pattern strings into command structures. Each command structure
696 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)697 static void parse_pattern(char **pline, long len)
698 {
699 struct sedcmd *command = (void *)TT.pattern;
700 char *line, *reg, c, *errstart;
701 int i;
702
703 line = errstart = pline ? *pline : "";
704 if (len && line[len-1]=='\n') line[--len] = 0;
705
706 // Append this line to previous multiline command? (hit indicates type.)
707 // During parsing "hit" stores data about line continuations, but in
708 // sed_line() it means the match range attached to this command
709 // is active, so processing the continuation must zero it again.
710 if (command && command->prev->hit) {
711 // Remove half-finished entry from list so remalloc() doesn't confuse it
712 TT.pattern = TT.pattern->prev;
713 command = dlist_pop(&TT.pattern);
714 c = command->c;
715 reg = (char *)command;
716 reg += command->arg1 + strlen(reg + command->arg1);
717
718 // Resume parsing for 'a' or 's' command. (Only two that can do this.)
719 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
720 // a unicode character.
721 if (command->hit < 256) goto resume_s;
722 else goto resume_a;
723 }
724
725 // Loop through commands in this line.
726
727 command = 0;
728 for (;;) {
729 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
730
731 // If there's no more data on this line, return.
732 for (;;) {
733 while (isspace(*line) || *line == ';') line++;
734 if (*line == '#') while (*line && *line != '\n') line++;
735 else break;
736 }
737 if (!*line) return;
738
739 // Start by writing data into toybuf.
740
741 errstart = line;
742 memset(toybuf, 0, sizeof(struct sedcmd));
743 command = (void *)toybuf;
744 reg = toybuf + sizeof(struct sedcmd);
745
746 // Parse address range (if any)
747 for (i = 0; i < 2; i++) {
748 if (*line == ',') line++;
749 else if (i) break;
750
751 if (i && *line == '+' && isdigit(line[1])) {
752 line++;
753 command->lmatch[i] = -2-strtol(line, &line, 0);
754 } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
755 else if (*line == '$') {
756 command->lmatch[i] = -1;
757 line++;
758 } else if (*line == '/' || *line == '\\') {
759 char *s = line;
760
761 if (!(s = unescape_delimited_string(&line, 0))) goto error;
762 if (!*s) command->rmatch[i] = 0;
763 else {
764 xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
765 command->rmatch[i] = reg-toybuf;
766 reg += sizeof(regex_t);
767 }
768 free(s);
769 } else break;
770 }
771
772 while (isspace(*line)) line++;
773 if (!*line) break;
774
775 if (*line == '!') {
776 command->not = 1;
777 line++;
778 }
779 while (isspace(*line)) line++;
780 if (!*line) break;
781
782 c = command->c = *(line++);
783 if (strchr("}:", c) && i) break;
784 if (strchr("aiqQr=", c) && i>1) break;
785
786 // Allocate memory and copy out of toybuf now that we know how big it is
787 command = xmemdup(toybuf, reg-toybuf);
788 reg = (reg-toybuf) + (char *)command;
789
790 // Parse arguments by command type
791 if (c == '{') TT.nextlen++;
792 else if (c == '}') {
793 if (!TT.nextlen--) break;
794 } else if (c == 's') {
795 char *end, delim = 0;
796 int flags;
797
798 // s/pattern/replacement/flags
799
800 // line continuations use arg1 (back at the start of the function),
801 // so let's fill out arg2 first (since the regex part can't be multiple
802 // lines) and swap them back later.
803
804 // get pattern (just record, we parse it later)
805 command->arg2 = reg - (char *)command;
806 if (!(TT.remember = unescape_delimited_string(&line, &delim)))
807 goto error;
808
809 reg += sizeof(regex_t);
810 command->arg1 = reg-(char *)command;
811 command->hit = delim;
812 resume_s:
813 // get replacement - don't replace escapes yet because \1 and \& need
814 // processing later, after we replace \\ with \ we can't tell \\1 from \1
815 end = line;
816 while (*end != command->hit) {
817 if (!*end) goto error;
818 if (*end++ == '\\') {
819 if (!*end || *end == '\n') {
820 end[-1] = '\n';
821 break;
822 }
823 end++;
824 }
825 }
826
827 reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
828 line = end;
829 // line continuation? (note: '\n' can't be a valid delim).
830 if (*line == command->hit) command->hit = 0;
831 else {
832 if (!*line) continue;
833 reg--;
834 line++;
835 goto resume_s;
836 }
837
838 // swap arg1/arg2 so they're back in order arguments occur.
839 i = command->arg1;
840 command->arg1 = command->arg2;
841 command->arg2 = i;
842
843 // get flags
844 for (line++; *line; line++) {
845 long l;
846
847 if (isspace(*line) && *line != '\n') continue;
848
849 if (0 <= (l = stridx("igpx", *line))) command->sflags |= 1<<l;
850 else if (*line == 'I') command->sflags |= 1<<0;
851 else if (!(command->sflags>>4) && 0<(l = strtol(line, &line, 10))) {
852 command->sflags |= l << 4;
853 line--;
854 } else break;
855 }
856 flags = (FLAG(r) || (command->sflags&8)) ? REG_EXTENDED : 0;
857 if (command->sflags&1) flags |= REG_ICASE;
858
859 // We deferred actually parsing the regex until we had the s///i flag
860 // allocating the space was done by extend_string() above
861 if (!*TT.remember) command->arg1 = 0;
862 else xregcomp((void *)(command->arg1+(char *)command),TT.remember,flags);
863 free(TT.remember);
864 TT.remember = 0;
865 if (*line == 'w') {
866 line++;
867 goto writenow;
868 }
869 } else if (c == 'w') {
870 int fd, delim;
871 char *cc;
872
873 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
874 // eol status, and to retain the filename for error messages, we'd need
875 // to go up to arg5 just for this. Compromise: dynamically allocate the
876 // filehandle and eol status.
877
878 writenow:
879 while (isspace(*line)) line++;
880 if (!*line) goto error;
881 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
882 delim = *cc;
883 *cc = 0;
884 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
885 *cc = delim;
886
887 command->w = reg - (char *)command;
888 command = xrealloc(command, command->w+(cc-line)+6);
889 reg = command->w + (char *)command;
890
891 memcpy(reg, &fd, 4);
892 reg += 4;
893 *(reg++) = 0;
894 memcpy(reg, line, delim);
895 reg += delim;
896 *(reg++) = 0;
897
898 line = cc;
899 if (delim) line += 2;
900 } else if (c == 'y') {
901 char *s, delim = 0;
902 int len;
903
904 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
905 command->arg1 = reg-(char *)command;
906 len = strlen(s);
907 reg = extend_string((void *)&command, s, reg-(char *)command, len);
908 free(s);
909 command->arg2 = reg-(char *)command;
910 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
911 if (len != strlen(s)) goto error;
912 reg = extend_string((void *)&command, s, reg-(char*)command, len);
913 free(s);
914 } else if (strchr("abcirtTqQw:", c)) {
915 int end;
916
917 // trim leading spaces
918 while (isspace(*line) && *line != '\n') line++;
919
920 // Resume logic differs from 's' case because we don't add a newline
921 // unless it's after something, so we add it on return instead.
922 resume_a:
923 command->hit = 0;
924
925 // btTqQ: end with space or semicolon, aicrw continue to newline.
926 if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
927 // Argument's optional for btTqQ
928 if (strchr("btTqQ", c)) continue;
929 else if (!command->arg1) break;
930 }
931 // Error checking: qQ can only have digits after them
932 if (c=='q' || c=='Q') {
933 for (i = 0; i<end && isdigit(line[i]); i++);
934 if (i != end) {
935 line += i;
936 break;
937 }
938 }
939
940 // Extend allocation to include new string. We use offsets instead of
941 // pointers so realloc() moving stuff doesn't break things. Ok to write
942 // \n over NUL terminator because call to extend_string() adds it back.
943 if (!command->arg1) command->arg1 = reg - (char*)command;
944 else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
945 else if (!pline) {
946 command->arg1 = 0;
947 continue;
948 }
949 reg = extend_string((void *)&command, line, reg - (char *)command, end);
950
951 // Recopy data to remove escape sequences and handle line continuation.
952 if (strchr("aci", c)) {
953 reg -= end+1;
954 for (i = end; i; i--) {
955 if ((*reg++ = *line++)=='\\') {
956
957 // escape at end of line: resume if -e escaped literal newline,
958 // else request callback and resume with next line
959 if (!--i) {
960 *--reg = 0;
961 if (*line) {
962 line++;
963 goto resume_a;
964 }
965 command->hit = 256;
966 break;
967 }
968 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
969 line++;
970 }
971 }
972 *reg = 0;
973 } else line += end;
974
975 // Commands that take no arguments
976 } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
977 }
978
979 error:
980 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
981 }
982
sed_main(void)983 void sed_main(void)
984 {
985 struct arg_list *al;
986 char **args = toys.optargs;
987
988 if (!FLAG(z)) TT.delim = '\n';
989
990 // Lie to autoconf when it asks stupid questions, so configure regexes
991 // that look for "GNU sed version %f" greater than some old buggy number
992 // don't fail us for not matching their narrow expectations.
993 if (FLAG(version)) {
994 xprintf("This is not GNU sed version 9.0\n");
995 return;
996 }
997
998 // Handling our own --version means we handle our own --help too.
999 if (FLAG(help)) help_exit(0);
1000
1001 // Parse pattern into commands.
1002
1003 // If no -e or -f, first argument is the pattern.
1004 if (!TT.e && !TT.f) {
1005 if (!*toys.optargs) error_exit("no pattern");
1006 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1007 }
1008
1009 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1010 // so handle all -e, then all -f. (At least the behavior's consistent.)
1011
1012 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1013 parse_pattern(0, 0);
1014 for (al = TT.f; al; al = al->next)
1015 do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1016 dlist_terminate(TT.pattern);
1017 if (TT.nextlen) error_exit("no }");
1018
1019 TT.fdout = 1;
1020 TT.remember = xstrdup("");
1021
1022 // Inflict pattern upon input files. Long version because !O_CLOEXEC
1023 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1024
1025 // Provide EOF flush at end of cumulative input for non-i mode.
1026 if (!FLAG(i) && !FLAG(s)) {
1027 toys.optflags |= FLAG_s;
1028 sed_line(0, 0);
1029 }
1030
1031 // todo: need to close fd when done for TOYBOX_FREE?
1032 }
1033