1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6  *
7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8  * but N and s///
9 
10 USE_SED(NEWTOY(sed, "(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
11 
12 config SED
13   bool "sed"
14   default y
15   help
16     usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
17 
18     Stream editor. Apply one or more editing SCRIPTs to each line of input
19     (from FILE or stdin) producing output (by default to stdout).
20 
21     -e	add SCRIPT to list
22     -f	add contents of SCRIPT_FILE to list
23     -i	Edit each file in place.
24     -n	No default output. (Use the p command to output matched lines.)
25     -r	Use extended regular expression syntax.
26     -E	Alias for -r.
27     -s	Treat input files separately (implied by -i)
28 
29     A SCRIPT is a series of one or more COMMANDs separated by newlines or
30     semicolons. All -e SCRIPTs are concatenated together as if separated
31     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
32     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
33 
34     Each COMMAND may be preceded by an address which limits the command to
35     apply only to the specified line(s). Commands without an address apply to
36     every line. Addresses are of the form:
37 
38       [ADDRESS[,ADDRESS]]COMMAND
39 
40     The ADDRESS may be a decimal line number (starting at 1), a /regular
41     expression/ within a pair of forward slashes, or the character "$" which
42     matches the last line of input. (In -s or -i mode this matches the last
43     line of each file, otherwise just the last line of the last file.) A single
44     address matches one line, a pair of comma separated addresses match
45     everything from the first address to the second address (inclusive). If
46     both addresses are regular expressions, more than one range of lines in
47     each file can match.
48 
49     REGULAR EXPRESSIONS in sed are started and ended by the same character
50     (traditionally / but anything except a backslash or a newline works).
51     Backslashes may be used to escape the delimiter if it occurs in the
52     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
53     and unicode). An empty regex repeats the previous one. ADDRESS regexes
54     (above) require the first delimeter to be escaped with a backslash when
55     it isn't a forward slash (to distinguish it from the COMMANDs below).
56 
57     Sed mostly operates on individual lines one at a time. It reads each line,
58     processes it, and either writes it to the output or discards it before
59     reading the next line. Sed can remember one additional line in a separate
60     buffer (using the h, H, g, G, and x commands), and can read the next line
61     of input early (using the n and N command), but other than that command
62     scripts operate on individual lines of text.
63 
64     Each COMMAND starts with a single character. The following commands take
65     no arguments:
66 
67       {  Start a new command block, continuing until a corresponding "}".
68          Command blocks may nest. If the block has an address, commands within
69          the block are only run for lines within the block's address range.
70 
71       }  End command block (this command cannot have an address)
72 
73       d  Delete this line and move on to the next one
74          (ignores remaining COMMANDs)
75 
76       D  Delete one line of input and restart command SCRIPT (same as "d"
77          unless you've glued lines together with "N" or similar)
78 
79       g  Get remembered line (overwriting current line)
80 
81       G  Get remembered line (appending to current line)
82 
83       h  Remember this line (overwriting remembered line)
84 
85       H  Remember this line (appending to remembered line, if any)
86 
87       l  Print line, escaping \abfrtv (but not newline), octal escaping other
88          nonprintable characters, wrapping lines to terminal width with a
89          backslash, and appending $ to actual end of line.
90 
91       n  Print default output and read next line, replacing current line
92          (If no next line available, quit processing script)
93 
94       N  Append next line of input to this line, separated by a newline
95          (This advances the line counter for address matching and "=", if no
96          next line available quit processing script without default output)
97 
98       p  Print this line
99 
100       P  Print this line up to first newline (from "N")
101 
102       q  Quit (print default output, no more commands processed or lines read)
103 
104       x  Exchange this line with remembered line (overwrite in both directions)
105 
106       =  Print the current line number (followed by a newline)
107 
108     The following commands (may) take an argument. The "text" arguments (to
109     the "a", "b", and "c" commands) may end with an unescaped "\" to append
110     the next line (for which leading whitespace is not skipped), and also
111     treat ";" as a literal character (use "\;" instead).
112 
113       a [text]   Append text to output before attempting to read next line
114 
115       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
116 
117       c [text]   Delete line, output text at end of matching address range
118                  (ignores remaining COMMANDs)
119 
120       i [text]   Print text
121 
122       r [file]   Append contents of file to output before attempting to read
123                  next line.
124 
125       s/S/R/F    Search for regex S, replace matched text with R using flags F.
126                  The first character after the "s" (anything but newline or
127                  backslash) is the delimiter, escape with \ to use normally.
128 
129                  The replacement text may contain "&" to substitute the matched
130                  text (escape it with backslash for a literal &), or \1 through
131                  \9 to substitute a parenthetical subexpression in the regex.
132                  You can also use the normal backslash escapes such as \n and
133                  a backslash at the end of the line appends the next line.
134 
135                  The flags are:
136 
137                  [0-9]    A number, substitute only that occurrence of pattern
138                  g        Global, substitute all occurrences of pattern
139                  i        Ignore case when matching
140                  p        Print the line if match was found and replaced
141                  w [file] Write (append) line to file if match replaced
142 
143       t [label]  Test, jump to :label only if an "s" command found a match in
144                  this line since last test (replacing with same text counts)
145 
146       T [label]  Test false, jump only if "s" hasn't found a match.
147 
148       w [file]   Write (append) line to file
149 
150       y/old/new/ Change each character in 'old' to corresponding character
151                  in 'new' (with standard backslash escapes, delimiter can be
152                  any repeated character except \ or \n)
153 
154       : [label]  Labeled target for jump commands
155 
156       #  Comment, ignore rest of this line of SCRIPT
157 
158     Deviations from posix: allow extended regular expressions with -r,
159     editing in place with -i, separate with -s, printf escapes in text, line
160     continuations, semicolons after all commands, 2-address anywhere an
161     address is allowed, "T" command, multiline continuations for [abc],
162     \; to end [abc] argument before end of line.
163 */
164 
165 #define FOR_sed
166 #include "toys.h"
167 
168 GLOBALS(
169   struct arg_list *f;
170   struct arg_list *e;
171 
172   // processed pattern list
173   struct double_list *pattern;
174 
175   char *nextline, *remember;
176   void *restart, *lastregex;
177   long nextlen, rememberlen, count;
178   int fdout, noeol;
179   unsigned xx;
180 )
181 
182 struct step {
183   struct step *next, *prev;
184 
185   // Begin and end of each match
186   long lmatch[2];
187   int rmatch[2], arg1, arg2, w; // offsets because remalloc()
188   unsigned not, hit, sflags;
189   char c; // action
190 };
191 
192 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)193 static int emit(char *line, long len, int eol)
194 {
195   int l, old = line[len];
196 
197   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
198   if (eol) line[len++] = '\n';
199   if (!len) return 0;
200   TT.noeol = len && !eol;
201   l = writeall(TT.fdout, line, len);
202   if (eol) line[len-1] = old;
203   if (l != len) {
204     perror_msg("short write");
205 
206     return 1;
207   }
208 
209   return 0;
210 }
211 
212 // Do regex matching handling embedded NUL bytes in string. Note that
213 // neither the pattern nor the match can currently include NUL bytes
214 // (even with wildcards) and string must be null terminated.
ghostwheel(regex_t * preg,char * string,long len,int nmatch,regmatch_t pmatch[],int eflags)215 static int ghostwheel(regex_t *preg, char *string, long len, int nmatch,
216   regmatch_t pmatch[], int eflags)
217 {
218   char *s = string;
219 
220   for (;;) {
221     long ll = 0;
222     int rc;
223 
224     while (len && !*s) {
225       s++;
226       len--;
227     }
228     while (s[ll] && ll<len) ll++;
229 
230     rc = regexec(preg, s, nmatch, pmatch, eflags);
231     if (!rc) {
232       for (rc = 0; rc<nmatch && pmatch[rc].rm_so!=-1; rc++) {
233         pmatch[rc].rm_so += s-string;
234         pmatch[rc].rm_eo += s-string;
235       }
236 
237       return 0;
238     }
239     if (ll==len) return rc;
240 
241     s += ll;
242     len -= ll;
243   }
244 }
245 
246 // Extend allocation to include new string, with newline between if newlen<0
247 
extend_string(char ** old,char * new,int oldlen,int newlen)248 static char *extend_string(char **old, char *new, int oldlen, int newlen)
249 {
250   int newline = newlen < 0;
251   char *s;
252 
253   if (newline) newlen = -newlen;
254   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
255   if (newline) s[oldlen++] = '\n';
256   memcpy(s+oldlen, new, newlen);
257   s[oldlen+newlen] = 0;
258 
259   return s+oldlen+newlen+1;
260 }
261 
262 // An empty regex repeats the previous one
get_regex(void * trump,int offset)263 void *get_regex(void *trump, int offset)
264 {
265   if (!offset) {
266     if (!TT.lastregex) error_exit("no previous regex");
267     return TT.lastregex;
268   }
269 
270   return TT.lastregex = offset+(char *)trump;
271 }
272 
273 // Apply pattern to line from input file
walk_pattern(char ** pline,long plen)274 static void walk_pattern(char **pline, long plen)
275 {
276   struct append {
277     struct append *next, *prev;
278     int file;
279     char *str;
280   } *append = 0;
281   char *line = TT.nextline;
282   long len = TT.nextlen;
283   struct step *logrus;
284   int eol = 0, tea = 0;
285 
286   // Grab next line for deferred processing (EOF detection: we get a NULL
287   // pline at EOF to flush last line). Note that only end of _last_ input
288   // file matches $ (unless we're doing -i).
289   TT.nextline = 0;
290   TT.nextlen = 0;
291   if (pline) {
292     TT.nextline = *pline;
293     TT.nextlen = plen;
294     *pline = 0;
295   }
296 
297   if (!line || !len) return;
298   if (line[len-1] == '\n') line[--len] = eol++;
299   TT.count++;
300 
301   // The restart-1 is because we added one to make sure it wasn't NULL,
302   // otherwise N as last command would restart script
303   logrus = TT.restart ? ((struct step *)TT.restart)-1 : (void *)TT.pattern;
304   TT.restart = 0;
305 
306   while (logrus) {
307     char *str, c = logrus->c;
308 
309     // Have we got a line or regex matching range for this rule?
310     if (*logrus->lmatch || *logrus->rmatch) {
311       int miss = 0;
312       long lm;
313 
314       // In a match that might end?
315       if (logrus->hit) {
316         if (!(lm = logrus->lmatch[1])) {
317           if (!logrus->rmatch[1]) logrus->hit = 0;
318           else {
319             void *rm = get_regex(logrus, logrus->rmatch[1]);
320 
321             // regex match end includes matching line, so defer deactivation
322             if (line && !ghostwheel(rm, line, len, 0, 0, 0)) miss = 1;
323           }
324         } else if (lm > 0 && lm < TT.count) logrus->hit = 0;
325 
326       // Start a new match?
327       } else {
328         if (!(lm = *logrus->lmatch)) {
329           void *rm = get_regex(logrus, *logrus->rmatch);
330 
331           if (line && !ghostwheel(rm, line, len, 0, 0, 0)) logrus->hit++;
332         } else if (lm == TT.count || (lm == -1 && !pline)) logrus->hit++;
333 
334         if (!logrus->lmatch[1] && !logrus->rmatch[1]) miss = 1;
335       }
336 
337       // Didn't match?
338       lm = !(logrus->hit ^ logrus->not);
339 
340       // Deferred disable from regex end match
341       if (miss || logrus->lmatch[1] == TT.count) logrus->hit = 0;
342 
343       if (lm) {
344         // Handle skipping curly bracket command group
345         if (c == '{') {
346           int curly = 1;
347 
348           while (curly) {
349             logrus = logrus->next;
350             if (logrus->c == '{') curly++;
351             if (logrus->c == '}') curly--;
352           }
353         }
354         logrus = logrus->next;
355         continue;
356       }
357     }
358 
359     // A deleted line can still update line match state for later commands
360     if (!line) {
361       logrus = logrus->next;
362       continue;
363     }
364 
365     // Process command
366 
367     if (c=='a' || c=='r') {
368       struct append *a = xzalloc(sizeof(struct append));
369       a->str = logrus->arg1+(char *)logrus;
370       a->file = c=='r';
371       dlist_add_nomalloc((void *)&append, (void *)a);
372     } else if (c=='b' || c=='t' || c=='T') {
373       int t = tea;
374 
375       if (c != 'b') tea = 0;
376       if (c=='b' || t^(c=='T')) {
377         if (!logrus->arg1) break;
378         str = logrus->arg1+(char *)logrus;
379         for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next)
380           if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str))
381             break;
382         if (!logrus) error_exit("no :%s", str);
383       }
384     } else if (c=='c') {
385       str = logrus->arg1+(char *)logrus;
386       if (!logrus->hit) emit(str, strlen(str), 1);
387       free(line);
388       line = 0;
389       continue;
390     } else if (c=='d') {
391       free(line);
392       line = 0;
393       continue;
394     } else if (c=='D') {
395       // Delete up to \n or end of buffer
396       str = line;
397       while ((str-line)<len) if (*(str++) == '\n') break;
398       len -= str - line;
399       memmove(line, str, len);
400 
401       // if "delete" blanks line, disable further processing
402       // otherwise trim and restart script
403       if (!len) {
404         free(line);
405         line = 0;
406       } else {
407         line[len] = 0;
408         logrus = (void *)TT.pattern;
409       }
410       continue;
411     } else if (c=='g') {
412       free(line);
413       line = xstrdup(TT.remember);
414       len = TT.rememberlen;
415     } else if (c=='G') {
416       line = xrealloc(line, len+TT.rememberlen+2);
417       line[len++] = '\n';
418       memcpy(line+len, TT.remember, TT.rememberlen);
419       line[len += TT.rememberlen] = 0;
420     } else if (c=='h') {
421       free(TT.remember);
422       TT.remember = xstrdup(line);
423       TT.rememberlen = len;
424     } else if (c=='H') {
425       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
426       TT.remember[TT.rememberlen++] = '\n';
427       memcpy(TT.remember+TT.rememberlen, line, len);
428       TT.remember[TT.rememberlen += len] = 0;
429     } else if (c=='i') {
430       str = logrus->arg1+(char *)logrus;
431       emit(str, strlen(str), 1);
432     } else if (c=='l') {
433       int i, x, off;
434 
435       if (!TT.xx) {
436         terminal_size(&TT.xx, 0);
437         if (!TT.xx) TT.xx = 80;
438         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
439         if (TT.xx > 4) TT.xx -= 4;
440       }
441 
442       for (i = off = 0; i<len; i++) {
443         if (off >= TT.xx) {
444           toybuf[off++] = '\\';
445           emit(toybuf, off, 1);
446           off = 0;
447         }
448         x = stridx("\\\a\b\f\r\t\v", line[i]);
449         if (x != -1) {
450           toybuf[off++] = '\\';
451           toybuf[off++] = "\\abfrtv"[x];
452         } else if (line[i] >= ' ') toybuf[off++] = line[i];
453         else off += sprintf(toybuf+off, "\\%03o", line[i]);
454       }
455       toybuf[off++] = '$';
456       emit(toybuf, off, 1);
457     } else if (c=='n') {
458       TT.restart = logrus->next+1;
459 
460       break;
461     } else if (c=='N') {
462       // Can't just grab next line because we could have multiple N and
463       // we need to actually read ahead to get N;$p EOF detection right.
464       if (pline) {
465         TT.restart = logrus->next+1;
466         extend_string(&line, TT.nextline, len, -TT.nextlen);
467         free(TT.nextline);
468         TT.nextline = line;
469         TT.nextlen += len + 1;
470         line = 0;
471       }
472 
473       // Pending append goes out right after N
474       goto done;
475     } else if (c=='p' || c=='P') {
476       char *l = (c=='P') ? strchr(line, '\n') : 0;
477 
478       if (emit(line, l ? l-line : len, eol)) break;
479     } else if (c=='q') {
480       if (pline) *pline = (void *)1;
481       free(TT.nextline);
482       TT.nextline = 0;
483       TT.nextlen = 0;
484 
485       break;
486     } else if (c=='s') {
487       char *rline = line, *new = logrus->arg2 + (char *)logrus, *swap, *rswap;
488       regmatch_t *match = (void *)toybuf;
489       regex_t *reg = get_regex(logrus, logrus->arg1);
490       int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
491 
492       // Find match in remaining line (up to remaining len)
493       while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) {
494         mflags = REG_NOTBOL;
495 
496         // Zero length matches don't count immediately after a previous match
497         mlen = match[0].rm_eo-match[0].rm_so;
498         if (!mlen && !zmatch) {
499           if (!rlen--) break;
500           rline++;
501           zmatch++;
502           continue;
503         } else zmatch = 0;
504 
505         // If we're replacing only a specific match, skip if this isn't it
506         off = logrus->sflags>>3;
507         if (off && off != ++count) {
508           rline += match[0].rm_eo;
509           rlen -= match[0].rm_eo;
510 
511           continue;
512         }
513         // The fact getline() can allocate unbounded amounts of memory is
514         // a bigger issue, but while we're here check for integer overflow
515         if (match[0].rm_eo > INT_MAX) perror_exit(0);
516 
517         // newlen = strlen(new) but with \1 and & and printf escapes
518         for (off = newlen = 0; new[off]; off++) {
519           int cc = -1;
520 
521           if (new[off] == '&') cc = 0;
522           else if (new[off] == '\\') cc = new[++off] - '0';
523           if (cc < 0 || cc > 9) {
524             newlen++;
525             continue;
526           }
527           newlen += match[cc].rm_eo-match[cc].rm_so;
528         }
529 
530         // Allocate new size, copy start/end around match. (Can't extend in
531         // place because backrefs may refer to text after it's overwritten.)
532         len += newlen-mlen;
533         swap = xmalloc(len+1);
534         rswap = swap+(rline-line)+match[0].rm_so;
535         memcpy(swap, line, (rline-line)+match[0].rm_so);
536         memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
537 
538         // copy in new replacement text
539         for (off = mlen = 0; new[off]; off++) {
540           int cc = 0, ll;
541 
542           if (new[off] == '\\') {
543             cc = new[++off] - '0';
544             if (cc<0 || cc>9) {
545               if (!(rswap[mlen++] = unescape(new[off])))
546                 rswap[mlen-1] = new[off];
547 
548               continue;
549             } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
550           } else if (new[off] != '&') {
551             rswap[mlen++] = new[off];
552 
553             continue;
554           }
555 
556           ll = match[cc].rm_eo-match[cc].rm_so;
557           memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
558           mlen += ll;
559         }
560 
561         rline = rswap+newlen;
562         free(line);
563         line = swap;
564 
565         // Stop after first substitution unless we have flag g
566         if (!(logrus->sflags & 2)) break;
567       }
568 
569       if (mflags) {
570         // flag p
571         if (logrus->sflags & 4) emit(line, len, eol);
572 
573         tea = 1;
574         if (logrus->w) goto writenow;
575       }
576     } else if (c=='w') {
577       int fd, noeol;
578       char *name;
579 
580 writenow:
581       // Swap out emit() context
582       fd = TT.fdout;
583       noeol = TT.noeol;
584 
585       // We save filehandle and newline status before filename
586       name = logrus->w + (char *)logrus;
587       memcpy(&TT.fdout, name, 4);
588       name += 4;
589       TT.noeol = *(name++);
590 
591       // write, then save/restore context
592       if (emit(line, len, eol))
593         perror_exit("w '%s'", logrus->arg1+(char *)logrus);
594       *(--name) = TT.noeol;
595       TT.noeol = noeol;
596       TT.fdout = fd;
597     } else if (c=='x') {
598       long swap = TT.rememberlen;
599 
600       str = TT.remember;
601       TT.remember = line;
602       line = str;
603       TT.rememberlen = len;
604       len = swap;
605     } else if (c=='y') {
606       char *from, *to = (char *)logrus;
607       int i, j;
608 
609       from = to+logrus->arg1;
610       to += logrus->arg2;
611 
612       for (i = 0; i < len; i++) {
613         j = stridx(from, line[i]);
614         if (j != -1) line[i] = to[j];
615       }
616     } else if (c=='=') {
617       sprintf(toybuf, "%ld", TT.count);
618       emit(toybuf, strlen(toybuf), 1);
619     }
620 
621     logrus = logrus->next;
622   }
623 
624   if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
625 
626 done:
627   free(line);
628 
629   if (dlist_terminate(append)) while (append) {
630     struct append *a = append->next;
631 
632     if (append->file) {
633       int fd = open(append->str, O_RDONLY);
634 
635       // Force newline if noeol pending
636       if (fd != -1) {
637         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
638         TT.noeol = 0;
639         xsendfile(fd, TT.fdout);
640         close(fd);
641       }
642     } else emit(append->str, strlen(append->str), 1);
643     free(append);
644     append = a;
645   }
646 }
647 
648 // Genericish function, can probably get moved to lib.c
649 
650 // Iterate over lines in file, calling function. Function can write 0 to
651 // the line pointer if they want to keep it, or 1 to terminate processing,
652 // otherwise line is freed. Passed file descriptor is closed at the end.
do_lines(int fd,char * name,void (* call)(char ** pline,long len))653 static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
654 {
655   FILE *fp = fd ? xfdopen(fd, "r") : stdin;
656 
657   for (;;) {
658     char *line = 0;
659     ssize_t len;
660 
661     len = getline(&line, (void *)&len, fp);
662     if (len > 0) {
663       call(&line, len);
664       if (line == (void *)1) break;
665       free(line);
666     } else break;
667   }
668 
669   if (fd) fclose(fp);
670 }
671 
do_sed(int fd,char * name)672 static void do_sed(int fd, char *name)
673 {
674   int i = toys.optflags & FLAG_i;
675   char *tmp;
676 
677   if (i) {
678     struct step *primal;
679 
680     if (!fd && *name=='-') {
681       error_msg("-i on stdin");
682       return;
683     }
684     TT.fdout = copy_tempfile(fd, name, &tmp);
685     TT.count = 0;
686     for (primal = (void *)TT.pattern; primal; primal = primal->next)
687       primal->hit = 0;
688   }
689   do_lines(fd, name, walk_pattern);
690   if (i) {
691     walk_pattern(0, 0);
692     replace_tempfile(-1, TT.fdout, &tmp);
693     TT.fdout = 1;
694     TT.nextline = 0;
695     TT.nextlen = TT.noeol = 0;
696   }
697 }
698 
699 // Copy chunk of string between two delimiters, converting printf escapes.
700 // returns processed copy of string (0 if error), *pstr advances to next
701 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
702 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim,int regex)703 static char *unescape_delimited_string(char **pstr, char *delim, int regex)
704 {
705   char *to, *from, mode = 0, d;
706 
707   to = from = *pstr;
708   if (!delim || !*delim) {
709     if (!(d = *(from++))) return 0;
710     if (d == '\\') d = *(from++);
711     if (!d || d == '\\') return 0;
712     if (delim) *delim = d;
713   } else d = *delim;
714   to = delim = xmalloc(strlen(*pstr)+1);
715 
716   while (mode || *from != d) {
717     if (!*from) return 0;
718 
719     // delimiter in regex character range doesn't count
720     if (*from == '[') {
721       mode = '[';
722       if (from[1] == ']') *(to++) = *(from++);
723     } else if (mode && *from == ']') mode = 0;
724     else if (*from == '\\') {
725       if (!from[1]) return 0;
726 
727       // Check escaped end delimiter before printf style escapes.
728       if (from[1] == d) from++;
729       else if (from[1]=='\\') *(to++) = *(from++);
730       else {
731         char c = unescape(from[1]);
732 
733         if (c) {
734           *(to++) = c;
735           from+=2;
736           continue;
737         } else *(to++) = *(from++);
738       }
739     }
740     *(to++) = *(from++);
741   }
742   *to = 0;
743   *pstr = from+1;
744 
745   return delim;
746 }
747 
748 // Translate primal pattern into walkable form.
jewel_of_judgement(char ** pline,long len)749 static void jewel_of_judgement(char **pline, long len)
750 {
751   struct step *corwin = (void *)TT.pattern;
752   char *line, *reg, c, *errstart;
753   int i;
754 
755   line = errstart = pline ? *pline : "";
756   if (len && line[len-1]=='\n') line[--len] = 0;
757 
758   // Append additional line to pattern argument string?
759   // We temporarily repurpose "hit" to indicate line continuations
760   if (corwin && corwin->prev->hit) {
761     if (!*pline) error_exit("unfinished %c", corwin->prev->c);;
762     // Remove half-finished entry from list so remalloc() doesn't confuse it
763     TT.pattern = TT.pattern->prev;
764     corwin = dlist_pop(&TT.pattern);
765     c = corwin->c;
766     reg = (char *)corwin;
767     reg += corwin->arg1 + strlen(reg + corwin->arg1);
768 
769     // Resume parsing for 'a' or 's' command
770     if (corwin->hit < 256) goto resume_s;
771     else goto resume_a;
772   }
773 
774   // Loop through commands in line
775 
776   corwin = 0;
777   for (;;) {
778     if (corwin) dlist_add_nomalloc(&TT.pattern, (void *)corwin);
779 
780     for (;;) {
781       while (isspace(*line) || *line == ';') line++;
782       if (*line == '#') while (*line && *line != '\n') line++;
783       else break;
784     }
785     if (!*line) return;
786 
787     errstart = line;
788     memset(toybuf, 0, sizeof(struct step));
789     corwin = (void *)toybuf;
790     reg = toybuf + sizeof(struct step);
791 
792     // Parse address range (if any)
793     for (i = 0; i < 2; i++) {
794       if (*line == ',') line++;
795       else if (i) break;
796 
797       if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
798       else if (*line == '$') {
799         corwin->lmatch[i] = -1;
800         line++;
801       } else if (*line == '/' || *line == '\\') {
802         char *s = line;
803 
804         if (!(s = unescape_delimited_string(&line, 0, 1))) goto brand;
805         if (!*s) corwin->rmatch[i] = 0;
806         else {
807           xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
808           corwin->rmatch[i] = reg-toybuf;
809           reg += sizeof(regex_t);
810         }
811         free(s);
812       } else break;
813     }
814 
815     while (isspace(*line)) line++;
816     if (!*line) break;
817 
818     while (*line == '!') {
819       corwin->not = 1;
820       line++;
821     }
822     while (isspace(*line)) line++;
823 
824     c = corwin->c = *(line++);
825     if (strchr("}:", c) && i) break;
826     if (strchr("aiqr=", c) && i>1) break;
827 
828     // Add step to pattern
829     corwin = xmalloc(reg-toybuf);
830     memcpy(corwin, toybuf, reg-toybuf);
831     reg = (reg-toybuf) + (char *)corwin;
832 
833     // Parse arguments by command type
834     if (c == '{') TT.nextlen++;
835     else if (c == '}') {
836       if (!TT.nextlen--) break;
837     } else if (c == 's') {
838       char *fiona, delim = 0;
839 
840       // s/pattern/replacement/flags
841 
842       // line continuations use arg1, so we fill out arg2 first (since the
843       // regex part can't be multiple lines) and swap them back later.
844 
845       // get pattern (just record, we parse it later)
846       corwin->arg2 = reg - (char *)corwin;
847       if (!(TT.remember = unescape_delimited_string(&line, &delim, 1)))
848         goto brand;
849 
850       reg += sizeof(regex_t);
851       corwin->arg1 = reg-(char *)corwin;
852       corwin->hit = delim;
853 resume_s:
854       // get replacement - don't replace escapes because \1 and \& need
855       // processing later, after we replace \\ with \ we can't tell \\1 from \1
856       fiona = line;
857       while (*fiona != corwin->hit) {
858         if (!*fiona) goto brand;
859         if (*fiona++ == '\\') {
860           if (!*fiona || *fiona == '\n') {
861             fiona[-1] = '\n';
862             break;
863           }
864           fiona++;
865         }
866       }
867 
868       reg = extend_string((void *)&corwin, line, reg-(char *)corwin,fiona-line);
869       line = fiona;
870       // line continuation? (note: '\n' can't be a valid delim).
871       if (*line == corwin->hit) corwin->hit = 0;
872       else {
873         if (!*line) continue;
874         reg--;
875         line++;
876         goto resume_s;
877       }
878 
879       // swap arg1/arg2 so they're back in order arguments occur.
880       i = corwin->arg1;
881       corwin->arg1 = corwin->arg2;
882       corwin->arg2 = i;
883 
884       // get flags
885       for (line++; *line; line++) {
886         long l;
887 
888         if (isspace(*line) && *line != '\n') continue;
889 
890         if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<<l;
891         else if (!(corwin->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
892           corwin->sflags |= l << 3;
893           line--;
894         } else break;
895       }
896 
897       // We deferred actually parsing the regex until we had the s///i flag
898       // allocating the space was done by extend_string() above
899       if (!*TT.remember) corwin->arg1 = 0;
900       else xregcomp((void *)(corwin->arg1 + (char *)corwin), TT.remember,
901         ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE));
902       free(TT.remember);
903       TT.remember = 0;
904       if (*line == 'w') {
905         line++;
906         goto writenow;
907       }
908     } else if (c == 'w') {
909       int fd, delim;
910       char *cc;
911 
912       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
913       // eol status, and to retain the filename for error messages, we'd need
914       // to go up to arg5 just for this. Compromise: dynamically allocate the
915       // filehandle and eol status.
916 
917 writenow:
918       while (isspace(*line)) line++;
919       if (!*line) goto brand;
920       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
921       delim = *cc;
922       *cc = 0;
923       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
924       *cc = delim;
925 
926       corwin->w = reg - (char *)corwin;
927       corwin = xrealloc(corwin, corwin->w+(cc-line)+6);
928       reg = corwin->w + (char *)corwin;
929 
930       memcpy(reg, &fd, 4);
931       reg += 4;
932       *(reg++) = 0;
933       memcpy(reg, line, delim);
934       reg += delim;
935       *(reg++) = 0;
936 
937       line = cc;
938       if (delim) line += 2;
939     } else if (c == 'y') {
940       char *s, delim = 0;
941       int len;
942 
943       if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
944       corwin->arg1 = reg-(char *)corwin;
945       len = strlen(s);
946       reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len);
947       free(s);
948       corwin->arg2 = reg-(char *)corwin;
949       if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
950       if (len != strlen(s)) goto brand;
951       reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len);
952       free(s);
953     } else if (strchr("abcirtTw:", c)) {
954       int end;
955 
956       while (isspace(*line) && *line != '\n') line++;
957 
958       // Resume logic differs from 's' case because we don't add a newline
959       // unless it's after something, so we add it on return instead.
960 resume_a:
961       corwin->hit = 0;
962 
963       // Trim whitespace from "b ;" and ": blah " but only first space in "w x "
964       if (!(end = strcspn(line, strchr("btT:", c) ? "; \t\r\n\v\f" : "\n"))) {
965         if (strchr("btT", c)) continue;
966         else if (!corwin->arg1) break;
967       }
968 
969       // Extend allocation to include new string. We use offsets instead of
970       // pointers so realloc() moving stuff doesn't break things. Ok to write
971       // \n over NUL terminator because call to extend_string() adds it back.
972       if (!corwin->arg1) corwin->arg1 = reg - (char*)corwin;
973       else if ((corwin+1) != (void *)reg) *(reg++) = '\n';
974       reg = extend_string((void *)&corwin, line, reg - (char *)corwin, end);
975 
976       // Recopy data to remove escape sequences and handle line continuation.
977       if (strchr("aci", c)) {
978         reg -= end+1;
979         for (i = end; i; i--) {
980           if ((*reg++ = *line++)=='\\') {
981 
982             // escape at end of line: resume if -e escaped literal newline,
983             // else request callback and resume with next line
984             if (!--i) {
985               *--reg = 0;
986               if (*line) {
987                 line++;
988                 goto resume_a;
989               }
990               corwin->hit = 256;
991               break;
992             }
993             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
994             line++;
995           }
996         }
997         *reg = 0;
998       } else line += end;
999 
1000     // Commands that take no arguments
1001     } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
1002   }
1003 
1004 brand:
1005   // Reminisce about chestnut trees.
1006   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1007 }
1008 
sed_main(void)1009 void sed_main(void)
1010 {
1011   struct arg_list *dworkin;
1012   char **args = toys.optargs;
1013 
1014   // Lie to autoconf when it asks stupid questions, so configure regexes
1015   // that look for "GNU sed version %f" greater than some old buggy number
1016   // don't fail us for not matching their narrow expectations.
1017   if (toys.optflags & FLAG_version) {
1018     xprintf("This is not GNU sed version 9.0\n");
1019     return;
1020   }
1021 
1022   // Need a pattern. If no unicorns about, fight serpent and take its eye.
1023   if (!TT.e && !TT.f) {
1024     if (!*toys.optargs) error_exit("no pattern");
1025     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1026   }
1027 
1028   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1029   // so handle all -e, then all -f. (At least the behavior's consistent.)
1030 
1031   for (dworkin = TT.e; dworkin; dworkin = dworkin->next)
1032     jewel_of_judgement(&dworkin->arg, strlen(dworkin->arg));
1033   for (dworkin = TT.f; dworkin; dworkin = dworkin->next)
1034     do_lines(xopen(dworkin->arg, O_RDONLY), dworkin->arg, jewel_of_judgement);
1035   jewel_of_judgement(0, 0);
1036   dlist_terminate(TT.pattern);
1037   if (TT.nextlen) error_exit("no }");
1038 
1039   TT.fdout = 1;
1040   TT.remember = xstrdup("");
1041 
1042   // Inflict pattern upon input files
1043   loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
1044 
1045   if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0);
1046 
1047   // todo: need to close fd when done for TOYBOX_FREE?
1048 }
1049