1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6  *
7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8  * but N and s///
9 
10 USE_SED(NEWTOY(sed, "(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
11 
12 config SED
13   bool "sed"
14   default y
15   help
16     usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
17 
18     Stream editor. Apply one or more editing SCRIPTs to each line of input
19     (from FILE or stdin) producing output (by default to stdout).
20 
21     -e	add SCRIPT to list
22     -f	add contents of SCRIPT_FILE to list
23     -i	Edit each file in place.
24     -n	No default output. (Use the p command to output matched lines.)
25     -r	Use extended regular expression syntax.
26     -E	Alias for -r.
27     -s	Treat input files separately (implied by -i)
28 
29     A SCRIPT is a series of one or more COMMANDs separated by newlines or
30     semicolons. All -e SCRIPTs are concatenated together as if separated
31     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
32     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
33 
34     Each COMMAND may be preceded by an address which limits the command to
35     apply only to the specified line(s). Commands without an address apply to
36     every line. Addresses are of the form:
37 
38       [ADDRESS[,ADDRESS]]COMMAND
39 
40     The ADDRESS may be a decimal line number (starting at 1), a /regular
41     expression/ within a pair of forward slashes, or the character "$" which
42     matches the last line of input. (In -s or -i mode this matches the last
43     line of each file, otherwise just the last line of the last file.) A single
44     address matches one line, a pair of comma separated addresses match
45     everything from the first address to the second address (inclusive). If
46     both addresses are regular expressions, more than one range of lines in
47     each file can match.
48 
49     REGULAR EXPRESSIONS in sed are started and ended by the same character
50     (traditionally / but anything except a backslash or a newline works).
51     Backslashes may be used to escape the delimiter if it occurs in the
52     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
53     and unicode). An empty regex repeats the previous one. ADDRESS regexes
54     (above) require the first delimeter to be escaped with a backslash when
55     it isn't a forward slash (to distinguish it from the COMMANDs below).
56 
57     Sed mostly operates on individual lines one at a time. It reads each line,
58     processes it, and either writes it to the output or discards it before
59     reading the next line. Sed can remember one additional line in a separate
60     buffer (using the h, H, g, G, and x commands), and can read the next line
61     of input early (using the n and N command), but other than that command
62     scripts operate on individual lines of text.
63 
64     Each COMMAND starts with a single character. The following commands take
65     no arguments:
66 
67       {  Start a new command block, continuing until a corresponding "}".
68          Command blocks may nest. If the block has an address, commands within
69          the block are only run for lines within the block's address range.
70 
71       }  End command block (this command cannot have an address)
72 
73       d  Delete this line and move on to the next one
74          (ignores remaining COMMANDs)
75 
76       D  Delete one line of input and restart command SCRIPT (same as "d"
77          unless you've glued lines together with "N" or similar)
78 
79       g  Get remembered line (overwriting current line)
80 
81       G  Get remembered line (appending to current line)
82 
83       h  Remember this line (overwriting remembered line)
84 
85       H  Remember this line (appending to remembered line, if any)
86 
87       l  Print line, escaping \abfrtv (but not newline), octal escaping other
88          nonprintable characters, wrapping lines to terminal width with a
89          backslash, and appending $ to actual end of line.
90 
91       n  Print default output and read next line, replacing current line
92          (If no next line available, quit processing script)
93 
94       N  Append next line of input to this line, separated by a newline
95          (This advances the line counter for address matching and "=", if no
96          next line available quit processing script without default output)
97 
98       p  Print this line
99 
100       P  Print this line up to first newline (from "N")
101 
102       q  Quit (print default output, no more commands processed or lines read)
103 
104       x  Exchange this line with remembered line (overwrite in both directions)
105 
106       =  Print the current line number (followed by a newline)
107 
108     The following commands (may) take an argument. The "text" arguments (to
109     the "a", "b", and "c" commands) may end with an unescaped "\" to append
110     the next line (for which leading whitespace is not skipped), and also
111     treat ";" as a literal character (use "\;" instead).
112 
113       a [text]   Append text to output before attempting to read next line
114 
115       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
116 
117       c [text]   Delete line, output text at end of matching address range
118                  (ignores remaining COMMANDs)
119 
120       i [text]   Print text
121 
122       r [file]   Append contents of file to output before attempting to read
123                  next line.
124 
125       s/S/R/F    Search for regex S, replace matched text with R using flags F.
126                  The first character after the "s" (anything but newline or
127                  backslash) is the delimiter, escape with \ to use normally.
128 
129                  The replacement text may contain "&" to substitute the matched
130                  text (escape it with backslash for a literal &), or \1 through
131                  \9 to substitute a parenthetical subexpression in the regex.
132                  You can also use the normal backslash escapes such as \n and
133                  a backslash at the end of the line appends the next line.
134 
135                  The flags are:
136 
137                  [0-9]    A number, substitute only that occurrence of pattern
138                  g        Global, substitute all occurrences of pattern
139                  i        Ignore case when matching
140                  p        Print the line if match was found and replaced
141                  w [file] Write (append) line to file if match replaced
142 
143       t [label]  Test, jump to :label only if an "s" command found a match in
144                  this line since last test (replacing with same text counts)
145 
146       T [label]  Test false, jump only if "s" hasn't found a match.
147 
148       w [file]   Write (append) line to file
149 
150       y/old/new/ Change each character in 'old' to corresponding character
151                  in 'new' (with standard backslash escapes, delimiter can be
152                  any repeated character except \ or \n)
153 
154       : [label]  Labeled target for jump commands
155 
156       #  Comment, ignore rest of this line of SCRIPT
157 
158     Deviations from posix: allow extended regular expressions with -r,
159     editing in place with -i, separate with -s, printf escapes in text, line
160     continuations, semicolons after all commands, 2-address anywhere an
161     address is allowed, "T" command, multiline continuations for [abc],
162     \; to end [abc] argument before end of line.
163 */
164 
165 #define FOR_sed
166 #include "toys.h"
167 
168 GLOBALS(
169   struct arg_list *f;
170   struct arg_list *e;
171 
172   // processed pattern list
173   struct double_list *pattern;
174 
175   char *nextline, *remember;
176   void *restart, *lastregex;
177   long nextlen, rememberlen, count;
178   int fdout, noeol;
179   unsigned xx;
180 )
181 
182 struct step {
183   struct step *next, *prev;
184 
185   // Begin and end of each match
186   long lmatch[2];
187   int rmatch[2], arg1, arg2, w; // offsets because remalloc()
188   unsigned not, hit, sflags;
189   char c; // action
190 };
191 
192 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)193 static int emit(char *line, long len, int eol)
194 {
195   int l, old = line[len];
196 
197   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
198   if (eol) line[len++] = '\n';
199   if (!len) return 0;
200   TT.noeol = len && !eol;
201   l = writeall(TT.fdout, line, len);
202   if (eol) line[len-1] = old;
203   if (l != len) {
204     perror_msg("short write");
205 
206     return 1;
207   }
208 
209   return 0;
210 }
211 
212 // Do regex matching handling embedded NUL bytes in string. Note that
213 // neither the pattern nor the match can currently include NUL bytes
214 // (even with wildcards) and string must be null terminated.
ghostwheel(regex_t * preg,char * string,long len,int nmatch,regmatch_t pmatch[],int eflags)215 static int ghostwheel(regex_t *preg, char *string, long len, int nmatch,
216   regmatch_t pmatch[], int eflags)
217 {
218   char *s = string;
219 
220   for (;;) {
221     long ll = 0;
222     int rc;
223 
224     while (len && !*s) {
225       s++;
226       len--;
227     }
228     while (s[ll] && ll<len) ll++;
229 
230     rc = regexec(preg, s, nmatch, pmatch, eflags);
231     if (!rc) {
232       for (rc = 0; rc<nmatch && pmatch[rc].rm_so!=-1; rc++) {
233         pmatch[rc].rm_so += s-string;
234         pmatch[rc].rm_eo += s-string;
235       }
236 
237       return 0;
238     }
239     if (ll==len) return rc;
240 
241     s += ll;
242     len -= ll;
243   }
244 }
245 
246 // Extend allocation to include new string, with newline between if newlen<0
247 
extend_string(char ** old,char * new,int oldlen,int newlen)248 static char *extend_string(char **old, char *new, int oldlen, int newlen)
249 {
250   int newline = newlen < 0;
251   char *s;
252 
253   if (newline) newlen = -newlen;
254   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
255   if (newline) s[oldlen++] = '\n';
256   memcpy(s+oldlen, new, newlen);
257   s[oldlen+newlen] = 0;
258 
259   return s+oldlen+newlen+1;
260 }
261 
262 // An empty regex repeats the previous one
get_regex(void * trump,int offset)263 static void *get_regex(void *trump, int offset)
264 {
265   if (!offset) {
266     if (!TT.lastregex) error_exit("no previous regex");
267     return TT.lastregex;
268   }
269 
270   return TT.lastregex = offset+(char *)trump;
271 }
272 
273 // Apply pattern to line from input file
walk_pattern(char ** pline,long plen)274 static void walk_pattern(char **pline, long plen)
275 {
276   struct append {
277     struct append *next, *prev;
278     int file;
279     char *str;
280   } *append = 0;
281   char *line = TT.nextline;
282   long len = TT.nextlen;
283   struct step *logrus;
284   int eol = 0, tea = 0;
285 
286   // Grab next line for deferred processing (EOF detection: we get a NULL
287   // pline at EOF to flush last line). Note that only end of _last_ input
288   // file matches $ (unless we're doing -i).
289   TT.nextline = 0;
290   TT.nextlen = 0;
291   if (pline) {
292     TT.nextline = *pline;
293     TT.nextlen = plen;
294     *pline = 0;
295   }
296 
297   if (!line || !len) return;
298   if (line[len-1] == '\n') line[--len] = eol++;
299   TT.count++;
300 
301   // The restart-1 is because we added one to make sure it wasn't NULL,
302   // otherwise N as last command would restart script
303   logrus = TT.restart ? ((struct step *)TT.restart)-1 : (void *)TT.pattern;
304   TT.restart = 0;
305 
306   while (logrus) {
307     char *str, c = logrus->c;
308 
309     // Have we got a line or regex matching range for this rule?
310     if (*logrus->lmatch || *logrus->rmatch) {
311       int miss = 0;
312       long lm;
313 
314       // In a match that might end?
315       if (logrus->hit) {
316         if (!(lm = logrus->lmatch[1])) {
317           if (!logrus->rmatch[1]) logrus->hit = 0;
318           else {
319             void *rm = get_regex(logrus, logrus->rmatch[1]);
320 
321             // regex match end includes matching line, so defer deactivation
322             if (line && !ghostwheel(rm, line, len, 0, 0, 0)) miss = 1;
323           }
324         } else if (lm > 0 && lm < TT.count) logrus->hit = 0;
325 
326       // Start a new match?
327       } else {
328         if (!(lm = *logrus->lmatch)) {
329           void *rm = get_regex(logrus, *logrus->rmatch);
330 
331           if (line && !ghostwheel(rm, line, len, 0, 0, 0)) logrus->hit++;
332         } else if (lm == TT.count || (lm == -1 && !pline)) logrus->hit++;
333 
334         if (!logrus->lmatch[1] && !logrus->rmatch[1]) miss = 1;
335       }
336 
337       // Didn't match?
338       lm = !(logrus->hit ^ logrus->not);
339 
340       // Deferred disable from regex end match
341       if (miss || logrus->lmatch[1] == TT.count) logrus->hit = 0;
342 
343       if (lm) {
344         // Handle skipping curly bracket command group
345         if (c == '{') {
346           int curly = 1;
347 
348           while (curly) {
349             logrus = logrus->next;
350             if (logrus->c == '{') curly++;
351             if (logrus->c == '}') curly--;
352           }
353         }
354         logrus = logrus->next;
355         continue;
356       }
357     }
358 
359     // A deleted line can still update line match state for later commands
360     if (!line) {
361       logrus = logrus->next;
362       continue;
363     }
364 
365     // Process command
366 
367     if (c=='a' || c=='r') {
368       struct append *a = xzalloc(sizeof(struct append));
369       a->str = logrus->arg1+(char *)logrus;
370       a->file = c=='r';
371       dlist_add_nomalloc((void *)&append, (void *)a);
372     } else if (c=='b' || c=='t' || c=='T') {
373       int t = tea;
374 
375       if (c != 'b') tea = 0;
376       if (c=='b' || t^(c=='T')) {
377         if (!logrus->arg1) break;
378         str = logrus->arg1+(char *)logrus;
379         for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next)
380           if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str))
381             break;
382         if (!logrus) error_exit("no :%s", str);
383       }
384     } else if (c=='c') {
385       str = logrus->arg1+(char *)logrus;
386       if (!logrus->hit) emit(str, strlen(str), 1);
387       free(line);
388       line = 0;
389       continue;
390     } else if (c=='d') {
391       free(line);
392       line = 0;
393       continue;
394     } else if (c=='D') {
395       // Delete up to \n or end of buffer
396       str = line;
397       while ((str-line)<len) if (*(str++) == '\n') break;
398       len -= str - line;
399       memmove(line, str, len);
400 
401       // if "delete" blanks line, disable further processing
402       // otherwise trim and restart script
403       if (!len) {
404         free(line);
405         line = 0;
406       } else {
407         line[len] = 0;
408         logrus = (void *)TT.pattern;
409       }
410       continue;
411     } else if (c=='g') {
412       free(line);
413       line = xstrdup(TT.remember);
414       len = TT.rememberlen;
415     } else if (c=='G') {
416       line = xrealloc(line, len+TT.rememberlen+2);
417       line[len++] = '\n';
418       memcpy(line+len, TT.remember, TT.rememberlen);
419       line[len += TT.rememberlen] = 0;
420     } else if (c=='h') {
421       free(TT.remember);
422       TT.remember = xstrdup(line);
423       TT.rememberlen = len;
424     } else if (c=='H') {
425       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
426       TT.remember[TT.rememberlen++] = '\n';
427       memcpy(TT.remember+TT.rememberlen, line, len);
428       TT.remember[TT.rememberlen += len] = 0;
429     } else if (c=='i') {
430       str = logrus->arg1+(char *)logrus;
431       emit(str, strlen(str), 1);
432     } else if (c=='l') {
433       int i, x, off;
434 
435       if (!TT.xx) {
436         terminal_size(&TT.xx, 0);
437         if (!TT.xx) TT.xx = 80;
438         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
439         if (TT.xx > 4) TT.xx -= 4;
440       }
441 
442       for (i = off = 0; i<len; i++) {
443         if (off >= TT.xx) {
444           toybuf[off++] = '\\';
445           emit(toybuf, off, 1);
446           off = 0;
447         }
448         x = stridx("\\\a\b\f\r\t\v", line[i]);
449         if (x != -1) {
450           toybuf[off++] = '\\';
451           toybuf[off++] = "\\abfrtv"[x];
452         } else if (line[i] >= ' ') toybuf[off++] = line[i];
453         else off += sprintf(toybuf+off, "\\%03o", line[i]);
454       }
455       toybuf[off++] = '$';
456       emit(toybuf, off, 1);
457     } else if (c=='n') {
458       TT.restart = logrus->next+1;
459 
460       break;
461     } else if (c=='N') {
462       // Can't just grab next line because we could have multiple N and
463       // we need to actually read ahead to get N;$p EOF detection right.
464       if (pline) {
465         TT.restart = logrus->next+1;
466         extend_string(&line, TT.nextline, len, -TT.nextlen);
467         free(TT.nextline);
468         TT.nextline = line;
469         TT.nextlen += len + 1;
470         line = 0;
471       }
472 
473       // Pending append goes out right after N
474       goto done;
475     } else if (c=='p' || c=='P') {
476       char *l = (c=='P') ? strchr(line, '\n') : 0;
477 
478       if (emit(line, l ? l-line : len, eol)) break;
479     } else if (c=='q') {
480       if (pline) *pline = (void *)1;
481       free(TT.nextline);
482       TT.nextline = 0;
483       TT.nextlen = 0;
484 
485       break;
486     } else if (c=='s') {
487       char *rline = line, *new = logrus->arg2 + (char *)logrus, *swap, *rswap;
488       regmatch_t *match = (void *)toybuf;
489       regex_t *reg = get_regex(logrus, logrus->arg1);
490       int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
491 
492       // Find match in remaining line (up to remaining len)
493       while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) {
494         mflags = REG_NOTBOL;
495 
496         // Zero length matches don't count immediately after a previous match
497         mlen = match[0].rm_eo-match[0].rm_so;
498         if (!mlen && !zmatch) {
499           if (!rlen--) break;
500           rline++;
501           zmatch++;
502           continue;
503         } else zmatch = 0;
504 
505         // If we're replacing only a specific match, skip if this isn't it
506         off = logrus->sflags>>3;
507         if (off && off != ++count) {
508           rline += match[0].rm_eo;
509           rlen -= match[0].rm_eo;
510 
511           continue;
512         }
513         // The fact getline() can allocate unbounded amounts of memory is
514         // a bigger issue, but while we're here check for integer overflow
515         if (match[0].rm_eo > INT_MAX) perror_exit(0);
516 
517         // newlen = strlen(new) but with \1 and & and printf escapes
518         for (off = newlen = 0; new[off]; off++) {
519           int cc = -1;
520 
521           if (new[off] == '&') cc = 0;
522           else if (new[off] == '\\') cc = new[++off] - '0';
523           if (cc < 0 || cc > 9) {
524             newlen++;
525             continue;
526           }
527           newlen += match[cc].rm_eo-match[cc].rm_so;
528         }
529 
530         // Allocate new size, copy start/end around match. (Can't extend in
531         // place because backrefs may refer to text after it's overwritten.)
532         len += newlen-mlen;
533         swap = xmalloc(len+1);
534         rswap = swap+(rline-line)+match[0].rm_so;
535         memcpy(swap, line, (rline-line)+match[0].rm_so);
536         memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
537 
538         // copy in new replacement text
539         for (off = mlen = 0; new[off]; off++) {
540           int cc = 0, ll;
541 
542           if (new[off] == '\\') {
543             cc = new[++off] - '0';
544             if (cc<0 || cc>9) {
545               if (!(rswap[mlen++] = unescape(new[off])))
546                 rswap[mlen-1] = new[off];
547 
548               continue;
549             } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
550           } else if (new[off] != '&') {
551             rswap[mlen++] = new[off];
552 
553             continue;
554           }
555 
556           ll = match[cc].rm_eo-match[cc].rm_so;
557           memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
558           mlen += ll;
559         }
560 
561         rline = rswap+newlen;
562         free(line);
563         line = swap;
564 
565         // Stop after first substitution unless we have flag g
566         if (!(logrus->sflags & 2)) break;
567       }
568 
569       if (mflags) {
570         // flag p
571         if (logrus->sflags & 4) emit(line, len, eol);
572 
573         tea = 1;
574         if (logrus->w) goto writenow;
575       }
576     } else if (c=='w') {
577       int fd, noeol;
578       char *name;
579 
580 writenow:
581       // Swap out emit() context
582       fd = TT.fdout;
583       noeol = TT.noeol;
584 
585       // We save filehandle and newline status before filename
586       name = logrus->w + (char *)logrus;
587       memcpy(&TT.fdout, name, 4);
588       name += 4;
589       TT.noeol = *(name++);
590 
591       // write, then save/restore context
592       if (emit(line, len, eol))
593         perror_exit("w '%s'", logrus->arg1+(char *)logrus);
594       *(--name) = TT.noeol;
595       TT.noeol = noeol;
596       TT.fdout = fd;
597     } else if (c=='x') {
598       long swap = TT.rememberlen;
599 
600       str = TT.remember;
601       TT.remember = line;
602       line = str;
603       TT.rememberlen = len;
604       len = swap;
605     } else if (c=='y') {
606       char *from, *to = (char *)logrus;
607       int i, j;
608 
609       from = to+logrus->arg1;
610       to += logrus->arg2;
611 
612       for (i = 0; i < len; i++) {
613         j = stridx(from, line[i]);
614         if (j != -1) line[i] = to[j];
615       }
616     } else if (c=='=') {
617       sprintf(toybuf, "%ld", TT.count);
618       emit(toybuf, strlen(toybuf), 1);
619     }
620 
621     logrus = logrus->next;
622   }
623 
624   if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
625 
626 done:
627   free(line);
628 
629   if (dlist_terminate(append)) while (append) {
630     struct append *a = append->next;
631 
632     if (append->file) {
633       int fd = open(append->str, O_RDONLY);
634 
635       // Force newline if noeol pending
636       if (fd != -1) {
637         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
638         TT.noeol = 0;
639         xsendfile(fd, TT.fdout);
640         close(fd);
641       }
642     } else emit(append->str, strlen(append->str), 1);
643     free(append);
644     append = a;
645   }
646 }
647 
648 // Genericish function, can probably get moved to lib.c
649 
650 // Iterate over lines in file, calling function. Function can write 0 to
651 // the line pointer if they want to keep it, or 1 to terminate processing,
652 // otherwise line is freed. Passed file descriptor is closed at the end.
do_lines(int fd,char * name,void (* call)(char ** pline,long len))653 static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
654 {
655   FILE *fp = fd ? xfdopen(fd, "r") : stdin;
656 
657   for (;;) {
658     char *line = 0;
659     ssize_t len;
660 
661     len = getline(&line, (void *)&len, fp);
662     if (len > 0) {
663       call(&line, len);
664       if (line == (void *)1) break;
665       free(line);
666     } else break;
667   }
668 
669   if (fd) fclose(fp);
670 }
671 
672 // Callback called on each input file
do_sed(int fd,char * name)673 static void do_sed(int fd, char *name)
674 {
675   int i = toys.optflags & FLAG_i;
676   char *tmp;
677 
678   if (i) {
679     struct step *primal;
680 
681     if (!fd && !strcmp(name, "-")) {
682       error_msg("-i on stdin");
683       return;
684     }
685     TT.fdout = copy_tempfile(fd, name, &tmp);
686     TT.count = 0;
687     for (primal = (void *)TT.pattern; primal; primal = primal->next)
688       primal->hit = 0;
689   }
690   do_lines(fd, name, walk_pattern);
691   if (i) {
692     walk_pattern(0, 0);
693     replace_tempfile(-1, TT.fdout, &tmp);
694     TT.fdout = 1;
695     TT.nextline = 0;
696     TT.nextlen = TT.noeol = 0;
697   }
698 }
699 
700 // Copy chunk of string between two delimiters, converting printf escapes.
701 // returns processed copy of string (0 if error), *pstr advances to next
702 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
703 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)704 static char *unescape_delimited_string(char **pstr, char *delim)
705 {
706   char *to, *from, mode = 0, d;
707 
708   from = *pstr;
709   if (!delim || !*delim) {
710     if (!(d = *(from++))) return 0;
711     if (d == '\\') d = *(from++);
712     if (!d || d == '\\') return 0;
713     if (delim) *delim = d;
714   } else d = *delim;
715   to = delim = xmalloc(strlen(*pstr)+1);
716 
717   while (mode || *from != d) {
718     if (!*from) return 0;
719 
720     // delimiter in regex character range doesn't count
721     if (!mode && *from == '[') {
722       mode = '[';
723       if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
724     } else if (mode && *from == ']') mode = 0;
725     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
726     // but the perl build does it, so we need to filter it out.
727     else if (mode && *from == '-' && from[-1] == from[1]) {
728       from+=2;
729       continue;
730     } else if (*from == '\\') {
731       if (!from[1]) return 0;
732 
733       // Check escaped end delimiter before printf style escapes.
734       if (from[1] == d) from++;
735       else if (from[1]=='\\') *(to++) = *(from++);
736       else {
737         char c = unescape(from[1]);
738 
739         if (c) {
740           *(to++) = c;
741           from+=2;
742           continue;
743         } else if (!mode) *(to++) = *(from++);
744       }
745     }
746     *(to++) = *(from++);
747   }
748   *to = 0;
749   *pstr = from+1;
750 
751   return delim;
752 }
753 
754 // Translate primal pattern into walkable form.
jewel_of_judgement(char ** pline,long len)755 static void jewel_of_judgement(char **pline, long len)
756 {
757   struct step *corwin = (void *)TT.pattern;
758   char *line, *reg, c, *errstart;
759   int i;
760 
761   line = errstart = pline ? *pline : "";
762   if (len && line[len-1]=='\n') line[--len] = 0;
763 
764   // Append additional line to pattern argument string?
765   // We temporarily repurpose "hit" to indicate line continuations
766   if (corwin && corwin->prev->hit) {
767     if (!*pline) error_exit("unfinished %c", corwin->prev->c);;
768     // Remove half-finished entry from list so remalloc() doesn't confuse it
769     TT.pattern = TT.pattern->prev;
770     corwin = dlist_pop(&TT.pattern);
771     c = corwin->c;
772     reg = (char *)corwin;
773     reg += corwin->arg1 + strlen(reg + corwin->arg1);
774 
775     // Resume parsing for 'a' or 's' command
776     if (corwin->hit < 256) goto resume_s;
777     else goto resume_a;
778   }
779 
780   // Loop through commands in line
781 
782   corwin = 0;
783   for (;;) {
784     if (corwin) dlist_add_nomalloc(&TT.pattern, (void *)corwin);
785 
786     for (;;) {
787       while (isspace(*line) || *line == ';') line++;
788       if (*line == '#') while (*line && *line != '\n') line++;
789       else break;
790     }
791     if (!*line) return;
792 
793     errstart = line;
794     memset(toybuf, 0, sizeof(struct step));
795     corwin = (void *)toybuf;
796     reg = toybuf + sizeof(struct step);
797 
798     // Parse address range (if any)
799     for (i = 0; i < 2; i++) {
800       if (*line == ',') line++;
801       else if (i) break;
802 
803       if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
804       else if (*line == '$') {
805         corwin->lmatch[i] = -1;
806         line++;
807       } else if (*line == '/' || *line == '\\') {
808         char *s = line;
809 
810         if (!(s = unescape_delimited_string(&line, 0))) goto brand;
811         if (!*s) corwin->rmatch[i] = 0;
812         else {
813           xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
814           corwin->rmatch[i] = reg-toybuf;
815           reg += sizeof(regex_t);
816         }
817         free(s);
818       } else break;
819     }
820 
821     while (isspace(*line)) line++;
822     if (!*line) break;
823 
824     while (*line == '!') {
825       corwin->not = 1;
826       line++;
827     }
828     while (isspace(*line)) line++;
829 
830     c = corwin->c = *(line++);
831     if (strchr("}:", c) && i) break;
832     if (strchr("aiqr=", c) && i>1) break;
833 
834     // Add step to pattern
835     corwin = xmemdup(toybuf, reg-toybuf);
836     reg = (reg-toybuf) + (char *)corwin;
837 
838     // Parse arguments by command type
839     if (c == '{') TT.nextlen++;
840     else if (c == '}') {
841       if (!TT.nextlen--) break;
842     } else if (c == 's') {
843       char *fiona, delim = 0;
844 
845       // s/pattern/replacement/flags
846 
847       // line continuations use arg1, so we fill out arg2 first (since the
848       // regex part can't be multiple lines) and swap them back later.
849 
850       // get pattern (just record, we parse it later)
851       corwin->arg2 = reg - (char *)corwin;
852       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
853         goto brand;
854 
855       reg += sizeof(regex_t);
856       corwin->arg1 = reg-(char *)corwin;
857       corwin->hit = delim;
858 resume_s:
859       // get replacement - don't replace escapes because \1 and \& need
860       // processing later, after we replace \\ with \ we can't tell \\1 from \1
861       fiona = line;
862       while (*fiona != corwin->hit) {
863         if (!*fiona) goto brand;
864         if (*fiona++ == '\\') {
865           if (!*fiona || *fiona == '\n') {
866             fiona[-1] = '\n';
867             break;
868           }
869           fiona++;
870         }
871       }
872 
873       reg = extend_string((void *)&corwin, line, reg-(char *)corwin,fiona-line);
874       line = fiona;
875       // line continuation? (note: '\n' can't be a valid delim).
876       if (*line == corwin->hit) corwin->hit = 0;
877       else {
878         if (!*line) continue;
879         reg--;
880         line++;
881         goto resume_s;
882       }
883 
884       // swap arg1/arg2 so they're back in order arguments occur.
885       i = corwin->arg1;
886       corwin->arg1 = corwin->arg2;
887       corwin->arg2 = i;
888 
889       // get flags
890       for (line++; *line; line++) {
891         long l;
892 
893         if (isspace(*line) && *line != '\n') continue;
894 
895         if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<<l;
896         else if (!(corwin->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
897           corwin->sflags |= l << 3;
898           line--;
899         } else break;
900       }
901 
902       // We deferred actually parsing the regex until we had the s///i flag
903       // allocating the space was done by extend_string() above
904       if (!*TT.remember) corwin->arg1 = 0;
905       else xregcomp((void *)(corwin->arg1 + (char *)corwin), TT.remember,
906         ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE));
907       free(TT.remember);
908       TT.remember = 0;
909       if (*line == 'w') {
910         line++;
911         goto writenow;
912       }
913     } else if (c == 'w') {
914       int fd, delim;
915       char *cc;
916 
917       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
918       // eol status, and to retain the filename for error messages, we'd need
919       // to go up to arg5 just for this. Compromise: dynamically allocate the
920       // filehandle and eol status.
921 
922 writenow:
923       while (isspace(*line)) line++;
924       if (!*line) goto brand;
925       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
926       delim = *cc;
927       *cc = 0;
928       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
929       *cc = delim;
930 
931       corwin->w = reg - (char *)corwin;
932       corwin = xrealloc(corwin, corwin->w+(cc-line)+6);
933       reg = corwin->w + (char *)corwin;
934 
935       memcpy(reg, &fd, 4);
936       reg += 4;
937       *(reg++) = 0;
938       memcpy(reg, line, delim);
939       reg += delim;
940       *(reg++) = 0;
941 
942       line = cc;
943       if (delim) line += 2;
944     } else if (c == 'y') {
945       char *s, delim = 0;
946       int len;
947 
948       if (!(s = unescape_delimited_string(&line, &delim))) goto brand;
949       corwin->arg1 = reg-(char *)corwin;
950       len = strlen(s);
951       reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len);
952       free(s);
953       corwin->arg2 = reg-(char *)corwin;
954       if (!(s = unescape_delimited_string(&line, &delim))) goto brand;
955       if (len != strlen(s)) goto brand;
956       reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len);
957       free(s);
958     } else if (strchr("abcirtTw:", c)) {
959       int end;
960 
961       while (isspace(*line) && *line != '\n') line++;
962 
963       // Resume logic differs from 's' case because we don't add a newline
964       // unless it's after something, so we add it on return instead.
965 resume_a:
966       corwin->hit = 0;
967 
968       // Trim whitespace from "b ;" and ": blah " but only first space in "w x "
969       if (!(end = strcspn(line, strchr("btT:", c) ? "; \t\r\n\v\f" : "\n"))) {
970         if (strchr("btT", c)) continue;
971         else if (!corwin->arg1) break;
972       }
973 
974       // Extend allocation to include new string. We use offsets instead of
975       // pointers so realloc() moving stuff doesn't break things. Ok to write
976       // \n over NUL terminator because call to extend_string() adds it back.
977       if (!corwin->arg1) corwin->arg1 = reg - (char*)corwin;
978       else if (*(corwin->arg1+(char *)corwin)) *(reg++) = '\n';
979       reg = extend_string((void *)&corwin, line, reg - (char *)corwin, end);
980 
981       // Recopy data to remove escape sequences and handle line continuation.
982       if (strchr("aci", c)) {
983         reg -= end+1;
984         for (i = end; i; i--) {
985           if ((*reg++ = *line++)=='\\') {
986 
987             // escape at end of line: resume if -e escaped literal newline,
988             // else request callback and resume with next line
989             if (!--i) {
990               *--reg = 0;
991               if (*line) {
992                 line++;
993                 goto resume_a;
994               }
995               corwin->hit = 256;
996               break;
997             }
998             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
999             line++;
1000           }
1001         }
1002         *reg = 0;
1003       } else line += end;
1004 
1005     // Commands that take no arguments
1006     } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
1007   }
1008 
1009 brand:
1010   // Reminisce about chestnut trees.
1011   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1012 }
1013 
sed_main(void)1014 void sed_main(void)
1015 {
1016   struct arg_list *dworkin;
1017   char **args = toys.optargs;
1018 
1019   // Lie to autoconf when it asks stupid questions, so configure regexes
1020   // that look for "GNU sed version %f" greater than some old buggy number
1021   // don't fail us for not matching their narrow expectations.
1022   if (toys.optflags & FLAG_version) {
1023     xprintf("This is not GNU sed version 9.0\n");
1024     return;
1025   }
1026 
1027   // Need a pattern. If no unicorns about, fight serpent and take its eye.
1028   if (!TT.e && !TT.f) {
1029     if (!*toys.optargs) error_exit("no pattern");
1030     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1031   }
1032 
1033   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1034   // so handle all -e, then all -f. (At least the behavior's consistent.)
1035 
1036   for (dworkin = TT.e; dworkin; dworkin = dworkin->next)
1037     jewel_of_judgement(&dworkin->arg, strlen(dworkin->arg));
1038   for (dworkin = TT.f; dworkin; dworkin = dworkin->next)
1039     do_lines(xopen(dworkin->arg, O_RDONLY), dworkin->arg, jewel_of_judgement);
1040   jewel_of_judgement(0, 0);
1041   dlist_terminate(TT.pattern);
1042   if (TT.nextlen) error_exit("no }");
1043 
1044   TT.fdout = 1;
1045   TT.remember = xstrdup("");
1046 
1047   // Inflict pattern upon input files
1048   loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
1049 
1050   if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0);
1051 
1052   // todo: need to close fd when done for TOYBOX_FREE?
1053 }
1054