1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6  *
7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8  * but N and s///
9  * TODO: make y// handle unicode, unicode delimiters
10  * TODO: handle error return from emit(), error_msg/exit consistently
11  *       What's the right thing to do for -i when write fails? Skip to next?
12  * test '//q' with no previous regex, also repeat previous regex?
13 
14 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
15 
16 config SED
17   bool "sed"
18   default y
19   help
20     usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
21 
22     Stream editor. Apply one or more editing SCRIPTs to each line of input
23     (from FILE or stdin) producing output (by default to stdout).
24 
25     -e	Add SCRIPT to list
26     -f	Add contents of SCRIPT_FILE to list
27     -i	Edit each file in place (-iEXT keeps backup file with extension EXT)
28     -n	No default output (use the p command to output matched lines)
29     -r	Use extended regular expression syntax
30     -E	POSIX alias for -r
31     -s	Treat input files separately (implied by -i)
32     -z	Use \0 rather than \n as the input line separator
33 
34     A SCRIPT is a series of one or more COMMANDs separated by newlines or
35     semicolons. All -e SCRIPTs are concatenated together as if separated
36     by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
37     If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
38 
39     Each COMMAND may be preceded by an address which limits the command to
40     apply only to the specified line(s). Commands without an address apply to
41     every line. Addresses are of the form:
42 
43       [ADDRESS[,ADDRESS]]COMMAND
44 
45     The ADDRESS may be a decimal line number (starting at 1), a /regular
46     expression/ within a pair of forward slashes, or the character "$" which
47     matches the last line of input. (In -s or -i mode this matches the last
48     line of each file, otherwise just the last line of the last file.) A single
49     address matches one line, a pair of comma separated addresses match
50     everything from the first address to the second address (inclusive). If
51     both addresses are regular expressions, more than one range of lines in
52     each file can match.
53 
54     REGULAR EXPRESSIONS in sed are started and ended by the same character
55     (traditionally / but anything except a backslash or a newline works).
56     Backslashes may be used to escape the delimiter if it occurs in the
57     regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
58     and unicode). An empty regex repeats the previous one. ADDRESS regexes
59     (above) require the first delimiter to be escaped with a backslash when
60     it isn't a forward slash (to distinguish it from the COMMANDs below).
61 
62     Sed mostly operates on individual lines one at a time. It reads each line,
63     processes it, and either writes it to the output or discards it before
64     reading the next line. Sed can remember one additional line in a separate
65     buffer (using the h, H, g, G, and x commands), and can read the next line
66     of input early (using the n and N command), but other than that command
67     scripts operate on individual lines of text.
68 
69     Each COMMAND starts with a single character. The following commands take
70     no arguments:
71 
72       {  Start a new command block, continuing until a corresponding "}".
73          Command blocks may nest. If the block has an address, commands within
74          the block are only run for lines within the block's address range.
75 
76       }  End command block (this command cannot have an address)
77 
78       d  Delete this line and move on to the next one
79          (ignores remaining COMMANDs)
80 
81       D  Delete one line of input and restart command SCRIPT (same as "d"
82          unless you've glued lines together with "N" or similar)
83 
84       g  Get remembered line (overwriting current line)
85 
86       G  Get remembered line (appending to current line)
87 
88       h  Remember this line (overwriting remembered line)
89 
90       H  Remember this line (appending to remembered line, if any)
91 
92       l  Print line, escaping \abfrtv (but not newline), octal escaping other
93          nonprintable characters, wrapping lines to terminal width with a
94          backslash, and appending $ to actual end of line.
95 
96       n  Print default output and read next line, replacing current line
97          (If no next line available, quit processing script)
98 
99       N  Append next line of input to this line, separated by a newline
100          (This advances the line counter for address matching and "=", if no
101          next line available quit processing script without default output)
102 
103       p  Print this line
104 
105       P  Print this line up to first newline (from "N")
106 
107       q  Quit (print default output, no more commands processed or lines read)
108 
109       x  Exchange this line with remembered line (overwrite in both directions)
110 
111       =  Print the current line number (followed by a newline)
112 
113     The following commands (may) take an argument. The "text" arguments (to
114     the "a", "b", and "c" commands) may end with an unescaped "\" to append
115     the next line (for which leading whitespace is not skipped), and also
116     treat ";" as a literal character (use "\;" instead).
117 
118       a [text]   Append text to output before attempting to read next line
119 
120       b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
121 
122       c [text]   Delete line, output text at end of matching address range
123                  (ignores remaining COMMANDs)
124 
125       i [text]   Print text
126 
127       r [file]   Append contents of file to output before attempting to read
128                  next line.
129 
130       s/S/R/F    Search for regex S, replace matched text with R using flags F.
131                  The first character after the "s" (anything but newline or
132                  backslash) is the delimiter, escape with \ to use normally.
133 
134                  The replacement text may contain "&" to substitute the matched
135                  text (escape it with backslash for a literal &), or \1 through
136                  \9 to substitute a parenthetical subexpression in the regex.
137                  You can also use the normal backslash escapes such as \n and
138                  a backslash at the end of the line appends the next line.
139 
140                  The flags are:
141 
142                  [0-9]    A number, substitute only that occurrence of pattern
143                  g        Global, substitute all occurrences of pattern
144                  i        Ignore case when matching
145                  p        Print the line if match was found and replaced
146                  w [file] Write (append) line to file if match replaced
147 
148       t [label]  Test, jump to :label only if an "s" command found a match in
149                  this line since last test (replacing with same text counts)
150 
151       T [label]  Test false, jump only if "s" hasn't found a match.
152 
153       w [file]   Write (append) line to file
154 
155       y/old/new/ Change each character in 'old' to corresponding character
156                  in 'new' (with standard backslash escapes, delimiter can be
157                  any repeated character except \ or \n)
158 
159       : [label]  Labeled target for jump commands
160 
161       #  Comment, ignore rest of this line of SCRIPT
162 
163     Deviations from POSIX: allow extended regular expressions with -r,
164     editing in place with -i, separate with -s, NUL-separated input with -z,
165     printf escapes in text, line continuations, semicolons after all commands,
166     2-address anywhere an address is allowed, "T" command, multiline
167     continuations for [abc], \; to end [abc] argument before end of line.
168 */
169 
170 #define FOR_sed
171 #include "toys.h"
172 
173 GLOBALS(
174   char *i;
175   struct arg_list *f, *e;
176 
177   // processed pattern list
178   struct double_list *pattern;
179 
180   char *nextline, *remember;
181   void *restart, *lastregex;
182   long nextlen, rememberlen, count;
183   int fdout, noeol;
184   unsigned xx;
185   char delim;
186 )
187 
188 // Linked list of parsed sed commands. Offset fields indicate location where
189 // regex or string starts, ala offset+(char *)struct, because we remalloc()
190 // these to expand them for multiline inputs, and pointers would have to be
191 // individually adjusted.
192 
193 struct sedcmd {
194   struct sedcmd *next, *prev;
195 
196   // Begin and end of each match
197   long lmatch[2]; // line number of match
198   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
199   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
200   unsigned not, hit;
201   unsigned sflags; // s///flag bits: i=1, g=2, p=4
202   char c; // action
203 };
204 
205 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)206 static int emit(char *line, long len, int eol)
207 {
208   int l, old = line[len];
209 
210   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
211   TT.noeol = !eol;
212   if (eol) line[len++] = '\n';
213   if (!len) return 0;
214   l = writeall(TT.fdout, line, len);
215   if (eol) line[len-1] = old;
216   if (l != len) {
217     perror_msg("short write");
218 
219     return 1;
220   }
221 
222   return 0;
223 }
224 
225 // Extend allocation to include new string, with newline between if newlen<0
226 
extend_string(char ** old,char * new,int oldlen,int newlen)227 static char *extend_string(char **old, char *new, int oldlen, int newlen)
228 {
229   int newline = newlen < 0;
230   char *s;
231 
232   if (newline) newlen = -newlen;
233   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
234   if (newline) s[oldlen++] = '\n';
235   memcpy(s+oldlen, new, newlen);
236   s[oldlen+newlen] = 0;
237 
238   return s+oldlen+newlen+1;
239 }
240 
241 // An empty regex repeats the previous one
get_regex(void * trump,int offset)242 static void *get_regex(void *trump, int offset)
243 {
244   if (!offset) {
245     if (!TT.lastregex) error_exit("no previous regex");
246     return TT.lastregex;
247   }
248 
249   return TT.lastregex = offset+(char *)trump;
250 }
251 
252 // Apply pattern to line from input file
sed_line(char ** pline,long plen)253 static void sed_line(char **pline, long plen)
254 {
255   struct append {
256     struct append *next, *prev;
257     int file;
258     char *str;
259   } *append = 0;
260   char *line = TT.nextline;
261   long len = TT.nextlen;
262   struct sedcmd *command;
263   int eol = 0, tea = 0;
264 
265   // Ignore EOF for all files before last unless -i
266   if (!pline && !FLAG(i)) return;
267 
268   // Grab next line for deferred processing (EOF detection: we get a NULL
269   // pline at EOF to flush last line). Note that only end of _last_ input
270   // file matches $ (unless we're doing -i).
271   TT.nextline = 0;
272   TT.nextlen = 0;
273   if (pline) {
274     TT.nextline = *pline;
275     TT.nextlen = plen;
276     *pline = 0;
277   }
278 
279   if (!line || !len) return;
280   if (line[len-1] == '\n') line[--len] = eol++;
281   TT.count++;
282 
283   // The restart-1 is because we added one to make sure it wasn't NULL,
284   // otherwise N as last command would restart script
285   command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
286   TT.restart = 0;
287 
288   while (command) {
289     char *str, c = command->c;
290 
291     // Have we got a line or regex matching range for this rule?
292     if (*command->lmatch || *command->rmatch) {
293       int miss = 0;
294       long lm;
295 
296       // In a match that might end?
297       if (command->hit) {
298         if (!(lm = command->lmatch[1])) {
299           if (!command->rmatch[1]) command->hit = 0;
300           else {
301             void *rm = get_regex(command, command->rmatch[1]);
302 
303             // regex match end includes matching line, so defer deactivation
304             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
305           }
306         } else if (lm > 0 && lm < TT.count) command->hit = 0;
307 
308       // Start a new match?
309       } else {
310         if (!(lm = *command->lmatch)) {
311           void *rm = get_regex(command, *command->rmatch);
312 
313           if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
314         } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
315 
316         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
317       }
318 
319       // Didn't match?
320       lm = !(command->hit ^ command->not);
321 
322       // Deferred disable from regex end match
323       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
324 
325       if (lm) {
326         // Handle skipping curly bracket command group
327         if (c == '{') {
328           int curly = 1;
329 
330           while (curly) {
331             command = command->next;
332             if (command->c == '{') curly++;
333             if (command->c == '}') curly--;
334           }
335         }
336         command = command->next;
337         continue;
338       }
339     }
340 
341     // A deleted line can still update line match state for later commands
342     if (!line) {
343       command = command->next;
344       continue;
345     }
346 
347     // Process command
348 
349     if (c=='a' || c=='r') {
350       struct append *a = xzalloc(sizeof(struct append));
351       if (command->arg1) a->str = command->arg1+(char *)command;
352       a->file = c=='r';
353       dlist_add_nomalloc((void *)&append, (void *)a);
354     } else if (c=='b' || c=='t' || c=='T') {
355       int t = tea;
356 
357       if (c != 'b') tea = 0;
358       if (c=='b' || t^(c=='T')) {
359         if (!command->arg1) break;
360         str = command->arg1+(char *)command;
361         for (command = (void *)TT.pattern; command; command = command->next)
362           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
363             break;
364         if (!command) error_exit("no :%s", str);
365       }
366     } else if (c=='c') {
367       str = command->arg1+(char *)command;
368       if (!command->hit) emit(str, strlen(str), 1);
369       free(line);
370       line = 0;
371       continue;
372     } else if (c=='d') {
373       free(line);
374       line = 0;
375       continue;
376     } else if (c=='D') {
377       // Delete up to \n or end of buffer
378       str = line;
379       while ((str-line)<len) if (*(str++) == '\n') break;
380       len -= str - line;
381       memmove(line, str, len);
382 
383       // if "delete" blanks line, disable further processing
384       // otherwise trim and restart script
385       if (!len) {
386         free(line);
387         line = 0;
388       } else {
389         line[len] = 0;
390         command = (void *)TT.pattern;
391       }
392       continue;
393     } else if (c=='g') {
394       free(line);
395       line = xstrdup(TT.remember);
396       len = TT.rememberlen;
397     } else if (c=='G') {
398       line = xrealloc(line, len+TT.rememberlen+2);
399       line[len++] = '\n';
400       memcpy(line+len, TT.remember, TT.rememberlen);
401       line[len += TT.rememberlen] = 0;
402     } else if (c=='h') {
403       free(TT.remember);
404       TT.remember = xstrdup(line);
405       TT.rememberlen = len;
406     } else if (c=='H') {
407       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
408       TT.remember[TT.rememberlen++] = '\n';
409       memcpy(TT.remember+TT.rememberlen, line, len);
410       TT.remember[TT.rememberlen += len] = 0;
411     } else if (c=='i') {
412       str = command->arg1+(char *)command;
413       emit(str, strlen(str), 1);
414     } else if (c=='l') {
415       int i, x, off;
416 
417       if (!TT.xx) {
418         terminal_size(&TT.xx, 0);
419         if (!TT.xx) TT.xx = 80;
420         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
421         if (TT.xx > 4) TT.xx -= 4;
422       }
423 
424       for (i = off = 0; i<len; i++) {
425         if (off >= TT.xx) {
426           toybuf[off++] = '\\';
427           emit(toybuf, off, 1);
428           off = 0;
429         }
430         x = stridx("\\\a\b\f\r\t\v", line[i]);
431         if (x != -1) {
432           toybuf[off++] = '\\';
433           toybuf[off++] = "\\abfrtv"[x];
434         } else if (line[i] >= ' ') toybuf[off++] = line[i];
435         else off += sprintf(toybuf+off, "\\%03o", line[i]);
436       }
437       toybuf[off++] = '$';
438       emit(toybuf, off, 1);
439     } else if (c=='n') {
440       TT.restart = command->next+1;
441 
442       break;
443     } else if (c=='N') {
444       // Can't just grab next line because we could have multiple N and
445       // we need to actually read ahead to get N;$p EOF detection right.
446       if (pline) {
447         TT.restart = command->next+1;
448         extend_string(&line, TT.nextline, len, -TT.nextlen);
449         free(TT.nextline);
450         TT.nextline = line;
451         TT.nextlen += len + 1;
452         line = 0;
453       }
454 
455       // Pending append goes out right after N
456       goto done;
457     } else if (c=='p' || c=='P') {
458       char *l = (c=='P') ? strchr(line, '\n') : 0;
459 
460       if (emit(line, l ? l-line : len, eol)) break;
461     } else if (c=='q') {
462       if (pline) *pline = (void *)1;
463       free(TT.nextline);
464       TT.nextline = 0;
465       TT.nextlen = 0;
466 
467       break;
468     } else if (c=='s') {
469       char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
470       regmatch_t *match = (void *)toybuf;
471       regex_t *reg = get_regex(command, command->arg1);
472       int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
473 
474       // Find match in remaining line (up to remaining len)
475       while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
476         mflags = REG_NOTBOL;
477 
478         // Zero length matches don't count immediately after a previous match
479         mlen = match[0].rm_eo-match[0].rm_so;
480         if (!mlen && !zmatch) {
481           if (!rlen--) break;
482           rline++;
483           zmatch++;
484           continue;
485         } else zmatch = 0;
486 
487         // If we're replacing only a specific match, skip if this isn't it
488         off = command->sflags>>3;
489         if (off && off != ++count) {
490           rline += match[0].rm_eo;
491           rlen -= match[0].rm_eo;
492 
493           continue;
494         }
495         // The fact getline() can allocate unbounded amounts of memory is
496         // a bigger issue, but while we're here check for integer overflow
497         if (match[0].rm_eo > INT_MAX) perror_exit(0);
498 
499         // newlen = strlen(new) but with \1 and & and printf escapes
500         for (off = newlen = 0; new[off]; off++) {
501           int cc = -1;
502 
503           if (new[off] == '&') cc = 0;
504           else if (new[off] == '\\') cc = new[++off] - '0';
505           if (cc < 0 || cc > 9) {
506             newlen++;
507             continue;
508           }
509           newlen += match[cc].rm_eo-match[cc].rm_so;
510         }
511 
512         // Allocate new size, copy start/end around match. (Can't extend in
513         // place because backrefs may refer to text after it's overwritten.)
514         len += newlen-mlen;
515         swap = xmalloc(len+1);
516         rswap = swap+(rline-line)+match[0].rm_so;
517         memcpy(swap, line, (rline-line)+match[0].rm_so);
518         memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
519 
520         // copy in new replacement text
521         for (off = mlen = 0; new[off]; off++) {
522           int cc = 0, ll;
523 
524           if (new[off] == '\\') {
525             cc = new[++off] - '0';
526             if (cc<0 || cc>9) {
527               if (!(rswap[mlen++] = unescape(new[off])))
528                 rswap[mlen-1] = new[off];
529 
530               continue;
531             } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
532           } else if (new[off] != '&') {
533             rswap[mlen++] = new[off];
534 
535             continue;
536           }
537 
538           if (match[cc].rm_so == -1) ll = 0; // Empty match.
539           else {
540             ll = match[cc].rm_eo-match[cc].rm_so;
541             memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
542           }
543           mlen += ll;
544         }
545 
546         rline = rswap+newlen;
547         free(line);
548         line = swap;
549 
550         // Stop after first substitution unless we have flag g
551         if (!(command->sflags & 2)) break;
552       }
553 
554       if (mflags) {
555         // flag p
556         if (command->sflags & 4) emit(line, len, eol);
557 
558         tea = 1;
559         if (command->w) goto writenow;
560       }
561     } else if (c=='w') {
562       int fd, noeol;
563       char *name;
564 
565 writenow:
566       // Swap out emit() context
567       fd = TT.fdout;
568       noeol = TT.noeol;
569 
570       // We save filehandle and newline status before filename
571       name = command->w + (char *)command;
572       memcpy(&TT.fdout, name, 4);
573       name += 4;
574       TT.noeol = *(name++);
575 
576       // write, then save/restore context
577       if (emit(line, len, eol))
578         perror_exit("w '%s'", command->arg1+(char *)command);
579       *(--name) = TT.noeol;
580       TT.noeol = noeol;
581       TT.fdout = fd;
582     } else if (c=='x') {
583       long swap = TT.rememberlen;
584 
585       str = TT.remember;
586       TT.remember = line;
587       line = str;
588       TT.rememberlen = len;
589       len = swap;
590     } else if (c=='y') {
591       char *from, *to = (char *)command;
592       int i, j;
593 
594       from = to+command->arg1;
595       to += command->arg2;
596 
597       for (i = 0; i < len; i++) {
598         j = stridx(from, line[i]);
599         if (j != -1) line[i] = to[j];
600       }
601     } else if (c=='=') {
602       sprintf(toybuf, "%ld", TT.count);
603       if (emit(toybuf, strlen(toybuf), 1)) break;
604     }
605 
606     command = command->next;
607   }
608 
609   if (line && !FLAG(n)) emit(line, len, eol);
610 
611 done:
612   if (dlist_terminate(append)) while (append) {
613     struct append *a = append->next;
614 
615     if (append->file) {
616       int fd = open(append->str, O_RDONLY);
617 
618       // Force newline if noeol pending
619       if (fd != -1) {
620         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
621         TT.noeol = 0;
622         xsendfile(fd, TT.fdout);
623         close(fd);
624       }
625     } else if (append->str) emit(append->str, strlen(append->str), 1);
626     else emit(line, 0, 0);
627     free(append);
628     append = a;
629   }
630   free(line);
631 }
632 
633 // Callback called on each input file
do_sed_file(int fd,char * name)634 static void do_sed_file(int fd, char *name)
635 {
636   char *tmp;
637 
638   if (FLAG(i)) {
639     struct sedcmd *command;
640 
641     if (!fd) return error_msg("-i on stdin");
642     TT.fdout = copy_tempfile(fd, name, &tmp);
643     TT.count = 0;
644     for (command = (void *)TT.pattern; command; command = command->next)
645       command->hit = 0;
646   }
647   do_lines(fd, TT.delim, sed_line);
648   if (FLAG(i)) {
649     if (TT.i && *TT.i) {
650       char *s = xmprintf("%s%s", name, TT.i);
651 
652       xrename(name, s);
653       free(s);
654     }
655     replace_tempfile(-1, TT.fdout, &tmp);
656     TT.fdout = 1;
657     TT.nextline = 0;
658     TT.nextlen = TT.noeol = 0;
659   }
660 }
661 
662 // Copy chunk of string between two delimiters, converting printf escapes.
663 // returns processed copy of string (0 if error), *pstr advances to next
664 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
665 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)666 static char *unescape_delimited_string(char **pstr, char *delim)
667 {
668   char *to, *from, mode = 0, d;
669 
670   // Grab leading delimiter (if necessary), allocate space for new string
671   from = *pstr;
672   if (!delim || !*delim) {
673     if (!(d = *(from++))) return 0;
674     if (d == '\\') d = *(from++);
675     if (!d || d == '\\') return 0;
676     if (delim) *delim = d;
677   } else d = *delim;
678   to = delim = xmalloc(strlen(*pstr)+1);
679 
680   while (mode || *from != d) {
681     if (!*from) return 0;
682 
683     // delimiter in regex character range doesn't count
684     if (*from == '[') {
685       if (!mode) {
686         mode = ']';
687         if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
688       } else if (mode == ']' && strchr(".=:", from[1])) {
689         *(to++) = *(from++);
690         mode = *from;
691       }
692     } else if (*from == mode) {
693       if (mode == ']') mode = 0;
694       else {
695         *(to++) = *(from++);
696         mode = ']';
697       }
698     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
699     // but the perl build does it, so we need to filter it out.
700     } else if (mode && *from == '-' && from[-1] == from[1]) {
701       from+=2;
702       continue;
703     } else if (*from == '\\') {
704       if (!from[1]) return 0;
705 
706       // Check escaped end delimiter before printf style escapes.
707       if (from[1] == d) from++;
708       else if (from[1]=='\\') *(to++) = *(from++);
709       else {
710         char c = unescape(from[1]);
711 
712         if (c) {
713           *(to++) = c;
714           from+=2;
715           continue;
716         } else if (!mode) *(to++) = *(from++);
717       }
718     }
719     *(to++) = *(from++);
720   }
721   *to = 0;
722   *pstr = from+1;
723 
724   return delim;
725 }
726 
727 // Translate pattern strings into command structures. Each command structure
728 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)729 static void parse_pattern(char **pline, long len)
730 {
731   struct sedcmd *command = (void *)TT.pattern;
732   char *line, *reg, c, *errstart;
733   int i;
734 
735   line = errstart = pline ? *pline : "";
736   if (len && line[len-1]=='\n') line[--len] = 0;
737 
738   // Append this line to previous multiline command? (hit indicates type.)
739   // During parsing "hit" stores data about line continuations, but in
740   // sed_line() it means the match range attached to this command
741   // is active, so processing the continuation must zero it again.
742   if (command && command->prev->hit) {
743     // Remove half-finished entry from list so remalloc() doesn't confuse it
744     TT.pattern = TT.pattern->prev;
745     command = dlist_pop(&TT.pattern);
746     c = command->c;
747     reg = (char *)command;
748     reg += command->arg1 + strlen(reg + command->arg1);
749 
750     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
751     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
752     // a unicode character.
753     if (command->hit < 256) goto resume_s;
754     else goto resume_a;
755   }
756 
757   // Loop through commands in this line.
758 
759   command = 0;
760   for (;;) {
761     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
762 
763     // If there's no more data on this line, return.
764     for (;;) {
765       while (isspace(*line) || *line == ';') line++;
766       if (*line == '#') while (*line && *line != '\n') line++;
767       else break;
768     }
769     if (!*line) return;
770 
771     // We start by writing data into toybuf. Later we'll allocate the
772     // ex
773 
774     errstart = line;
775     memset(toybuf, 0, sizeof(struct sedcmd));
776     command = (void *)toybuf;
777     reg = toybuf + sizeof(struct sedcmd);
778 
779     // Parse address range (if any)
780     for (i = 0; i < 2; i++) {
781       if (*line == ',') line++;
782       else if (i) break;
783 
784       if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
785       else if (*line == '$') {
786         command->lmatch[i] = -1;
787         line++;
788       } else if (*line == '/' || *line == '\\') {
789         char *s = line;
790 
791         if (!(s = unescape_delimited_string(&line, 0))) goto error;
792         if (!*s) command->rmatch[i] = 0;
793         else {
794           xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
795           command->rmatch[i] = reg-toybuf;
796           reg += sizeof(regex_t);
797         }
798         free(s);
799       } else break;
800     }
801 
802     while (isspace(*line)) line++;
803     if (!*line) break;
804 
805     while (*line == '!') {
806       command->not = 1;
807       line++;
808     }
809     while (isspace(*line)) line++;
810 
811     c = command->c = *(line++);
812     if (strchr("}:", c) && i) break;
813     if (strchr("aiqr=", c) && i>1) break;
814 
815     // Add step to pattern
816     command = xmemdup(toybuf, reg-toybuf);
817     reg = (reg-toybuf) + (char *)command;
818 
819     // Parse arguments by command type
820     if (c == '{') TT.nextlen++;
821     else if (c == '}') {
822       if (!TT.nextlen--) break;
823     } else if (c == 's') {
824       char *end, delim = 0;
825 
826       // s/pattern/replacement/flags
827 
828       // line continuations use arg1 (back at the start of the function),
829       // so let's fill out arg2 first (since the regex part can't be multiple
830       // lines) and swap them back later.
831 
832       // get pattern (just record, we parse it later)
833       command->arg2 = reg - (char *)command;
834       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
835         goto error;
836 
837       reg += sizeof(regex_t);
838       command->arg1 = reg-(char *)command;
839       command->hit = delim;
840 resume_s:
841       // get replacement - don't replace escapes yet because \1 and \& need
842       // processing later, after we replace \\ with \ we can't tell \\1 from \1
843       end = line;
844       while (*end != command->hit) {
845         if (!*end) goto error;
846         if (*end++ == '\\') {
847           if (!*end || *end == '\n') {
848             end[-1] = '\n';
849             break;
850           }
851           end++;
852         }
853       }
854 
855       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
856       line = end;
857       // line continuation? (note: '\n' can't be a valid delim).
858       if (*line == command->hit) command->hit = 0;
859       else {
860         if (!*line) continue;
861         reg--;
862         line++;
863         goto resume_s;
864       }
865 
866       // swap arg1/arg2 so they're back in order arguments occur.
867       i = command->arg1;
868       command->arg1 = command->arg2;
869       command->arg2 = i;
870 
871       // get flags
872       for (line++; *line; line++) {
873         long l;
874 
875         if (isspace(*line) && *line != '\n') continue;
876 
877         if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
878         else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
879           command->sflags |= l << 3;
880           line--;
881         } else break;
882       }
883 
884       // We deferred actually parsing the regex until we had the s///i flag
885       // allocating the space was done by extend_string() above
886       if (!*TT.remember) command->arg1 = 0;
887       else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
888         (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
889       free(TT.remember);
890       TT.remember = 0;
891       if (*line == 'w') {
892         line++;
893         goto writenow;
894       }
895     } else if (c == 'w') {
896       int fd, delim;
897       char *cc;
898 
899       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
900       // eol status, and to retain the filename for error messages, we'd need
901       // to go up to arg5 just for this. Compromise: dynamically allocate the
902       // filehandle and eol status.
903 
904 writenow:
905       while (isspace(*line)) line++;
906       if (!*line) goto error;
907       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
908       delim = *cc;
909       *cc = 0;
910       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
911       *cc = delim;
912 
913       command->w = reg - (char *)command;
914       command = xrealloc(command, command->w+(cc-line)+6);
915       reg = command->w + (char *)command;
916 
917       memcpy(reg, &fd, 4);
918       reg += 4;
919       *(reg++) = 0;
920       memcpy(reg, line, delim);
921       reg += delim;
922       *(reg++) = 0;
923 
924       line = cc;
925       if (delim) line += 2;
926     } else if (c == 'y') {
927       char *s, delim = 0;
928       int len;
929 
930       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
931       command->arg1 = reg-(char *)command;
932       len = strlen(s);
933       reg = extend_string((void *)&command, s, reg-(char *)command, len);
934       free(s);
935       command->arg2 = reg-(char *)command;
936       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
937       if (len != strlen(s)) goto error;
938       reg = extend_string((void *)&command, s, reg-(char*)command, len);
939       free(s);
940     } else if (strchr("abcirtTw:", c)) {
941       int end;
942 
943       // trim leading spaces
944       while (isspace(*line) && *line != '\n') line++;
945 
946       // Resume logic differs from 's' case because we don't add a newline
947       // unless it's after something, so we add it on return instead.
948 resume_a:
949       command->hit = 0;
950 
951       // btT: end with space or semicolon, aicrw continue to newline.
952       if (!(end = strcspn(line, strchr(":btT", c) ? "}; \t\r\n\v\f" : "\n"))) {
953         // Argument's optional for btT
954         if (strchr("btT", c)) continue;
955         else if (!command->arg1) break;
956       }
957 
958       // Extend allocation to include new string. We use offsets instead of
959       // pointers so realloc() moving stuff doesn't break things. Ok to write
960       // \n over NUL terminator because call to extend_string() adds it back.
961       if (!command->arg1) command->arg1 = reg - (char*)command;
962       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
963       else if (!pline) {
964         command->arg1 = 0;
965         continue;
966       }
967       reg = extend_string((void *)&command, line, reg - (char *)command, end);
968 
969       // Recopy data to remove escape sequences and handle line continuation.
970       if (strchr("aci", c)) {
971         reg -= end+1;
972         for (i = end; i; i--) {
973           if ((*reg++ = *line++)=='\\') {
974 
975             // escape at end of line: resume if -e escaped literal newline,
976             // else request callback and resume with next line
977             if (!--i) {
978               *--reg = 0;
979               if (*line) {
980                 line++;
981                 goto resume_a;
982               }
983               command->hit = 256;
984               break;
985             }
986             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
987             line++;
988           }
989         }
990         *reg = 0;
991       } else line += end;
992 
993     // Commands that take no arguments
994     } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
995   }
996 
997 error:
998   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
999 }
1000 
sed_main(void)1001 void sed_main(void)
1002 {
1003   struct arg_list *al;
1004   char **args = toys.optargs;
1005 
1006   if (!FLAG(z)) TT.delim = '\n';
1007 
1008   // Lie to autoconf when it asks stupid questions, so configure regexes
1009   // that look for "GNU sed version %f" greater than some old buggy number
1010   // don't fail us for not matching their narrow expectations.
1011   if (FLAG(version)) {
1012     xprintf("This is not GNU sed version 9.0\n");
1013     return;
1014   }
1015 
1016   // Handling our own --version means we handle our own --help too.
1017   if (FLAG(help)) help_exit(0);
1018 
1019   // Parse pattern into commands.
1020 
1021   // If no -e or -f, first argument is the pattern.
1022   if (!TT.e && !TT.f) {
1023     if (!*toys.optargs) error_exit("no pattern");
1024     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1025   }
1026 
1027   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1028   // so handle all -e, then all -f. (At least the behavior's consistent.)
1029 
1030   for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1031   parse_pattern(0, 0);
1032   for (al = TT.f; al; al = al->next)
1033     do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1034   dlist_terminate(TT.pattern);
1035   if (TT.nextlen) error_exit("no }");
1036 
1037   TT.fdout = 1;
1038   TT.remember = xstrdup("");
1039 
1040   // Inflict pattern upon input files. Long version because !O_CLOEXEC
1041   loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1042 
1043   // Provide EOF flush at end of cumulative input for non-i mode.
1044   if (!FLAG(i)) {
1045     toys.optflags |= FLAG_i;
1046     sed_line(0, 0);
1047   }
1048 
1049   // todo: need to close fd when done for TOYBOX_FREE?
1050 }
1051