1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6  *
7  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8  * but N and s///
9  * TODO: make y// handle unicode, unicode delimiters
10  * TODO: handle error return from emit(), error_msg/exit consistently
11  *       What's the right thing to do for -i when write fails? Skip to next?
12  * test '//q' with no previous regex, also repeat previous regex?
13  *
14  * Deviations from POSIX: allow extended regular expressions with -r,
15  * editing in place with -i, separate with -s, NUL-separated input with -z,
16  * printf escapes in text, line continuations, semicolons after all commands,
17  * 2-address anywhere an address is allowed, "T" command, multiline
18  * continuations for [abc], \; to end [abc] argument before end of line.
19 
20 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
21 
22 config SED
23   bool "sed"
24   default y
25   help
26     usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
27 
28     Stream editor. Apply editing SCRIPTs to lines of input.
29 
30     -e	Add SCRIPT to list
31     -f	Add contents of SCRIPT_FILE to list
32     -i	Edit each file in place (-iEXT keeps backup file with extension EXT)
33     -n	No default output (use the p command to output matched lines)
34     -r	Use extended regular expression syntax
35     -E	POSIX alias for -r
36     -s	Treat input files separately (implied by -i)
37     -z	Use \0 rather than \n as input line separator
38 
39     A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
40     All -e SCRIPTs are combined as if separated by newlines, followed by all -f
41     SCRIPT_FILEs. If no -e or -f then first argument is the SCRIPT.
42 
43     COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
44 
45       [ADDRESS[,ADDRESS]][!]COMMAND
46 
47     ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
48     last line (-s or -i makes it last line of each file). One address matches one
49     line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
50     match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
51 
52     REGULAR EXPRESSIONS start and end with the same character (anything but
53     backslash or newline). To use the delimiter in the regex escape it with a
54     backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
55     An empty regex repeats the previous one. ADDRESS regexes require any
56     first delimiter except / to be \escaped to distinguish it from COMMANDs.
57 
58     Sed reads each line of input, processes it, and writes it out or discards it
59     before reading the next. Sed can remember one additional line in a separate
60     buffer (the h, H, g, G, and x commands), and can read the next line of input
61     early (the n and N commands), but otherwise operates on individual lines.
62 
63     Each COMMAND starts with a single character. Commands with no arguments are:
64 
65       !  Run this command when the ADDRESS _didn't_ match.
66       {  Start new command block, continuing until a corresponding "}".
67          Command blocks nest and can have ADDRESSes applying to the whole block.
68       }  End command block (this COMMAND cannot have an address)
69       d  Delete this line and move on to the next one
70          (ignores remaining COMMANDs)
71       D  Delete one line of input and restart command SCRIPT (same as "d"
72          unless you've glued lines together with "N" or similar)
73       g  Get remembered line (overwriting current line)
74       G  Get remembered line (appending to current line)
75       h  Remember this line (overwriting remembered line)
76       H  Remember this line (appending to remembered line, if any)
77       l  Print line escaping \abfrtv (but not \n), octal escape other nonprintng
78          chars, wrap lines to terminal width with \, append $ to end of line.
79       n  Print default output and read next line over current line (quit at EOF)
80       N  Append \n and next line of input to this line. Quit at EOF without
81          default output. Advances line counter for ADDRESS and "=".
82       p  Print this line
83       P  Print this line up to first newline (from "N")
84       q  Quit (print default output, no more commands processed or lines read)
85       x  Exchange this line with remembered line (overwrite in both directions)
86       =  Print the current line number (plus newline)
87       #  Comment, ignores rest of this line of SCRIPT (until newline)
88 
89     Commands that take an argument:
90 
91       : LABEL    Target for jump commands
92       a TEXT     Append text to output before reading next line
93       b LABEL    Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
94       c TEXT     Delete matching ADDRESS range and output TEXT instead
95       i TEXT     Insert text (output immediately)
96       r FILE     Append contents of FILE to output before reading next line.
97       s/S/R/F    Search for regex S replace match with R using flags F. Delimiter
98                  is anything but \n or \, escape with \ to use in S or R. Printf
99                  escapes work. Unescaped & in R becomes full matched text, \1
100                  through \9 = parenthetical subexpression from S. \ at end of
101                  line appends next line of SCRIPT. The flags in F are:
102                  [0-9]    A number N, substitute only Nth match
103                  g        Global, substitute all matches
104                  i/I      Ignore case when matching
105                  p        Print resulting line when match found and replaced
106                  w [file] Write (append) line to file when match replaced
107       t LABEL    Test, jump if s/// command matched this line since last test
108       T LABEL    Test false, jump to :LABEL only if no s/// found a match
109       w FILE     Write (append) line to file
110       y/old/new/ Change each character in 'old' to corresponding character
111                  in 'new' (with standard backslash escapes, delimiter can be
112                  any repeated character except \ or \n)
113 
114     The TEXT arguments (to a c i) may end with an unescaped "\" to append
115     the next line (leading whitespace is not skipped), and treat ";" as a
116     literal character (use "\;" instead).
117 */
118 
119 #define FOR_sed
120 #include "toys.h"
121 
122 GLOBALS(
123   char *i;
124   struct arg_list *f, *e;
125 
126   // processed pattern list
127   struct double_list *pattern;
128 
129   char *nextline, *remember;
130   void *restart, *lastregex;
131   long nextlen, rememberlen, count;
132   int fdout, noeol;
133   unsigned xx;
134   char delim;
135 )
136 
137 // Linked list of parsed sed commands. Offset fields indicate location where
138 // regex or string starts, ala offset+(char *)struct, because we remalloc()
139 // these to expand them for multiline inputs, and pointers would have to be
140 // individually adjusted.
141 
142 struct sedcmd {
143   struct sedcmd *next, *prev;
144 
145   // Begin and end of each match
146   long lmatch[2]; // line number of match
147   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
148   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
149   unsigned not, hit;
150   unsigned sflags; // s///flag bits: i=1, g=2, p=4, x=8
151   char c; // action
152 };
153 
154 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)155 static int emit(char *line, long len, int eol)
156 {
157   int l, old = line[len];
158 
159   if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
160   TT.noeol = !eol;
161   if (eol) line[len++] = '\n';
162   if (!len) return 0;
163   l = writeall(TT.fdout, line, len);
164   if (eol) line[len-1] = old;
165   if (l != len) {
166     if (TT.fdout != 1) perror_msg("short write");
167 
168     return 1;
169   }
170 
171   return 0;
172 }
173 
174 // Extend allocation to include new string, with newline between if newlen<0
175 
extend_string(char ** old,char * new,int oldlen,int newlen)176 static char *extend_string(char **old, char *new, int oldlen, int newlen)
177 {
178   int newline = newlen < 0;
179   char *s;
180 
181   if (newline) newlen = -newlen;
182   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
183   if (newline) s[oldlen++] = '\n';
184   memcpy(s+oldlen, new, newlen);
185   s[oldlen+newlen] = 0;
186 
187   return s+oldlen+newlen+1;
188 }
189 
190 // An empty regex repeats the previous one
get_regex(void * command,int offset)191 static void *get_regex(void *command, int offset)
192 {
193   if (!offset) {
194     if (!TT.lastregex) error_exit("no previous regex");
195     return TT.lastregex;
196   }
197 
198   return TT.lastregex = offset+(char *)command;
199 }
200 
201 // Apply pattern to line from input file
sed_line(char ** pline,long plen)202 static void sed_line(char **pline, long plen)
203 {
204   struct append {
205     struct append *next, *prev;
206     int file;
207     char *str;
208   } *append = 0;
209   char *line = TT.nextline;
210   long len = TT.nextlen;
211   struct sedcmd *command;
212   int eol = 0, tea = 0;
213 
214   // Ignore EOF for all files before last unless -i
215   if (!pline && !FLAG(i) && !FLAG(s)) return;
216 
217   // Grab next line for deferred processing (EOF detection: we get a NULL
218   // pline at EOF to flush last line). Note that only end of _last_ input
219   // file matches $ (unless we're doing -i).
220   TT.nextline = 0;
221   TT.nextlen = 0;
222   if (pline) {
223     TT.nextline = *pline;
224     TT.nextlen = plen;
225     *pline = 0;
226   }
227 
228   if (!line || !len) return;
229   if (line[len-1] == '\n') line[--len] = eol++;
230   TT.count++;
231 
232   // The restart-1 is because we added one to make sure it wasn't NULL,
233   // otherwise N as last command would restart script
234   command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
235   TT.restart = 0;
236 
237   while (command) {
238     char *str, c = command->c;
239 
240     // Have we got a line or regex matching range for this rule?
241     if (*command->lmatch || *command->rmatch) {
242       int miss = 0;
243       long lm;
244 
245       // In a match that might end?
246       if (command->hit) {
247         if (!(lm = command->lmatch[1])) {
248           if (!command->rmatch[1]) command->hit = 0;
249           else {
250             void *rm = get_regex(command, command->rmatch[1]);
251 
252             // regex match end includes matching line, so defer deactivation
253             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
254           }
255         } else if (lm > 0 && lm < TT.count) command->hit = 0;
256         else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
257 
258       // Start a new match?
259       } else {
260         if (!(lm = *command->lmatch)) {
261           void *rm = get_regex(command, *command->rmatch);
262 
263           if (line && !regexec0(rm, line, len, 0, 0, 0))
264             command->hit = TT.count;
265         } else if (lm == TT.count || (lm == -1 && !pline))
266           command->hit = TT.count;
267 
268         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
269       }
270 
271       // Didn't match?
272       lm = !(command->not^!!command->hit);
273 
274       // Deferred disable from regex end match
275       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
276 
277       if (lm) {
278         // Handle skipping curly bracket command group
279         if (c == '{') {
280           int curly = 1;
281 
282           while (curly) {
283             command = command->next;
284             if (command->c == '{') curly++;
285             if (command->c == '}') curly--;
286           }
287         }
288         command = command->next;
289         continue;
290       }
291     }
292 
293     // A deleted line can still update line match state for later commands
294     if (!line) {
295       command = command->next;
296       continue;
297     }
298 
299     // Process command
300 
301     if (c=='a' || c=='r') {
302       struct append *a = xzalloc(sizeof(struct append));
303       if (command->arg1) a->str = command->arg1+(char *)command;
304       a->file = c=='r';
305       dlist_add_nomalloc((void *)&append, (void *)a);
306     } else if (c=='b' || c=='t' || c=='T') {
307       int t = tea;
308 
309       if (c != 'b') tea = 0;
310       if (c=='b' || t^(c=='T')) {
311         if (!command->arg1) break;
312         str = command->arg1+(char *)command;
313         for (command = (void *)TT.pattern; command; command = command->next)
314           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
315             break;
316         if (!command) error_exit("no :%s", str);
317       }
318     } else if (c=='c') {
319       str = command->arg1+(char *)command;
320       if (!command->hit) emit(str, strlen(str), 1);
321       free(line);
322       line = 0;
323       continue;
324     } else if (c=='d') {
325       free(line);
326       line = 0;
327       continue;
328     } else if (c=='D') {
329       // Delete up to \n or end of buffer
330       str = line;
331       while ((str-line)<len) if (*(str++) == '\n') break;
332       len -= str - line;
333       memmove(line, str, len);
334 
335       // if "delete" blanks line, disable further processing
336       // otherwise trim and restart script
337       if (!len) {
338         free(line);
339         line = 0;
340       } else {
341         line[len] = 0;
342         command = (void *)TT.pattern;
343       }
344       continue;
345     } else if (c=='g') {
346       free(line);
347       line = xstrdup(TT.remember);
348       len = TT.rememberlen;
349     } else if (c=='G') {
350       line = xrealloc(line, len+TT.rememberlen+2);
351       line[len++] = '\n';
352       memcpy(line+len, TT.remember, TT.rememberlen);
353       line[len += TT.rememberlen] = 0;
354     } else if (c=='h') {
355       free(TT.remember);
356       TT.remember = xstrdup(line);
357       TT.rememberlen = len;
358     } else if (c=='H') {
359       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
360       TT.remember[TT.rememberlen++] = '\n';
361       memcpy(TT.remember+TT.rememberlen, line, len);
362       TT.remember[TT.rememberlen += len] = 0;
363     } else if (c=='i') {
364       str = command->arg1+(char *)command;
365       emit(str, strlen(str), 1);
366     } else if (c=='l') {
367       int i, x, off;
368 
369       if (!TT.xx) {
370         terminal_size(&TT.xx, 0);
371         if (!TT.xx) TT.xx = 80;
372         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
373         if (TT.xx > 4) TT.xx -= 4;
374       }
375 
376       for (i = off = 0; i<len; i++) {
377         if (off >= TT.xx) {
378           toybuf[off++] = '\\';
379           emit(toybuf, off, 1);
380           off = 0;
381         }
382         x = stridx("\\\a\b\f\r\t\v", line[i]);
383         if (x != -1) {
384           toybuf[off++] = '\\';
385           toybuf[off++] = "\\abfrtv"[x];
386         } else if (line[i] >= ' ') toybuf[off++] = line[i];
387         else off += sprintf(toybuf+off, "\\%03o", line[i]);
388       }
389       toybuf[off++] = '$';
390       emit(toybuf, off, 1);
391     } else if (c=='n') {
392       TT.restart = command->next+1;
393 
394       break;
395     } else if (c=='N') {
396       // Can't just grab next line because we could have multiple N and
397       // we need to actually read ahead to get N;$p EOF detection right.
398       if (pline) {
399         TT.restart = command->next+1;
400         extend_string(&line, TT.nextline, len, -TT.nextlen);
401         free(TT.nextline);
402         TT.nextline = line;
403         TT.nextlen += len + 1;
404         line = 0;
405       }
406 
407       // Pending append goes out right after N
408       goto done;
409     } else if (c=='p' || c=='P') {
410       char *l = (c=='P') ? strchr(line, '\n') : 0;
411 
412       if (emit(line, l ? l-line : len, eol)) break;
413     } else if (c=='q' || c=='Q') {
414       if (pline) *pline = (void *)1;
415       free(TT.nextline);
416       if (!toys.exitval && command->arg1)
417         toys.exitval = atoi(command->arg1+(char *)command);
418       TT.nextline = 0;
419       TT.nextlen = 0;
420       if (c=='Q') line = 0;
421 
422       break;
423     } else if (c=='s') {
424       char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
425       regmatch_t *match = (void *)toybuf;
426       regex_t *reg = get_regex(command, command->arg1);
427       int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
428         mlen, off, newlen;
429 
430       // Loop finding match in remaining line (up to remaining len)
431       while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
432         mflags = REG_NOTBOL;
433 
434         // Zero length matches don't count immediately after a previous match
435         mlen = match[0].rm_eo-match[0].rm_so;
436         if (!mlen && !zmatch) {
437           if (rline-line == len) break;
438           l2[l2used++] = *rline++;
439           zmatch++;
440           continue;
441         } else zmatch = 0;
442 
443         // If we're replacing only a specific match, skip if this isn't it
444         off = command->sflags>>4;
445         if (off && off != ++count) {
446           if (l2) memcpy(l2+l2used, rline, match[0].rm_eo);
447           l2used += match[0].rm_eo;
448           rline += match[0].rm_eo;
449 
450           continue;
451         }
452         // The fact getline() can allocate unbounded amounts of memory is
453         // a bigger issue, but while we're here check for integer overflow
454         if (match[0].rm_eo > INT_MAX) perror_exit(0);
455 
456         // newlen = strlen(new) but with \1 and & and printf escapes
457         for (off = newlen = 0; new[off]; off++) {
458           int cc = -1;
459 
460           if (new[off] == '&') cc = 0;
461           else if (new[off] == '\\') cc = new[++off] - '0';
462           if (cc < 0 || cc > 9) {
463             newlen++;
464             continue;
465           }
466           newlen += match[cc].rm_eo-match[cc].rm_so;
467         }
468 
469         // Copy changed data to new string
470 
471         // Adjust allocation size of new string, copy data we know we'll keep
472         l2l += newlen-mlen;
473         if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
474         if (match[0].rm_so) {
475           memcpy(l2+l2used, rline, match[0].rm_so);
476           l2used += match[0].rm_so;
477         }
478 
479         // copy in new replacement text
480         for (off = mlen = 0; new[off]; off++) {
481           int cc = 0, ll;
482 
483           if (new[off] == '\\') {
484             cc = new[++off] - '0';
485             if (cc<0 || cc>9) {
486               if (!(l2[l2used+mlen++] = unescape(new[off])))
487                 l2[l2used+mlen-1] = new[off];
488 
489               continue;
490             } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
491           } else if (new[off] != '&') {
492             l2[l2used+mlen++] = new[off];
493 
494             continue;
495           }
496 
497           if (match[cc].rm_so != -1) {
498             ll = match[cc].rm_eo-match[cc].rm_so;
499             memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
500             mlen += ll;
501           }
502         }
503         l2used += newlen;
504         rline += match[0].rm_eo;
505 
506         // Stop after first substitution unless we have flag g
507         if (!(command->sflags & 2)) break;
508       }
509 
510       // If we made any changes, finish off l2 and swap it for line
511       if (l2) {
512         // grab trailing unmatched data and null terminator, swap with original
513         mlen = len-(rline-line);
514         memcpy(l2+l2used, rline, mlen+1);
515         len = l2used + mlen;
516         free(line);
517         line = l2;
518       }
519 
520       if (mflags) {
521         // flag p
522         if (command->sflags & 4) emit(line, len, eol);
523 
524         tea = 1;
525         if (command->w) goto writenow;
526       }
527     } else if (c=='w') {
528       int fd, noeol;
529       char *name;
530 
531 writenow:
532       // Swap out emit() context
533       fd = TT.fdout;
534       noeol = TT.noeol;
535 
536       // We save filehandle and newline status before filename
537       name = command->w + (char *)command;
538       memcpy(&TT.fdout, name, 4);
539       name += 4;
540       TT.noeol = *(name++);
541 
542       // write, then save/restore context
543       if (emit(line, len, eol))
544         perror_exit("w '%s'", command->arg1+(char *)command);
545       *(--name) = TT.noeol;
546       TT.noeol = noeol;
547       TT.fdout = fd;
548     } else if (c=='x') {
549       long swap = TT.rememberlen;
550 
551       str = TT.remember;
552       TT.remember = line;
553       line = str;
554       TT.rememberlen = len;
555       len = swap;
556     } else if (c=='y') {
557       char *from, *to = (char *)command;
558       int i, j;
559 
560       from = to+command->arg1;
561       to += command->arg2;
562 
563       for (i = 0; i < len; i++) {
564         j = stridx(from, line[i]);
565         if (j != -1) line[i] = to[j];
566       }
567     } else if (c=='=') {
568       sprintf(toybuf, "%ld", TT.count);
569       if (emit(toybuf, strlen(toybuf), 1)) break;
570     }
571 
572     command = command->next;
573   }
574 
575   if (line && !FLAG(n)) emit(line, len, eol);
576 
577 done:
578   if (dlist_terminate(append)) while (append) {
579     struct append *a = append->next;
580 
581     if (append->file) {
582       int fd = open(append->str, O_RDONLY);
583 
584       // Force newline if noeol pending
585       if (fd != -1) {
586         if (TT.noeol) xwrite(TT.fdout, "\n", 1);
587         TT.noeol = 0;
588         xsendfile(fd, TT.fdout);
589         close(fd);
590       }
591     } else if (append->str) emit(append->str, strlen(append->str), 1);
592     else emit(line, 0, 0);
593     free(append);
594     append = a;
595   }
596   free(line);
597 }
598 
599 // Callback called on each input file
do_sed_file(int fd,char * name)600 static void do_sed_file(int fd, char *name)
601 {
602   char *tmp, *s;
603 
604   if (FLAG(i)) {
605     if (!fd) return error_msg("-i on stdin");
606     TT.fdout = copy_tempfile(fd, name, &tmp);
607   }
608   if (FLAG(i) || FLAG(s)) {
609     struct sedcmd *command;
610 
611     TT.count = 0;
612     for (command = (void *)TT.pattern; command; command = command->next)
613       command->hit = 0;
614   }
615   do_lines(fd, TT.delim, sed_line);
616   if (FLAG(i)) {
617     if (TT.i && *TT.i) {
618       xrename(name, s = xmprintf("%s%s", name, TT.i));
619       free(s);
620     }
621     replace_tempfile(-1, TT.fdout, &tmp);
622     TT.fdout = 1;
623   }
624   if (FLAG(i) || FLAG(s)) {
625     TT.nextline = 0;
626     TT.nextlen = TT.noeol = 0;
627   }
628 }
629 
630 // Copy chunk of string between two delimiters, converting printf escapes.
631 // returns processed copy of string (0 if error), *pstr advances to next
632 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
633 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)634 static char *unescape_delimited_string(char **pstr, char *delim)
635 {
636   char *to, *from, mode = 0, d;
637 
638   // Grab leading delimiter (if necessary), allocate space for new string
639   from = *pstr;
640   if (!delim || !*delim) {
641     if (!(d = *(from++))) return 0;
642     if (d == '\\') d = *(from++);
643     if (!d || d == '\\') return 0;
644     if (delim) *delim = d;
645   } else d = *delim;
646   to = delim = xmalloc(strlen(*pstr)+1);
647 
648   while (mode || *from != d) {
649     if (!*from) return 0;
650 
651     // delimiter in regex character range doesn't count
652     if (*from == '[') {
653       if (!mode) {
654         mode = ']';
655         if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
656       } else if (mode == ']' && strchr(".=:", from[1])) {
657         *(to++) = *(from++);
658         mode = *from;
659       }
660     } else if (*from == mode) {
661       if (mode == ']') mode = 0;
662       else {
663         *(to++) = *(from++);
664         mode = ']';
665       }
666     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
667     // but the perl build does it, so we need to filter it out.
668     } else if (mode && *from == '-' && from[-1] == from[1]) {
669       from+=2;
670       continue;
671     } else if (*from == '\\') {
672       if (!from[1]) return 0;
673 
674       // Check escaped end delimiter before printf style escapes.
675       if (from[1] == d) from++;
676       else if (from[1]=='\\') *(to++) = *(from++);
677       else {
678         char c = unescape(from[1]);
679 
680         if (c) {
681           *(to++) = c;
682           from+=2;
683           continue;
684         } else if (!mode) *(to++) = *(from++);
685       }
686     }
687     *(to++) = *(from++);
688   }
689   *to = 0;
690   *pstr = from+1;
691 
692   return delim;
693 }
694 
695 // Translate pattern strings into command structures. Each command structure
696 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)697 static void parse_pattern(char **pline, long len)
698 {
699   struct sedcmd *command = (void *)TT.pattern;
700   char *line, *reg, c, *errstart;
701   int i;
702 
703   line = errstart = pline ? *pline : "";
704   if (len && line[len-1]=='\n') line[--len] = 0;
705 
706   // Append this line to previous multiline command? (hit indicates type.)
707   // During parsing "hit" stores data about line continuations, but in
708   // sed_line() it means the match range attached to this command
709   // is active, so processing the continuation must zero it again.
710   if (command && command->prev->hit) {
711     // Remove half-finished entry from list so remalloc() doesn't confuse it
712     TT.pattern = TT.pattern->prev;
713     command = dlist_pop(&TT.pattern);
714     c = command->c;
715     reg = (char *)command;
716     reg += command->arg1 + strlen(reg + command->arg1);
717 
718     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
719     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
720     // a unicode character.
721     if (command->hit < 256) goto resume_s;
722     else goto resume_a;
723   }
724 
725   // Loop through commands in this line.
726 
727   command = 0;
728   for (;;) {
729     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
730 
731     // If there's no more data on this line, return.
732     for (;;) {
733       while (isspace(*line) || *line == ';') line++;
734       if (*line == '#') while (*line && *line != '\n') line++;
735       else break;
736     }
737     if (!*line) return;
738 
739     // Start by writing data into toybuf.
740 
741     errstart = line;
742     memset(toybuf, 0, sizeof(struct sedcmd));
743     command = (void *)toybuf;
744     reg = toybuf + sizeof(struct sedcmd);
745 
746     // Parse address range (if any)
747     for (i = 0; i < 2; i++) {
748       if (*line == ',') line++;
749       else if (i) break;
750 
751       if (i && *line == '+' && isdigit(line[1])) {
752         line++;
753         command->lmatch[i] = -2-strtol(line, &line, 0);
754       } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
755       else if (*line == '$') {
756         command->lmatch[i] = -1;
757         line++;
758       } else if (*line == '/' || *line == '\\') {
759         char *s = line;
760 
761         if (!(s = unescape_delimited_string(&line, 0))) goto error;
762         if (!*s) command->rmatch[i] = 0;
763         else {
764           xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
765           command->rmatch[i] = reg-toybuf;
766           reg += sizeof(regex_t);
767         }
768         free(s);
769       } else break;
770     }
771 
772     while (isspace(*line)) line++;
773     if (!*line) break;
774 
775     if (*line == '!') {
776       command->not = 1;
777       line++;
778     }
779     while (isspace(*line)) line++;
780     if (!*line) break;
781 
782     c = command->c = *(line++);
783     if (strchr("}:", c) && i) break;
784     if (strchr("aiqQr=", c) && i>1) break;
785 
786     // Allocate memory and copy out of toybuf now that we know how big it is
787     command = xmemdup(toybuf, reg-toybuf);
788     reg = (reg-toybuf) + (char *)command;
789 
790     // Parse arguments by command type
791     if (c == '{') TT.nextlen++;
792     else if (c == '}') {
793       if (!TT.nextlen--) break;
794     } else if (c == 's') {
795       char *end, delim = 0;
796       int flags;
797 
798       // s/pattern/replacement/flags
799 
800       // line continuations use arg1 (back at the start of the function),
801       // so let's fill out arg2 first (since the regex part can't be multiple
802       // lines) and swap them back later.
803 
804       // get pattern (just record, we parse it later)
805       command->arg2 = reg - (char *)command;
806       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
807         goto error;
808 
809       reg += sizeof(regex_t);
810       command->arg1 = reg-(char *)command;
811       command->hit = delim;
812 resume_s:
813       // get replacement - don't replace escapes yet because \1 and \& need
814       // processing later, after we replace \\ with \ we can't tell \\1 from \1
815       end = line;
816       while (*end != command->hit) {
817         if (!*end) goto error;
818         if (*end++ == '\\') {
819           if (!*end || *end == '\n') {
820             end[-1] = '\n';
821             break;
822           }
823           end++;
824         }
825       }
826 
827       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
828       line = end;
829       // line continuation? (note: '\n' can't be a valid delim).
830       if (*line == command->hit) command->hit = 0;
831       else {
832         if (!*line) continue;
833         reg--;
834         line++;
835         goto resume_s;
836       }
837 
838       // swap arg1/arg2 so they're back in order arguments occur.
839       i = command->arg1;
840       command->arg1 = command->arg2;
841       command->arg2 = i;
842 
843       // get flags
844       for (line++; *line; line++) {
845         long l;
846 
847         if (isspace(*line) && *line != '\n') continue;
848 
849         if (0 <= (l = stridx("igpx", *line))) command->sflags |= 1<<l;
850         else if (*line == 'I') command->sflags |= 1<<0;
851         else if (!(command->sflags>>4) && 0<(l = strtol(line, &line, 10))) {
852           command->sflags |= l << 4;
853           line--;
854         } else break;
855       }
856       flags = (FLAG(r) || (command->sflags&8)) ? REG_EXTENDED : 0;
857       if (command->sflags&1) flags |= REG_ICASE;
858 
859       // We deferred actually parsing the regex until we had the s///i flag
860       // allocating the space was done by extend_string() above
861       if (!*TT.remember) command->arg1 = 0;
862       else xregcomp((void *)(command->arg1+(char *)command),TT.remember,flags);
863       free(TT.remember);
864       TT.remember = 0;
865       if (*line == 'w') {
866         line++;
867         goto writenow;
868       }
869     } else if (c == 'w') {
870       int fd, delim;
871       char *cc;
872 
873       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
874       // eol status, and to retain the filename for error messages, we'd need
875       // to go up to arg5 just for this. Compromise: dynamically allocate the
876       // filehandle and eol status.
877 
878 writenow:
879       while (isspace(*line)) line++;
880       if (!*line) goto error;
881       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
882       delim = *cc;
883       *cc = 0;
884       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
885       *cc = delim;
886 
887       command->w = reg - (char *)command;
888       command = xrealloc(command, command->w+(cc-line)+6);
889       reg = command->w + (char *)command;
890 
891       memcpy(reg, &fd, 4);
892       reg += 4;
893       *(reg++) = 0;
894       memcpy(reg, line, delim);
895       reg += delim;
896       *(reg++) = 0;
897 
898       line = cc;
899       if (delim) line += 2;
900     } else if (c == 'y') {
901       char *s, delim = 0;
902       int len;
903 
904       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
905       command->arg1 = reg-(char *)command;
906       len = strlen(s);
907       reg = extend_string((void *)&command, s, reg-(char *)command, len);
908       free(s);
909       command->arg2 = reg-(char *)command;
910       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
911       if (len != strlen(s)) goto error;
912       reg = extend_string((void *)&command, s, reg-(char*)command, len);
913       free(s);
914     } else if (strchr("abcirtTqQw:", c)) {
915       int end;
916 
917       // trim leading spaces
918       while (isspace(*line) && *line != '\n') line++;
919 
920       // Resume logic differs from 's' case because we don't add a newline
921       // unless it's after something, so we add it on return instead.
922 resume_a:
923       command->hit = 0;
924 
925       // btTqQ: end with space or semicolon, aicrw continue to newline.
926       if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
927         // Argument's optional for btTqQ
928         if (strchr("btTqQ", c)) continue;
929         else if (!command->arg1) break;
930       }
931       // Error checking: qQ can only have digits after them
932       if (c=='q' || c=='Q') {
933         for (i = 0; i<end && isdigit(line[i]); i++);
934         if (i != end) {
935           line += i;
936           break;
937         }
938       }
939 
940       // Extend allocation to include new string. We use offsets instead of
941       // pointers so realloc() moving stuff doesn't break things. Ok to write
942       // \n over NUL terminator because call to extend_string() adds it back.
943       if (!command->arg1) command->arg1 = reg - (char*)command;
944       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
945       else if (!pline) {
946         command->arg1 = 0;
947         continue;
948       }
949       reg = extend_string((void *)&command, line, reg - (char *)command, end);
950 
951       // Recopy data to remove escape sequences and handle line continuation.
952       if (strchr("aci", c)) {
953         reg -= end+1;
954         for (i = end; i; i--) {
955           if ((*reg++ = *line++)=='\\') {
956 
957             // escape at end of line: resume if -e escaped literal newline,
958             // else request callback and resume with next line
959             if (!--i) {
960               *--reg = 0;
961               if (*line) {
962                 line++;
963                 goto resume_a;
964               }
965               command->hit = 256;
966               break;
967             }
968             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
969             line++;
970           }
971         }
972         *reg = 0;
973       } else line += end;
974 
975     // Commands that take no arguments
976     } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
977   }
978 
979 error:
980   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
981 }
982 
sed_main(void)983 void sed_main(void)
984 {
985   struct arg_list *al;
986   char **args = toys.optargs;
987 
988   if (!FLAG(z)) TT.delim = '\n';
989 
990   // Lie to autoconf when it asks stupid questions, so configure regexes
991   // that look for "GNU sed version %f" greater than some old buggy number
992   // don't fail us for not matching their narrow expectations.
993   if (FLAG(version)) {
994     xprintf("This is not GNU sed version 9.0\n");
995     return;
996   }
997 
998   // Handling our own --version means we handle our own --help too.
999   if (FLAG(help)) help_exit(0);
1000 
1001   // Parse pattern into commands.
1002 
1003   // If no -e or -f, first argument is the pattern.
1004   if (!TT.e && !TT.f) {
1005     if (!*toys.optargs) error_exit("no pattern");
1006     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1007   }
1008 
1009   // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1010   // so handle all -e, then all -f. (At least the behavior's consistent.)
1011 
1012   for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1013   parse_pattern(0, 0);
1014   for (al = TT.f; al; al = al->next)
1015     do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1016   dlist_terminate(TT.pattern);
1017   if (TT.nextlen) error_exit("no }");
1018 
1019   TT.fdout = 1;
1020   TT.remember = xstrdup("");
1021 
1022   // Inflict pattern upon input files. Long version because !O_CLOEXEC
1023   loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1024 
1025   // Provide EOF flush at end of cumulative input for non-i mode.
1026   if (!FLAG(i) && !FLAG(s)) {
1027     toys.optflags |= FLAG_s;
1028     sed_line(0, 0);
1029   }
1030 
1031   // todo: need to close fd when done for TOYBOX_FREE?
1032 }
1033