1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9
10 USE_SED(NEWTOY(sed, "(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
11
12 config SED
13 bool "sed"
14 default y
15 help
16 usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
17
18 Stream editor. Apply one or more editing SCRIPTs to each line of input
19 (from FILE or stdin) producing output (by default to stdout).
20
21 -e add SCRIPT to list
22 -f add contents of SCRIPT_FILE to list
23 -i Edit each file in place.
24 -n No default output. (Use the p command to output matched lines.)
25 -r Use extended regular expression syntax.
26 -E Alias for -r.
27 -s Treat input files separately (implied by -i)
28
29 A SCRIPT is a series of one or more COMMANDs separated by newlines or
30 semicolons. All -e SCRIPTs are concatenated together as if separated
31 by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
32 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
33
34 Each COMMAND may be preceded by an address which limits the command to
35 apply only to the specified line(s). Commands without an address apply to
36 every line. Addresses are of the form:
37
38 [ADDRESS[,ADDRESS]]COMMAND
39
40 The ADDRESS may be a decimal line number (starting at 1), a /regular
41 expression/ within a pair of forward slashes, or the character "$" which
42 matches the last line of input. (In -s or -i mode this matches the last
43 line of each file, otherwise just the last line of the last file.) A single
44 address matches one line, a pair of comma separated addresses match
45 everything from the first address to the second address (inclusive). If
46 both addresses are regular expressions, more than one range of lines in
47 each file can match.
48
49 REGULAR EXPRESSIONS in sed are started and ended by the same character
50 (traditionally / but anything except a backslash or a newline works).
51 Backslashes may be used to escape the delimiter if it occurs in the
52 regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
53 and unicode). An empty regex repeats the previous one. ADDRESS regexes
54 (above) require the first delimeter to be escaped with a backslash when
55 it isn't a forward slash (to distinguish it from the COMMANDs below).
56
57 Sed mostly operates on individual lines one at a time. It reads each line,
58 processes it, and either writes it to the output or discards it before
59 reading the next line. Sed can remember one additional line in a separate
60 buffer (using the h, H, g, G, and x commands), and can read the next line
61 of input early (using the n and N command), but other than that command
62 scripts operate on individual lines of text.
63
64 Each COMMAND starts with a single character. The following commands take
65 no arguments:
66
67 { Start a new command block, continuing until a corresponding "}".
68 Command blocks may nest. If the block has an address, commands within
69 the block are only run for lines within the block's address range.
70
71 } End command block (this command cannot have an address)
72
73 d Delete this line and move on to the next one
74 (ignores remaining COMMANDs)
75
76 D Delete one line of input and restart command SCRIPT (same as "d"
77 unless you've glued lines together with "N" or similar)
78
79 g Get remembered line (overwriting current line)
80
81 G Get remembered line (appending to current line)
82
83 h Remember this line (overwriting remembered line)
84
85 H Remember this line (appending to remembered line, if any)
86
87 l Print line, escaping \abfrtv (but not newline), octal escaping other
88 nonprintable characters, wrapping lines to terminal width with a
89 backslash, and appending $ to actual end of line.
90
91 n Print default output and read next line, replacing current line
92 (If no next line available, quit processing script)
93
94 N Append next line of input to this line, separated by a newline
95 (This advances the line counter for address matching and "=", if no
96 next line available quit processing script without default output)
97
98 p Print this line
99
100 P Print this line up to first newline (from "N")
101
102 q Quit (print default output, no more commands processed or lines read)
103
104 x Exchange this line with remembered line (overwrite in both directions)
105
106 = Print the current line number (followed by a newline)
107
108 The following commands (may) take an argument. The "text" arguments (to
109 the "a", "b", and "c" commands) may end with an unescaped "\" to append
110 the next line (for which leading whitespace is not skipped), and also
111 treat ";" as a literal character (use "\;" instead).
112
113 a [text] Append text to output before attempting to read next line
114
115 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
116
117 c [text] Delete line, output text at end of matching address range
118 (ignores remaining COMMANDs)
119
120 i [text] Print text
121
122 r [file] Append contents of file to output before attempting to read
123 next line.
124
125 s/S/R/F Search for regex S, replace matched text with R using flags F.
126 The first character after the "s" (anything but newline or
127 backslash) is the delimiter, escape with \ to use normally.
128
129 The replacement text may contain "&" to substitute the matched
130 text (escape it with backslash for a literal &), or \1 through
131 \9 to substitute a parenthetical subexpression in the regex.
132 You can also use the normal backslash escapes such as \n and
133 a backslash at the end of the line appends the next line.
134
135 The flags are:
136
137 [0-9] A number, substitute only that occurrence of pattern
138 g Global, substitute all occurrences of pattern
139 i Ignore case when matching
140 p Print the line if match was found and replaced
141 w [file] Write (append) line to file if match replaced
142
143 t [label] Test, jump to :label only if an "s" command found a match in
144 this line since last test (replacing with same text counts)
145
146 T [label] Test false, jump only if "s" hasn't found a match.
147
148 w [file] Write (append) line to file
149
150 y/old/new/ Change each character in 'old' to corresponding character
151 in 'new' (with standard backslash escapes, delimiter can be
152 any repeated character except \ or \n)
153
154 : [label] Labeled target for jump commands
155
156 # Comment, ignore rest of this line of SCRIPT
157
158 Deviations from posix: allow extended regular expressions with -r,
159 editing in place with -i, separate with -s, printf escapes in text, line
160 continuations, semicolons after all commands, 2-address anywhere an
161 address is allowed, "T" command, multiline continuations for [abc],
162 \; to end [abc] argument before end of line.
163 */
164
165 #define FOR_sed
166 #include "toys.h"
167
168 GLOBALS(
169 struct arg_list *f;
170 struct arg_list *e;
171
172 // processed pattern list
173 struct double_list *pattern;
174
175 char *nextline, *remember;
176 void *restart, *lastregex;
177 long nextlen, rememberlen, count;
178 int fdout, noeol;
179 unsigned xx;
180 )
181
182 struct step {
183 struct step *next, *prev;
184
185 // Begin and end of each match
186 long lmatch[2];
187 int rmatch[2], arg1, arg2, w; // offsets because remalloc()
188 unsigned not, hit, sflags;
189 char c; // action
190 };
191
192 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)193 static int emit(char *line, long len, int eol)
194 {
195 int l, old = line[len];
196
197 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
198 if (eol) line[len++] = '\n';
199 if (!len) return 0;
200 TT.noeol = len && !eol;
201 l = writeall(TT.fdout, line, len);
202 if (eol) line[len-1] = old;
203 if (l != len) {
204 perror_msg("short write");
205
206 return 1;
207 }
208
209 return 0;
210 }
211
212 // Do regex matching handling embedded NUL bytes in string. Note that
213 // neither the pattern nor the match can currently include NUL bytes
214 // (even with wildcards) and string must be null terminated.
ghostwheel(regex_t * preg,char * string,long len,int nmatch,regmatch_t pmatch[],int eflags)215 static int ghostwheel(regex_t *preg, char *string, long len, int nmatch,
216 regmatch_t pmatch[], int eflags)
217 {
218 char *s = string;
219
220 for (;;) {
221 long ll = 0;
222 int rc;
223
224 while (len && !*s) {
225 s++;
226 len--;
227 }
228 while (s[ll] && ll<len) ll++;
229
230 rc = regexec(preg, s, nmatch, pmatch, eflags);
231 if (!rc) {
232 for (rc = 0; rc<nmatch && pmatch[rc].rm_so!=-1; rc++) {
233 pmatch[rc].rm_so += s-string;
234 pmatch[rc].rm_eo += s-string;
235 }
236
237 return 0;
238 }
239 if (ll==len) return rc;
240
241 s += ll;
242 len -= ll;
243 }
244 }
245
246 // Extend allocation to include new string, with newline between if newlen<0
247
extend_string(char ** old,char * new,int oldlen,int newlen)248 static char *extend_string(char **old, char *new, int oldlen, int newlen)
249 {
250 int newline = newlen < 0;
251 char *s;
252
253 if (newline) newlen = -newlen;
254 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
255 if (newline) s[oldlen++] = '\n';
256 memcpy(s+oldlen, new, newlen);
257 s[oldlen+newlen] = 0;
258
259 return s+oldlen+newlen+1;
260 }
261
262 // An empty regex repeats the previous one
get_regex(void * trump,int offset)263 void *get_regex(void *trump, int offset)
264 {
265 if (!offset) {
266 if (!TT.lastregex) error_exit("no previous regex");
267 return TT.lastregex;
268 }
269
270 return TT.lastregex = offset+(char *)trump;
271 }
272
273 // Apply pattern to line from input file
walk_pattern(char ** pline,long plen)274 static void walk_pattern(char **pline, long plen)
275 {
276 struct append {
277 struct append *next, *prev;
278 int file;
279 char *str;
280 } *append = 0;
281 char *line = TT.nextline;
282 long len = TT.nextlen;
283 struct step *logrus;
284 int eol = 0, tea = 0;
285
286 // Grab next line for deferred processing (EOF detection: we get a NULL
287 // pline at EOF to flush last line). Note that only end of _last_ input
288 // file matches $ (unless we're doing -i).
289 TT.nextline = 0;
290 TT.nextlen = 0;
291 if (pline) {
292 TT.nextline = *pline;
293 TT.nextlen = plen;
294 *pline = 0;
295 }
296
297 if (!line || !len) return;
298 if (line[len-1] == '\n') line[--len] = eol++;
299 TT.count++;
300
301 // The restart-1 is because we added one to make sure it wasn't NULL,
302 // otherwise N as last command would restart script
303 logrus = TT.restart ? ((struct step *)TT.restart)-1 : (void *)TT.pattern;
304 TT.restart = 0;
305
306 while (logrus) {
307 char *str, c = logrus->c;
308
309 // Have we got a line or regex matching range for this rule?
310 if (*logrus->lmatch || *logrus->rmatch) {
311 int miss = 0;
312 long lm;
313
314 // In a match that might end?
315 if (logrus->hit) {
316 if (!(lm = logrus->lmatch[1])) {
317 if (!logrus->rmatch[1]) logrus->hit = 0;
318 else {
319 void *rm = get_regex(logrus, logrus->rmatch[1]);
320
321 // regex match end includes matching line, so defer deactivation
322 if (line && !ghostwheel(rm, line, len, 0, 0, 0)) miss = 1;
323 }
324 } else if (lm > 0 && lm < TT.count) logrus->hit = 0;
325
326 // Start a new match?
327 } else {
328 if (!(lm = *logrus->lmatch)) {
329 void *rm = get_regex(logrus, *logrus->rmatch);
330
331 if (line && !ghostwheel(rm, line, len, 0, 0, 0)) logrus->hit++;
332 } else if (lm == TT.count || (lm == -1 && !pline)) logrus->hit++;
333
334 if (!logrus->lmatch[1] && !logrus->rmatch[1]) miss = 1;
335 }
336
337 // Didn't match?
338 lm = !(logrus->hit ^ logrus->not);
339
340 // Deferred disable from regex end match
341 if (miss || logrus->lmatch[1] == TT.count) logrus->hit = 0;
342
343 if (lm) {
344 // Handle skipping curly bracket command group
345 if (c == '{') {
346 int curly = 1;
347
348 while (curly) {
349 logrus = logrus->next;
350 if (logrus->c == '{') curly++;
351 if (logrus->c == '}') curly--;
352 }
353 }
354 logrus = logrus->next;
355 continue;
356 }
357 }
358
359 // A deleted line can still update line match state for later commands
360 if (!line) {
361 logrus = logrus->next;
362 continue;
363 }
364
365 // Process command
366
367 if (c=='a' || c=='r') {
368 struct append *a = xzalloc(sizeof(struct append));
369 a->str = logrus->arg1+(char *)logrus;
370 a->file = c=='r';
371 dlist_add_nomalloc((void *)&append, (void *)a);
372 } else if (c=='b' || c=='t' || c=='T') {
373 int t = tea;
374
375 if (c != 'b') tea = 0;
376 if (c=='b' || t^(c=='T')) {
377 if (!logrus->arg1) break;
378 str = logrus->arg1+(char *)logrus;
379 for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next)
380 if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str))
381 break;
382 if (!logrus) error_exit("no :%s", str);
383 }
384 } else if (c=='c') {
385 str = logrus->arg1+(char *)logrus;
386 if (!logrus->hit) emit(str, strlen(str), 1);
387 free(line);
388 line = 0;
389 continue;
390 } else if (c=='d') {
391 free(line);
392 line = 0;
393 continue;
394 } else if (c=='D') {
395 // Delete up to \n or end of buffer
396 str = line;
397 while ((str-line)<len) if (*(str++) == '\n') break;
398 len -= str - line;
399 memmove(line, str, len);
400
401 // if "delete" blanks line, disable further processing
402 // otherwise trim and restart script
403 if (!len) {
404 free(line);
405 line = 0;
406 } else {
407 line[len] = 0;
408 logrus = (void *)TT.pattern;
409 }
410 continue;
411 } else if (c=='g') {
412 free(line);
413 line = xstrdup(TT.remember);
414 len = TT.rememberlen;
415 } else if (c=='G') {
416 line = xrealloc(line, len+TT.rememberlen+2);
417 line[len++] = '\n';
418 memcpy(line+len, TT.remember, TT.rememberlen);
419 line[len += TT.rememberlen] = 0;
420 } else if (c=='h') {
421 free(TT.remember);
422 TT.remember = xstrdup(line);
423 TT.rememberlen = len;
424 } else if (c=='H') {
425 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
426 TT.remember[TT.rememberlen++] = '\n';
427 memcpy(TT.remember+TT.rememberlen, line, len);
428 TT.remember[TT.rememberlen += len] = 0;
429 } else if (c=='i') {
430 str = logrus->arg1+(char *)logrus;
431 emit(str, strlen(str), 1);
432 } else if (c=='l') {
433 int i, x, off;
434
435 if (!TT.xx) {
436 terminal_size(&TT.xx, 0);
437 if (!TT.xx) TT.xx = 80;
438 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
439 if (TT.xx > 4) TT.xx -= 4;
440 }
441
442 for (i = off = 0; i<len; i++) {
443 if (off >= TT.xx) {
444 toybuf[off++] = '\\';
445 emit(toybuf, off, 1);
446 off = 0;
447 }
448 x = stridx("\\\a\b\f\r\t\v", line[i]);
449 if (x != -1) {
450 toybuf[off++] = '\\';
451 toybuf[off++] = "\\abfrtv"[x];
452 } else if (line[i] >= ' ') toybuf[off++] = line[i];
453 else off += sprintf(toybuf+off, "\\%03o", line[i]);
454 }
455 toybuf[off++] = '$';
456 emit(toybuf, off, 1);
457 } else if (c=='n') {
458 TT.restart = logrus->next+1;
459
460 break;
461 } else if (c=='N') {
462 // Can't just grab next line because we could have multiple N and
463 // we need to actually read ahead to get N;$p EOF detection right.
464 if (pline) {
465 TT.restart = logrus->next+1;
466 extend_string(&line, TT.nextline, len, -TT.nextlen);
467 free(TT.nextline);
468 TT.nextline = line;
469 TT.nextlen += len + 1;
470 line = 0;
471 }
472
473 // Pending append goes out right after N
474 goto done;
475 } else if (c=='p' || c=='P') {
476 char *l = (c=='P') ? strchr(line, '\n') : 0;
477
478 if (emit(line, l ? l-line : len, eol)) break;
479 } else if (c=='q') {
480 if (pline) *pline = (void *)1;
481 free(TT.nextline);
482 TT.nextline = 0;
483 TT.nextlen = 0;
484
485 break;
486 } else if (c=='s') {
487 char *rline = line, *new = logrus->arg2 + (char *)logrus, *swap, *rswap;
488 regmatch_t *match = (void *)toybuf;
489 regex_t *reg = get_regex(logrus, logrus->arg1);
490 int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
491
492 // Find match in remaining line (up to remaining len)
493 while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) {
494 mflags = REG_NOTBOL;
495
496 // Zero length matches don't count immediately after a previous match
497 mlen = match[0].rm_eo-match[0].rm_so;
498 if (!mlen && !zmatch) {
499 if (!rlen--) break;
500 rline++;
501 zmatch++;
502 continue;
503 } else zmatch = 0;
504
505 // If we're replacing only a specific match, skip if this isn't it
506 off = logrus->sflags>>3;
507 if (off && off != ++count) {
508 rline += match[0].rm_eo;
509 rlen -= match[0].rm_eo;
510
511 continue;
512 }
513 // The fact getline() can allocate unbounded amounts of memory is
514 // a bigger issue, but while we're here check for integer overflow
515 if (match[0].rm_eo > INT_MAX) perror_exit(0);
516
517 // newlen = strlen(new) but with \1 and & and printf escapes
518 for (off = newlen = 0; new[off]; off++) {
519 int cc = -1;
520
521 if (new[off] == '&') cc = 0;
522 else if (new[off] == '\\') cc = new[++off] - '0';
523 if (cc < 0 || cc > 9) {
524 newlen++;
525 continue;
526 }
527 newlen += match[cc].rm_eo-match[cc].rm_so;
528 }
529
530 // Allocate new size, copy start/end around match. (Can't extend in
531 // place because backrefs may refer to text after it's overwritten.)
532 len += newlen-mlen;
533 swap = xmalloc(len+1);
534 rswap = swap+(rline-line)+match[0].rm_so;
535 memcpy(swap, line, (rline-line)+match[0].rm_so);
536 memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
537
538 // copy in new replacement text
539 for (off = mlen = 0; new[off]; off++) {
540 int cc = 0, ll;
541
542 if (new[off] == '\\') {
543 cc = new[++off] - '0';
544 if (cc<0 || cc>9) {
545 if (!(rswap[mlen++] = unescape(new[off])))
546 rswap[mlen-1] = new[off];
547
548 continue;
549 } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
550 } else if (new[off] != '&') {
551 rswap[mlen++] = new[off];
552
553 continue;
554 }
555
556 ll = match[cc].rm_eo-match[cc].rm_so;
557 memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
558 mlen += ll;
559 }
560
561 rline = rswap+newlen;
562 free(line);
563 line = swap;
564
565 // Stop after first substitution unless we have flag g
566 if (!(logrus->sflags & 2)) break;
567 }
568
569 if (mflags) {
570 // flag p
571 if (logrus->sflags & 4) emit(line, len, eol);
572
573 tea = 1;
574 if (logrus->w) goto writenow;
575 }
576 } else if (c=='w') {
577 int fd, noeol;
578 char *name;
579
580 writenow:
581 // Swap out emit() context
582 fd = TT.fdout;
583 noeol = TT.noeol;
584
585 // We save filehandle and newline status before filename
586 name = logrus->w + (char *)logrus;
587 memcpy(&TT.fdout, name, 4);
588 name += 4;
589 TT.noeol = *(name++);
590
591 // write, then save/restore context
592 if (emit(line, len, eol))
593 perror_exit("w '%s'", logrus->arg1+(char *)logrus);
594 *(--name) = TT.noeol;
595 TT.noeol = noeol;
596 TT.fdout = fd;
597 } else if (c=='x') {
598 long swap = TT.rememberlen;
599
600 str = TT.remember;
601 TT.remember = line;
602 line = str;
603 TT.rememberlen = len;
604 len = swap;
605 } else if (c=='y') {
606 char *from, *to = (char *)logrus;
607 int i, j;
608
609 from = to+logrus->arg1;
610 to += logrus->arg2;
611
612 for (i = 0; i < len; i++) {
613 j = stridx(from, line[i]);
614 if (j != -1) line[i] = to[j];
615 }
616 } else if (c=='=') {
617 sprintf(toybuf, "%ld", TT.count);
618 emit(toybuf, strlen(toybuf), 1);
619 }
620
621 logrus = logrus->next;
622 }
623
624 if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
625
626 done:
627 free(line);
628
629 if (dlist_terminate(append)) while (append) {
630 struct append *a = append->next;
631
632 if (append->file) {
633 int fd = open(append->str, O_RDONLY);
634
635 // Force newline if noeol pending
636 if (fd != -1) {
637 if (TT.noeol) xwrite(TT.fdout, "\n", 1);
638 TT.noeol = 0;
639 xsendfile(fd, TT.fdout);
640 close(fd);
641 }
642 } else emit(append->str, strlen(append->str), 1);
643 free(append);
644 append = a;
645 }
646 }
647
648 // Genericish function, can probably get moved to lib.c
649
650 // Iterate over lines in file, calling function. Function can write 0 to
651 // the line pointer if they want to keep it, or 1 to terminate processing,
652 // otherwise line is freed. Passed file descriptor is closed at the end.
do_lines(int fd,char * name,void (* call)(char ** pline,long len))653 static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
654 {
655 FILE *fp = fd ? xfdopen(fd, "r") : stdin;
656
657 for (;;) {
658 char *line = 0;
659 ssize_t len;
660
661 len = getline(&line, (void *)&len, fp);
662 if (len > 0) {
663 call(&line, len);
664 if (line == (void *)1) break;
665 free(line);
666 } else break;
667 }
668
669 if (fd) fclose(fp);
670 }
671
do_sed(int fd,char * name)672 static void do_sed(int fd, char *name)
673 {
674 int i = toys.optflags & FLAG_i;
675 char *tmp;
676
677 if (i) {
678 struct step *primal;
679
680 if (!fd && *name=='-') {
681 error_msg("-i on stdin");
682 return;
683 }
684 TT.fdout = copy_tempfile(fd, name, &tmp);
685 TT.count = 0;
686 for (primal = (void *)TT.pattern; primal; primal = primal->next)
687 primal->hit = 0;
688 }
689 do_lines(fd, name, walk_pattern);
690 if (i) {
691 walk_pattern(0, 0);
692 replace_tempfile(-1, TT.fdout, &tmp);
693 TT.fdout = 1;
694 TT.nextline = 0;
695 TT.nextlen = TT.noeol = 0;
696 }
697 }
698
699 // Copy chunk of string between two delimiters, converting printf escapes.
700 // returns processed copy of string (0 if error), *pstr advances to next
701 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
702 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim,int regex)703 static char *unescape_delimited_string(char **pstr, char *delim, int regex)
704 {
705 char *to, *from, mode = 0, d;
706
707 to = from = *pstr;
708 if (!delim || !*delim) {
709 if (!(d = *(from++))) return 0;
710 if (d == '\\') d = *(from++);
711 if (!d || d == '\\') return 0;
712 if (delim) *delim = d;
713 } else d = *delim;
714 to = delim = xmalloc(strlen(*pstr)+1);
715
716 while (mode || *from != d) {
717 if (!*from) return 0;
718
719 // delimiter in regex character range doesn't count
720 if (*from == '[') {
721 mode = '[';
722 if (from[1] == ']') *(to++) = *(from++);
723 } else if (mode && *from == ']') mode = 0;
724 else if (*from == '\\') {
725 if (!from[1]) return 0;
726
727 // Check escaped end delimiter before printf style escapes.
728 if (from[1] == d) from++;
729 else if (from[1]=='\\') *(to++) = *(from++);
730 else {
731 char c = unescape(from[1]);
732
733 if (c) {
734 *(to++) = c;
735 from+=2;
736 continue;
737 } else *(to++) = *(from++);
738 }
739 }
740 *(to++) = *(from++);
741 }
742 *to = 0;
743 *pstr = from+1;
744
745 return delim;
746 }
747
748 // Translate primal pattern into walkable form.
jewel_of_judgement(char ** pline,long len)749 static void jewel_of_judgement(char **pline, long len)
750 {
751 struct step *corwin = (void *)TT.pattern;
752 char *line, *reg, c, *errstart;
753 int i;
754
755 line = errstart = pline ? *pline : "";
756 if (len && line[len-1]=='\n') line[--len] = 0;
757
758 // Append additional line to pattern argument string?
759 // We temporarily repurpose "hit" to indicate line continuations
760 if (corwin && corwin->prev->hit) {
761 if (!*pline) error_exit("unfinished %c", corwin->prev->c);;
762 // Remove half-finished entry from list so remalloc() doesn't confuse it
763 TT.pattern = TT.pattern->prev;
764 corwin = dlist_pop(&TT.pattern);
765 c = corwin->c;
766 reg = (char *)corwin;
767 reg += corwin->arg1 + strlen(reg + corwin->arg1);
768
769 // Resume parsing for 'a' or 's' command
770 if (corwin->hit < 256) goto resume_s;
771 else goto resume_a;
772 }
773
774 // Loop through commands in line
775
776 corwin = 0;
777 for (;;) {
778 if (corwin) dlist_add_nomalloc(&TT.pattern, (void *)corwin);
779
780 for (;;) {
781 while (isspace(*line) || *line == ';') line++;
782 if (*line == '#') while (*line && *line != '\n') line++;
783 else break;
784 }
785 if (!*line) return;
786
787 errstart = line;
788 memset(toybuf, 0, sizeof(struct step));
789 corwin = (void *)toybuf;
790 reg = toybuf + sizeof(struct step);
791
792 // Parse address range (if any)
793 for (i = 0; i < 2; i++) {
794 if (*line == ',') line++;
795 else if (i) break;
796
797 if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
798 else if (*line == '$') {
799 corwin->lmatch[i] = -1;
800 line++;
801 } else if (*line == '/' || *line == '\\') {
802 char *s = line;
803
804 if (!(s = unescape_delimited_string(&line, 0, 1))) goto brand;
805 if (!*s) corwin->rmatch[i] = 0;
806 else {
807 xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
808 corwin->rmatch[i] = reg-toybuf;
809 reg += sizeof(regex_t);
810 }
811 free(s);
812 } else break;
813 }
814
815 while (isspace(*line)) line++;
816 if (!*line) break;
817
818 while (*line == '!') {
819 corwin->not = 1;
820 line++;
821 }
822 while (isspace(*line)) line++;
823
824 c = corwin->c = *(line++);
825 if (strchr("}:", c) && i) break;
826 if (strchr("aiqr=", c) && i>1) break;
827
828 // Add step to pattern
829 corwin = xmalloc(reg-toybuf);
830 memcpy(corwin, toybuf, reg-toybuf);
831 reg = (reg-toybuf) + (char *)corwin;
832
833 // Parse arguments by command type
834 if (c == '{') TT.nextlen++;
835 else if (c == '}') {
836 if (!TT.nextlen--) break;
837 } else if (c == 's') {
838 char *fiona, delim = 0;
839
840 // s/pattern/replacement/flags
841
842 // line continuations use arg1, so we fill out arg2 first (since the
843 // regex part can't be multiple lines) and swap them back later.
844
845 // get pattern (just record, we parse it later)
846 corwin->arg2 = reg - (char *)corwin;
847 if (!(TT.remember = unescape_delimited_string(&line, &delim, 1)))
848 goto brand;
849
850 reg += sizeof(regex_t);
851 corwin->arg1 = reg-(char *)corwin;
852 corwin->hit = delim;
853 resume_s:
854 // get replacement - don't replace escapes because \1 and \& need
855 // processing later, after we replace \\ with \ we can't tell \\1 from \1
856 fiona = line;
857 while (*fiona != corwin->hit) {
858 if (!*fiona) goto brand;
859 if (*fiona++ == '\\') {
860 if (!*fiona || *fiona == '\n') {
861 fiona[-1] = '\n';
862 break;
863 }
864 fiona++;
865 }
866 }
867
868 reg = extend_string((void *)&corwin, line, reg-(char *)corwin,fiona-line);
869 line = fiona;
870 // line continuation? (note: '\n' can't be a valid delim).
871 if (*line == corwin->hit) corwin->hit = 0;
872 else {
873 if (!*line) continue;
874 reg--;
875 line++;
876 goto resume_s;
877 }
878
879 // swap arg1/arg2 so they're back in order arguments occur.
880 i = corwin->arg1;
881 corwin->arg1 = corwin->arg2;
882 corwin->arg2 = i;
883
884 // get flags
885 for (line++; *line; line++) {
886 long l;
887
888 if (isspace(*line) && *line != '\n') continue;
889
890 if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<<l;
891 else if (!(corwin->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
892 corwin->sflags |= l << 3;
893 line--;
894 } else break;
895 }
896
897 // We deferred actually parsing the regex until we had the s///i flag
898 // allocating the space was done by extend_string() above
899 if (!*TT.remember) corwin->arg1 = 0;
900 else xregcomp((void *)(corwin->arg1 + (char *)corwin), TT.remember,
901 ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE));
902 free(TT.remember);
903 TT.remember = 0;
904 if (*line == 'w') {
905 line++;
906 goto writenow;
907 }
908 } else if (c == 'w') {
909 int fd, delim;
910 char *cc;
911
912 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
913 // eol status, and to retain the filename for error messages, we'd need
914 // to go up to arg5 just for this. Compromise: dynamically allocate the
915 // filehandle and eol status.
916
917 writenow:
918 while (isspace(*line)) line++;
919 if (!*line) goto brand;
920 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
921 delim = *cc;
922 *cc = 0;
923 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
924 *cc = delim;
925
926 corwin->w = reg - (char *)corwin;
927 corwin = xrealloc(corwin, corwin->w+(cc-line)+6);
928 reg = corwin->w + (char *)corwin;
929
930 memcpy(reg, &fd, 4);
931 reg += 4;
932 *(reg++) = 0;
933 memcpy(reg, line, delim);
934 reg += delim;
935 *(reg++) = 0;
936
937 line = cc;
938 if (delim) line += 2;
939 } else if (c == 'y') {
940 char *s, delim = 0;
941 int len;
942
943 if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
944 corwin->arg1 = reg-(char *)corwin;
945 len = strlen(s);
946 reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len);
947 free(s);
948 corwin->arg2 = reg-(char *)corwin;
949 if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
950 if (len != strlen(s)) goto brand;
951 reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len);
952 free(s);
953 } else if (strchr("abcirtTw:", c)) {
954 int end;
955
956 while (isspace(*line) && *line != '\n') line++;
957
958 // Resume logic differs from 's' case because we don't add a newline
959 // unless it's after something, so we add it on return instead.
960 resume_a:
961 corwin->hit = 0;
962
963 // Trim whitespace from "b ;" and ": blah " but only first space in "w x "
964 if (!(end = strcspn(line, strchr("btT:", c) ? "; \t\r\n\v\f" : "\n"))) {
965 if (strchr("btT", c)) continue;
966 else if (!corwin->arg1) break;
967 }
968
969 // Extend allocation to include new string. We use offsets instead of
970 // pointers so realloc() moving stuff doesn't break things. Ok to write
971 // \n over NUL terminator because call to extend_string() adds it back.
972 if (!corwin->arg1) corwin->arg1 = reg - (char*)corwin;
973 else if ((corwin+1) != (void *)reg) *(reg++) = '\n';
974 reg = extend_string((void *)&corwin, line, reg - (char *)corwin, end);
975
976 // Recopy data to remove escape sequences and handle line continuation.
977 if (strchr("aci", c)) {
978 reg -= end+1;
979 for (i = end; i; i--) {
980 if ((*reg++ = *line++)=='\\') {
981
982 // escape at end of line: resume if -e escaped literal newline,
983 // else request callback and resume with next line
984 if (!--i) {
985 *--reg = 0;
986 if (*line) {
987 line++;
988 goto resume_a;
989 }
990 corwin->hit = 256;
991 break;
992 }
993 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
994 line++;
995 }
996 }
997 *reg = 0;
998 } else line += end;
999
1000 // Commands that take no arguments
1001 } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
1002 }
1003
1004 brand:
1005 // Reminisce about chestnut trees.
1006 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1007 }
1008
sed_main(void)1009 void sed_main(void)
1010 {
1011 struct arg_list *dworkin;
1012 char **args = toys.optargs;
1013
1014 // Lie to autoconf when it asks stupid questions, so configure regexes
1015 // that look for "GNU sed version %f" greater than some old buggy number
1016 // don't fail us for not matching their narrow expectations.
1017 if (toys.optflags & FLAG_version) {
1018 xprintf("This is not GNU sed version 9.0\n");
1019 return;
1020 }
1021
1022 // Need a pattern. If no unicorns about, fight serpent and take its eye.
1023 if (!TT.e && !TT.f) {
1024 if (!*toys.optargs) error_exit("no pattern");
1025 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1026 }
1027
1028 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1029 // so handle all -e, then all -f. (At least the behavior's consistent.)
1030
1031 for (dworkin = TT.e; dworkin; dworkin = dworkin->next)
1032 jewel_of_judgement(&dworkin->arg, strlen(dworkin->arg));
1033 for (dworkin = TT.f; dworkin; dworkin = dworkin->next)
1034 do_lines(xopen(dworkin->arg, O_RDONLY), dworkin->arg, jewel_of_judgement);
1035 jewel_of_judgement(0, 0);
1036 dlist_terminate(TT.pattern);
1037 if (TT.nextlen) error_exit("no }");
1038
1039 TT.fdout = 1;
1040 TT.remember = xstrdup("");
1041
1042 // Inflict pattern upon input files
1043 loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
1044
1045 if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0);
1046
1047 // todo: need to close fd when done for TOYBOX_FREE?
1048 }
1049