1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9
10 USE_SED(NEWTOY(sed, "(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
11
12 config SED
13 bool "sed"
14 default y
15 help
16 usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
17
18 Stream editor. Apply one or more editing SCRIPTs to each line of input
19 (from FILE or stdin) producing output (by default to stdout).
20
21 -e add SCRIPT to list
22 -f add contents of SCRIPT_FILE to list
23 -i Edit each file in place.
24 -n No default output. (Use the p command to output matched lines.)
25 -r Use extended regular expression syntax.
26 -E Alias for -r.
27 -s Treat input files separately (implied by -i)
28
29 A SCRIPT is a series of one or more COMMANDs separated by newlines or
30 semicolons. All -e SCRIPTs are concatenated together as if separated
31 by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
32 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
33
34 Each COMMAND may be preceded by an address which limits the command to
35 apply only to the specified line(s). Commands without an address apply to
36 every line. Addresses are of the form:
37
38 [ADDRESS[,ADDRESS]]COMMAND
39
40 The ADDRESS may be a decimal line number (starting at 1), a /regular
41 expression/ within a pair of forward slashes, or the character "$" which
42 matches the last line of input. (In -s or -i mode this matches the last
43 line of each file, otherwise just the last line of the last file.) A single
44 address matches one line, a pair of comma separated addresses match
45 everything from the first address to the second address (inclusive). If
46 both addresses are regular expressions, more than one range of lines in
47 each file can match.
48
49 REGULAR EXPRESSIONS in sed are started and ended by the same character
50 (traditionally / but anything except a backslash or a newline works).
51 Backslashes may be used to escape the delimiter if it occurs in the
52 regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
53 and unicode). An empty regex repeats the previous one. ADDRESS regexes
54 (above) require the first delimeter to be escaped with a backslash when
55 it isn't a forward slash (to distinguish it from the COMMANDs below).
56
57 Sed mostly operates on individual lines one at a time. It reads each line,
58 processes it, and either writes it to the output or discards it before
59 reading the next line. Sed can remember one additional line in a separate
60 buffer (using the h, H, g, G, and x commands), and can read the next line
61 of input early (using the n and N command), but other than that command
62 scripts operate on individual lines of text.
63
64 Each COMMAND starts with a single character. The following commands take
65 no arguments:
66
67 { Start a new command block, continuing until a corresponding "}".
68 Command blocks may nest. If the block has an address, commands within
69 the block are only run for lines within the block's address range.
70
71 } End command block (this command cannot have an address)
72
73 d Delete this line and move on to the next one
74 (ignores remaining COMMANDs)
75
76 D Delete one line of input and restart command SCRIPT (same as "d"
77 unless you've glued lines together with "N" or similar)
78
79 g Get remembered line (overwriting current line)
80
81 G Get remembered line (appending to current line)
82
83 h Remember this line (overwriting remembered line)
84
85 H Remember this line (appending to remembered line, if any)
86
87 l Print line, escaping \abfrtv (but not newline), octal escaping other
88 nonprintable characters, wrapping lines to terminal width with a
89 backslash, and appending $ to actual end of line.
90
91 n Print default output and read next line, replacing current line
92 (If no next line available, quit processing script)
93
94 N Append next line of input to this line, separated by a newline
95 (This advances the line counter for address matching and "=", if no
96 next line available quit processing script without default output)
97
98 p Print this line
99
100 P Print this line up to first newline (from "N")
101
102 q Quit (print default output, no more commands processed or lines read)
103
104 x Exchange this line with remembered line (overwrite in both directions)
105
106 = Print the current line number (followed by a newline)
107
108 The following commands (may) take an argument. The "text" arguments (to
109 the "a", "b", and "c" commands) may end with an unescaped "\" to append
110 the next line (for which leading whitespace is not skipped), and also
111 treat ";" as a literal character (use "\;" instead).
112
113 a [text] Append text to output before attempting to read next line
114
115 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
116
117 c [text] Delete line, output text at end of matching address range
118 (ignores remaining COMMANDs)
119
120 i [text] Print text
121
122 r [file] Append contents of file to output before attempting to read
123 next line.
124
125 s/S/R/F Search for regex S, replace matched text with R using flags F.
126 The first character after the "s" (anything but newline or
127 backslash) is the delimiter, escape with \ to use normally.
128
129 The replacement text may contain "&" to substitute the matched
130 text (escape it with backslash for a literal &), or \1 through
131 \9 to substitute a parenthetical subexpression in the regex.
132 You can also use the normal backslash escapes such as \n and
133 a backslash at the end of the line appends the next line.
134
135 The flags are:
136
137 [0-9] A number, substitute only that occurrence of pattern
138 g Global, substitute all occurrences of pattern
139 i Ignore case when matching
140 p Print the line if match was found and replaced
141 w [file] Write (append) line to file if match replaced
142
143 t [label] Test, jump to :label only if an "s" command found a match in
144 this line since last test (replacing with same text counts)
145
146 T [label] Test false, jump only if "s" hasn't found a match.
147
148 w [file] Write (append) line to file
149
150 y/old/new/ Change each character in 'old' to corresponding character
151 in 'new' (with standard backslash escapes, delimiter can be
152 any repeated character except \ or \n)
153
154 : [label] Labeled target for jump commands
155
156 # Comment, ignore rest of this line of SCRIPT
157
158 Deviations from posix: allow extended regular expressions with -r,
159 editing in place with -i, separate with -s, printf escapes in text, line
160 continuations, semicolons after all commands, 2-address anywhere an
161 address is allowed, "T" command, multiline continuations for [abc],
162 \; to end [abc] argument before end of line.
163 */
164
165 #define FOR_sed
166 #include "toys.h"
167
168 GLOBALS(
169 struct arg_list *f;
170 struct arg_list *e;
171
172 // processed pattern list
173 struct double_list *pattern;
174
175 char *nextline, *remember;
176 void *restart, *lastregex;
177 long nextlen, rememberlen, count;
178 int fdout, noeol;
179 unsigned xx;
180 )
181
182 struct step {
183 struct step *next, *prev;
184
185 // Begin and end of each match
186 long lmatch[2];
187 int rmatch[2], arg1, arg2, w; // offsets because remalloc()
188 unsigned not, hit, sflags;
189 char c; // action
190 };
191
192 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)193 static int emit(char *line, long len, int eol)
194 {
195 int l, old = line[len];
196
197 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
198 if (eol) line[len++] = '\n';
199 if (!len) return 0;
200 TT.noeol = len && !eol;
201 l = writeall(TT.fdout, line, len);
202 if (eol) line[len-1] = old;
203 if (l != len) {
204 perror_msg("short write");
205
206 return 1;
207 }
208
209 return 0;
210 }
211
212 // Do regex matching handling embedded NUL bytes in string. Note that
213 // neither the pattern nor the match can currently include NUL bytes
214 // (even with wildcards) and string must be null terminated.
ghostwheel(regex_t * preg,char * string,long len,int nmatch,regmatch_t pmatch[],int eflags)215 static int ghostwheel(regex_t *preg, char *string, long len, int nmatch,
216 regmatch_t pmatch[], int eflags)
217 {
218 char *s = string;
219
220 for (;;) {
221 long ll = 0;
222 int rc;
223
224 while (len && !*s) {
225 s++;
226 len--;
227 }
228 while (s[ll] && ll<len) ll++;
229
230 rc = regexec(preg, s, nmatch, pmatch, eflags);
231 if (!rc) {
232 for (rc = 0; rc<nmatch && pmatch[rc].rm_so!=-1; rc++) {
233 pmatch[rc].rm_so += s-string;
234 pmatch[rc].rm_eo += s-string;
235 }
236
237 return 0;
238 }
239 if (ll==len) return rc;
240
241 s += ll;
242 len -= ll;
243 }
244 }
245
246 // Extend allocation to include new string, with newline between if newlen<0
247
extend_string(char ** old,char * new,int oldlen,int newlen)248 static char *extend_string(char **old, char *new, int oldlen, int newlen)
249 {
250 int newline = newlen < 0;
251 char *s;
252
253 if (newline) newlen = -newlen;
254 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
255 if (newline) s[oldlen++] = '\n';
256 memcpy(s+oldlen, new, newlen);
257 s[oldlen+newlen] = 0;
258
259 return s+oldlen+newlen+1;
260 }
261
262 // An empty regex repeats the previous one
get_regex(void * trump,int offset)263 static void *get_regex(void *trump, int offset)
264 {
265 if (!offset) {
266 if (!TT.lastregex) error_exit("no previous regex");
267 return TT.lastregex;
268 }
269
270 return TT.lastregex = offset+(char *)trump;
271 }
272
273 // Apply pattern to line from input file
walk_pattern(char ** pline,long plen)274 static void walk_pattern(char **pline, long plen)
275 {
276 struct append {
277 struct append *next, *prev;
278 int file;
279 char *str;
280 } *append = 0;
281 char *line = TT.nextline;
282 long len = TT.nextlen;
283 struct step *logrus;
284 int eol = 0, tea = 0;
285
286 // Grab next line for deferred processing (EOF detection: we get a NULL
287 // pline at EOF to flush last line). Note that only end of _last_ input
288 // file matches $ (unless we're doing -i).
289 TT.nextline = 0;
290 TT.nextlen = 0;
291 if (pline) {
292 TT.nextline = *pline;
293 TT.nextlen = plen;
294 *pline = 0;
295 }
296
297 if (!line || !len) return;
298 if (line[len-1] == '\n') line[--len] = eol++;
299 TT.count++;
300
301 // The restart-1 is because we added one to make sure it wasn't NULL,
302 // otherwise N as last command would restart script
303 logrus = TT.restart ? ((struct step *)TT.restart)-1 : (void *)TT.pattern;
304 TT.restart = 0;
305
306 while (logrus) {
307 char *str, c = logrus->c;
308
309 // Have we got a line or regex matching range for this rule?
310 if (*logrus->lmatch || *logrus->rmatch) {
311 int miss = 0;
312 long lm;
313
314 // In a match that might end?
315 if (logrus->hit) {
316 if (!(lm = logrus->lmatch[1])) {
317 if (!logrus->rmatch[1]) logrus->hit = 0;
318 else {
319 void *rm = get_regex(logrus, logrus->rmatch[1]);
320
321 // regex match end includes matching line, so defer deactivation
322 if (line && !ghostwheel(rm, line, len, 0, 0, 0)) miss = 1;
323 }
324 } else if (lm > 0 && lm < TT.count) logrus->hit = 0;
325
326 // Start a new match?
327 } else {
328 if (!(lm = *logrus->lmatch)) {
329 void *rm = get_regex(logrus, *logrus->rmatch);
330
331 if (line && !ghostwheel(rm, line, len, 0, 0, 0)) logrus->hit++;
332 } else if (lm == TT.count || (lm == -1 && !pline)) logrus->hit++;
333
334 if (!logrus->lmatch[1] && !logrus->rmatch[1]) miss = 1;
335 }
336
337 // Didn't match?
338 lm = !(logrus->hit ^ logrus->not);
339
340 // Deferred disable from regex end match
341 if (miss || logrus->lmatch[1] == TT.count) logrus->hit = 0;
342
343 if (lm) {
344 // Handle skipping curly bracket command group
345 if (c == '{') {
346 int curly = 1;
347
348 while (curly) {
349 logrus = logrus->next;
350 if (logrus->c == '{') curly++;
351 if (logrus->c == '}') curly--;
352 }
353 }
354 logrus = logrus->next;
355 continue;
356 }
357 }
358
359 // A deleted line can still update line match state for later commands
360 if (!line) {
361 logrus = logrus->next;
362 continue;
363 }
364
365 // Process command
366
367 if (c=='a' || c=='r') {
368 struct append *a = xzalloc(sizeof(struct append));
369 a->str = logrus->arg1+(char *)logrus;
370 a->file = c=='r';
371 dlist_add_nomalloc((void *)&append, (void *)a);
372 } else if (c=='b' || c=='t' || c=='T') {
373 int t = tea;
374
375 if (c != 'b') tea = 0;
376 if (c=='b' || t^(c=='T')) {
377 if (!logrus->arg1) break;
378 str = logrus->arg1+(char *)logrus;
379 for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next)
380 if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str))
381 break;
382 if (!logrus) error_exit("no :%s", str);
383 }
384 } else if (c=='c') {
385 str = logrus->arg1+(char *)logrus;
386 if (!logrus->hit) emit(str, strlen(str), 1);
387 free(line);
388 line = 0;
389 continue;
390 } else if (c=='d') {
391 free(line);
392 line = 0;
393 continue;
394 } else if (c=='D') {
395 // Delete up to \n or end of buffer
396 str = line;
397 while ((str-line)<len) if (*(str++) == '\n') break;
398 len -= str - line;
399 memmove(line, str, len);
400
401 // if "delete" blanks line, disable further processing
402 // otherwise trim and restart script
403 if (!len) {
404 free(line);
405 line = 0;
406 } else {
407 line[len] = 0;
408 logrus = (void *)TT.pattern;
409 }
410 continue;
411 } else if (c=='g') {
412 free(line);
413 line = xstrdup(TT.remember);
414 len = TT.rememberlen;
415 } else if (c=='G') {
416 line = xrealloc(line, len+TT.rememberlen+2);
417 line[len++] = '\n';
418 memcpy(line+len, TT.remember, TT.rememberlen);
419 line[len += TT.rememberlen] = 0;
420 } else if (c=='h') {
421 free(TT.remember);
422 TT.remember = xstrdup(line);
423 TT.rememberlen = len;
424 } else if (c=='H') {
425 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
426 TT.remember[TT.rememberlen++] = '\n';
427 memcpy(TT.remember+TT.rememberlen, line, len);
428 TT.remember[TT.rememberlen += len] = 0;
429 } else if (c=='i') {
430 str = logrus->arg1+(char *)logrus;
431 emit(str, strlen(str), 1);
432 } else if (c=='l') {
433 int i, x, off;
434
435 if (!TT.xx) {
436 terminal_size(&TT.xx, 0);
437 if (!TT.xx) TT.xx = 80;
438 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
439 if (TT.xx > 4) TT.xx -= 4;
440 }
441
442 for (i = off = 0; i<len; i++) {
443 if (off >= TT.xx) {
444 toybuf[off++] = '\\';
445 emit(toybuf, off, 1);
446 off = 0;
447 }
448 x = stridx("\\\a\b\f\r\t\v", line[i]);
449 if (x != -1) {
450 toybuf[off++] = '\\';
451 toybuf[off++] = "\\abfrtv"[x];
452 } else if (line[i] >= ' ') toybuf[off++] = line[i];
453 else off += sprintf(toybuf+off, "\\%03o", line[i]);
454 }
455 toybuf[off++] = '$';
456 emit(toybuf, off, 1);
457 } else if (c=='n') {
458 TT.restart = logrus->next+1;
459
460 break;
461 } else if (c=='N') {
462 // Can't just grab next line because we could have multiple N and
463 // we need to actually read ahead to get N;$p EOF detection right.
464 if (pline) {
465 TT.restart = logrus->next+1;
466 extend_string(&line, TT.nextline, len, -TT.nextlen);
467 free(TT.nextline);
468 TT.nextline = line;
469 TT.nextlen += len + 1;
470 line = 0;
471 }
472
473 // Pending append goes out right after N
474 goto done;
475 } else if (c=='p' || c=='P') {
476 char *l = (c=='P') ? strchr(line, '\n') : 0;
477
478 if (emit(line, l ? l-line : len, eol)) break;
479 } else if (c=='q') {
480 if (pline) *pline = (void *)1;
481 free(TT.nextline);
482 TT.nextline = 0;
483 TT.nextlen = 0;
484
485 break;
486 } else if (c=='s') {
487 char *rline = line, *new = logrus->arg2 + (char *)logrus, *swap, *rswap;
488 regmatch_t *match = (void *)toybuf;
489 regex_t *reg = get_regex(logrus, logrus->arg1);
490 int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
491
492 // Find match in remaining line (up to remaining len)
493 while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) {
494 mflags = REG_NOTBOL;
495
496 // Zero length matches don't count immediately after a previous match
497 mlen = match[0].rm_eo-match[0].rm_so;
498 if (!mlen && !zmatch) {
499 if (!rlen--) break;
500 rline++;
501 zmatch++;
502 continue;
503 } else zmatch = 0;
504
505 // If we're replacing only a specific match, skip if this isn't it
506 off = logrus->sflags>>3;
507 if (off && off != ++count) {
508 rline += match[0].rm_eo;
509 rlen -= match[0].rm_eo;
510
511 continue;
512 }
513 // The fact getline() can allocate unbounded amounts of memory is
514 // a bigger issue, but while we're here check for integer overflow
515 if (match[0].rm_eo > INT_MAX) perror_exit(0);
516
517 // newlen = strlen(new) but with \1 and & and printf escapes
518 for (off = newlen = 0; new[off]; off++) {
519 int cc = -1;
520
521 if (new[off] == '&') cc = 0;
522 else if (new[off] == '\\') cc = new[++off] - '0';
523 if (cc < 0 || cc > 9) {
524 newlen++;
525 continue;
526 }
527 newlen += match[cc].rm_eo-match[cc].rm_so;
528 }
529
530 // Allocate new size, copy start/end around match. (Can't extend in
531 // place because backrefs may refer to text after it's overwritten.)
532 len += newlen-mlen;
533 swap = xmalloc(len+1);
534 rswap = swap+(rline-line)+match[0].rm_so;
535 memcpy(swap, line, (rline-line)+match[0].rm_so);
536 memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
537
538 // copy in new replacement text
539 for (off = mlen = 0; new[off]; off++) {
540 int cc = 0, ll;
541
542 if (new[off] == '\\') {
543 cc = new[++off] - '0';
544 if (cc<0 || cc>9) {
545 if (!(rswap[mlen++] = unescape(new[off])))
546 rswap[mlen-1] = new[off];
547
548 continue;
549 } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
550 } else if (new[off] != '&') {
551 rswap[mlen++] = new[off];
552
553 continue;
554 }
555
556 ll = match[cc].rm_eo-match[cc].rm_so;
557 memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
558 mlen += ll;
559 }
560
561 rline = rswap+newlen;
562 free(line);
563 line = swap;
564
565 // Stop after first substitution unless we have flag g
566 if (!(logrus->sflags & 2)) break;
567 }
568
569 if (mflags) {
570 // flag p
571 if (logrus->sflags & 4) emit(line, len, eol);
572
573 tea = 1;
574 if (logrus->w) goto writenow;
575 }
576 } else if (c=='w') {
577 int fd, noeol;
578 char *name;
579
580 writenow:
581 // Swap out emit() context
582 fd = TT.fdout;
583 noeol = TT.noeol;
584
585 // We save filehandle and newline status before filename
586 name = logrus->w + (char *)logrus;
587 memcpy(&TT.fdout, name, 4);
588 name += 4;
589 TT.noeol = *(name++);
590
591 // write, then save/restore context
592 if (emit(line, len, eol))
593 perror_exit("w '%s'", logrus->arg1+(char *)logrus);
594 *(--name) = TT.noeol;
595 TT.noeol = noeol;
596 TT.fdout = fd;
597 } else if (c=='x') {
598 long swap = TT.rememberlen;
599
600 str = TT.remember;
601 TT.remember = line;
602 line = str;
603 TT.rememberlen = len;
604 len = swap;
605 } else if (c=='y') {
606 char *from, *to = (char *)logrus;
607 int i, j;
608
609 from = to+logrus->arg1;
610 to += logrus->arg2;
611
612 for (i = 0; i < len; i++) {
613 j = stridx(from, line[i]);
614 if (j != -1) line[i] = to[j];
615 }
616 } else if (c=='=') {
617 sprintf(toybuf, "%ld", TT.count);
618 emit(toybuf, strlen(toybuf), 1);
619 }
620
621 logrus = logrus->next;
622 }
623
624 if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
625
626 done:
627 free(line);
628
629 if (dlist_terminate(append)) while (append) {
630 struct append *a = append->next;
631
632 if (append->file) {
633 int fd = open(append->str, O_RDONLY);
634
635 // Force newline if noeol pending
636 if (fd != -1) {
637 if (TT.noeol) xwrite(TT.fdout, "\n", 1);
638 TT.noeol = 0;
639 xsendfile(fd, TT.fdout);
640 close(fd);
641 }
642 } else emit(append->str, strlen(append->str), 1);
643 free(append);
644 append = a;
645 }
646 }
647
648 // Genericish function, can probably get moved to lib.c
649
650 // Iterate over lines in file, calling function. Function can write 0 to
651 // the line pointer if they want to keep it, or 1 to terminate processing,
652 // otherwise line is freed. Passed file descriptor is closed at the end.
do_lines(int fd,char * name,void (* call)(char ** pline,long len))653 static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
654 {
655 FILE *fp = fd ? xfdopen(fd, "r") : stdin;
656
657 for (;;) {
658 char *line = 0;
659 ssize_t len;
660
661 len = getline(&line, (void *)&len, fp);
662 if (len > 0) {
663 call(&line, len);
664 if (line == (void *)1) break;
665 free(line);
666 } else break;
667 }
668
669 if (fd) fclose(fp);
670 }
671
672 // Callback called on each input file
do_sed(int fd,char * name)673 static void do_sed(int fd, char *name)
674 {
675 int i = toys.optflags & FLAG_i;
676 char *tmp;
677
678 if (i) {
679 struct step *primal;
680
681 if (!fd && !strcmp(name, "-")) {
682 error_msg("-i on stdin");
683 return;
684 }
685 TT.fdout = copy_tempfile(fd, name, &tmp);
686 TT.count = 0;
687 for (primal = (void *)TT.pattern; primal; primal = primal->next)
688 primal->hit = 0;
689 }
690 do_lines(fd, name, walk_pattern);
691 if (i) {
692 walk_pattern(0, 0);
693 replace_tempfile(-1, TT.fdout, &tmp);
694 TT.fdout = 1;
695 TT.nextline = 0;
696 TT.nextlen = TT.noeol = 0;
697 }
698 }
699
700 // Copy chunk of string between two delimiters, converting printf escapes.
701 // returns processed copy of string (0 if error), *pstr advances to next
702 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
703 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)704 static char *unescape_delimited_string(char **pstr, char *delim)
705 {
706 char *to, *from, mode = 0, d;
707
708 from = *pstr;
709 if (!delim || !*delim) {
710 if (!(d = *(from++))) return 0;
711 if (d == '\\') d = *(from++);
712 if (!d || d == '\\') return 0;
713 if (delim) *delim = d;
714 } else d = *delim;
715 to = delim = xmalloc(strlen(*pstr)+1);
716
717 while (mode || *from != d) {
718 if (!*from) return 0;
719
720 // delimiter in regex character range doesn't count
721 if (!mode && *from == '[') {
722 mode = '[';
723 if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
724 } else if (mode && *from == ']') mode = 0;
725 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
726 // but the perl build does it, so we need to filter it out.
727 else if (mode && *from == '-' && from[-1] == from[1]) {
728 from+=2;
729 continue;
730 } else if (*from == '\\') {
731 if (!from[1]) return 0;
732
733 // Check escaped end delimiter before printf style escapes.
734 if (from[1] == d) from++;
735 else if (from[1]=='\\') *(to++) = *(from++);
736 else {
737 char c = unescape(from[1]);
738
739 if (c) {
740 *(to++) = c;
741 from+=2;
742 continue;
743 } else if (!mode) *(to++) = *(from++);
744 }
745 }
746 *(to++) = *(from++);
747 }
748 *to = 0;
749 *pstr = from+1;
750
751 return delim;
752 }
753
754 // Translate primal pattern into walkable form.
jewel_of_judgement(char ** pline,long len)755 static void jewel_of_judgement(char **pline, long len)
756 {
757 struct step *corwin = (void *)TT.pattern;
758 char *line, *reg, c, *errstart;
759 int i;
760
761 line = errstart = pline ? *pline : "";
762 if (len && line[len-1]=='\n') line[--len] = 0;
763
764 // Append additional line to pattern argument string?
765 // We temporarily repurpose "hit" to indicate line continuations
766 if (corwin && corwin->prev->hit) {
767 if (!*pline) error_exit("unfinished %c", corwin->prev->c);;
768 // Remove half-finished entry from list so remalloc() doesn't confuse it
769 TT.pattern = TT.pattern->prev;
770 corwin = dlist_pop(&TT.pattern);
771 c = corwin->c;
772 reg = (char *)corwin;
773 reg += corwin->arg1 + strlen(reg + corwin->arg1);
774
775 // Resume parsing for 'a' or 's' command
776 if (corwin->hit < 256) goto resume_s;
777 else goto resume_a;
778 }
779
780 // Loop through commands in line
781
782 corwin = 0;
783 for (;;) {
784 if (corwin) dlist_add_nomalloc(&TT.pattern, (void *)corwin);
785
786 for (;;) {
787 while (isspace(*line) || *line == ';') line++;
788 if (*line == '#') while (*line && *line != '\n') line++;
789 else break;
790 }
791 if (!*line) return;
792
793 errstart = line;
794 memset(toybuf, 0, sizeof(struct step));
795 corwin = (void *)toybuf;
796 reg = toybuf + sizeof(struct step);
797
798 // Parse address range (if any)
799 for (i = 0; i < 2; i++) {
800 if (*line == ',') line++;
801 else if (i) break;
802
803 if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
804 else if (*line == '$') {
805 corwin->lmatch[i] = -1;
806 line++;
807 } else if (*line == '/' || *line == '\\') {
808 char *s = line;
809
810 if (!(s = unescape_delimited_string(&line, 0))) goto brand;
811 if (!*s) corwin->rmatch[i] = 0;
812 else {
813 xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
814 corwin->rmatch[i] = reg-toybuf;
815 reg += sizeof(regex_t);
816 }
817 free(s);
818 } else break;
819 }
820
821 while (isspace(*line)) line++;
822 if (!*line) break;
823
824 while (*line == '!') {
825 corwin->not = 1;
826 line++;
827 }
828 while (isspace(*line)) line++;
829
830 c = corwin->c = *(line++);
831 if (strchr("}:", c) && i) break;
832 if (strchr("aiqr=", c) && i>1) break;
833
834 // Add step to pattern
835 corwin = xmemdup(toybuf, reg-toybuf);
836 reg = (reg-toybuf) + (char *)corwin;
837
838 // Parse arguments by command type
839 if (c == '{') TT.nextlen++;
840 else if (c == '}') {
841 if (!TT.nextlen--) break;
842 } else if (c == 's') {
843 char *fiona, delim = 0;
844
845 // s/pattern/replacement/flags
846
847 // line continuations use arg1, so we fill out arg2 first (since the
848 // regex part can't be multiple lines) and swap them back later.
849
850 // get pattern (just record, we parse it later)
851 corwin->arg2 = reg - (char *)corwin;
852 if (!(TT.remember = unescape_delimited_string(&line, &delim)))
853 goto brand;
854
855 reg += sizeof(regex_t);
856 corwin->arg1 = reg-(char *)corwin;
857 corwin->hit = delim;
858 resume_s:
859 // get replacement - don't replace escapes because \1 and \& need
860 // processing later, after we replace \\ with \ we can't tell \\1 from \1
861 fiona = line;
862 while (*fiona != corwin->hit) {
863 if (!*fiona) goto brand;
864 if (*fiona++ == '\\') {
865 if (!*fiona || *fiona == '\n') {
866 fiona[-1] = '\n';
867 break;
868 }
869 fiona++;
870 }
871 }
872
873 reg = extend_string((void *)&corwin, line, reg-(char *)corwin,fiona-line);
874 line = fiona;
875 // line continuation? (note: '\n' can't be a valid delim).
876 if (*line == corwin->hit) corwin->hit = 0;
877 else {
878 if (!*line) continue;
879 reg--;
880 line++;
881 goto resume_s;
882 }
883
884 // swap arg1/arg2 so they're back in order arguments occur.
885 i = corwin->arg1;
886 corwin->arg1 = corwin->arg2;
887 corwin->arg2 = i;
888
889 // get flags
890 for (line++; *line; line++) {
891 long l;
892
893 if (isspace(*line) && *line != '\n') continue;
894
895 if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<<l;
896 else if (!(corwin->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
897 corwin->sflags |= l << 3;
898 line--;
899 } else break;
900 }
901
902 // We deferred actually parsing the regex until we had the s///i flag
903 // allocating the space was done by extend_string() above
904 if (!*TT.remember) corwin->arg1 = 0;
905 else xregcomp((void *)(corwin->arg1 + (char *)corwin), TT.remember,
906 ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE));
907 free(TT.remember);
908 TT.remember = 0;
909 if (*line == 'w') {
910 line++;
911 goto writenow;
912 }
913 } else if (c == 'w') {
914 int fd, delim;
915 char *cc;
916
917 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
918 // eol status, and to retain the filename for error messages, we'd need
919 // to go up to arg5 just for this. Compromise: dynamically allocate the
920 // filehandle and eol status.
921
922 writenow:
923 while (isspace(*line)) line++;
924 if (!*line) goto brand;
925 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
926 delim = *cc;
927 *cc = 0;
928 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
929 *cc = delim;
930
931 corwin->w = reg - (char *)corwin;
932 corwin = xrealloc(corwin, corwin->w+(cc-line)+6);
933 reg = corwin->w + (char *)corwin;
934
935 memcpy(reg, &fd, 4);
936 reg += 4;
937 *(reg++) = 0;
938 memcpy(reg, line, delim);
939 reg += delim;
940 *(reg++) = 0;
941
942 line = cc;
943 if (delim) line += 2;
944 } else if (c == 'y') {
945 char *s, delim = 0;
946 int len;
947
948 if (!(s = unescape_delimited_string(&line, &delim))) goto brand;
949 corwin->arg1 = reg-(char *)corwin;
950 len = strlen(s);
951 reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len);
952 free(s);
953 corwin->arg2 = reg-(char *)corwin;
954 if (!(s = unescape_delimited_string(&line, &delim))) goto brand;
955 if (len != strlen(s)) goto brand;
956 reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len);
957 free(s);
958 } else if (strchr("abcirtTw:", c)) {
959 int end;
960
961 while (isspace(*line) && *line != '\n') line++;
962
963 // Resume logic differs from 's' case because we don't add a newline
964 // unless it's after something, so we add it on return instead.
965 resume_a:
966 corwin->hit = 0;
967
968 // Trim whitespace from "b ;" and ": blah " but only first space in "w x "
969 if (!(end = strcspn(line, strchr("btT:", c) ? "; \t\r\n\v\f" : "\n"))) {
970 if (strchr("btT", c)) continue;
971 else if (!corwin->arg1) break;
972 }
973
974 // Extend allocation to include new string. We use offsets instead of
975 // pointers so realloc() moving stuff doesn't break things. Ok to write
976 // \n over NUL terminator because call to extend_string() adds it back.
977 if (!corwin->arg1) corwin->arg1 = reg - (char*)corwin;
978 else if (*(corwin->arg1+(char *)corwin)) *(reg++) = '\n';
979 reg = extend_string((void *)&corwin, line, reg - (char *)corwin, end);
980
981 // Recopy data to remove escape sequences and handle line continuation.
982 if (strchr("aci", c)) {
983 reg -= end+1;
984 for (i = end; i; i--) {
985 if ((*reg++ = *line++)=='\\') {
986
987 // escape at end of line: resume if -e escaped literal newline,
988 // else request callback and resume with next line
989 if (!--i) {
990 *--reg = 0;
991 if (*line) {
992 line++;
993 goto resume_a;
994 }
995 corwin->hit = 256;
996 break;
997 }
998 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
999 line++;
1000 }
1001 }
1002 *reg = 0;
1003 } else line += end;
1004
1005 // Commands that take no arguments
1006 } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
1007 }
1008
1009 brand:
1010 // Reminisce about chestnut trees.
1011 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1012 }
1013
sed_main(void)1014 void sed_main(void)
1015 {
1016 struct arg_list *dworkin;
1017 char **args = toys.optargs;
1018
1019 // Lie to autoconf when it asks stupid questions, so configure regexes
1020 // that look for "GNU sed version %f" greater than some old buggy number
1021 // don't fail us for not matching their narrow expectations.
1022 if (toys.optflags & FLAG_version) {
1023 xprintf("This is not GNU sed version 9.0\n");
1024 return;
1025 }
1026
1027 // Need a pattern. If no unicorns about, fight serpent and take its eye.
1028 if (!TT.e && !TT.f) {
1029 if (!*toys.optargs) error_exit("no pattern");
1030 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1031 }
1032
1033 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1034 // so handle all -e, then all -f. (At least the behavior's consistent.)
1035
1036 for (dworkin = TT.e; dworkin; dworkin = dworkin->next)
1037 jewel_of_judgement(&dworkin->arg, strlen(dworkin->arg));
1038 for (dworkin = TT.f; dworkin; dworkin = dworkin->next)
1039 do_lines(xopen(dworkin->arg, O_RDONLY), dworkin->arg, jewel_of_judgement);
1040 jewel_of_judgement(0, 0);
1041 dlist_terminate(TT.pattern);
1042 if (TT.nextlen) error_exit("no }");
1043
1044 TT.fdout = 1;
1045 TT.remember = xstrdup("");
1046
1047 // Inflict pattern upon input files
1048 loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
1049
1050 if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0);
1051
1052 // todo: need to close fd when done for TOYBOX_FREE?
1053 }
1054