1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9 * TODO: make y// handle unicode
10 * TODO: handle error return from emit(), error_msg/exit consistently
11 * What's the right thing to do for -i when write fails? Skip to next?
12
13 USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
14
15 config SED
16 bool "sed"
17 default y
18 help
19 usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
20
21 Stream editor. Apply one or more editing SCRIPTs to each line of input
22 (from FILE or stdin) producing output (by default to stdout).
23
24 -e add SCRIPT to list
25 -f add contents of SCRIPT_FILE to list
26 -i Edit each file in place.
27 -n No default output. (Use the p command to output matched lines.)
28 -r Use extended regular expression syntax.
29 -E Alias for -r.
30 -s Treat input files separately (implied by -i)
31
32 A SCRIPT is a series of one or more COMMANDs separated by newlines or
33 semicolons. All -e SCRIPTs are concatenated together as if separated
34 by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
35 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
36
37 Each COMMAND may be preceded by an address which limits the command to
38 apply only to the specified line(s). Commands without an address apply to
39 every line. Addresses are of the form:
40
41 [ADDRESS[,ADDRESS]]COMMAND
42
43 The ADDRESS may be a decimal line number (starting at 1), a /regular
44 expression/ within a pair of forward slashes, or the character "$" which
45 matches the last line of input. (In -s or -i mode this matches the last
46 line of each file, otherwise just the last line of the last file.) A single
47 address matches one line, a pair of comma separated addresses match
48 everything from the first address to the second address (inclusive). If
49 both addresses are regular expressions, more than one range of lines in
50 each file can match.
51
52 REGULAR EXPRESSIONS in sed are started and ended by the same character
53 (traditionally / but anything except a backslash or a newline works).
54 Backslashes may be used to escape the delimiter if it occurs in the
55 regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
56 and unicode). An empty regex repeats the previous one. ADDRESS regexes
57 (above) require the first delimeter to be escaped with a backslash when
58 it isn't a forward slash (to distinguish it from the COMMANDs below).
59
60 Sed mostly operates on individual lines one at a time. It reads each line,
61 processes it, and either writes it to the output or discards it before
62 reading the next line. Sed can remember one additional line in a separate
63 buffer (using the h, H, g, G, and x commands), and can read the next line
64 of input early (using the n and N command), but other than that command
65 scripts operate on individual lines of text.
66
67 Each COMMAND starts with a single character. The following commands take
68 no arguments:
69
70 { Start a new command block, continuing until a corresponding "}".
71 Command blocks may nest. If the block has an address, commands within
72 the block are only run for lines within the block's address range.
73
74 } End command block (this command cannot have an address)
75
76 d Delete this line and move on to the next one
77 (ignores remaining COMMANDs)
78
79 D Delete one line of input and restart command SCRIPT (same as "d"
80 unless you've glued lines together with "N" or similar)
81
82 g Get remembered line (overwriting current line)
83
84 G Get remembered line (appending to current line)
85
86 h Remember this line (overwriting remembered line)
87
88 H Remember this line (appending to remembered line, if any)
89
90 l Print line, escaping \abfrtv (but not newline), octal escaping other
91 nonprintable characters, wrapping lines to terminal width with a
92 backslash, and appending $ to actual end of line.
93
94 n Print default output and read next line, replacing current line
95 (If no next line available, quit processing script)
96
97 N Append next line of input to this line, separated by a newline
98 (This advances the line counter for address matching and "=", if no
99 next line available quit processing script without default output)
100
101 p Print this line
102
103 P Print this line up to first newline (from "N")
104
105 q Quit (print default output, no more commands processed or lines read)
106
107 x Exchange this line with remembered line (overwrite in both directions)
108
109 = Print the current line number (followed by a newline)
110
111 The following commands (may) take an argument. The "text" arguments (to
112 the "a", "b", and "c" commands) may end with an unescaped "\" to append
113 the next line (for which leading whitespace is not skipped), and also
114 treat ";" as a literal character (use "\;" instead).
115
116 a [text] Append text to output before attempting to read next line
117
118 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
119
120 c [text] Delete line, output text at end of matching address range
121 (ignores remaining COMMANDs)
122
123 i [text] Print text
124
125 r [file] Append contents of file to output before attempting to read
126 next line.
127
128 s/S/R/F Search for regex S, replace matched text with R using flags F.
129 The first character after the "s" (anything but newline or
130 backslash) is the delimiter, escape with \ to use normally.
131
132 The replacement text may contain "&" to substitute the matched
133 text (escape it with backslash for a literal &), or \1 through
134 \9 to substitute a parenthetical subexpression in the regex.
135 You can also use the normal backslash escapes such as \n and
136 a backslash at the end of the line appends the next line.
137
138 The flags are:
139
140 [0-9] A number, substitute only that occurrence of pattern
141 g Global, substitute all occurrences of pattern
142 i Ignore case when matching
143 p Print the line if match was found and replaced
144 w [file] Write (append) line to file if match replaced
145
146 t [label] Test, jump to :label only if an "s" command found a match in
147 this line since last test (replacing with same text counts)
148
149 T [label] Test false, jump only if "s" hasn't found a match.
150
151 w [file] Write (append) line to file
152
153 y/old/new/ Change each character in 'old' to corresponding character
154 in 'new' (with standard backslash escapes, delimiter can be
155 any repeated character except \ or \n)
156
157 : [label] Labeled target for jump commands
158
159 # Comment, ignore rest of this line of SCRIPT
160
161 Deviations from posix: allow extended regular expressions with -r,
162 editing in place with -i, separate with -s, printf escapes in text, line
163 continuations, semicolons after all commands, 2-address anywhere an
164 address is allowed, "T" command, multiline continuations for [abc],
165 \; to end [abc] argument before end of line.
166 */
167
168 #define FOR_sed
169 #include "toys.h"
170
171 GLOBALS(
172 struct arg_list *f;
173 struct arg_list *e;
174
175 // processed pattern list
176 struct double_list *pattern;
177
178 char *nextline, *remember;
179 void *restart, *lastregex;
180 long nextlen, rememberlen, count;
181 int fdout, noeol;
182 unsigned xx;
183 )
184
185 // Linked list of parsed sed commands. Offset fields indicate location where
186 // regex or string starts, ala offset+(char *)struct, because we remalloc()
187 // these to expand them for multiline inputs, and pointers would have to be
188 // individually adjusted.
189
190 struct sedcmd {
191 struct sedcmd *next, *prev;
192
193 // Begin and end of each match
194 long lmatch[2]; // line number of match
195 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
196 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
197 unsigned not, hit;
198 unsigned sflags; // s///flag bits: i=1, g=2, p=4
199 char c; // action
200 };
201
202 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)203 static int emit(char *line, long len, int eol)
204 {
205 int l, old = line[len];
206
207 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
208 TT.noeol = !eol;
209 if (eol) line[len++] = '\n';
210 if (!len) return 0;
211 l = writeall(TT.fdout, line, len);
212 if (eol) line[len-1] = old;
213 if (l != len) {
214 perror_msg("short write");
215
216 return 1;
217 }
218
219 return 0;
220 }
221
222 // Extend allocation to include new string, with newline between if newlen<0
223
extend_string(char ** old,char * new,int oldlen,int newlen)224 static char *extend_string(char **old, char *new, int oldlen, int newlen)
225 {
226 int newline = newlen < 0;
227 char *s;
228
229 if (newline) newlen = -newlen;
230 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
231 if (newline) s[oldlen++] = '\n';
232 memcpy(s+oldlen, new, newlen);
233 s[oldlen+newlen] = 0;
234
235 return s+oldlen+newlen+1;
236 }
237
238 // An empty regex repeats the previous one
get_regex(void * trump,int offset)239 static void *get_regex(void *trump, int offset)
240 {
241 if (!offset) {
242 if (!TT.lastregex) error_exit("no previous regex");
243 return TT.lastregex;
244 }
245
246 return TT.lastregex = offset+(char *)trump;
247 }
248
249 // Apply pattern to line from input file
process_line(char ** pline,long plen)250 static void process_line(char **pline, long plen)
251 {
252 struct append {
253 struct append *next, *prev;
254 int file;
255 char *str;
256 } *append = 0;
257 char *line = TT.nextline;
258 long len = TT.nextlen;
259 struct sedcmd *command;
260 int eol = 0, tea = 0;
261
262 // Grab next line for deferred processing (EOF detection: we get a NULL
263 // pline at EOF to flush last line). Note that only end of _last_ input
264 // file matches $ (unless we're doing -i).
265 TT.nextline = 0;
266 TT.nextlen = 0;
267 if (pline) {
268 TT.nextline = *pline;
269 TT.nextlen = plen;
270 *pline = 0;
271 }
272
273 if (!line || !len) return;
274 if (line[len-1] == '\n') line[--len] = eol++;
275 TT.count++;
276
277 // The restart-1 is because we added one to make sure it wasn't NULL,
278 // otherwise N as last command would restart script
279 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
280 TT.restart = 0;
281
282 while (command) {
283 char *str, c = command->c;
284
285 // Have we got a line or regex matching range for this rule?
286 if (*command->lmatch || *command->rmatch) {
287 int miss = 0;
288 long lm;
289
290 // In a match that might end?
291 if (command->hit) {
292 if (!(lm = command->lmatch[1])) {
293 if (!command->rmatch[1]) command->hit = 0;
294 else {
295 void *rm = get_regex(command, command->rmatch[1]);
296
297 // regex match end includes matching line, so defer deactivation
298 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
299 }
300 } else if (lm > 0 && lm < TT.count) command->hit = 0;
301
302 // Start a new match?
303 } else {
304 if (!(lm = *command->lmatch)) {
305 void *rm = get_regex(command, *command->rmatch);
306
307 if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
308 } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
309
310 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
311 }
312
313 // Didn't match?
314 lm = !(command->hit ^ command->not);
315
316 // Deferred disable from regex end match
317 if (miss || command->lmatch[1] == TT.count) command->hit = 0;
318
319 if (lm) {
320 // Handle skipping curly bracket command group
321 if (c == '{') {
322 int curly = 1;
323
324 while (curly) {
325 command = command->next;
326 if (command->c == '{') curly++;
327 if (command->c == '}') curly--;
328 }
329 }
330 command = command->next;
331 continue;
332 }
333 }
334
335 // A deleted line can still update line match state for later commands
336 if (!line) {
337 command = command->next;
338 continue;
339 }
340
341 // Process command
342
343 if (c=='a' || c=='r') {
344 struct append *a = xzalloc(sizeof(struct append));
345 if (command->arg1) a->str = command->arg1+(char *)command;
346 a->file = c=='r';
347 dlist_add_nomalloc((void *)&append, (void *)a);
348 } else if (c=='b' || c=='t' || c=='T') {
349 int t = tea;
350
351 if (c != 'b') tea = 0;
352 if (c=='b' || t^(c=='T')) {
353 if (!command->arg1) break;
354 str = command->arg1+(char *)command;
355 for (command = (void *)TT.pattern; command; command = command->next)
356 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
357 break;
358 if (!command) error_exit("no :%s", str);
359 }
360 } else if (c=='c') {
361 str = command->arg1+(char *)command;
362 if (!command->hit) emit(str, strlen(str), 1);
363 free(line);
364 line = 0;
365 continue;
366 } else if (c=='d') {
367 free(line);
368 line = 0;
369 continue;
370 } else if (c=='D') {
371 // Delete up to \n or end of buffer
372 str = line;
373 while ((str-line)<len) if (*(str++) == '\n') break;
374 len -= str - line;
375 memmove(line, str, len);
376
377 // if "delete" blanks line, disable further processing
378 // otherwise trim and restart script
379 if (!len) {
380 free(line);
381 line = 0;
382 } else {
383 line[len] = 0;
384 command = (void *)TT.pattern;
385 }
386 continue;
387 } else if (c=='g') {
388 free(line);
389 line = xstrdup(TT.remember);
390 len = TT.rememberlen;
391 } else if (c=='G') {
392 line = xrealloc(line, len+TT.rememberlen+2);
393 line[len++] = '\n';
394 memcpy(line+len, TT.remember, TT.rememberlen);
395 line[len += TT.rememberlen] = 0;
396 } else if (c=='h') {
397 free(TT.remember);
398 TT.remember = xstrdup(line);
399 TT.rememberlen = len;
400 } else if (c=='H') {
401 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
402 TT.remember[TT.rememberlen++] = '\n';
403 memcpy(TT.remember+TT.rememberlen, line, len);
404 TT.remember[TT.rememberlen += len] = 0;
405 } else if (c=='i') {
406 str = command->arg1+(char *)command;
407 emit(str, strlen(str), 1);
408 } else if (c=='l') {
409 int i, x, off;
410
411 if (!TT.xx) {
412 terminal_size(&TT.xx, 0);
413 if (!TT.xx) TT.xx = 80;
414 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
415 if (TT.xx > 4) TT.xx -= 4;
416 }
417
418 for (i = off = 0; i<len; i++) {
419 if (off >= TT.xx) {
420 toybuf[off++] = '\\';
421 emit(toybuf, off, 1);
422 off = 0;
423 }
424 x = stridx("\\\a\b\f\r\t\v", line[i]);
425 if (x != -1) {
426 toybuf[off++] = '\\';
427 toybuf[off++] = "\\abfrtv"[x];
428 } else if (line[i] >= ' ') toybuf[off++] = line[i];
429 else off += sprintf(toybuf+off, "\\%03o", line[i]);
430 }
431 toybuf[off++] = '$';
432 emit(toybuf, off, 1);
433 } else if (c=='n') {
434 TT.restart = command->next+1;
435
436 break;
437 } else if (c=='N') {
438 // Can't just grab next line because we could have multiple N and
439 // we need to actually read ahead to get N;$p EOF detection right.
440 if (pline) {
441 TT.restart = command->next+1;
442 extend_string(&line, TT.nextline, len, -TT.nextlen);
443 free(TT.nextline);
444 TT.nextline = line;
445 TT.nextlen += len + 1;
446 line = 0;
447 }
448
449 // Pending append goes out right after N
450 goto done;
451 } else if (c=='p' || c=='P') {
452 char *l = (c=='P') ? strchr(line, '\n') : 0;
453
454 if (emit(line, l ? l-line : len, eol)) break;
455 } else if (c=='q') {
456 if (pline) *pline = (void *)1;
457 free(TT.nextline);
458 TT.nextline = 0;
459 TT.nextlen = 0;
460
461 break;
462 } else if (c=='s') {
463 char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
464 regmatch_t *match = (void *)toybuf;
465 regex_t *reg = get_regex(command, command->arg1);
466 int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
467
468 // Find match in remaining line (up to remaining len)
469 while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
470 mflags = REG_NOTBOL;
471
472 // Zero length matches don't count immediately after a previous match
473 mlen = match[0].rm_eo-match[0].rm_so;
474 if (!mlen && !zmatch) {
475 if (!rlen--) break;
476 rline++;
477 zmatch++;
478 continue;
479 } else zmatch = 0;
480
481 // If we're replacing only a specific match, skip if this isn't it
482 off = command->sflags>>3;
483 if (off && off != ++count) {
484 rline += match[0].rm_eo;
485 rlen -= match[0].rm_eo;
486
487 continue;
488 }
489 // The fact getline() can allocate unbounded amounts of memory is
490 // a bigger issue, but while we're here check for integer overflow
491 if (match[0].rm_eo > INT_MAX) perror_exit(0);
492
493 // newlen = strlen(new) but with \1 and & and printf escapes
494 for (off = newlen = 0; new[off]; off++) {
495 int cc = -1;
496
497 if (new[off] == '&') cc = 0;
498 else if (new[off] == '\\') cc = new[++off] - '0';
499 if (cc < 0 || cc > 9) {
500 newlen++;
501 continue;
502 }
503 newlen += match[cc].rm_eo-match[cc].rm_so;
504 }
505
506 // Allocate new size, copy start/end around match. (Can't extend in
507 // place because backrefs may refer to text after it's overwritten.)
508 len += newlen-mlen;
509 swap = xmalloc(len+1);
510 rswap = swap+(rline-line)+match[0].rm_so;
511 memcpy(swap, line, (rline-line)+match[0].rm_so);
512 memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
513
514 // copy in new replacement text
515 for (off = mlen = 0; new[off]; off++) {
516 int cc = 0, ll;
517
518 if (new[off] == '\\') {
519 cc = new[++off] - '0';
520 if (cc<0 || cc>9) {
521 if (!(rswap[mlen++] = unescape(new[off])))
522 rswap[mlen-1] = new[off];
523
524 continue;
525 } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
526 } else if (new[off] != '&') {
527 rswap[mlen++] = new[off];
528
529 continue;
530 }
531
532 ll = match[cc].rm_eo-match[cc].rm_so;
533 memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
534 mlen += ll;
535 }
536
537 rline = rswap+newlen;
538 free(line);
539 line = swap;
540
541 // Stop after first substitution unless we have flag g
542 if (!(command->sflags & 2)) break;
543 }
544
545 if (mflags) {
546 // flag p
547 if (command->sflags & 4) emit(line, len, eol);
548
549 tea = 1;
550 if (command->w) goto writenow;
551 }
552 } else if (c=='w') {
553 int fd, noeol;
554 char *name;
555
556 writenow:
557 // Swap out emit() context
558 fd = TT.fdout;
559 noeol = TT.noeol;
560
561 // We save filehandle and newline status before filename
562 name = command->w + (char *)command;
563 memcpy(&TT.fdout, name, 4);
564 name += 4;
565 TT.noeol = *(name++);
566
567 // write, then save/restore context
568 if (emit(line, len, eol))
569 perror_exit("w '%s'", command->arg1+(char *)command);
570 *(--name) = TT.noeol;
571 TT.noeol = noeol;
572 TT.fdout = fd;
573 } else if (c=='x') {
574 long swap = TT.rememberlen;
575
576 str = TT.remember;
577 TT.remember = line;
578 line = str;
579 TT.rememberlen = len;
580 len = swap;
581 } else if (c=='y') {
582 char *from, *to = (char *)command;
583 int i, j;
584
585 from = to+command->arg1;
586 to += command->arg2;
587
588 for (i = 0; i < len; i++) {
589 j = stridx(from, line[i]);
590 if (j != -1) line[i] = to[j];
591 }
592 } else if (c=='=') {
593 sprintf(toybuf, "%ld", TT.count);
594 emit(toybuf, strlen(toybuf), 1);
595 }
596
597 command = command->next;
598 }
599
600 if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
601
602 done:
603 if (dlist_terminate(append)) while (append) {
604 struct append *a = append->next;
605
606 if (append->file) {
607 int fd = open(append->str, O_RDONLY);
608
609 // Force newline if noeol pending
610 if (fd != -1) {
611 if (TT.noeol) xwrite(TT.fdout, "\n", 1);
612 TT.noeol = 0;
613 xsendfile(fd, TT.fdout);
614 close(fd);
615 }
616 } else if (append->str) emit(append->str, strlen(append->str), 1);
617 else emit(line, 0, 0);
618 free(append);
619 append = a;
620 }
621 free(line);
622 }
623
624 // Callback called on each input file
do_sed(int fd,char * name)625 static void do_sed(int fd, char *name)
626 {
627 int i = toys.optflags & FLAG_i;
628 char *tmp;
629
630 if (i) {
631 struct sedcmd *command;
632
633 if (!fd && !strcmp(name, "-")) {
634 error_msg("-i on stdin");
635 return;
636 }
637 TT.fdout = copy_tempfile(fd, name, &tmp);
638 TT.count = 0;
639 for (command = (void *)TT.pattern; command; command = command->next)
640 command->hit = 0;
641 }
642 do_lines(fd, process_line);
643 if (i) {
644 process_line(0, 0);
645 replace_tempfile(-1, TT.fdout, &tmp);
646 TT.fdout = 1;
647 TT.nextline = 0;
648 TT.nextlen = TT.noeol = 0;
649 }
650 }
651
652 // Copy chunk of string between two delimiters, converting printf escapes.
653 // returns processed copy of string (0 if error), *pstr advances to next
654 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
655 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)656 static char *unescape_delimited_string(char **pstr, char *delim)
657 {
658 char *to, *from, mode = 0, d;
659
660 // Grab leading delimiter (if necessary), allocate space for new string
661 from = *pstr;
662 if (!delim || !*delim) {
663 if (!(d = *(from++))) return 0;
664 if (d == '\\') d = *(from++);
665 if (!d || d == '\\') return 0;
666 if (delim) *delim = d;
667 } else d = *delim;
668 to = delim = xmalloc(strlen(*pstr)+1);
669
670 while (mode || *from != d) {
671 if (!*from) return 0;
672
673 // delimiter in regex character range doesn't count
674 if (*from == '[') {
675 if (!mode) {
676 mode = ']';
677 if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
678 } else if (mode == ']' && strchr(".=:", from[1])) {
679 *(to++) = *(from++);
680 mode = *from;
681 }
682 } else if (*from == mode) {
683 if (mode == ']') mode = 0;
684 else {
685 *(to++) = *(from++);
686 mode = ']';
687 }
688 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
689 // but the perl build does it, so we need to filter it out.
690 } else if (mode && *from == '-' && from[-1] == from[1]) {
691 from+=2;
692 continue;
693 } else if (*from == '\\') {
694 if (!from[1]) return 0;
695
696 // Check escaped end delimiter before printf style escapes.
697 if (from[1] == d) from++;
698 else if (from[1]=='\\') *(to++) = *(from++);
699 else {
700 char c = unescape(from[1]);
701
702 if (c) {
703 *(to++) = c;
704 from+=2;
705 continue;
706 } else if (!mode) *(to++) = *(from++);
707 }
708 }
709 *(to++) = *(from++);
710 }
711 *to = 0;
712 *pstr = from+1;
713
714 return delim;
715 }
716
717 // Translate pattern strings into command structures. Each command structure
718 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)719 static void parse_pattern(char **pline, long len)
720 {
721 struct sedcmd *command = (void *)TT.pattern;
722 char *line, *reg, c, *errstart;
723 int i;
724
725 line = errstart = pline ? *pline : "";
726 if (len && line[len-1]=='\n') line[--len] = 0;
727
728 // Append this line to previous multiline command? (hit indicates type.)
729 // During parsing "hit" stores data about line continuations, but in
730 // process_line() it means the match range attached to this command
731 // is active, so processing the continuation must zero it again.
732 if (command && command->prev->hit) {
733 // Remove half-finished entry from list so remalloc() doesn't confuse it
734 TT.pattern = TT.pattern->prev;
735 command = dlist_pop(&TT.pattern);
736 c = command->c;
737 reg = (char *)command;
738 reg += command->arg1 + strlen(reg + command->arg1);
739
740 // Resume parsing for 'a' or 's' command. (Only two that can do this.)
741 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
742 // a unicode character.
743 if (command->hit < 256) goto resume_s;
744 else goto resume_a;
745 }
746
747 // Loop through commands in this line.
748
749 command = 0;
750 for (;;) {
751 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
752
753 // If there's no more data on this line, return.
754 for (;;) {
755 while (isspace(*line) || *line == ';') line++;
756 if (*line == '#') while (*line && *line != '\n') line++;
757 else break;
758 }
759 if (!*line) return;
760
761 // We start by writing data into toybuf. Later we'll allocate the
762 // ex
763
764 errstart = line;
765 memset(toybuf, 0, sizeof(struct sedcmd));
766 command = (void *)toybuf;
767 reg = toybuf + sizeof(struct sedcmd);
768
769 // Parse address range (if any)
770 for (i = 0; i < 2; i++) {
771 if (*line == ',') line++;
772 else if (i) break;
773
774 if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
775 else if (*line == '$') {
776 command->lmatch[i] = -1;
777 line++;
778 } else if (*line == '/' || *line == '\\') {
779 char *s = line;
780
781 if (!(s = unescape_delimited_string(&line, 0))) goto error;
782 if (!*s) command->rmatch[i] = 0;
783 else {
784 xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
785 command->rmatch[i] = reg-toybuf;
786 reg += sizeof(regex_t);
787 }
788 free(s);
789 } else break;
790 }
791
792 while (isspace(*line)) line++;
793 if (!*line) break;
794
795 while (*line == '!') {
796 command->not = 1;
797 line++;
798 }
799 while (isspace(*line)) line++;
800
801 c = command->c = *(line++);
802 if (strchr("}:", c) && i) break;
803 if (strchr("aiqr=", c) && i>1) break;
804
805 // Add step to pattern
806 command = xmemdup(toybuf, reg-toybuf);
807 reg = (reg-toybuf) + (char *)command;
808
809 // Parse arguments by command type
810 if (c == '{') TT.nextlen++;
811 else if (c == '}') {
812 if (!TT.nextlen--) break;
813 } else if (c == 's') {
814 char *end, delim = 0;
815
816 // s/pattern/replacement/flags
817
818 // line continuations use arg1 (back at the start of the function),
819 // so let's fill out arg2 first (since the regex part can't be multiple
820 // lines) and swap them back later.
821
822 // get pattern (just record, we parse it later)
823 command->arg2 = reg - (char *)command;
824 if (!(TT.remember = unescape_delimited_string(&line, &delim)))
825 goto error;
826
827 reg += sizeof(regex_t);
828 command->arg1 = reg-(char *)command;
829 command->hit = delim;
830 resume_s:
831 // get replacement - don't replace escapes yet because \1 and \& need
832 // processing later, after we replace \\ with \ we can't tell \\1 from \1
833 end = line;
834 while (*end != command->hit) {
835 if (!*end) goto error;
836 if (*end++ == '\\') {
837 if (!*end || *end == '\n') {
838 end[-1] = '\n';
839 break;
840 }
841 end++;
842 }
843 }
844
845 reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
846 line = end;
847 // line continuation? (note: '\n' can't be a valid delim).
848 if (*line == command->hit) command->hit = 0;
849 else {
850 if (!*line) continue;
851 reg--;
852 line++;
853 goto resume_s;
854 }
855
856 // swap arg1/arg2 so they're back in order arguments occur.
857 i = command->arg1;
858 command->arg1 = command->arg2;
859 command->arg2 = i;
860
861 // get flags
862 for (line++; *line; line++) {
863 long l;
864
865 if (isspace(*line) && *line != '\n') continue;
866
867 if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
868 else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
869 command->sflags |= l << 3;
870 line--;
871 } else break;
872 }
873
874 // We deferred actually parsing the regex until we had the s///i flag
875 // allocating the space was done by extend_string() above
876 if (!*TT.remember) command->arg1 = 0;
877 else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
878 ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
879 free(TT.remember);
880 TT.remember = 0;
881 if (*line == 'w') {
882 line++;
883 goto writenow;
884 }
885 } else if (c == 'w') {
886 int fd, delim;
887 char *cc;
888
889 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
890 // eol status, and to retain the filename for error messages, we'd need
891 // to go up to arg5 just for this. Compromise: dynamically allocate the
892 // filehandle and eol status.
893
894 writenow:
895 while (isspace(*line)) line++;
896 if (!*line) goto error;
897 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
898 delim = *cc;
899 *cc = 0;
900 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
901 *cc = delim;
902
903 command->w = reg - (char *)command;
904 command = xrealloc(command, command->w+(cc-line)+6);
905 reg = command->w + (char *)command;
906
907 memcpy(reg, &fd, 4);
908 reg += 4;
909 *(reg++) = 0;
910 memcpy(reg, line, delim);
911 reg += delim;
912 *(reg++) = 0;
913
914 line = cc;
915 if (delim) line += 2;
916 } else if (c == 'y') {
917 char *s, delim = 0;
918 int len;
919
920 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
921 command->arg1 = reg-(char *)command;
922 len = strlen(s);
923 reg = extend_string((void *)&command, s, reg-(char *)command, len);
924 free(s);
925 command->arg2 = reg-(char *)command;
926 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
927 if (len != strlen(s)) goto error;
928 reg = extend_string((void *)&command, s, reg-(char*)command, len);
929 free(s);
930 } else if (strchr("abcirtTw:", c)) {
931 int end;
932
933 // trim leading spaces
934 while (isspace(*line) && *line != '\n') line++;
935
936 // Resume logic differs from 's' case because we don't add a newline
937 // unless it's after something, so we add it on return instead.
938 resume_a:
939 command->hit = 0;
940
941 // btT: end with space or semicolon, aicrw continue to newline.
942 if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
943 // Argument's optional for btT
944 if (strchr("btT", c)) continue;
945 else if (!command->arg1) break;
946 }
947
948 // Extend allocation to include new string. We use offsets instead of
949 // pointers so realloc() moving stuff doesn't break things. Ok to write
950 // \n over NUL terminator because call to extend_string() adds it back.
951 if (!command->arg1) command->arg1 = reg - (char*)command;
952 else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
953 else if (!pline) {
954 command->arg1 = 0;
955 continue;
956 }
957 reg = extend_string((void *)&command, line, reg - (char *)command, end);
958
959 // Recopy data to remove escape sequences and handle line continuation.
960 if (strchr("aci", c)) {
961 reg -= end+1;
962 for (i = end; i; i--) {
963 if ((*reg++ = *line++)=='\\') {
964
965 // escape at end of line: resume if -e escaped literal newline,
966 // else request callback and resume with next line
967 if (!--i) {
968 *--reg = 0;
969 if (*line) {
970 line++;
971 goto resume_a;
972 }
973 command->hit = 256;
974 break;
975 }
976 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
977 line++;
978 }
979 }
980 *reg = 0;
981 } else line += end;
982
983 // Commands that take no arguments
984 } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
985 }
986
987 error:
988 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
989 }
990
sed_main(void)991 void sed_main(void)
992 {
993 struct arg_list *al;
994 char **args = toys.optargs;
995
996 // Lie to autoconf when it asks stupid questions, so configure regexes
997 // that look for "GNU sed version %f" greater than some old buggy number
998 // don't fail us for not matching their narrow expectations.
999 if (toys.optflags & FLAG_version) {
1000 xprintf("This is not GNU sed version 9.0\n");
1001 return;
1002 }
1003
1004 // Handling our own --version means we handle our own --help too.
1005 if (toys.optflags&FLAG_help) help_exit(0);
1006
1007 // Parse pattern into commands.
1008
1009 // If no -e or -f, first argument is the pattern.
1010 if (!TT.e && !TT.f) {
1011 if (!*toys.optargs) error_exit("no pattern");
1012 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1013 }
1014
1015 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1016 // so handle all -e, then all -f. (At least the behavior's consistent.)
1017
1018 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1019 for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
1020 parse_pattern(0, 0);
1021 dlist_terminate(TT.pattern);
1022 if (TT.nextlen) error_exit("no }");
1023
1024 TT.fdout = 1;
1025 TT.remember = xstrdup("");
1026
1027 // Inflict pattern upon input files. Long version because !O_CLOEXEC
1028 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed);
1029
1030 if (!(toys.optflags & FLAG_i)) process_line(0, 0);
1031
1032 // todo: need to close fd when done for TOYBOX_FREE?
1033 }
1034