1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9 * TODO: make y// handle unicode, unicode delimiters
10 * TODO: handle error return from emit(), error_msg/exit consistently
11 * What's the right thing to do for -i when write fails? Skip to next?
12 * test '//q' with no previous regex, also repeat previous regex?
13
14 USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
15
16 config SED
17 bool "sed"
18 default y
19 help
20 usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
21
22 Stream editor. Apply one or more editing SCRIPTs to each line of input
23 (from FILE or stdin) producing output (by default to stdout).
24
25 -e Add SCRIPT to list
26 -f Add contents of SCRIPT_FILE to list
27 -i Edit each file in place (-iEXT keeps backup file with extension EXT)
28 -n No default output (use the p command to output matched lines)
29 -r Use extended regular expression syntax
30 -E POSIX alias for -r
31 -s Treat input files separately (implied by -i)
32 -z Use \0 rather than \n as the input line separator
33
34 A SCRIPT is a series of one or more COMMANDs separated by newlines or
35 semicolons. All -e SCRIPTs are concatenated together as if separated
36 by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
37 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
38
39 Each COMMAND may be preceded by an address which limits the command to
40 apply only to the specified line(s). Commands without an address apply to
41 every line. Addresses are of the form:
42
43 [ADDRESS[,ADDRESS]]COMMAND
44
45 The ADDRESS may be a decimal line number (starting at 1), a /regular
46 expression/ within a pair of forward slashes, or the character "$" which
47 matches the last line of input. (In -s or -i mode this matches the last
48 line of each file, otherwise just the last line of the last file.) A single
49 address matches one line, a pair of comma separated addresses match
50 everything from the first address to the second address (inclusive). If
51 both addresses are regular expressions, more than one range of lines in
52 each file can match.
53
54 REGULAR EXPRESSIONS in sed are started and ended by the same character
55 (traditionally / but anything except a backslash or a newline works).
56 Backslashes may be used to escape the delimiter if it occurs in the
57 regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
58 and unicode). An empty regex repeats the previous one. ADDRESS regexes
59 (above) require the first delimiter to be escaped with a backslash when
60 it isn't a forward slash (to distinguish it from the COMMANDs below).
61
62 Sed mostly operates on individual lines one at a time. It reads each line,
63 processes it, and either writes it to the output or discards it before
64 reading the next line. Sed can remember one additional line in a separate
65 buffer (using the h, H, g, G, and x commands), and can read the next line
66 of input early (using the n and N command), but other than that command
67 scripts operate on individual lines of text.
68
69 Each COMMAND starts with a single character. The following commands take
70 no arguments:
71
72 { Start a new command block, continuing until a corresponding "}".
73 Command blocks may nest. If the block has an address, commands within
74 the block are only run for lines within the block's address range.
75
76 } End command block (this command cannot have an address)
77
78 d Delete this line and move on to the next one
79 (ignores remaining COMMANDs)
80
81 D Delete one line of input and restart command SCRIPT (same as "d"
82 unless you've glued lines together with "N" or similar)
83
84 g Get remembered line (overwriting current line)
85
86 G Get remembered line (appending to current line)
87
88 h Remember this line (overwriting remembered line)
89
90 H Remember this line (appending to remembered line, if any)
91
92 l Print line, escaping \abfrtv (but not newline), octal escaping other
93 nonprintable characters, wrapping lines to terminal width with a
94 backslash, and appending $ to actual end of line.
95
96 n Print default output and read next line, replacing current line
97 (If no next line available, quit processing script)
98
99 N Append next line of input to this line, separated by a newline
100 (This advances the line counter for address matching and "=", if no
101 next line available quit processing script without default output)
102
103 p Print this line
104
105 P Print this line up to first newline (from "N")
106
107 q Quit (print default output, no more commands processed or lines read)
108
109 x Exchange this line with remembered line (overwrite in both directions)
110
111 = Print the current line number (followed by a newline)
112
113 The following commands (may) take an argument. The "text" arguments (to
114 the "a", "b", and "c" commands) may end with an unescaped "\" to append
115 the next line (for which leading whitespace is not skipped), and also
116 treat ";" as a literal character (use "\;" instead).
117
118 a [text] Append text to output before attempting to read next line
119
120 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
121
122 c [text] Delete line, output text at end of matching address range
123 (ignores remaining COMMANDs)
124
125 i [text] Print text
126
127 r [file] Append contents of file to output before attempting to read
128 next line.
129
130 s/S/R/F Search for regex S, replace matched text with R using flags F.
131 The first character after the "s" (anything but newline or
132 backslash) is the delimiter, escape with \ to use normally.
133
134 The replacement text may contain "&" to substitute the matched
135 text (escape it with backslash for a literal &), or \1 through
136 \9 to substitute a parenthetical subexpression in the regex.
137 You can also use the normal backslash escapes such as \n and
138 a backslash at the end of the line appends the next line.
139
140 The flags are:
141
142 [0-9] A number, substitute only that occurrence of pattern
143 g Global, substitute all occurrences of pattern
144 i Ignore case when matching
145 p Print the line if match was found and replaced
146 w [file] Write (append) line to file if match replaced
147
148 t [label] Test, jump to :label only if an "s" command found a match in
149 this line since last test (replacing with same text counts)
150
151 T [label] Test false, jump only if "s" hasn't found a match.
152
153 w [file] Write (append) line to file
154
155 y/old/new/ Change each character in 'old' to corresponding character
156 in 'new' (with standard backslash escapes, delimiter can be
157 any repeated character except \ or \n)
158
159 : [label] Labeled target for jump commands
160
161 # Comment, ignore rest of this line of SCRIPT
162
163 Deviations from POSIX: allow extended regular expressions with -r,
164 editing in place with -i, separate with -s, NUL-separated input with -z,
165 printf escapes in text, line continuations, semicolons after all commands,
166 2-address anywhere an address is allowed, "T" command, multiline
167 continuations for [abc], \; to end [abc] argument before end of line.
168 */
169
170 #define FOR_sed
171 #include "toys.h"
172
173 GLOBALS(
174 char *i;
175 struct arg_list *f, *e;
176
177 // processed pattern list
178 struct double_list *pattern;
179
180 char *nextline, *remember;
181 void *restart, *lastregex;
182 long nextlen, rememberlen, count;
183 int fdout, noeol;
184 unsigned xx;
185 char delim;
186 )
187
188 // Linked list of parsed sed commands. Offset fields indicate location where
189 // regex or string starts, ala offset+(char *)struct, because we remalloc()
190 // these to expand them for multiline inputs, and pointers would have to be
191 // individually adjusted.
192
193 struct sedcmd {
194 struct sedcmd *next, *prev;
195
196 // Begin and end of each match
197 long lmatch[2]; // line number of match
198 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
199 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
200 unsigned not, hit;
201 unsigned sflags; // s///flag bits: i=1, g=2, p=4
202 char c; // action
203 };
204
205 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)206 static int emit(char *line, long len, int eol)
207 {
208 int l, old = line[len];
209
210 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
211 TT.noeol = !eol;
212 if (eol) line[len++] = '\n';
213 if (!len) return 0;
214 l = writeall(TT.fdout, line, len);
215 if (eol) line[len-1] = old;
216 if (l != len) {
217 perror_msg("short write");
218
219 return 1;
220 }
221
222 return 0;
223 }
224
225 // Extend allocation to include new string, with newline between if newlen<0
226
extend_string(char ** old,char * new,int oldlen,int newlen)227 static char *extend_string(char **old, char *new, int oldlen, int newlen)
228 {
229 int newline = newlen < 0;
230 char *s;
231
232 if (newline) newlen = -newlen;
233 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
234 if (newline) s[oldlen++] = '\n';
235 memcpy(s+oldlen, new, newlen);
236 s[oldlen+newlen] = 0;
237
238 return s+oldlen+newlen+1;
239 }
240
241 // An empty regex repeats the previous one
get_regex(void * trump,int offset)242 static void *get_regex(void *trump, int offset)
243 {
244 if (!offset) {
245 if (!TT.lastregex) error_exit("no previous regex");
246 return TT.lastregex;
247 }
248
249 return TT.lastregex = offset+(char *)trump;
250 }
251
252 // Apply pattern to line from input file
sed_line(char ** pline,long plen)253 static void sed_line(char **pline, long plen)
254 {
255 struct append {
256 struct append *next, *prev;
257 int file;
258 char *str;
259 } *append = 0;
260 char *line = TT.nextline;
261 long len = TT.nextlen;
262 struct sedcmd *command;
263 int eol = 0, tea = 0;
264
265 // Ignore EOF for all files before last unless -i
266 if (!pline && !FLAG(i)) return;
267
268 // Grab next line for deferred processing (EOF detection: we get a NULL
269 // pline at EOF to flush last line). Note that only end of _last_ input
270 // file matches $ (unless we're doing -i).
271 TT.nextline = 0;
272 TT.nextlen = 0;
273 if (pline) {
274 TT.nextline = *pline;
275 TT.nextlen = plen;
276 *pline = 0;
277 }
278
279 if (!line || !len) return;
280 if (line[len-1] == '\n') line[--len] = eol++;
281 TT.count++;
282
283 // The restart-1 is because we added one to make sure it wasn't NULL,
284 // otherwise N as last command would restart script
285 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
286 TT.restart = 0;
287
288 while (command) {
289 char *str, c = command->c;
290
291 // Have we got a line or regex matching range for this rule?
292 if (*command->lmatch || *command->rmatch) {
293 int miss = 0;
294 long lm;
295
296 // In a match that might end?
297 if (command->hit) {
298 if (!(lm = command->lmatch[1])) {
299 if (!command->rmatch[1]) command->hit = 0;
300 else {
301 void *rm = get_regex(command, command->rmatch[1]);
302
303 // regex match end includes matching line, so defer deactivation
304 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
305 }
306 } else if (lm > 0 && lm < TT.count) command->hit = 0;
307
308 // Start a new match?
309 } else {
310 if (!(lm = *command->lmatch)) {
311 void *rm = get_regex(command, *command->rmatch);
312
313 if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
314 } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
315
316 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
317 }
318
319 // Didn't match?
320 lm = !(command->hit ^ command->not);
321
322 // Deferred disable from regex end match
323 if (miss || command->lmatch[1] == TT.count) command->hit = 0;
324
325 if (lm) {
326 // Handle skipping curly bracket command group
327 if (c == '{') {
328 int curly = 1;
329
330 while (curly) {
331 command = command->next;
332 if (command->c == '{') curly++;
333 if (command->c == '}') curly--;
334 }
335 }
336 command = command->next;
337 continue;
338 }
339 }
340
341 // A deleted line can still update line match state for later commands
342 if (!line) {
343 command = command->next;
344 continue;
345 }
346
347 // Process command
348
349 if (c=='a' || c=='r') {
350 struct append *a = xzalloc(sizeof(struct append));
351 if (command->arg1) a->str = command->arg1+(char *)command;
352 a->file = c=='r';
353 dlist_add_nomalloc((void *)&append, (void *)a);
354 } else if (c=='b' || c=='t' || c=='T') {
355 int t = tea;
356
357 if (c != 'b') tea = 0;
358 if (c=='b' || t^(c=='T')) {
359 if (!command->arg1) break;
360 str = command->arg1+(char *)command;
361 for (command = (void *)TT.pattern; command; command = command->next)
362 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
363 break;
364 if (!command) error_exit("no :%s", str);
365 }
366 } else if (c=='c') {
367 str = command->arg1+(char *)command;
368 if (!command->hit) emit(str, strlen(str), 1);
369 free(line);
370 line = 0;
371 continue;
372 } else if (c=='d') {
373 free(line);
374 line = 0;
375 continue;
376 } else if (c=='D') {
377 // Delete up to \n or end of buffer
378 str = line;
379 while ((str-line)<len) if (*(str++) == '\n') break;
380 len -= str - line;
381 memmove(line, str, len);
382
383 // if "delete" blanks line, disable further processing
384 // otherwise trim and restart script
385 if (!len) {
386 free(line);
387 line = 0;
388 } else {
389 line[len] = 0;
390 command = (void *)TT.pattern;
391 }
392 continue;
393 } else if (c=='g') {
394 free(line);
395 line = xstrdup(TT.remember);
396 len = TT.rememberlen;
397 } else if (c=='G') {
398 line = xrealloc(line, len+TT.rememberlen+2);
399 line[len++] = '\n';
400 memcpy(line+len, TT.remember, TT.rememberlen);
401 line[len += TT.rememberlen] = 0;
402 } else if (c=='h') {
403 free(TT.remember);
404 TT.remember = xstrdup(line);
405 TT.rememberlen = len;
406 } else if (c=='H') {
407 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
408 TT.remember[TT.rememberlen++] = '\n';
409 memcpy(TT.remember+TT.rememberlen, line, len);
410 TT.remember[TT.rememberlen += len] = 0;
411 } else if (c=='i') {
412 str = command->arg1+(char *)command;
413 emit(str, strlen(str), 1);
414 } else if (c=='l') {
415 int i, x, off;
416
417 if (!TT.xx) {
418 terminal_size(&TT.xx, 0);
419 if (!TT.xx) TT.xx = 80;
420 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
421 if (TT.xx > 4) TT.xx -= 4;
422 }
423
424 for (i = off = 0; i<len; i++) {
425 if (off >= TT.xx) {
426 toybuf[off++] = '\\';
427 emit(toybuf, off, 1);
428 off = 0;
429 }
430 x = stridx("\\\a\b\f\r\t\v", line[i]);
431 if (x != -1) {
432 toybuf[off++] = '\\';
433 toybuf[off++] = "\\abfrtv"[x];
434 } else if (line[i] >= ' ') toybuf[off++] = line[i];
435 else off += sprintf(toybuf+off, "\\%03o", line[i]);
436 }
437 toybuf[off++] = '$';
438 emit(toybuf, off, 1);
439 } else if (c=='n') {
440 TT.restart = command->next+1;
441
442 break;
443 } else if (c=='N') {
444 // Can't just grab next line because we could have multiple N and
445 // we need to actually read ahead to get N;$p EOF detection right.
446 if (pline) {
447 TT.restart = command->next+1;
448 extend_string(&line, TT.nextline, len, -TT.nextlen);
449 free(TT.nextline);
450 TT.nextline = line;
451 TT.nextlen += len + 1;
452 line = 0;
453 }
454
455 // Pending append goes out right after N
456 goto done;
457 } else if (c=='p' || c=='P') {
458 char *l = (c=='P') ? strchr(line, '\n') : 0;
459
460 if (emit(line, l ? l-line : len, eol)) break;
461 } else if (c=='q') {
462 if (pline) *pline = (void *)1;
463 free(TT.nextline);
464 TT.nextline = 0;
465 TT.nextlen = 0;
466
467 break;
468 } else if (c=='s') {
469 char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
470 regmatch_t *match = (void *)toybuf;
471 regex_t *reg = get_regex(command, command->arg1);
472 int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
473
474 // Find match in remaining line (up to remaining len)
475 while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
476 mflags = REG_NOTBOL;
477
478 // Zero length matches don't count immediately after a previous match
479 mlen = match[0].rm_eo-match[0].rm_so;
480 if (!mlen && !zmatch) {
481 if (!rlen--) break;
482 rline++;
483 zmatch++;
484 continue;
485 } else zmatch = 0;
486
487 // If we're replacing only a specific match, skip if this isn't it
488 off = command->sflags>>3;
489 if (off && off != ++count) {
490 rline += match[0].rm_eo;
491 rlen -= match[0].rm_eo;
492
493 continue;
494 }
495 // The fact getline() can allocate unbounded amounts of memory is
496 // a bigger issue, but while we're here check for integer overflow
497 if (match[0].rm_eo > INT_MAX) perror_exit(0);
498
499 // newlen = strlen(new) but with \1 and & and printf escapes
500 for (off = newlen = 0; new[off]; off++) {
501 int cc = -1;
502
503 if (new[off] == '&') cc = 0;
504 else if (new[off] == '\\') cc = new[++off] - '0';
505 if (cc < 0 || cc > 9) {
506 newlen++;
507 continue;
508 }
509 newlen += match[cc].rm_eo-match[cc].rm_so;
510 }
511
512 // Allocate new size, copy start/end around match. (Can't extend in
513 // place because backrefs may refer to text after it's overwritten.)
514 len += newlen-mlen;
515 swap = xmalloc(len+1);
516 rswap = swap+(rline-line)+match[0].rm_so;
517 memcpy(swap, line, (rline-line)+match[0].rm_so);
518 memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
519
520 // copy in new replacement text
521 for (off = mlen = 0; new[off]; off++) {
522 int cc = 0, ll;
523
524 if (new[off] == '\\') {
525 cc = new[++off] - '0';
526 if (cc<0 || cc>9) {
527 if (!(rswap[mlen++] = unescape(new[off])))
528 rswap[mlen-1] = new[off];
529
530 continue;
531 } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
532 } else if (new[off] != '&') {
533 rswap[mlen++] = new[off];
534
535 continue;
536 }
537
538 if (match[cc].rm_so == -1) ll = 0; // Empty match.
539 else {
540 ll = match[cc].rm_eo-match[cc].rm_so;
541 memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
542 }
543 mlen += ll;
544 }
545
546 rline = rswap+newlen;
547 free(line);
548 line = swap;
549
550 // Stop after first substitution unless we have flag g
551 if (!(command->sflags & 2)) break;
552 }
553
554 if (mflags) {
555 // flag p
556 if (command->sflags & 4) emit(line, len, eol);
557
558 tea = 1;
559 if (command->w) goto writenow;
560 }
561 } else if (c=='w') {
562 int fd, noeol;
563 char *name;
564
565 writenow:
566 // Swap out emit() context
567 fd = TT.fdout;
568 noeol = TT.noeol;
569
570 // We save filehandle and newline status before filename
571 name = command->w + (char *)command;
572 memcpy(&TT.fdout, name, 4);
573 name += 4;
574 TT.noeol = *(name++);
575
576 // write, then save/restore context
577 if (emit(line, len, eol))
578 perror_exit("w '%s'", command->arg1+(char *)command);
579 *(--name) = TT.noeol;
580 TT.noeol = noeol;
581 TT.fdout = fd;
582 } else if (c=='x') {
583 long swap = TT.rememberlen;
584
585 str = TT.remember;
586 TT.remember = line;
587 line = str;
588 TT.rememberlen = len;
589 len = swap;
590 } else if (c=='y') {
591 char *from, *to = (char *)command;
592 int i, j;
593
594 from = to+command->arg1;
595 to += command->arg2;
596
597 for (i = 0; i < len; i++) {
598 j = stridx(from, line[i]);
599 if (j != -1) line[i] = to[j];
600 }
601 } else if (c=='=') {
602 sprintf(toybuf, "%ld", TT.count);
603 if (emit(toybuf, strlen(toybuf), 1)) break;
604 }
605
606 command = command->next;
607 }
608
609 if (line && !FLAG(n)) emit(line, len, eol);
610
611 done:
612 if (dlist_terminate(append)) while (append) {
613 struct append *a = append->next;
614
615 if (append->file) {
616 int fd = open(append->str, O_RDONLY);
617
618 // Force newline if noeol pending
619 if (fd != -1) {
620 if (TT.noeol) xwrite(TT.fdout, "\n", 1);
621 TT.noeol = 0;
622 xsendfile(fd, TT.fdout);
623 close(fd);
624 }
625 } else if (append->str) emit(append->str, strlen(append->str), 1);
626 else emit(line, 0, 0);
627 free(append);
628 append = a;
629 }
630 free(line);
631 }
632
633 // Callback called on each input file
do_sed_file(int fd,char * name)634 static void do_sed_file(int fd, char *name)
635 {
636 char *tmp;
637
638 if (FLAG(i)) {
639 struct sedcmd *command;
640
641 if (!fd) return error_msg("-i on stdin");
642 TT.fdout = copy_tempfile(fd, name, &tmp);
643 TT.count = 0;
644 for (command = (void *)TT.pattern; command; command = command->next)
645 command->hit = 0;
646 }
647 do_lines(fd, TT.delim, sed_line);
648 if (FLAG(i)) {
649 if (TT.i && *TT.i) {
650 char *s = xmprintf("%s%s", name, TT.i);
651
652 xrename(name, s);
653 free(s);
654 }
655 replace_tempfile(-1, TT.fdout, &tmp);
656 TT.fdout = 1;
657 TT.nextline = 0;
658 TT.nextlen = TT.noeol = 0;
659 }
660 }
661
662 // Copy chunk of string between two delimiters, converting printf escapes.
663 // returns processed copy of string (0 if error), *pstr advances to next
664 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
665 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)666 static char *unescape_delimited_string(char **pstr, char *delim)
667 {
668 char *to, *from, mode = 0, d;
669
670 // Grab leading delimiter (if necessary), allocate space for new string
671 from = *pstr;
672 if (!delim || !*delim) {
673 if (!(d = *(from++))) return 0;
674 if (d == '\\') d = *(from++);
675 if (!d || d == '\\') return 0;
676 if (delim) *delim = d;
677 } else d = *delim;
678 to = delim = xmalloc(strlen(*pstr)+1);
679
680 while (mode || *from != d) {
681 if (!*from) return 0;
682
683 // delimiter in regex character range doesn't count
684 if (*from == '[') {
685 if (!mode) {
686 mode = ']';
687 if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
688 } else if (mode == ']' && strchr(".=:", from[1])) {
689 *(to++) = *(from++);
690 mode = *from;
691 }
692 } else if (*from == mode) {
693 if (mode == ']') mode = 0;
694 else {
695 *(to++) = *(from++);
696 mode = ']';
697 }
698 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
699 // but the perl build does it, so we need to filter it out.
700 } else if (mode && *from == '-' && from[-1] == from[1]) {
701 from+=2;
702 continue;
703 } else if (*from == '\\') {
704 if (!from[1]) return 0;
705
706 // Check escaped end delimiter before printf style escapes.
707 if (from[1] == d) from++;
708 else if (from[1]=='\\') *(to++) = *(from++);
709 else {
710 char c = unescape(from[1]);
711
712 if (c) {
713 *(to++) = c;
714 from+=2;
715 continue;
716 } else if (!mode) *(to++) = *(from++);
717 }
718 }
719 *(to++) = *(from++);
720 }
721 *to = 0;
722 *pstr = from+1;
723
724 return delim;
725 }
726
727 // Translate pattern strings into command structures. Each command structure
728 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)729 static void parse_pattern(char **pline, long len)
730 {
731 struct sedcmd *command = (void *)TT.pattern;
732 char *line, *reg, c, *errstart;
733 int i;
734
735 line = errstart = pline ? *pline : "";
736 if (len && line[len-1]=='\n') line[--len] = 0;
737
738 // Append this line to previous multiline command? (hit indicates type.)
739 // During parsing "hit" stores data about line continuations, but in
740 // sed_line() it means the match range attached to this command
741 // is active, so processing the continuation must zero it again.
742 if (command && command->prev->hit) {
743 // Remove half-finished entry from list so remalloc() doesn't confuse it
744 TT.pattern = TT.pattern->prev;
745 command = dlist_pop(&TT.pattern);
746 c = command->c;
747 reg = (char *)command;
748 reg += command->arg1 + strlen(reg + command->arg1);
749
750 // Resume parsing for 'a' or 's' command. (Only two that can do this.)
751 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
752 // a unicode character.
753 if (command->hit < 256) goto resume_s;
754 else goto resume_a;
755 }
756
757 // Loop through commands in this line.
758
759 command = 0;
760 for (;;) {
761 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
762
763 // If there's no more data on this line, return.
764 for (;;) {
765 while (isspace(*line) || *line == ';') line++;
766 if (*line == '#') while (*line && *line != '\n') line++;
767 else break;
768 }
769 if (!*line) return;
770
771 // We start by writing data into toybuf. Later we'll allocate the
772 // ex
773
774 errstart = line;
775 memset(toybuf, 0, sizeof(struct sedcmd));
776 command = (void *)toybuf;
777 reg = toybuf + sizeof(struct sedcmd);
778
779 // Parse address range (if any)
780 for (i = 0; i < 2; i++) {
781 if (*line == ',') line++;
782 else if (i) break;
783
784 if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
785 else if (*line == '$') {
786 command->lmatch[i] = -1;
787 line++;
788 } else if (*line == '/' || *line == '\\') {
789 char *s = line;
790
791 if (!(s = unescape_delimited_string(&line, 0))) goto error;
792 if (!*s) command->rmatch[i] = 0;
793 else {
794 xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
795 command->rmatch[i] = reg-toybuf;
796 reg += sizeof(regex_t);
797 }
798 free(s);
799 } else break;
800 }
801
802 while (isspace(*line)) line++;
803 if (!*line) break;
804
805 while (*line == '!') {
806 command->not = 1;
807 line++;
808 }
809 while (isspace(*line)) line++;
810
811 c = command->c = *(line++);
812 if (strchr("}:", c) && i) break;
813 if (strchr("aiqr=", c) && i>1) break;
814
815 // Add step to pattern
816 command = xmemdup(toybuf, reg-toybuf);
817 reg = (reg-toybuf) + (char *)command;
818
819 // Parse arguments by command type
820 if (c == '{') TT.nextlen++;
821 else if (c == '}') {
822 if (!TT.nextlen--) break;
823 } else if (c == 's') {
824 char *end, delim = 0;
825
826 // s/pattern/replacement/flags
827
828 // line continuations use arg1 (back at the start of the function),
829 // so let's fill out arg2 first (since the regex part can't be multiple
830 // lines) and swap them back later.
831
832 // get pattern (just record, we parse it later)
833 command->arg2 = reg - (char *)command;
834 if (!(TT.remember = unescape_delimited_string(&line, &delim)))
835 goto error;
836
837 reg += sizeof(regex_t);
838 command->arg1 = reg-(char *)command;
839 command->hit = delim;
840 resume_s:
841 // get replacement - don't replace escapes yet because \1 and \& need
842 // processing later, after we replace \\ with \ we can't tell \\1 from \1
843 end = line;
844 while (*end != command->hit) {
845 if (!*end) goto error;
846 if (*end++ == '\\') {
847 if (!*end || *end == '\n') {
848 end[-1] = '\n';
849 break;
850 }
851 end++;
852 }
853 }
854
855 reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
856 line = end;
857 // line continuation? (note: '\n' can't be a valid delim).
858 if (*line == command->hit) command->hit = 0;
859 else {
860 if (!*line) continue;
861 reg--;
862 line++;
863 goto resume_s;
864 }
865
866 // swap arg1/arg2 so they're back in order arguments occur.
867 i = command->arg1;
868 command->arg1 = command->arg2;
869 command->arg2 = i;
870
871 // get flags
872 for (line++; *line; line++) {
873 long l;
874
875 if (isspace(*line) && *line != '\n') continue;
876
877 if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
878 else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
879 command->sflags |= l << 3;
880 line--;
881 } else break;
882 }
883
884 // We deferred actually parsing the regex until we had the s///i flag
885 // allocating the space was done by extend_string() above
886 if (!*TT.remember) command->arg1 = 0;
887 else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
888 (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
889 free(TT.remember);
890 TT.remember = 0;
891 if (*line == 'w') {
892 line++;
893 goto writenow;
894 }
895 } else if (c == 'w') {
896 int fd, delim;
897 char *cc;
898
899 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
900 // eol status, and to retain the filename for error messages, we'd need
901 // to go up to arg5 just for this. Compromise: dynamically allocate the
902 // filehandle and eol status.
903
904 writenow:
905 while (isspace(*line)) line++;
906 if (!*line) goto error;
907 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
908 delim = *cc;
909 *cc = 0;
910 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
911 *cc = delim;
912
913 command->w = reg - (char *)command;
914 command = xrealloc(command, command->w+(cc-line)+6);
915 reg = command->w + (char *)command;
916
917 memcpy(reg, &fd, 4);
918 reg += 4;
919 *(reg++) = 0;
920 memcpy(reg, line, delim);
921 reg += delim;
922 *(reg++) = 0;
923
924 line = cc;
925 if (delim) line += 2;
926 } else if (c == 'y') {
927 char *s, delim = 0;
928 int len;
929
930 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
931 command->arg1 = reg-(char *)command;
932 len = strlen(s);
933 reg = extend_string((void *)&command, s, reg-(char *)command, len);
934 free(s);
935 command->arg2 = reg-(char *)command;
936 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
937 if (len != strlen(s)) goto error;
938 reg = extend_string((void *)&command, s, reg-(char*)command, len);
939 free(s);
940 } else if (strchr("abcirtTw:", c)) {
941 int end;
942
943 // trim leading spaces
944 while (isspace(*line) && *line != '\n') line++;
945
946 // Resume logic differs from 's' case because we don't add a newline
947 // unless it's after something, so we add it on return instead.
948 resume_a:
949 command->hit = 0;
950
951 // btT: end with space or semicolon, aicrw continue to newline.
952 if (!(end = strcspn(line, strchr(":btT", c) ? "}; \t\r\n\v\f" : "\n"))) {
953 // Argument's optional for btT
954 if (strchr("btT", c)) continue;
955 else if (!command->arg1) break;
956 }
957
958 // Extend allocation to include new string. We use offsets instead of
959 // pointers so realloc() moving stuff doesn't break things. Ok to write
960 // \n over NUL terminator because call to extend_string() adds it back.
961 if (!command->arg1) command->arg1 = reg - (char*)command;
962 else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
963 else if (!pline) {
964 command->arg1 = 0;
965 continue;
966 }
967 reg = extend_string((void *)&command, line, reg - (char *)command, end);
968
969 // Recopy data to remove escape sequences and handle line continuation.
970 if (strchr("aci", c)) {
971 reg -= end+1;
972 for (i = end; i; i--) {
973 if ((*reg++ = *line++)=='\\') {
974
975 // escape at end of line: resume if -e escaped literal newline,
976 // else request callback and resume with next line
977 if (!--i) {
978 *--reg = 0;
979 if (*line) {
980 line++;
981 goto resume_a;
982 }
983 command->hit = 256;
984 break;
985 }
986 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
987 line++;
988 }
989 }
990 *reg = 0;
991 } else line += end;
992
993 // Commands that take no arguments
994 } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
995 }
996
997 error:
998 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
999 }
1000
sed_main(void)1001 void sed_main(void)
1002 {
1003 struct arg_list *al;
1004 char **args = toys.optargs;
1005
1006 if (!FLAG(z)) TT.delim = '\n';
1007
1008 // Lie to autoconf when it asks stupid questions, so configure regexes
1009 // that look for "GNU sed version %f" greater than some old buggy number
1010 // don't fail us for not matching their narrow expectations.
1011 if (FLAG(version)) {
1012 xprintf("This is not GNU sed version 9.0\n");
1013 return;
1014 }
1015
1016 // Handling our own --version means we handle our own --help too.
1017 if (FLAG(help)) help_exit(0);
1018
1019 // Parse pattern into commands.
1020
1021 // If no -e or -f, first argument is the pattern.
1022 if (!TT.e && !TT.f) {
1023 if (!*toys.optargs) error_exit("no pattern");
1024 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1025 }
1026
1027 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1028 // so handle all -e, then all -f. (At least the behavior's consistent.)
1029
1030 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1031 parse_pattern(0, 0);
1032 for (al = TT.f; al; al = al->next)
1033 do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1034 dlist_terminate(TT.pattern);
1035 if (TT.nextlen) error_exit("no }");
1036
1037 TT.fdout = 1;
1038 TT.remember = xstrdup("");
1039
1040 // Inflict pattern upon input files. Long version because !O_CLOEXEC
1041 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1042
1043 // Provide EOF flush at end of cumulative input for non-i mode.
1044 if (!FLAG(i)) {
1045 toys.optflags |= FLAG_i;
1046 sed_line(0, 0);
1047 }
1048
1049 // todo: need to close fd when done for TOYBOX_FREE?
1050 }
1051