1/* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9 * TODO: make y// handle unicode
10 * TODO: handle error return from emit(), error_msg/exit consistently
11 *       What's the right thing to do for -i when write fails? Skip to next?
12
13USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
14
15config SED
16  bool "sed"
17  default y
18  help
19    usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
20
21    Stream editor. Apply one or more editing SCRIPTs to each line of input
22    (from FILE or stdin) producing output (by default to stdout).
23
24    -e	add SCRIPT to list
25    -f	add contents of SCRIPT_FILE to list
26    -i	Edit each file in place
27    -n	No default output (use the p command to output matched lines)
28    -r	Use extended regular expression syntax
29    -E	Alias for -r
30    -s	Treat input files separately (implied by -i)
31
32    A SCRIPT is a series of one or more COMMANDs separated by newlines or
33    semicolons. All -e SCRIPTs are concatenated together as if separated
34    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
35    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
36
37    Each COMMAND may be preceded by an address which limits the command to
38    apply only to the specified line(s). Commands without an address apply to
39    every line. Addresses are of the form:
40
41      [ADDRESS[,ADDRESS]]COMMAND
42
43    The ADDRESS may be a decimal line number (starting at 1), a /regular
44    expression/ within a pair of forward slashes, or the character "$" which
45    matches the last line of input. (In -s or -i mode this matches the last
46    line of each file, otherwise just the last line of the last file.) A single
47    address matches one line, a pair of comma separated addresses match
48    everything from the first address to the second address (inclusive). If
49    both addresses are regular expressions, more than one range of lines in
50    each file can match.
51
52    REGULAR EXPRESSIONS in sed are started and ended by the same character
53    (traditionally / but anything except a backslash or a newline works).
54    Backslashes may be used to escape the delimiter if it occurs in the
55    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
56    and unicode). An empty regex repeats the previous one. ADDRESS regexes
57    (above) require the first delimeter to be escaped with a backslash when
58    it isn't a forward slash (to distinguish it from the COMMANDs below).
59
60    Sed mostly operates on individual lines one at a time. It reads each line,
61    processes it, and either writes it to the output or discards it before
62    reading the next line. Sed can remember one additional line in a separate
63    buffer (using the h, H, g, G, and x commands), and can read the next line
64    of input early (using the n and N command), but other than that command
65    scripts operate on individual lines of text.
66
67    Each COMMAND starts with a single character. The following commands take
68    no arguments:
69
70      {  Start a new command block, continuing until a corresponding "}".
71         Command blocks may nest. If the block has an address, commands within
72         the block are only run for lines within the block's address range.
73
74      }  End command block (this command cannot have an address)
75
76      d  Delete this line and move on to the next one
77         (ignores remaining COMMANDs)
78
79      D  Delete one line of input and restart command SCRIPT (same as "d"
80         unless you've glued lines together with "N" or similar)
81
82      g  Get remembered line (overwriting current line)
83
84      G  Get remembered line (appending to current line)
85
86      h  Remember this line (overwriting remembered line)
87
88      H  Remember this line (appending to remembered line, if any)
89
90      l  Print line, escaping \abfrtv (but not newline), octal escaping other
91         nonprintable characters, wrapping lines to terminal width with a
92         backslash, and appending $ to actual end of line.
93
94      n  Print default output and read next line, replacing current line
95         (If no next line available, quit processing script)
96
97      N  Append next line of input to this line, separated by a newline
98         (This advances the line counter for address matching and "=", if no
99         next line available quit processing script without default output)
100
101      p  Print this line
102
103      P  Print this line up to first newline (from "N")
104
105      q  Quit (print default output, no more commands processed or lines read)
106
107      x  Exchange this line with remembered line (overwrite in both directions)
108
109      =  Print the current line number (followed by a newline)
110
111    The following commands (may) take an argument. The "text" arguments (to
112    the "a", "b", and "c" commands) may end with an unescaped "\" to append
113    the next line (for which leading whitespace is not skipped), and also
114    treat ";" as a literal character (use "\;" instead).
115
116      a [text]   Append text to output before attempting to read next line
117
118      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
119
120      c [text]   Delete line, output text at end of matching address range
121                 (ignores remaining COMMANDs)
122
123      i [text]   Print text
124
125      r [file]   Append contents of file to output before attempting to read
126                 next line.
127
128      s/S/R/F    Search for regex S, replace matched text with R using flags F.
129                 The first character after the "s" (anything but newline or
130                 backslash) is the delimiter, escape with \ to use normally.
131
132                 The replacement text may contain "&" to substitute the matched
133                 text (escape it with backslash for a literal &), or \1 through
134                 \9 to substitute a parenthetical subexpression in the regex.
135                 You can also use the normal backslash escapes such as \n and
136                 a backslash at the end of the line appends the next line.
137
138                 The flags are:
139
140                 [0-9]    A number, substitute only that occurrence of pattern
141                 g        Global, substitute all occurrences of pattern
142                 i        Ignore case when matching
143                 p        Print the line if match was found and replaced
144                 w [file] Write (append) line to file if match replaced
145
146      t [label]  Test, jump to :label only if an "s" command found a match in
147                 this line since last test (replacing with same text counts)
148
149      T [label]  Test false, jump only if "s" hasn't found a match.
150
151      w [file]   Write (append) line to file
152
153      y/old/new/ Change each character in 'old' to corresponding character
154                 in 'new' (with standard backslash escapes, delimiter can be
155                 any repeated character except \ or \n)
156
157      : [label]  Labeled target for jump commands
158
159      #  Comment, ignore rest of this line of SCRIPT
160
161    Deviations from posix: allow extended regular expressions with -r,
162    editing in place with -i, separate with -s, printf escapes in text, line
163    continuations, semicolons after all commands, 2-address anywhere an
164    address is allowed, "T" command, multiline continuations for [abc],
165    \; to end [abc] argument before end of line.
166*/
167
168#define FOR_sed
169#include "toys.h"
170
171GLOBALS(
172  struct arg_list *f;
173  struct arg_list *e;
174
175  // processed pattern list
176  struct double_list *pattern;
177
178  char *nextline, *remember;
179  void *restart, *lastregex;
180  long nextlen, rememberlen, count;
181  int fdout, noeol;
182  unsigned xx;
183)
184
185// Linked list of parsed sed commands. Offset fields indicate location where
186// regex or string starts, ala offset+(char *)struct, because we remalloc()
187// these to expand them for multiline inputs, and pointers would have to be
188// individually adjusted.
189
190struct sedcmd {
191  struct sedcmd *next, *prev;
192
193  // Begin and end of each match
194  long lmatch[2]; // line number of match
195  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
196  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
197  unsigned not, hit;
198  unsigned sflags; // s///flag bits: i=1, g=2, p=4
199  char c; // action
200};
201
202// Write out line with potential embedded NUL, handling eol/noeol
203static int emit(char *line, long len, int eol)
204{
205  int l, old = line[len];
206
207  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
208  TT.noeol = !eol;
209  if (eol) line[len++] = '\n';
210  if (!len) return 0;
211  l = writeall(TT.fdout, line, len);
212  if (eol) line[len-1] = old;
213  if (l != len) {
214    perror_msg("short write");
215
216    return 1;
217  }
218
219  return 0;
220}
221
222// Extend allocation to include new string, with newline between if newlen<0
223
224static char *extend_string(char **old, char *new, int oldlen, int newlen)
225{
226  int newline = newlen < 0;
227  char *s;
228
229  if (newline) newlen = -newlen;
230  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
231  if (newline) s[oldlen++] = '\n';
232  memcpy(s+oldlen, new, newlen);
233  s[oldlen+newlen] = 0;
234
235  return s+oldlen+newlen+1;
236}
237
238// An empty regex repeats the previous one
239static void *get_regex(void *trump, int offset)
240{
241  if (!offset) {
242    if (!TT.lastregex) error_exit("no previous regex");
243    return TT.lastregex;
244  }
245
246  return TT.lastregex = offset+(char *)trump;
247}
248
249// Apply pattern to line from input file
250static void process_line(char **pline, long plen)
251{
252  struct append {
253    struct append *next, *prev;
254    int file;
255    char *str;
256  } *append = 0;
257  char *line = TT.nextline;
258  long len = TT.nextlen;
259  struct sedcmd *command;
260  int eol = 0, tea = 0;
261
262  // Grab next line for deferred processing (EOF detection: we get a NULL
263  // pline at EOF to flush last line). Note that only end of _last_ input
264  // file matches $ (unless we're doing -i).
265  TT.nextline = 0;
266  TT.nextlen = 0;
267  if (pline) {
268    TT.nextline = *pline;
269    TT.nextlen = plen;
270    *pline = 0;
271  }
272
273  if (!line || !len) return;
274  if (line[len-1] == '\n') line[--len] = eol++;
275  TT.count++;
276
277  // The restart-1 is because we added one to make sure it wasn't NULL,
278  // otherwise N as last command would restart script
279  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
280  TT.restart = 0;
281
282  while (command) {
283    char *str, c = command->c;
284
285    // Have we got a line or regex matching range for this rule?
286    if (*command->lmatch || *command->rmatch) {
287      int miss = 0;
288      long lm;
289
290      // In a match that might end?
291      if (command->hit) {
292        if (!(lm = command->lmatch[1])) {
293          if (!command->rmatch[1]) command->hit = 0;
294          else {
295            void *rm = get_regex(command, command->rmatch[1]);
296
297            // regex match end includes matching line, so defer deactivation
298            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
299          }
300        } else if (lm > 0 && lm < TT.count) command->hit = 0;
301
302      // Start a new match?
303      } else {
304        if (!(lm = *command->lmatch)) {
305          void *rm = get_regex(command, *command->rmatch);
306
307          if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
308        } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
309
310        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
311      }
312
313      // Didn't match?
314      lm = !(command->hit ^ command->not);
315
316      // Deferred disable from regex end match
317      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
318
319      if (lm) {
320        // Handle skipping curly bracket command group
321        if (c == '{') {
322          int curly = 1;
323
324          while (curly) {
325            command = command->next;
326            if (command->c == '{') curly++;
327            if (command->c == '}') curly--;
328          }
329        }
330        command = command->next;
331        continue;
332      }
333    }
334
335    // A deleted line can still update line match state for later commands
336    if (!line) {
337      command = command->next;
338      continue;
339    }
340
341    // Process command
342
343    if (c=='a' || c=='r') {
344      struct append *a = xzalloc(sizeof(struct append));
345      if (command->arg1) a->str = command->arg1+(char *)command;
346      a->file = c=='r';
347      dlist_add_nomalloc((void *)&append, (void *)a);
348    } else if (c=='b' || c=='t' || c=='T') {
349      int t = tea;
350
351      if (c != 'b') tea = 0;
352      if (c=='b' || t^(c=='T')) {
353        if (!command->arg1) break;
354        str = command->arg1+(char *)command;
355        for (command = (void *)TT.pattern; command; command = command->next)
356          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
357            break;
358        if (!command) error_exit("no :%s", str);
359      }
360    } else if (c=='c') {
361      str = command->arg1+(char *)command;
362      if (!command->hit) emit(str, strlen(str), 1);
363      free(line);
364      line = 0;
365      continue;
366    } else if (c=='d') {
367      free(line);
368      line = 0;
369      continue;
370    } else if (c=='D') {
371      // Delete up to \n or end of buffer
372      str = line;
373      while ((str-line)<len) if (*(str++) == '\n') break;
374      len -= str - line;
375      memmove(line, str, len);
376
377      // if "delete" blanks line, disable further processing
378      // otherwise trim and restart script
379      if (!len) {
380        free(line);
381        line = 0;
382      } else {
383        line[len] = 0;
384        command = (void *)TT.pattern;
385      }
386      continue;
387    } else if (c=='g') {
388      free(line);
389      line = xstrdup(TT.remember);
390      len = TT.rememberlen;
391    } else if (c=='G') {
392      line = xrealloc(line, len+TT.rememberlen+2);
393      line[len++] = '\n';
394      memcpy(line+len, TT.remember, TT.rememberlen);
395      line[len += TT.rememberlen] = 0;
396    } else if (c=='h') {
397      free(TT.remember);
398      TT.remember = xstrdup(line);
399      TT.rememberlen = len;
400    } else if (c=='H') {
401      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
402      TT.remember[TT.rememberlen++] = '\n';
403      memcpy(TT.remember+TT.rememberlen, line, len);
404      TT.remember[TT.rememberlen += len] = 0;
405    } else if (c=='i') {
406      str = command->arg1+(char *)command;
407      emit(str, strlen(str), 1);
408    } else if (c=='l') {
409      int i, x, off;
410
411      if (!TT.xx) {
412        terminal_size(&TT.xx, 0);
413        if (!TT.xx) TT.xx = 80;
414        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
415        if (TT.xx > 4) TT.xx -= 4;
416      }
417
418      for (i = off = 0; i<len; i++) {
419        if (off >= TT.xx) {
420          toybuf[off++] = '\\';
421          emit(toybuf, off, 1);
422          off = 0;
423        }
424        x = stridx("\\\a\b\f\r\t\v", line[i]);
425        if (x != -1) {
426          toybuf[off++] = '\\';
427          toybuf[off++] = "\\abfrtv"[x];
428        } else if (line[i] >= ' ') toybuf[off++] = line[i];
429        else off += sprintf(toybuf+off, "\\%03o", line[i]);
430      }
431      toybuf[off++] = '$';
432      emit(toybuf, off, 1);
433    } else if (c=='n') {
434      TT.restart = command->next+1;
435
436      break;
437    } else if (c=='N') {
438      // Can't just grab next line because we could have multiple N and
439      // we need to actually read ahead to get N;$p EOF detection right.
440      if (pline) {
441        TT.restart = command->next+1;
442        extend_string(&line, TT.nextline, len, -TT.nextlen);
443        free(TT.nextline);
444        TT.nextline = line;
445        TT.nextlen += len + 1;
446        line = 0;
447      }
448
449      // Pending append goes out right after N
450      goto done;
451    } else if (c=='p' || c=='P') {
452      char *l = (c=='P') ? strchr(line, '\n') : 0;
453
454      if (emit(line, l ? l-line : len, eol)) break;
455    } else if (c=='q') {
456      if (pline) *pline = (void *)1;
457      free(TT.nextline);
458      TT.nextline = 0;
459      TT.nextlen = 0;
460
461      break;
462    } else if (c=='s') {
463      char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
464      regmatch_t *match = (void *)toybuf;
465      regex_t *reg = get_regex(command, command->arg1);
466      int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
467
468      // Find match in remaining line (up to remaining len)
469      while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
470        mflags = REG_NOTBOL;
471
472        // Zero length matches don't count immediately after a previous match
473        mlen = match[0].rm_eo-match[0].rm_so;
474        if (!mlen && !zmatch) {
475          if (!rlen--) break;
476          rline++;
477          zmatch++;
478          continue;
479        } else zmatch = 0;
480
481        // If we're replacing only a specific match, skip if this isn't it
482        off = command->sflags>>3;
483        if (off && off != ++count) {
484          rline += match[0].rm_eo;
485          rlen -= match[0].rm_eo;
486
487          continue;
488        }
489        // The fact getline() can allocate unbounded amounts of memory is
490        // a bigger issue, but while we're here check for integer overflow
491        if (match[0].rm_eo > INT_MAX) perror_exit(0);
492
493        // newlen = strlen(new) but with \1 and & and printf escapes
494        for (off = newlen = 0; new[off]; off++) {
495          int cc = -1;
496
497          if (new[off] == '&') cc = 0;
498          else if (new[off] == '\\') cc = new[++off] - '0';
499          if (cc < 0 || cc > 9) {
500            newlen++;
501            continue;
502          }
503          newlen += match[cc].rm_eo-match[cc].rm_so;
504        }
505
506        // Allocate new size, copy start/end around match. (Can't extend in
507        // place because backrefs may refer to text after it's overwritten.)
508        len += newlen-mlen;
509        swap = xmalloc(len+1);
510        rswap = swap+(rline-line)+match[0].rm_so;
511        memcpy(swap, line, (rline-line)+match[0].rm_so);
512        memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
513
514        // copy in new replacement text
515        for (off = mlen = 0; new[off]; off++) {
516          int cc = 0, ll;
517
518          if (new[off] == '\\') {
519            cc = new[++off] - '0';
520            if (cc<0 || cc>9) {
521              if (!(rswap[mlen++] = unescape(new[off])))
522                rswap[mlen-1] = new[off];
523
524              continue;
525            } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
526          } else if (new[off] != '&') {
527            rswap[mlen++] = new[off];
528
529            continue;
530          }
531
532          ll = match[cc].rm_eo-match[cc].rm_so;
533          memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
534          mlen += ll;
535        }
536
537        rline = rswap+newlen;
538        free(line);
539        line = swap;
540
541        // Stop after first substitution unless we have flag g
542        if (!(command->sflags & 2)) break;
543      }
544
545      if (mflags) {
546        // flag p
547        if (command->sflags & 4) emit(line, len, eol);
548
549        tea = 1;
550        if (command->w) goto writenow;
551      }
552    } else if (c=='w') {
553      int fd, noeol;
554      char *name;
555
556writenow:
557      // Swap out emit() context
558      fd = TT.fdout;
559      noeol = TT.noeol;
560
561      // We save filehandle and newline status before filename
562      name = command->w + (char *)command;
563      memcpy(&TT.fdout, name, 4);
564      name += 4;
565      TT.noeol = *(name++);
566
567      // write, then save/restore context
568      if (emit(line, len, eol))
569        perror_exit("w '%s'", command->arg1+(char *)command);
570      *(--name) = TT.noeol;
571      TT.noeol = noeol;
572      TT.fdout = fd;
573    } else if (c=='x') {
574      long swap = TT.rememberlen;
575
576      str = TT.remember;
577      TT.remember = line;
578      line = str;
579      TT.rememberlen = len;
580      len = swap;
581    } else if (c=='y') {
582      char *from, *to = (char *)command;
583      int i, j;
584
585      from = to+command->arg1;
586      to += command->arg2;
587
588      for (i = 0; i < len; i++) {
589        j = stridx(from, line[i]);
590        if (j != -1) line[i] = to[j];
591      }
592    } else if (c=='=') {
593      sprintf(toybuf, "%ld", TT.count);
594      emit(toybuf, strlen(toybuf), 1);
595    }
596
597    command = command->next;
598  }
599
600  if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
601
602done:
603  if (dlist_terminate(append)) while (append) {
604    struct append *a = append->next;
605
606    if (append->file) {
607      int fd = open(append->str, O_RDONLY);
608
609      // Force newline if noeol pending
610      if (fd != -1) {
611        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
612        TT.noeol = 0;
613        xsendfile(fd, TT.fdout);
614        close(fd);
615      }
616    } else if (append->str) emit(append->str, strlen(append->str), 1);
617    else emit(line, 0, 0);
618    free(append);
619    append = a;
620  }
621  free(line);
622}
623
624// Callback called on each input file
625static void do_sed(int fd, char *name)
626{
627  int i = toys.optflags & FLAG_i;
628  char *tmp;
629
630  if (i) {
631    struct sedcmd *command;
632
633    if (!fd) {
634      error_msg("-i on stdin");
635      return;
636    }
637    TT.fdout = copy_tempfile(fd, name, &tmp);
638    TT.count = 0;
639    for (command = (void *)TT.pattern; command; command = command->next)
640      command->hit = 0;
641  }
642  do_lines(fd, process_line);
643  if (i) {
644    process_line(0, 0);
645    replace_tempfile(-1, TT.fdout, &tmp);
646    TT.fdout = 1;
647    TT.nextline = 0;
648    TT.nextlen = TT.noeol = 0;
649  }
650}
651
652// Copy chunk of string between two delimiters, converting printf escapes.
653// returns processed copy of string (0 if error), *pstr advances to next
654// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
655// if regxex, ignore delimiter in [ranges]
656static char *unescape_delimited_string(char **pstr, char *delim)
657{
658  char *to, *from, mode = 0, d;
659
660  // Grab leading delimiter (if necessary), allocate space for new string
661  from = *pstr;
662  if (!delim || !*delim) {
663    if (!(d = *(from++))) return 0;
664    if (d == '\\') d = *(from++);
665    if (!d || d == '\\') return 0;
666    if (delim) *delim = d;
667  } else d = *delim;
668  to = delim = xmalloc(strlen(*pstr)+1);
669
670  while (mode || *from != d) {
671    if (!*from) return 0;
672
673    // delimiter in regex character range doesn't count
674    if (*from == '[') {
675      if (!mode) {
676        mode = ']';
677        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
678      } else if (mode == ']' && strchr(".=:", from[1])) {
679        *(to++) = *(from++);
680        mode = *from;
681      }
682    } else if (*from == mode) {
683      if (mode == ']') mode = 0;
684      else {
685        *(to++) = *(from++);
686        mode = ']';
687      }
688    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
689    // but the perl build does it, so we need to filter it out.
690    } else if (mode && *from == '-' && from[-1] == from[1]) {
691      from+=2;
692      continue;
693    } else if (*from == '\\') {
694      if (!from[1]) return 0;
695
696      // Check escaped end delimiter before printf style escapes.
697      if (from[1] == d) from++;
698      else if (from[1]=='\\') *(to++) = *(from++);
699      else {
700        char c = unescape(from[1]);
701
702        if (c) {
703          *(to++) = c;
704          from+=2;
705          continue;
706        } else if (!mode) *(to++) = *(from++);
707      }
708    }
709    *(to++) = *(from++);
710  }
711  *to = 0;
712  *pstr = from+1;
713
714  return delim;
715}
716
717// Translate pattern strings into command structures. Each command structure
718// is a single allocation (which requires some math and remalloc at times).
719static void parse_pattern(char **pline, long len)
720{
721  struct sedcmd *command = (void *)TT.pattern;
722  char *line, *reg, c, *errstart;
723  int i;
724
725  line = errstart = pline ? *pline : "";
726  if (len && line[len-1]=='\n') line[--len] = 0;
727
728  // Append this line to previous multiline command? (hit indicates type.)
729  // During parsing "hit" stores data about line continuations, but in
730  // process_line() it means the match range attached to this command
731  // is active, so processing the continuation must zero it again.
732  if (command && command->prev->hit) {
733    // Remove half-finished entry from list so remalloc() doesn't confuse it
734    TT.pattern = TT.pattern->prev;
735    command = dlist_pop(&TT.pattern);
736    c = command->c;
737    reg = (char *)command;
738    reg += command->arg1 + strlen(reg + command->arg1);
739
740    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
741    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
742    // a unicode character.
743    if (command->hit < 256) goto resume_s;
744    else goto resume_a;
745  }
746
747  // Loop through commands in this line.
748
749  command = 0;
750  for (;;) {
751    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
752
753    // If there's no more data on this line, return.
754    for (;;) {
755      while (isspace(*line) || *line == ';') line++;
756      if (*line == '#') while (*line && *line != '\n') line++;
757      else break;
758    }
759    if (!*line) return;
760
761    // We start by writing data into toybuf. Later we'll allocate the
762    // ex
763
764    errstart = line;
765    memset(toybuf, 0, sizeof(struct sedcmd));
766    command = (void *)toybuf;
767    reg = toybuf + sizeof(struct sedcmd);
768
769    // Parse address range (if any)
770    for (i = 0; i < 2; i++) {
771      if (*line == ',') line++;
772      else if (i) break;
773
774      if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
775      else if (*line == '$') {
776        command->lmatch[i] = -1;
777        line++;
778      } else if (*line == '/' || *line == '\\') {
779        char *s = line;
780
781        if (!(s = unescape_delimited_string(&line, 0))) goto error;
782        if (!*s) command->rmatch[i] = 0;
783        else {
784          xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
785          command->rmatch[i] = reg-toybuf;
786          reg += sizeof(regex_t);
787        }
788        free(s);
789      } else break;
790    }
791
792    while (isspace(*line)) line++;
793    if (!*line) break;
794
795    while (*line == '!') {
796      command->not = 1;
797      line++;
798    }
799    while (isspace(*line)) line++;
800
801    c = command->c = *(line++);
802    if (strchr("}:", c) && i) break;
803    if (strchr("aiqr=", c) && i>1) break;
804
805    // Add step to pattern
806    command = xmemdup(toybuf, reg-toybuf);
807    reg = (reg-toybuf) + (char *)command;
808
809    // Parse arguments by command type
810    if (c == '{') TT.nextlen++;
811    else if (c == '}') {
812      if (!TT.nextlen--) break;
813    } else if (c == 's') {
814      char *end, delim = 0;
815
816      // s/pattern/replacement/flags
817
818      // line continuations use arg1 (back at the start of the function),
819      // so let's fill out arg2 first (since the regex part can't be multiple
820      // lines) and swap them back later.
821
822      // get pattern (just record, we parse it later)
823      command->arg2 = reg - (char *)command;
824      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
825        goto error;
826
827      reg += sizeof(regex_t);
828      command->arg1 = reg-(char *)command;
829      command->hit = delim;
830resume_s:
831      // get replacement - don't replace escapes yet because \1 and \& need
832      // processing later, after we replace \\ with \ we can't tell \\1 from \1
833      end = line;
834      while (*end != command->hit) {
835        if (!*end) goto error;
836        if (*end++ == '\\') {
837          if (!*end || *end == '\n') {
838            end[-1] = '\n';
839            break;
840          }
841          end++;
842        }
843      }
844
845      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
846      line = end;
847      // line continuation? (note: '\n' can't be a valid delim).
848      if (*line == command->hit) command->hit = 0;
849      else {
850        if (!*line) continue;
851        reg--;
852        line++;
853        goto resume_s;
854      }
855
856      // swap arg1/arg2 so they're back in order arguments occur.
857      i = command->arg1;
858      command->arg1 = command->arg2;
859      command->arg2 = i;
860
861      // get flags
862      for (line++; *line; line++) {
863        long l;
864
865        if (isspace(*line) && *line != '\n') continue;
866
867        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
868        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
869          command->sflags |= l << 3;
870          line--;
871        } else break;
872      }
873
874      // We deferred actually parsing the regex until we had the s///i flag
875      // allocating the space was done by extend_string() above
876      if (!*TT.remember) command->arg1 = 0;
877      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
878        ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
879      free(TT.remember);
880      TT.remember = 0;
881      if (*line == 'w') {
882        line++;
883        goto writenow;
884      }
885    } else if (c == 'w') {
886      int fd, delim;
887      char *cc;
888
889      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
890      // eol status, and to retain the filename for error messages, we'd need
891      // to go up to arg5 just for this. Compromise: dynamically allocate the
892      // filehandle and eol status.
893
894writenow:
895      while (isspace(*line)) line++;
896      if (!*line) goto error;
897      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
898      delim = *cc;
899      *cc = 0;
900      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
901      *cc = delim;
902
903      command->w = reg - (char *)command;
904      command = xrealloc(command, command->w+(cc-line)+6);
905      reg = command->w + (char *)command;
906
907      memcpy(reg, &fd, 4);
908      reg += 4;
909      *(reg++) = 0;
910      memcpy(reg, line, delim);
911      reg += delim;
912      *(reg++) = 0;
913
914      line = cc;
915      if (delim) line += 2;
916    } else if (c == 'y') {
917      char *s, delim = 0;
918      int len;
919
920      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
921      command->arg1 = reg-(char *)command;
922      len = strlen(s);
923      reg = extend_string((void *)&command, s, reg-(char *)command, len);
924      free(s);
925      command->arg2 = reg-(char *)command;
926      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
927      if (len != strlen(s)) goto error;
928      reg = extend_string((void *)&command, s, reg-(char*)command, len);
929      free(s);
930    } else if (strchr("abcirtTw:", c)) {
931      int end;
932
933      // trim leading spaces
934      while (isspace(*line) && *line != '\n') line++;
935
936      // Resume logic differs from 's' case because we don't add a newline
937      // unless it's after something, so we add it on return instead.
938resume_a:
939      command->hit = 0;
940
941      // btT: end with space or semicolon, aicrw continue to newline.
942      if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
943        // Argument's optional for btT
944        if (strchr("btT", c)) continue;
945        else if (!command->arg1) break;
946      }
947
948      // Extend allocation to include new string. We use offsets instead of
949      // pointers so realloc() moving stuff doesn't break things. Ok to write
950      // \n over NUL terminator because call to extend_string() adds it back.
951      if (!command->arg1) command->arg1 = reg - (char*)command;
952      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
953      else if (!pline) {
954        command->arg1 = 0;
955        continue;
956      }
957      reg = extend_string((void *)&command, line, reg - (char *)command, end);
958
959      // Recopy data to remove escape sequences and handle line continuation.
960      if (strchr("aci", c)) {
961        reg -= end+1;
962        for (i = end; i; i--) {
963          if ((*reg++ = *line++)=='\\') {
964
965            // escape at end of line: resume if -e escaped literal newline,
966            // else request callback and resume with next line
967            if (!--i) {
968              *--reg = 0;
969              if (*line) {
970                line++;
971                goto resume_a;
972              }
973              command->hit = 256;
974              break;
975            }
976            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
977            line++;
978          }
979        }
980        *reg = 0;
981      } else line += end;
982
983    // Commands that take no arguments
984    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
985  }
986
987error:
988  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
989}
990
991void sed_main(void)
992{
993  struct arg_list *al;
994  char **args = toys.optargs;
995
996  // Lie to autoconf when it asks stupid questions, so configure regexes
997  // that look for "GNU sed version %f" greater than some old buggy number
998  // don't fail us for not matching their narrow expectations.
999  if (toys.optflags & FLAG_version) {
1000    xprintf("This is not GNU sed version 9.0\n");
1001    return;
1002  }
1003
1004  // Handling our own --version means we handle our own --help too.
1005  if (toys.optflags&FLAG_help) help_exit(0);
1006
1007  // Parse pattern into commands.
1008
1009  // If no -e or -f, first argument is the pattern.
1010  if (!TT.e && !TT.f) {
1011    if (!*toys.optargs) error_exit("no pattern");
1012    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1013  }
1014
1015  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1016  // so handle all -e, then all -f. (At least the behavior's consistent.)
1017
1018  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1019  for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
1020  parse_pattern(0, 0);
1021  dlist_terminate(TT.pattern);
1022  if (TT.nextlen) error_exit("no }");
1023
1024  TT.fdout = 1;
1025  TT.remember = xstrdup("");
1026
1027  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1028  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed);
1029
1030  if (!(toys.optflags & FLAG_i)) process_line(0, 0);
1031
1032  // todo: need to close fd when done for TOYBOX_FREE?
1033}
1034