changeset 1530:3eafa445c1a6 draft

Random in-progress snapshot of sed, not finished yet.
author Rob Landley <rob@landley.net>
date Mon, 20 Oct 2014 21:07:16 -0500
parents e127aa575ff2
children 3ff823086c99
files toys/pending/sed.c
diffstat 1 files changed, 150 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/toys/pending/sed.c	Mon Oct 20 19:56:05 2014 -0500
+++ b/toys/pending/sed.c	Mon Oct 20 21:07:16 2014 -0500
@@ -4,7 +4,7 @@
  *
  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
  *
- * todo "-e blah -f blah -e blah" what order?
+ * todo "-e blah -f blah -e blah" what order? (All -e, then all -f.)
  * What happens when first address matched, then EOF? How about ",42" or "1,"
  * Does $ match last line of file or last line of input
  * If file doesn't end with newline
@@ -12,8 +12,13 @@
  * space before address
  * numerical addresses that cross, select one line
  * test backslash escapes in regex; share code with printf?
+ * address counts lines cumulatively across files
+ * Why can't I start an address with \\ (posix says no, but _why_?)
+ * Fun with \nblah\nn vs \tblah\tt
+ *
+ * echo -e "one\ntwo\nthree" | sed -n '$,$p'
 
-USE_SED(NEWTOY(sed, "e*f*inr", TOYFLAG_USR|TOYFLAG_BIN))
+USE_SED(NEWTOY(sed, "(version)e*f*inr", TOYFLAG_USR|TOYFLAG_BIN))
 
 config SED
   bool "sed"
@@ -75,27 +80,68 @@
   struct arg_list *f;
   struct arg_list *e;
 
-  void *pattern;
+  // processed pattern list
+  struct double_list *pattern;
 )
 
+struct step {
+  struct step *next, *prev;
+
+  // Begin and end of each match
+  long lmatch[2];
+  regex_t *rmatch[2];
+
+  // Action
+  char c;
+};
+
+// Apply pattern to line from input file
 static void do_line(char **pline, long len)
 {
   printf("len=%ld line=%s\n", len, *pline);
 }
 
+// Genericish function, can probably get moved to lib.c
+
+// Iterate over lines in file, calling function. Function can write NULL to
+// the line pointer if they want to keep it, otherwise line is freed.
+// Passed file descriptor is closed at the end of processing.
 static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
 {
-  FILE *fp = fdopen(fd, "r");
+  FILE *fp = xfdopen(fd, "r");
 
   for (;;) {
     char *line = 0;
     ssize_t len;
 
     len = getline(&line, (void *)&len, fp);
-    do_line(&line, len);
+    call(&line, len);
     free(line);
     if (len < 1) break;
   }
+  fclose(fp);
+}
+
+// Iterate over newline delimited data blob (potentially with embedded NUL),
+// call function on each line.
+static void chop_lines(char *data, long len,
+  void (*call)(char **pline, long len))
+{
+  long ll;
+
+  for (ll = 0; ll < len; ll++) {
+    if (data[ll] == '\n') {
+      char *c = data;
+
+      data[ll] = 0;
+      call(&c, len);
+      data[ll++] = '\n';
+      data += ll;
+      len -= ll;
+      ll = -1;
+    }
+  }
+  if (len) call(&data, len);
 }
 
 static void do_sed(int fd, char *name)
@@ -103,16 +149,112 @@
   do_lines(fd, name, do_line);
 }
 
+// Translate primal pattern into walkable form.
+static void jewel_of_judgement(char **pline, long len)
+{
+  struct step *corwin;
+  char *line = *pline, *reg;
+  int i;
+
+  while (isspace(*line)) line++;
+  if (*line == '#') return;
+
+  memset(toybuf, 0, sizeof(struct step));
+  corwin = (void *)toybuf;
+  reg = toybuf + sizeof(struct step);
+
+  // Parse address range (if any)
+  for (i = 0; i < 2; i++) {
+    if (*line == ',') line++;
+    else if (i) break;
+
+    if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
+    else if (*line == '$') {
+      corwin->lmatch[i] = -1;
+      line++;
+    } else if (*line == '/' || *line == '\\') {
+      char delim = *(line++), slash = 0, *to, *from;
+
+      if (delim == '\\') {
+        if (!*line) goto brand;
+        slash = delim = *(line++);
+      }
+
+      // Removing backslash escapes edits the source string, which could
+      // be from the environment space via -e, which could screw up what
+      // "ps" sees, and I'm ok with that.
+      for (to = from = line; *from != delim; *(to++) = *(from++)) {
+        if (!*from) goto brand;
+        if (*from == '\\') {
+          if (!from[1]) goto brand;
+
+          // Check escaped end delimiter before printf style escapes.
+          if (from[1] == slash) from++;
+          else {
+            char c = unescape(from[1]);
+
+            if (c) {
+              *to = c;
+              from++;
+            }
+          }
+        }
+      }
+      slash = *to;
+      *to = 0;
+      xregcomp(corwin->rmatch[i] = (void *)reg, line,
+        ((toys.optflags & FLAG_r)*REG_EXTENDED)|REG_NOSUB);
+      *to = slash;
+      reg += sizeof(regex_t);
+    } else break;
+  }
+
+  while (isspace(*line)) line++;
+
+  if (!*line || !strchr("p", *line)) goto brand;
+
+  // Add step to pattern
+  corwin = xmalloc(reg-toybuf);
+  memcpy(corwin, toybuf, reg-toybuf);
+  dlist_add_nomalloc(&TT.pattern, (void *)corwin);
+
+  return;
+
+brand:
+
+  // Reminisce about chestnut trees.
+  error_exit("bad pattern '%s'@%ld (%c)", *pline, line-*pline, *line);
+}
+
 void sed_main(void)
 {
+  struct arg_list *dworkin;
   char **args = toys.optargs;
 
-  // Need a pattern
-  if (!TT.e) {
+  // Lie to autoconf when it asks stupid questions, so configure regexes
+  // that look for "GNU sed version %f" greater than some old buggy number
+  // don't fail us for not matching their narrow expectations.
+  if (FLAG_version) {
+    xprintf("This is not GNU sed version 9.0\n");
+    return;
+  }
+
+  // Need a pattern. If no unicorns about, fight dragon and take its eye.
+  if (!TT.e && !TT.f) {
     if (!*toys.optargs) error_exit("no pattern");
     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
   }
 
+  for (dworkin = TT.e; dworkin; dworkin = dworkin->next) {
+    chop_lines(dworkin->arg, strlen(dworkin->arg), jewel_of_judgement);
+  }
+
+  for (dworkin = TT.f; dworkin; dworkin = dworkin->next) {
+    int fd = xopen(dworkin->arg, O_RDONLY);
+
+    do_lines(fd, dworkin->arg, jewel_of_judgement);
+  }
+
   // Inflict pattern upon input files
-  loopfiles(args, do_sed);
+  loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
 }