From a7e49c3c78605f782c2611fbd2900ce756517d9f Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Sat, 24 Sep 2022 03:18:03 -0500 Subject: [PATCH] Speed up grep with large numbers of patterns. Build on autodetecting -F: separate fixed patterns by first letter and traverse string checking only the patterns starting with the current character. Fixed patterns are also sorted by length, so first match is longest match and we can stop there. The "fixed" logic can handle ^ and $, single character . wildcard (not in first position), escaped known special characters, and non-unicode case insensitivity. Falls through to check any other patterns in the previous way: regexec traverses string from start to finish for each pattern and says where it found it. --- tests/grep.test | 4 ++ toys/posix/grep.c | 126 ++++++++++++++++++++++++++++++---------------- 2 files changed, 87 insertions(+), 43 deletions(-) diff --git a/tests/grep.test b/tests/grep.test index cbaf7696..726fe9f0 100755 --- a/tests/grep.test +++ b/tests/grep.test @@ -211,3 +211,7 @@ testing "-o skip zero length match" "grep -o '[0-9]*'" "1234\n" "" "a1234b" testing "--color highlights all matches" \ "grep --color=always def | grep -o '[[][0-9;]*[Km]def.[[]m' | wc -l" \ "2\n" "" "abcdefghidefjkl\n" +seq 1 100002 | base64 > testfile +testing "speed" "timeout 5 grep -f testfile testfile 2>/dev/null | wc -l" \ + "10332\n" "" "" +rm -f testfile diff --git a/toys/posix/grep.c b/toys/posix/grep.c index c75e85b1..c7e0064d 100644 --- a/toys/posix/grep.c +++ b/toys/posix/grep.c @@ -4,11 +4,8 @@ * * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/grep.html * - * Posix doesn't even specify -r, documenting deviations from it is silly. -* echo hello | grep -w '' -* echo '' | grep -w '' -* echo hello | grep -f next) - shoe->rc = 0; + for (shoe = (void *)TT.reg; shoe; shoe = shoe->next) shoe->rc = 0; // Loop to handle multiple matches in same line do { regmatch_t *mm = (void *)toybuf; - struct arg_list *seek, fseek; - int baseline = mm->rm_eo; - char *s = 0; + struct arg_list *seek; - mm->rm_so = mm->rm_eo = INT_MAX; + mm->rm_so = mm->rm_eo = 0; rc = 1; // Handle "fixed" (literal) matches (if any) - for (seek = TT.e; seek; seek = seek->next) { - if (FLAG(x)) { - if (!(FLAG(i) ? strcasecmp : strcmp)(seek->arg, line)) s = line; - } else if (!*seek->arg) { - if (FLAG(o)) continue; - // No need to set fseek.next because this will match every line. - seek = &fseek; - fseek.arg = s = line; - mm->rm_so = mm->rm_eo = rc = 0; - break; - } else if (FLAG(i)) s = strcasestr(start, seek->arg); - else s = memmem(start, ulen-(start-line), seek->arg, strlen(seek->arg)); - - // TODO: literal matches don't have "best match" logic, just first hit. - if (s) { - if (!rc && ((s-start) > mm->rm_so || ((s-start)==mm->rm_so && (mm->rm_so+strlen(seek->arg) < mm->rm_eo)))) continue; + if (TT.e && *start) for (ss = start; ss-linenext) { + if (*(pp = seek->arg)=='^' && !FLAG(F)) { + if (ss!=start) continue; + pp++; + } + for (ii = 1; pp[ii] && ss[ii]; ii++) { + if (!FLAG(F)) { + if (pp[ii]=='.') continue; + if (pp[ii]=='\\' && pp[ii+1]) pp++; + else if (pp[ii]=='$' && !pp[ii+1]) break; + } + if (FLAG(i)) { + if (toupper(pp[ii])!=toupper(ss[ii])) break; + } else if (pp[ii]!=ss[ii]) break; + } + if (pp[ii] && (pp[ii]!='$' || pp[ii+1] || ss[ii])) continue; + mm->rm_eo = (mm->rm_so = ss-start)+ii; rc = 0; - mm->rm_so = mm->rm_eo = (s-start); - mm->rm_eo += strlen(seek->arg); + + goto got; } + if (FLAG(x)) break; } + // Empty pattern always matches + if (rc && *TT.fixed && !FLAG(o)) rc = 0; +got: // Handle regex matches (if any) for (shoe = (void *)TT.reg; shoe; shoe = shoe->next) { // Do we need to re-check this regex? if (!shoe->rc) { - shoe->m.rm_so -= baseline; - shoe->m.rm_eo -= baseline; + shoe->m.rm_so -= move; + shoe->m.rm_eo -= move; if (!matched || shoe->m.rm_so<0) shoe->rc = regexec0(&shoe->r, start, ulen-(start-line), 1, &shoe->m, start==line ? 0 : REG_NOTBOL); } // If we got a match, is it a _better_ match? - if (!shoe->rc && (shoe->m.rm_so < mm->rm_so || + if (!shoe->rc && (rc || shoe->m.rm_so < mm->rm_so || (shoe->m.rm_so == mm->rm_so && shoe->m.rm_eo >= mm->rm_eo))) { mm = &shoe->m; @@ -218,7 +220,7 @@ static void do_grep(int fd, char *name) } if (!rc && FLAG(o) && !mm->rm_eo && ulen>start-line) { - start++; + move = 1; continue; } @@ -236,7 +238,7 @@ static void do_grep(int fd, char *name) if (!isalnum(c) && c != '_') c = 0; } if (c) { - start += mm->rm_so+1; + move = mm->rm_so+1; continue; } } @@ -245,7 +247,7 @@ static void do_grep(int fd, char *name) if (FLAG(o)) { if (rc) mm->rm_eo = ulen-(start-line); else if (!mm->rm_so) { - start += mm->rm_eo; + move = mm->rm_eo; continue; } else mm->rm_eo = mm->rm_so; } else { @@ -306,9 +308,8 @@ static void do_grep(int fd, char *name) } } - start += mm->rm_eo; - if (mm->rm_so == mm->rm_eo) break; - } while (*start); + if (mm->rm_so == (move = mm->rm_eo)) break; + } while (*(start += move)); offset += len; if (matched) { @@ -366,10 +367,21 @@ static void do_grep(int fd, char *name) } } +static int lensort(struct arg_list **a, struct arg_list **b) +{ + long la = strlen((*a)->arg), lb = strlen((*b)->arg); + + if (lalb) return 1; + + return 0; +} + static void parse_regex(void) { struct arg_list *al, *new, *list = NULL, **last; - char *s, *ss; + char *s, *ss, *special = "\\.^$[()|*+?{"; + int len, ii, key; // Add all -f lines to -e list. (Yes, this is leaking allocation context for // exit to free. Not supporting nofork for this command any time soon.) @@ -406,8 +418,13 @@ static void parse_regex(void) // Convert to regex where appropriate for (last = &TT.e; *last;) { - for (s = (*last)->arg; *s; s++) - if (*s>127 || strchr("^.[$()|*+?{\\", *s)) break; + if ('.'!=*(s = (*last)->arg) && !FLAG(F)) for (; *s; s++) { + if (*s=='\\') { + if (!s[1] || !strchr(special, *++s)) break; + if (!FLAG(E) && *s=='(') break; + } else if (*s>127 || strchr(special+4, *s)) break; + } + if (!*s || FLAG(F)) last = &((*last)->next); else { struct reg *shoe; @@ -421,6 +438,29 @@ static void parse_regex(void) } } dlist_terminate(TT.reg); + + // Sort fixed patterns into buckets by first character + for (al = TT.e; al; al = new) { + new = al->next; + key = '^'==*al->arg; + if ('$'==al->arg[key] && !al->arg[key+1]) key = 0; + else key = al->arg[key]; + if (FLAG(i)) key = toupper(key); + al->next = TT.fixed[key]; + TT.fixed[key] = al; + } + + // Sort each fixed pattern set by length so first hit is longest match + if (TT.e) for (key = 0; key<256; key++) { + if (!TT.fixed[key]) continue; + for (len = 0, al = TT.fixed[key]; al; al = al->next) len++; + last = xmalloc(len*sizeof(void *)); + for (len = 0, al = TT.fixed[key]; al; al = al->next) last[len++] = al; + qsort(last, len, sizeof(void *), (void *)lensort); + for (ii = 0; iinext = ii ? last[ii-1] : 0; + TT.fixed[key] = last[len-1]; + free(last); + } } static int do_grep_r(struct dirtree *new) -- 2.39.2