From b6881f66dedda832217fc7619058cf1a229de3a0 Mon Sep 17 00:00:00 2001 From: Ray Gardner Date: Thu, 22 Feb 2024 13:22:10 -0700 Subject: [PATCH] Allow nul bytes in data (WIP) Allow nul bytes in data files (not in source code yet). Still cannot have nul in multiline record (RS="") data. Work in progress. --- toys/pending/awk.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/toys/pending/awk.c b/toys/pending/awk.c index cf226c92..62518914 100644 --- a/toys/pending/awk.c +++ b/toys/pending/awk.c @@ -304,7 +304,7 @@ ssize_t getdelim(char ** restrict lineptr, size_t * restrict n, int delimiter, F // Forward ref declarations static struct zvalue *val_to_str(struct zvalue *v); -static int rx_compile(regex_t *rx, char *pat); +static void rx_compile(regex_t *rx, char *pat); //////////////////// @@ -1102,7 +1102,7 @@ static int make_literal_regex_val(char *s) { regex_t *rx; rx = xmalloc(sizeof(*rx)); - if (rx_compile(rx, s)) XERR("regex seen as '%s'\n", s); + rx_compile(rx, s); struct zvalue v = ZVINIT(ZF_RX, 0, 0); v.rx = rx; // Flag empty rx to make it easy to identify for split() special case @@ -2521,20 +2521,9 @@ static char *rx_escape_str(char *s) return s0; } -static int rx_compile(regex_t *rx, char *pat) +static void rx_compile(regex_t *rx, char *pat) { - int r; - if ((r = regcomp(rx, pat, REG_EXTENDED)) != 0) { - char errbuf[256]; - regerror(r, rx, errbuf, sizeof(errbuf)); - error_exit("regex error %d: %s on '%s' -- ", r, errbuf, pat); - } - return r; -} - -static void rx_compile_or_die(regex_t *rx, char *pat) -{ - if (rx_compile(rx, pat)) FATAL("bad regex\n"); + xregcomp(rx, pat, REG_EXTENDED); } static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat) @@ -2544,7 +2533,7 @@ static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat) val_to_str(pat); zvalue_dup_zstring(pat); rx_escape_str(pat->vst->str); - rx_compile_or_die(*rx, pat->vst->str); + rx_compile(*rx, pat->vst->str); } } @@ -2635,7 +2624,7 @@ static regex_t *rx_fs_prep(char *fs) if (strlen(fs) >= FS_MAX) FATAL("FS too long"); strcpy(TT.fs_last, fs); regfree(&TT.rx_last); - rx_compile_or_die(&TT.rx_last, fmt_one_char_fs(fs)); + rx_compile(&TT.rx_last, fmt_one_char_fs(fs)); return &TT.rx_last; } @@ -3352,6 +3341,17 @@ static ssize_t getrec_multiline(struct zfile *zfp) return k; } +static int rx_findx(regex_t *rx, char *s, long len, regoff_t *start, regoff_t *end, int eflags) +{ + regmatch_t matches[1]; + int r = regexec0(rx, s, len, 1, matches, eflags); + if (r == REG_NOMATCH) return r; + if (r) FATAL("regexec error"); // TODO ? use regerr() to meaningful msg + *start = matches[0].rm_so; + *end = matches[0].rm_eo; + return 0; +} + static ssize_t getrec_f(struct zfile *zfp) { int r = 0, rs = ENSURE_STR(&STACK[RS])->vst->str[0] & 0xff; @@ -3373,7 +3373,7 @@ static ssize_t getrec_f(struct zfile *zfp) if (!zfp->endoffs) break; } TT.rgl.recptr = zfp->recbuf + zfp->recoffs; - r = rx_find(rsrxp, TT.rgl.recptr, &so, &eo, REG_NOTBOL | REG_NOTEOL); + r = rx_findx(rsrxp, TT.rgl.recptr, zfp->endoffs - zfp->recoffs, &so, &eo, 0); // if not found, or found "near" end of buffer... if (r || zfp->recoffs + eo > (int)zfp->recbufsize - RS_LENGTH_MARGIN) { // if at end of data, and (not found or found at end of data) @@ -4483,9 +4483,9 @@ static void run(int optind, int argc, char **argv, char *sepstring, char *printf_fmt_rx = "%[-+ #0']*([*]|[0-9]*)([.]([*]|[0-9]*))?[aAdiouxXfFeEgGcs%]"; init_globals(optind, argc, argv, sepstring, assign_args, envp); TT.cfile = xzalloc(sizeof(struct zfile)); - rx_compile_or_die(&TT.rx_default, "[ \t\n]+"); - rx_compile_or_die(&TT.rx_last, "[ \t\n]+"); - rx_compile_or_die(&TT.rx_printf_fmt, printf_fmt_rx); + rx_compile(&TT.rx_default, "[ \t\n]+"); + rx_compile(&TT.rx_last, "[ \t\n]+"); + rx_compile(&TT.rx_printf_fmt, printf_fmt_rx); new_file("-", stdin, 'r', 'f')->is_std_file = 1; new_file("/dev/stdin", stdin, 'r', 'f')->is_std_file = 1; new_file("/dev/stdout", stdout, 'w', 'f')->is_std_file = 1; -- 2.39.2