From baca3a4ac6f7aab27dcf6a3c0fab8de6449c3700 Mon Sep 17 00:00:00 2001 From: Ray Gardner Date: Thu, 1 Feb 2024 15:54:16 -0700 Subject: [PATCH] Update Makefile Mod Makefile to keep mono.c and toys/pending/awk.c after 'make clean'; add those sources to repo --- toys/pending/awk.c | 4551 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4551 insertions(+) create mode 100644 toys/pending/awk.c diff --git a/toys/pending/awk.c b/toys/pending/awk.c new file mode 100644 index 00000000..db2dec37 --- /dev/null +++ b/toys/pending/awk.c @@ -0,0 +1,4551 @@ +/* awk.c - An awk implementation. + * vi: tabstop=2 softtabstop=2 shiftwidth=2 + * + * Copyright 2024 Ray Gardner + * + * See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html + +USE_AWK(NEWTOY(awk, "F:v*f*c", TOYFLAG_USR|TOYFLAG_BIN)) + +config AWK + bool "awk" + default n + help + usage: awk [-F sepstring] [-v assignment]... program [argument...] + or: + awk [-F sepstring] -f progfile [-f progfile]... [-v assignment]... + [argument...] + also: + -c : compile only, do not run +*/ + +#define FOR_awk +#include "toys.h" + +GLOBALS( + struct arg_list *f; + struct arg_list *v; + char *F; + + struct scanner_state { + char *p; + char *progstring; + struct arg_list *prog_args; + char *filename; + char *line; + size_t line_size; + ssize_t line_len; + int line_num; + int ch; + FILE *fp; + // state includes latest token seen + int tok; + int tokbuiltin; + int toktype; + char *tokstr; + size_t maxtok; + size_t toklen; + double numval; + int error; // Set if lexical error. + } *scs; + char *tokstr; + int prevtok; + + struct compiler_globals { + int in_print_stmt; + int paren_level; + int in_function_body; + int funcnum; + int nparms; + int compile_error_count; + int first_begin; + int last_begin; + int first_end; + int last_end; + int first_recrule; + int last_recrule; + int break_dest; + int continue_dest; + int stack_offset_to_fix; // fixup stack if return in for(e in a) + int range_pattern_num; + int rule_type; // tkbegin, tkend, or 0 + } cgl; + + // zvalue: the main awk value type + // Can be number or string or both, or else map (array) or regex + struct zvalue { + unsigned flags; + double num; + union { // anonymous union not in C99; not going to fix it now. + struct zstring *vst; + struct zmap *map; + regex_t *rx; + }; + } nozvalue; // to shut up compiler warning TODO FIXME + + struct runtime_globals { + struct zvalue cur_arg; + //char *filename; // UNUSED + FILE *fp; // current data file + int narg; // cmdline arg index + int nfiles; // num of cmdline data file args processed + int eof; // all cmdline files (incl. stdin) read + char *recptr; + char *recbuf; + size_t recbufsize; + char *recbuf_multx; + size_t recbufsize_multx; + struct zstring *zspr; // Global to receive sprintf() string value + } rgl; + + // Expanding sequential list + struct zlist { + char *base, *limit, *avail; + size_t size; + } globals_table, // global symbol table + locals_table, // local symbol table + func_def_table; // function symbol table + // runtime lists + struct zlist literals, fields, zcode, stack; + + char *progname; + + int spec_var_limit; + int zcode_last; + int stkptr; + + char *pbuf; // Used for number formatting in num_to_zstring() +#define RS_MAX 64 + char rs_last[RS_MAX]; + regex_t rx_rs_default, rx_rs_last; + regex_t rx_default, rx_last, rx_printf_fmt; +#define FS_MAX 64 + char fs_last[FS_MAX]; + char one_char_fs[4]; + int nf_internal; // should match NF + char range_sw[64]; // FIXME TODO quick and dirty set of range switches + int file_cnt, std_file_cnt; + + struct zfile { + struct zfile *next; + char *fn; + FILE *fp; + char mode; // w, a, or r + char file_or_pipe; // f or p + char is_std_file; + char *recbuf; + size_t recbufsize; + char *recbuf_multi; + size_t recbufsize_multi; + char *recbuf_multx; + size_t recbufsize_multx; + int recoffs, endoffs; + } *zfiles, *cfile, *zstdout; +) + +#ifdef __GNUC__ +#define ATTR_FALLTHROUGH_INTENDED __attribute__ ((fallthrough)) +#else +#define ATTR_FALLTHROUGH_INTENDED +#endif + +//////////////////// +//// declarations +//////////////////// + +#define PBUFSIZE 512 // For num_to_zstring() + +enum toktypes { + // EOF (use -1 from stdio.h) + ERROR = 2, NEWLINE, VAR, NUMBER, STRING, REGEX, USERFUNC, BUILTIN, TOKEN, + KEYWORD + }; + +// Must align with lbp_table[] +enum tokens { + tkunusedtoken, tkeof, tkerr, tknl, + tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, + +// static char *ops = " ; , [ ] ( ) { } $ ++ -- ^ ! * / % + - " +// "< <= != == > >= ~ !~ && || ? : ^= %= *= /= += -= = >> | "; + tksemi, tkcomma, tklbracket, tkrbracket, tklparen, tkrparen, tklbrace, + tkrbrace, tkfield, tkincr, tkdecr, tkpow, tknot, tkmul, tkdiv, tkmod, + tkplus, tkminus, + tkcat, // !!! Fake operator for concatenation (just adjacent string exprs) + tklt, tkle, tkne, tkeq, tkgt, tkge, tkmatchop, tknotmatch, tkand, tkor, + tkternif, tkternelse, tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn, + tkaddasgn, tksubasgn, tkasgn, tkappend, tkpipe, + +// static char *keywords = " in BEGIN END if else " +// "while for do break continue exit function " +// "return next nextfile delete print printf getline "; + tkin, tkbegin, tkend, tkif, tkelse, + tkwhile, tkfor, tkdo, tkbreak, tkcontinue, tkexit, tkfunction, + tkreturn, tknext, tknextfile, tkdelete, tkprint, tkprintf, tkgetline, + +// static char *builtins = " atan2 cos sin exp " +// "log sqrt int rand srand length " +// "tolower toupper system fflush "; + tkatan2, tkcos, tksin, tkexp, tklog, tksqrt, tkint, tkrand, tksrand, + tklength, tktolower, tktoupper, tksystem, tkfflush, + +// static char *specialfuncs = " close index match split " +// "sub gsub sprintf substr "; + tkclose, tkindex, tkmatch, tksplit, + tksub, tkgsub, tksprintf, tksubstr, tklasttk + }; + +enum opcodes { + opunusedop = tklasttk, + opvarref, opmapref, opfldref, oppush, opdrop, opdrop_n, opnotnot, + oppreincr, oppredecr, oppostincr, oppostdecr, opnegate, opjump, opjumptrue, + opjumpfalse, opprepcall, opmap, opmapiternext, opmapdelete, opmatchrec, + opquit, opprintrec, oprange1, oprange2, oprange3, oplastop +}; + +// Special variables (POSIX). Must align with char *spec_vars[] +enum spec_var_names { ARGC=1, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF, + NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP }; + +struct symtab_slot { // global symbol table entry + unsigned flags; + int slotnum; + char *name; +}; + +// zstring: flexible string type. +// Capacity must be > size because we insert a NUL byte. +struct zstring { + int refcnt; + unsigned size; + unsigned capacity; + char str[]; // C99 flexible array member +}; + +// Flag bits for zvalue and symbol tables +#define ZF_MAYBEMAP (1u << 1) +#define ZF_MAP (1u << 2) +#define ZF_SCALAR (1u << 3) +#define ZF_NUM (1u << 4) +#define ZF_RX (1u << 5) +#define ZF_STR (1u << 6) +#define ZF_NUMSTR (1u << 7) // "numeric string" per posix +#define ZF_REF (1u << 9) // for lvalues +#define ZF_MAPREF (1u << 10) // for lvalues +#define ZF_FIELDREF (1u << 11) // for lvalues +#define ZF_EMPTY_RX (1u << 12) +#define ZF_ANYMAP (ZF_MAP | ZF_MAYBEMAP) + +// Macro to help facilitate possible future change in zvalue layout. +#define ZVINIT(flags, num, ptr) {(flags), (double)(num), {(ptr)}} + +#define IS_STR(zvalp) ((zvalp)->flags & ZF_STR) +#define IS_RX(zvalp) ((zvalp)->flags & ZF_RX) +#define IS_NUM(zvalp) ((zvalp)->flags & ZF_NUM) +#define IS_MAP(zvalp) ((zvalp)->flags & ZF_MAP) +#define IS_EMPTY_RX(zvalp) ((zvalp)->flags & ZF_EMPTY_RX) + +#define GLOBAL ((struct symtab_slot *)TT.globals_table.base) +#define LOCAL ((struct symtab_slot *)TT.locals_table.base) +#define FUNC_DEF ((struct functab_slot *)TT.func_def_table.base) + +#define LITERAL ((struct zvalue *)TT.literals.base) +#define STACK ((struct zvalue *)TT.stack.base) +#define FIELD ((struct zvalue *)TT.fields.base) + +#define ZCODE ((int *)TT.zcode.base) + +#define FUNC_DEFINED (1u) +#define FUNC_CALLED (2u) +struct functab_slot { // function symbol table entry + unsigned flags; + int slotnum; + char *name; + struct zlist function_locals; + int zcode_addr; +}; + +// Elements of the hash table (key/value pairs) +struct zmap_slot { + int hash; // store hash key to speed hash table expansion + struct zstring *key; + struct zvalue val; +}; +#define ZMSLOTINIT(hash, key, val) {hash, key, val} + +// zmap: Mapping data type for arrays; a hash table. Values in hash are either +// 0 (unused), -1 (marked deleted), or one plus the number of the zmap slot +// containing a key/value pair. The zlist slot entries are numbered from 0 to +// count-1, so need to add one to distinguish from unused. The probe sequence +// is borrowed from Python dict, using the "perturb" idea to mix in upper bits +// of the original hash value. +struct zmap { + unsigned mask; // tablesize - 1; tablesize is 2 ** n + int *hash; // (mask + 1) elements + int limit; // 80% of table size ((mask+1)*8/10) + int count; // number of occupied slots in hash + int deleted; // number of deleted slots + struct zlist slot; // expanding list of zmap_slot elements +}; + +#define MAPSLOT ((struct zmap_slot *)(m->slot).base) +#define FFATAL(format, ...) zzerr("$" format, __VA_ARGS__) +#define FATAL(...) zzerr("$%s\n", __VA_ARGS__) +#define XERR(format, ...) zzerr(format, __VA_ARGS__) + +#define NO_EXIT_STATUS (9999987) // value unlikely to appear in exit stmt + +ssize_t getline(char **lineptr, size_t *n, FILE *stream); +ssize_t getdelim(char ** restrict lineptr, size_t * restrict n, int delimiter, FILE *stream); + + +// Forward ref declarations +static struct zvalue *val_to_str(struct zvalue *v); +static int rx_compile(regex_t *rx, char *pat); + + +//////////////////// +//// lib +//////////////////// + +static void xfree(void *p) +{ + free(p); +} + +static double str_to_num(char *s) +{ + setlocale(LC_NUMERIC, ""); + return atof(s); +} + +static int hexval(int c) +{ + // Assumes c is valid hex digit + return isdigit(c) ? c - '0' : (c | 040) - 'a' + 10; +} + +//////////////////// +//// common defs +//////////////////// + +// These (ops, keywords, builtins) must align with enum tokens +static char *ops = " ; , [ ] ( ) { } $ ++ -- ^ ! * / % + - .. " + "< <= != == > >= ~ !~ && || ? : ^= %= *= /= += -= = >> | "; + +static char *keywords = " in BEGIN END if else " + "while for do break continue exit function " + "return next nextfile delete print printf getline "; + +static char *builtins = " atan2 cos sin exp log " + "sqrt int rand srand length " + "tolower toupper system fflush " + "close index match split " + "sub gsub sprintf substr "; + +static void zzerr(char *format, ...) +{ + va_list args; + int fatal_sw = 0; + fprintf(stderr, "%s: ", TT.progname); + if (format[0] == '$') { + fprintf(stderr, "FATAL: "); + format++; + fatal_sw = 1; + } + fprintf(stderr, "file %s line %d: ", TT.scs->filename, TT.scs->line_num); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + if (format[strlen(format)-1] != '\n') fputc('\n', stderr); // TEMP FIXME !!! + fflush(stderr); + if (fatal_sw) xexit(); + // Don't bump error count for warnings + else if (!strstr(format, "arning")) TT.cgl.compile_error_count++; +} + +static void get_token_text(char *op, int tk) +{ + // This MUST ? be changed if ops string or tk... assignments change! + memmove(op, ops + 3 * (tk - tksemi) + 1, 2); + op[ op[1] == ' ' ? 1 : 2 ] = 0; +} + +//////////////////// +//// zlist +//////////////////// + +static struct zlist *zlist_initx(struct zlist *p, size_t size, size_t count) +{ + p->base = p->avail = xzalloc(count * size); + p->limit = p->base + size * count; + p->size = size; + return p; +} + +static struct zlist *zlist_init(struct zlist *p, size_t size) +{ +#define SLIST_MAX_INIT_BYTES 128 + return zlist_initx(p, size, SLIST_MAX_INIT_BYTES / size); +} + +static void zlist_expand(struct zlist *p) +{ + size_t offset = p->avail - p->base; + size_t cap = p->limit - p->base; + size_t newcap = maxof(cap + p->size, ((cap / p->size) * 3 / 2) * p->size); + if (newcap <= cap) error_exit("bad memory request.\n"); + char *base = xrealloc(p->base, newcap); + p->base = base; + p->limit = base + newcap; + p->avail = base + offset; +} + +static size_t zlist_append(struct zlist *p, void *obj) +{ + // Insert obj (p->size bytes) at end of list, expand as needed. + // Return scaled offset to newly inserted obj; i.e. the + // "slot number" 0, 1, 2,... + void *objtemp = 0; + if (p->avail > p->limit - p->size) { + objtemp = xmalloc(p->size); // Copy obj in case it is in + memmove(objtemp, obj, p->size); // the area realloc might free! + obj = objtemp; + zlist_expand(p); + } + memmove(p->avail, obj, p->size); + if (objtemp) xfree(objtemp); + p->avail += p->size; + return (p->avail - p->base - p->size) / p->size; // offset of updated slot +} + +static int zlist_len(struct zlist *p) +{ + return (p->avail - p->base) / p->size; +} + +//////////////////// +//// zstring +//////////////////// + +static void zstring_release(struct zstring **s) +{ + if (*s && (**s).refcnt-- == 0) xfree(*s); //free_zstring(s); + *s = 0; +} + +static void zstring_incr_refcnt(struct zstring *s) +{ + if (s) s->refcnt++; +} + +static struct zstring *new_zstring_cap(int capacity) +{ + struct zstring *z = xzalloc(sizeof(*z) + capacity); + z->capacity = capacity; + return z; +} + +// !! Use only if 'to' is NULL or its refcnt is 0. +static struct zstring *zstring_modify(struct zstring *to, size_t at, char *s, size_t n) +{ + size_t cap = at + n + 1; + if (!to || to->capacity < cap) { + to = xrealloc(to, sizeof(*to) + cap); + to->capacity = cap; + to->refcnt = 0; + } + memcpy(to->str + at, s, n); + to->size = at + n; + to->str[to->size] = '\0'; + return to; +} + +// The 'to' pointer may move by realloc, so return (maybe updated) pointer. +// If refcnt is nonzero then there is another pointer to this zstring, +// so copy this one and release it. If refcnt is zero we can mutate this. +static struct zstring *zstring_update(struct zstring *to, size_t at, char *s, size_t n) +{ + if (to && to->refcnt) { + struct zstring *to_before = to; + to = zstring_modify(0, 0, to->str, to->size); + zstring_release(&to_before); + } + return zstring_modify(to, at, s, n); +} + +static struct zstring *zstring_copy(struct zstring *to, struct zstring *from) +{ + return zstring_update(to, 0, from->str, from->size); +} + +static struct zstring *zstring_extend(struct zstring *to, struct zstring *from) +{ + return zstring_update(to, to->size, from->str, from->size); +} + +static struct zstring *new_zstring(char *s, size_t size) +{ + return zstring_modify(0, 0, s, size); +} + +//////////////////// +//// zvalue +//////////////////// + +static struct zvalue uninit_zvalue = ZVINIT(0, 0.0, 0); + +// This will be reassigned in init_globals() with an empty string. +// It's a special value used for "uninitialized" field vars +// referenced past $NF. See push_field(). +static struct zvalue uninit_string_zvalue = ZVINIT(0, 0.0, 0); + +static struct zvalue new_str_val(char *s) +{ + // Only if no nul inside string! + struct zvalue v = ZVINIT(ZF_STR, 0.0, new_zstring(s, strlen(s))); + return v; +} + +static void zvalue_release_zstring(struct zvalue *v) +{ + if (v && ! (v->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&v->vst); +} + +static size_t zlist_append_zvalue(struct zlist *p, struct zvalue *v) +{ + struct zvalue vtemp; + if (p->avail > p->limit - sizeof(*v)) { + vtemp = *v; + v = &vtemp; + zlist_expand(p); + } + *(struct zvalue *)p->avail = *v; + p->avail += p->size; + return (p->avail - p->base - p->size) / p->size; // offset of updated slot +} + +// push_val() is used for initializing globals (see init_compiler()) +// but mostly used in runtime +// WARNING: push_val may change location of v, so do NOT depend on it after! +// Note the incr refcnt used to be after the zlist_append, but that caused a +// heap-use-after-free error when the zlist_append relocated the zvalue being +// pushed, invalidating the v pointer. +static void push_val(struct zvalue *v) +{ + if (IS_STR(v)) zstring_incr_refcnt(v->vst); + TT.stkptr = zlist_append_zvalue(&TT.stack, v); +} + +static void zvalue_copy(struct zvalue *to, struct zvalue *from) +{ + if (IS_RX(from)) *to = *from; + else { + zvalue_release_zstring(to); + *to = *from; + zstring_incr_refcnt(to->vst); + } +} + +static void zvalue_dup_zstring(struct zvalue *v) +{ + struct zstring *z = new_zstring(v->vst->str, v->vst->size); + zstring_release(&v->vst); + v->vst = z; +} + +//////////////////// +//// zmap (array) implementation +//////////////////// + +static int zstring_match(struct zstring *a, struct zstring *b) +{ + return a->size == b->size && memcmp(a->str, b->str, a->size) == 0; +} + +static int zstring_hash(struct zstring *s) +{ // djb2 -- small, fast, good enough for this + unsigned h = 5381; + for (size_t k = 0; k < s->size; k++) + h = ((h << 5) + h) + s->str[k]; + return h; +} + +enum { PSHIFT = 5 }; // "perturb" shift -- see find_mapslot() below + +static struct zmap_slot *find_mapslot(struct zmap *m, struct zstring *key, int *hash, int *probe) +{ + struct zmap_slot *x = 0; + *hash = zstring_hash(key); + unsigned perturb = *hash; + *probe = *hash & m->mask; + int n; + while ((n = m->hash[*probe])) { + if (n > 0) { + x = &MAPSLOT[n-1]; + if (*hash == x->hash && zstring_match(key, x->key)) { + return x; + } + } + // Based on technique in Python dict implementation. Comment there + // (https://github.com/python/cpython/blob/3.10/Objects/dictobject.c) + // says + // + // j = ((5*j) + 1) mod 2**i + // For any initial j in range(2**i), repeating that 2**i times generates + // each int in range(2**i) exactly once (see any text on random-number + // generation for proof). + // + // The addition of 'perturb' greatly improves the probe sequence. See + // the Python dict implementation for more details. + *probe = (*probe * 5 + 1 + (perturb >>= PSHIFT)) & m->mask; + } + return 0; +} + +static struct zvalue *zmap_find(struct zmap *m, struct zstring *key) +{ + int hash, probe; + struct zmap_slot *x = find_mapslot(m, key, &hash, &probe); + return x ? &x->val : 0; +} + +static void zmap_init(struct zmap *m) +{ + enum {INIT_SIZE = 8}; + m->mask = INIT_SIZE - 1; + m->hash = xzalloc(INIT_SIZE * sizeof(*m->hash)); + m->limit = INIT_SIZE * 8 / 10; + m->count = 0; + m->deleted = 0; + zlist_init(&m->slot, sizeof(struct zmap_slot)); +} + +static void zvalue_map_init(struct zvalue *v) +{ + struct zmap *m = xmalloc(sizeof(*m)); + zmap_init(m); + v->map = m; + v->flags |= ZF_MAP; +} + +static void zmap_delete_map_incl_slotdata(struct zmap *m) +{ + for (struct zmap_slot *p = &MAPSLOT[0]; p < &MAPSLOT[zlist_len(&m->slot)]; p++) { + if (p->key) zstring_release(&p->key); + if (p->val.vst) zstring_release(&p->val.vst); + } + xfree(m->slot.base); + xfree(m->hash); +} + +static void zmap_delete_map(struct zmap *m) +{ + zmap_delete_map_incl_slotdata(m); + zmap_init(m); +} + +static void zmap_rehash(struct zmap *m) +{ + // New table is twice the size of old. + int size = m->mask + 1; + unsigned mask = 2 * size - 1; + int *h = xzalloc(2 * size * sizeof(*m->hash)); + // Step through the old hash table, set up location in new table. + for (int i = 0; i < size; i++) { + int n = m->hash[i]; + if (n > 0) { + int hash = MAPSLOT[n-1].hash; + unsigned perturb = hash; + int p = hash & mask; + while (h[p]) { + p = (p * 5 + 1 + (perturb >>= PSHIFT)) & mask; + } + h[p] = n; + } + } + m->mask = mask; + xfree(m->hash); + m->hash = h; + m->limit = 2 * size * 8 / 10; +} + +static struct zmap_slot *zmap_find_or_insert_key(struct zmap *m, struct zstring *key) +{ + int hash, probe; + struct zmap_slot *x = find_mapslot(m, key, &hash, &probe); + if (x) return x; + // not found; insert it. + if (m->count == m->limit) { + zmap_rehash(m); // rehash if getting too full. + // rerun find_mapslot to get new probe index + x = find_mapslot(m, key, &hash, &probe); + } + // Assign key to new slot entry and bump refcnt. + struct zmap_slot zs = ZMSLOTINIT(hash, key, (struct zvalue)ZVINIT(0, 0.0, 0)); + zstring_incr_refcnt(key); + int n = zlist_append(&m->slot, &zs); + m->count++; + m->hash[probe] = n + 1; + return &MAPSLOT[n]; +} + +static void zmap_delete(struct zmap *m, struct zstring *key) +{ + int hash, probe; + struct zmap_slot *x = find_mapslot(m, key, &hash, &probe); + if (!x) return; + zstring_release(&MAPSLOT[m->hash[probe] - 1].key); + m->hash[probe] = -1; + m->deleted++; +} + +//////////////////// +//// scan (lexical analyzer) +//////////////////// + +// TODO: +// IS line_num getting incr correctly? Newline counts as start of line!? +// Handle nuls in file better. +// Open files "rb" and handle CRs in program. +// Roll gch() into get_char() ? +// Deal with signed char (at EOF? elsewhere?) +// +// 2023-01-11: Allow nul bytes inside strings? regexes? + +static void progfile_open(void) +{ + TT.scs->filename = TT.scs->prog_args->arg; + TT.scs->prog_args = TT.scs->prog_args->next; + TT.scs->fp = stdin; + if (strcmp(TT.scs->filename, "-")) TT.scs->fp = fopen(TT.scs->filename, "r"); + if (!TT.scs->fp) error_exit("Can't open %s.\n", TT.scs->filename); + TT.scs->line_num = 0; +} + +static int get_char(void) +{ + static char *nl = "\n"; + // On first entry, TT.scs->p points to progstring if any, or null string. + for (;;) { + int c = *(TT.scs->p)++; + if (c) { + return c; + } + if (TT.scs->progstring) { // Fake newline at end of progstring. + if (TT.scs->progstring == nl) return EOF; + TT.scs->p = TT.scs->progstring = nl; + continue; + } + // Here if getting from progfile(s). + if (TT.scs->line == nl) return EOF; + if (!TT.scs->fp) { + progfile_open(); + // The " " + 1 is to set p to null string but allow ref to prev char for + // "lastchar" test below. + } + // Save last char to allow faking final newline. + int lastchar = (TT.scs->p)[-2]; + TT.scs->line_len = getline(&TT.scs->line, &TT.scs->line_size, TT.scs->fp); + if (TT.scs->line_len > 0) { + TT.scs->line_num++; + TT.scs->p = TT.scs->line; + continue; + } + // EOF + // FIXME TODO or check for error? feof() vs. ferror() + fclose(TT.scs->fp); + TT.scs->fp = 0; + TT.scs->p = " " + 2; + if (!TT.scs->prog_args) { + xfree(TT.scs->line); + if (lastchar == '\n') return EOF; + // Fake final newline + TT.scs->line = TT.scs->p = nl; + } + } +} + +static void append_this_char(int c) +{ + if (TT.scs->toklen == TT.scs->maxtok - 1) { + TT.scs->maxtok *= 2; + TT.scs->tokstr = xrealloc(TT.scs->tokstr, TT.scs->maxtok); + } + TT.scs->tokstr[TT.scs->toklen++] = c; + TT.scs->tokstr[TT.scs->toklen] = 0; +} + +static void gch(void) +{ + // FIXME probably not right place to skip CRs. + do { + TT.scs->ch = get_char(); + } while (TT.scs->ch == '\r'); +} + +static void append_char(void) +{ + append_this_char(TT.scs->ch); + gch(); +} + +static int find_keyword_or_builtin(char *table, + int first_tok_in_table) +{ + char s[16] = " ", *p; + // keywords and builtin functions are spaced 10 apart for strstr() lookup, + // so must be less than that long. + if (TT.scs->toklen >= 10) return 0; + strcat(s, TT.scs->tokstr); + strcat(s, " "); + p = strstr(table, s); + if (!p) return 0; + return first_tok_in_table + (p - table) / 10; +} + +static int find_token(void) +{ + char s[6] = " ", *p; + // tokens are spaced 3 apart for strstr() lookup, so must be less than + // that long. + strcat(s, TT.scs->tokstr); + strcat(s, " "); + p = strstr(ops, s); + if (!p) return 0; + return tksemi + (p - ops) / 3; +} + +static int find_keyword(void) +{ + return find_keyword_or_builtin(keywords, tkin); +} + +static int find_builtin(void) +{ + return find_keyword_or_builtin(builtins, tkatan2); +} + +static void get_number(void) +{ + // Assumes TT.scs->ch is digit or dot on entry. + // TT.scs->p points to the following character. + // OK formats: 1 1. 1.2 1.2E3 1.2E+3 1.2E-3 1.E2 1.E+2 1.E-2 1E2 .1 .1E2 + // .1E+2 .1E-2 + // NOT OK: . .E .E1 .E+ .E+1 ; 1E .1E 1.E 1.E+ 1.E- parse as number + // followed by variable E. + // gawk accepts 12.E+ and 12.E- as 12; nawk & mawk say syntax error. + char *leftover; + int len; + TT.scs->numval = strtod(TT.scs->p - 1, &leftover); + len = leftover - TT.scs->p + 1; + if (len == 0) { + append_char(); + TT.scs->toktype = ERROR; + TT.scs->tok = tkerr; + TT.scs->error = 1; + FFATAL("Unexpected token '%s'\n", TT.scs->tokstr); + return; + } + while (len--) + append_char(); +} + +static void get_string_or_regex(int endchar) +{ + gch(); + while (TT.scs->ch != endchar) { + if (TT.scs->ch == '\n') { + // FIXME Handle unterminated string or regex. Is this OK? + // FIXME TODO better diagnostic here? + XERR("%s\n", "unterminated string or regex"); + break; + } else if (TT.scs->ch == '\\') { + // \\ \a \b \f \n \r \t \v \" \/ \ddd + char *p, *escapes = "\\abfnrtv\"/"; + gch(); + if (TT.scs->ch == '\n') { // backslash newline is continuation + gch(); + continue; + } else if ((p = strchr(escapes, TT.scs->ch))) { + // posix regex does not use these escapes, + // but awk does, so do them. + int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes]; + append_this_char(c); + // Need to double up \ inside literal regex + if (endchar == '/' && c == '\\') append_this_char('\\'); + gch(); + } else if (TT.scs->ch == 'x') { + gch(); + if (isxdigit(TT.scs->ch)) { + int c = hexval(TT.scs->ch); + gch(); + if (isxdigit(TT.scs->ch)) { + c = c * 16 + hexval(TT.scs->ch); + gch(); + } + append_this_char(c); + } else append_this_char('x'); + } else if (isdigit(TT.scs->ch)) { + if (TT.scs->ch < '8') { + int k, c = 0; + for (k = 0; k < 3; k++) { + if (isdigit(TT.scs->ch) && TT.scs->ch < '8') { + c = c * 8 + TT.scs->ch - '0'; + gch(); + } else + break; + } + append_this_char(c); + } else { + append_char(); + } + } else { + if (endchar == '/') { + // pass \ unmolested if not awk escape, + // so that regex routines can see it. + if (!strchr(".[]()*+?{}|^$-", TT.scs->ch)) { + XERR("warning: '\\%c' -- unknown regex escape\n", TT.scs->ch); + } + append_this_char('\\'); + } else { + XERR("warning: '\\%c' treated as plain '%c'\n", TT.scs->ch, TT.scs->ch); + } + } + } else if (TT.scs->ch == EOF) { + FATAL("EOF in string or regex\n"); + } else { + append_char(); + } + } + gch(); +} + +static void ascan_opt_div(int div_op_allowed_here) +{ + int n; + for (;;) { + TT.scs->tokbuiltin = 0; + TT.scs->toklen = 0; + TT.scs->tokstr[0] = 0; + while (TT.scs->ch == ' ' || TT.scs->ch == '\t') + gch(); + if (TT.scs->ch == '\\') { + append_char(); + if (TT.scs->ch == '\n') { + gch(); + continue; + } + TT.scs->toktype = ERROR; // \ not last char in line. + TT.scs->tok = tkerr; + TT.scs->error = 3; + FATAL("backslash not last char in line\n"); + return; + } + break; + } + // Note \ in comment does not continue it. + if (TT.scs->ch == '#') { + gch(); + while (TT.scs->ch != '\n') + gch(); + // Need to fall through here to pick up newline. + } + if (TT.scs->ch == '\n') { + TT.scs->toktype = NEWLINE; + TT.scs->tok = tknl; + append_char(); + } else if (isalpha(TT.scs->ch) || TT.scs->ch == '_') { + append_char(); + while (isalnum(TT.scs->ch) || TT.scs->ch == '_') { + append_char(); + } + if ((n = find_keyword()) != 0) { + TT.scs->toktype = KEYWORD; + TT.scs->tok = n; + } else if ((n = find_builtin()) != 0) { + TT.scs->toktype = BUILTIN; + TT.scs->tok = tkbuiltin; + TT.scs->tokbuiltin = n; + } else if ((TT.scs->ch == '(')) { + TT.scs->toktype = USERFUNC; + TT.scs->tok = tkfunc; + } else { + TT.scs->toktype = VAR; + TT.scs->tok = tkvar; + // skip whitespace to be able to check for , or ) + while (TT.scs->ch == ' ' || TT.scs->ch == '\t') + gch(); + } + return; + } else if (TT.scs->ch == '"') { + TT.scs->toktype = STRING; + TT.scs->tok = tkstring; + get_string_or_regex('"'); + } else if (isdigit(TT.scs->ch) || TT.scs->ch == '.') { + TT.scs->toktype = NUMBER; + TT.scs->tok = tknumber; + get_number(); + } else if (TT.scs->ch == '/' && ! div_op_allowed_here) { + TT.scs->toktype = REGEX; + TT.scs->tok = tkregex; + get_string_or_regex('/'); + } else if (TT.scs->ch == EOF) { + TT.scs->toktype = EOF; + TT.scs->tok = tkeof; + } else if (TT.scs->ch == '\0') { + append_char(); + TT.scs->toktype = ERROR; + TT.scs->tok = tkerr; + TT.scs->error = 5; + FATAL("null char\n"); + } else { + // All other tokens. + TT.scs->toktype = TT.scs->ch; + append_char(); + // Special case for **= and ** tokens + if (TT.scs->toktype == '*' && TT.scs->ch == '*') { + append_char(); + if (TT.scs->ch == '=') { + append_char(); + TT.scs->tok = tkpowasgn; + } else TT.scs->tok = tkpow; + TT.scs->toktype = TT.scs->tok + 200; + return; + } + // Is it a 2-character token? + if (TT.scs->ch != ' ' && TT.scs->ch != '\n') { + append_this_char(TT.scs->ch); + if (find_token()) { + TT.scs->tok = find_token(); + TT.scs->toktype = TT.scs->tok + 200; + gch(); // Eat second char of token. + return; + } + TT.scs->toklen--; // Not 2-character token; back off. + TT.scs->tokstr[TT.scs->toklen] = 0; + } + TT.scs->tok = find_token(); + if (TT.scs->tok) return; + TT.scs->toktype = ERROR; + TT.scs->tok = tkerr; + TT.scs->error = 4; + FFATAL("Unexpected token '%s'\n", TT.scs->tokstr); + } +} + +static void scan_opt_div(int div_op_allowed_here) +{ + // TODO FIXME need better diags for bad tokens! + // TODO Also set global syntax error flag. + do ascan_opt_div(div_op_allowed_here); while (TT.scs->tok == tkerr); +} + +static void init_scanner(void) +{ + TT.prevtok = tkeof; + gch(); +} + +// POSIX says '/' does not begin a regex wherever '/' or '/=' can mean divide. +// Pretty sure if / or /= comes after these, it means divide: +static char div_preceders[] = {tknumber, tkstring, tkvar, tkgetline, tkrparen, tkrbracket, tkincr, tkdecr, 0}; + +// For checking end of prev statement for termination and if '/' can come next + +static void scan(void) +{ + TT.prevtok = TT.scs->tok; + if (TT.prevtok && strchr(div_preceders, TT.prevtok)) scan_opt_div(1); + else scan_opt_div(0); + TT.tokstr = TT.scs->tokstr; +} + +//////////////////// +//// compile +//////////////////// + +// NOTES: +// NL ok after , { && || do else OR after right paren after if/while/for +// TODO: +// see case tkgetline -- test more +// case tkmatchop, tknotmatch -- fix ~ (/re/) + +// Forward declarations -- for mutually recursive parsing functions +static int exprn(int rbp); +static void lvalue(void); +static int primary(void); +static void stmt(void); +static void action(int action_type); + +#define CURTOK() (TT.scs->tok) +#define ISTOK(toknum) (TT.scs->tok == (toknum)) + +static int havetok(int tk) +{ + if (!ISTOK(tk)) return 0; + scan(); + return 1; +} + +//// code and "literal" emitters +static void gen2cd(int op, int n) +{ + zlist_append(&TT.zcode, &op); + TT.zcode_last = zlist_append(&TT.zcode, &n); +} + +static void gencd(int op) +{ + TT.zcode_last = zlist_append(&TT.zcode, &op); +} + +static int make_literal_str_val(char *s) +{ + // Only if no nul inside string! + struct zvalue v = new_str_val(s); + return zlist_append(&TT.literals, &v); +} + +static int make_literal_regex_val(char *s) +{ + regex_t *rx; + rx = xmalloc(sizeof(*rx)); + if (rx_compile(rx, s)) XERR("regex seen as '%s'\n", s); + struct zvalue v = ZVINIT(ZF_RX, 0, 0); + v.rx = rx; + // Flag empty rx to make it easy to identify for split() special case + if (!*s) v.flags |= ZF_EMPTY_RX; + return zlist_append(&TT.literals, &v); +} + +static int make_literal_num_val(double num) +{ + struct zvalue v = ZVINIT(ZF_NUM, num, 0); + return zlist_append(&TT.literals, &v); +} + +static int make_uninit_val(void) +{ + struct zvalue v = uninit_zvalue; + return zlist_append(&TT.literals, &v); +} +//// END code and "literal" emitters + +//// Symbol tables functions +static int find_func_def_entry(char *s) +{ + for (int k = 1; k < zlist_len(&TT.func_def_table); k++) + if (!strcmp(s, FUNC_DEF[k].name)) return k; + return 0; +} + +static int add_func_def_entry(char *s) +{ + struct functab_slot ent = {0, 0, 0, {0, 0, 0, 0}, 0}; + ent.name = xstrdup(s); + int slotnum = zlist_append(&TT.func_def_table, &ent); + FUNC_DEF[slotnum].slotnum = slotnum; + return slotnum; +} + +static int find_global(char *s) +{ + for (int k = 1; k < zlist_len(&TT.globals_table); k++) + if (!strcmp(s, GLOBAL[k].name)) return k; + return 0; +} + +static int add_global(char *s) +{ + struct symtab_slot ent = {0, 0, 0}; + ent.name = xstrdup(s); + int slotnum = zlist_append(&TT.globals_table, &ent); + GLOBAL[slotnum].slotnum = slotnum; + return slotnum; +} + +static int find_local_entry(char *s) +{ + for (int k = 1; k < zlist_len(&TT.locals_table); k++) + if (!strcmp(s, LOCAL[k].name)) return k; + return 0; +} + +static int add_local_entry(char *s) +{ + struct symtab_slot ent = {0, 0, 0}; + ent.name = xstrdup(s); + int slotnum = zlist_append(&TT.locals_table, &ent); + LOCAL[slotnum].slotnum = slotnum; + return slotnum; +} + +static int find_or_add_var_name(void) +{ + int slotnum = 0; // + means global; - means local to function + int globals_ent = 0; + int locals_ent = find_local_entry(TT.tokstr); // in local symbol table? + if (locals_ent) { + slotnum = -LOCAL[locals_ent].slotnum; + } else { + globals_ent = find_global(TT.tokstr); + if (!globals_ent) globals_ent = add_global(TT.tokstr); + slotnum = GLOBAL[globals_ent].slotnum; + if (find_func_def_entry(TT.tokstr)) + // POSIX: The same name shall not be used both as a variable name + // with global scope and as the name of a function. + XERR("var '%s' used as function name\n", TT.tokstr); + } + return slotnum; +} + +//// END Symbol tables functions + +//// Initialization +static void init_locals_table(void) +{ + static struct symtab_slot locals_ent; + zlist_init(&TT.locals_table, sizeof(struct symtab_slot)); + zlist_append(&TT.locals_table, &locals_ent); +} + +static void init_tables(void) +{ + static struct symtab_slot global_ent; + static struct functab_slot func_ent; + + // Append dummy elements in lists to force valid offsets nonzero. + zlist_init(&TT.globals_table, sizeof(struct symtab_slot)); + zlist_append(&TT.globals_table, &global_ent); + zlist_init(&TT.func_def_table, sizeof(struct functab_slot)); + zlist_append(&TT.func_def_table, &func_ent); + init_locals_table(); + zlist_init(&TT.zcode, sizeof(int)); + gencd(tkeof); // to ensure zcode offsets are non-zero + zlist_init(&TT.literals, sizeof(struct zvalue)); + zlist_init(&TT.stack, sizeof(struct zvalue)); + zlist_init(&TT.fields, sizeof(struct zvalue)); + zlist_append(&TT.literals, &uninit_zvalue); + zlist_append(&TT.stack, &uninit_zvalue); + zlist_append(&TT.fields, &uninit_zvalue); + FIELD[0].vst = new_zstring("", 0); +} + +static void init_compiler(void) +{ + // Special variables (POSIX). Must align with enum spec_var_names + static char *spec_vars[] = { "ARGC", "ARGV", "CONVFMT", "ENVIRON", "FILENAME", + "FNR", "FS", "NF", "NR", "OFMT", "OFS", "ORS", "RLENGTH", "RS", "RSTART", + "SUBSEP", 0}; + + init_tables(); + for (int k = 0; spec_vars[k]; k++) { + TT.spec_var_limit = add_global(spec_vars[k]); + GLOBAL[TT.spec_var_limit++].flags |= (k == 1 || k == 3) ? ZF_MAP : ZF_SCALAR; + push_val(&uninit_zvalue); + } +} +//// END Initialization + +//// Parsing and compiling to TT.zcode +// Left binding powers +static int lbp_table[] = { // Must align with enum Toks + 0, 0, 0, 0, // tkunusedtoken, tkeof, tkerr, tknl, + 250, 250, 250, // tkvar, tknumber, tkstring, + 250, 250, 250, // tkregex, tkfunc, tkbuiltin, + 0, 0, 210, 0, // tksemi, tkcomma, tklbracket, tkrbracket, + 200, 0, 0, 0, // tklparen, tkrparen, tklbrace, tkrbrace, + 190, 180, 180, 170, 160, // tkfield, tkincr, tkdecr, tkpow, tknot, + 150, 150, 150, 140, 140, // tkmul, tkdiv, tkmod, tkplus, tkminus, + 130, // tkcat, // FAKE (?) optor for concatenation (adjacent string exprs) + 110, 110, 110, 110, 110, 110, // tklt, tkle, tkne, tkeq, tkgt, tkge, + 100, 100, // tkmatchop, tknotmatch, + 80, 70, // tkand, tkor, + 60, 0, // tkternif, tkternelse, + 50, 50, 50, 50, // tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn, + 50, 50, 50, // tkaddasgn, tksubasgn, tkasgn, + 0, 120, // tkappend, tkpipe, + 90 // tkin +}; + +static int getlbp(int tok) +{ + // FIXME: should tkappend be here too? is tkpipe needed? + // In print statement outside parens: make '>' end an expression + if (TT.cgl.in_print_stmt && ! TT.cgl.paren_level && (tok == tkgt || tok == tkpipe)) + return 0; + return (0 <= tok && tok <= tkin) ? lbp_table[tok] : + // getline is special, not a normal builtin. + // close, index, match, split, sub, gsub, sprintf, substr + // are really builtin functions though bwk treats them as keywords. + (tkgetline <= tok && tok <= tksubstr) ? 240 : 0; // FIXME 240 is temp? +} + +// Get right binding power. Same as left except for right associative optors +static int getrbp(int tok) +{ + int lbp = getlbp(tok); + // ternary (?:), assignment, power ops are right associative + return (lbp <= 60 || lbp == 170) ? lbp - 1 : lbp; +} + +static void unexpected_eof(void) +{ + error_exit("terminated with error(s)"); +} + +//// syntax error diagnostic and recovery (Turner's method) +// D.A. Turner, Error diagnosis and recovery in one pass compilers, +// Information Processing Letters, Volume 6, Issue 4, 1977, Pages 113-115 +static int recovering = 0; + +static void complain(int tk) +{ + char op[3], tkstr[10]; + if (recovering) return; + recovering = 1; + if (!strcmp(TT.tokstr, "\n")) TT.tokstr = ""; + if (tksemi <= tk && tk <= tkpipe) { + get_token_text(op, tk); + XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, op); + } else if (tk >= tkin && tk <= tksubstr) { + if (tk < tkatan2) memmove(tkstr, keywords + 1 + 10 * (tk - tkin), 10); + else memmove(tkstr, builtins + 1 + 10 * (tk - tkatan2), 10); + *strchr(tkstr, ' ') = 0; + XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, tkstr); + } else XERR("syntax near '%s'\n", TT.tokstr); +} + +static void expect(int tk) +{ + if (recovering) { + while (!ISTOK(tkeof) && !ISTOK(tk)) + scan(); + if (ISTOK(tkeof)) unexpected_eof(); + scan(); // consume expected token + recovering = 0; + } else if (!havetok(tk)) complain(tk); +} + +static void skip_to(char *tklist) +{ + do scan(); while (!ISTOK(tkeof) && !strchr(tklist, CURTOK())); + if (ISTOK(tkeof)) unexpected_eof(); +} + +//// END syntax error diagnostic and recovery (Turner's method) + +static void optional_nl_or_semi(void) +{ + while (havetok(tknl) || havetok(tksemi)) + ; +} + +static void optional_nl(void) +{ + while (havetok(tknl)) + ; +} + +static void rparen(void) +{ + expect(tkrparen); + optional_nl(); +} + +static int have_comma(void) +{ + if (!havetok(tkcomma)) return 0; + optional_nl(); + return 1; +} + +static void check_set_map(int slotnum) +{ + // POSIX: The same name shall not be used within the same scope both as + // a scalar variable and as an array. + if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_SCALAR) + XERR("scalar param '%s' used as array\n", LOCAL[-slotnum].name); + if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_SCALAR) + XERR("scalar var '%s' used as array\n", GLOBAL[slotnum].name); + if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_MAP; + if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_MAP; +} + +static void check_set_scalar(int slotnum) +{ + if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_MAP) + XERR("array param '%s' used as scalar\n", LOCAL[-slotnum].name); + if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_MAP) + XERR("array var '%s' used as scalar\n", GLOBAL[slotnum].name); + if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_SCALAR; + if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_SCALAR; +} + +static void map_name(void) +{ + int slotnum; + check_set_map(slotnum = find_or_add_var_name()); + gen2cd(tkvar, slotnum); +} + +static void expr(void) +{ + exprn(0); +} + +static void check_builtin_arg_counts(int tk, int num_args, char *fname) +{ + static char builtin_1_arg[] = { tkcos, tksin, tkexp, tklog, tksqrt, tkint, + tktolower, tktoupper, tkclose, tksystem, 0}; + static char builtin_2_arg[] = { tkatan2, tkmatch, tkindex, 0}; + static char builtin_2_3_arg[] = { tksub, tkgsub, tksplit, tksubstr, 0}; + static char builtin_0_1_arg[] = { tksrand, tklength, tkfflush, 0}; + + if (tk == tkrand && num_args) + XERR("function '%s' expected no args, got %d\n", fname, num_args); + else if (strchr(builtin_1_arg, tk) && num_args != 1) + XERR("function '%s' expected 1 arg, got %d\n", fname, num_args); + else if (strchr(builtin_2_arg, tk) && num_args != 2) + XERR("function '%s' expected 2 args, got %d\n", fname, num_args); + else if (strchr(builtin_2_3_arg, tk) && num_args != 2 && num_args != 3) + XERR("function '%s' expected 2 or 3 args, got %d\n", fname, num_args); + else if (strchr(builtin_0_1_arg, tk) && num_args != 0 && num_args != 1) + XERR("function '%s' expected no arg or 1 arg, got %d\n", fname, num_args); +} + +static void builtin_call(int tk, char *builtin_name) +{ + int num_args = 0; + expect(tklparen); + TT.cgl.paren_level++; + switch (tk) { + case tksub: + case tkgsub: + if (ISTOK(tkregex)) { + gen2cd(tkregex, make_literal_regex_val(TT.tokstr)); + scan(); + } else expr(); + expect(tkcomma); + optional_nl(); + expr(); + if (have_comma()) { + lvalue(); + } else { + gen2cd(tknumber, make_literal_num_val(0)); + gen2cd(opfldref, tkeof); + } + num_args = 3; + break; + + case tkmatch: + expr(); + expect(tkcomma); + optional_nl(); + if (ISTOK(tkregex)) { + gen2cd(tkregex, make_literal_regex_val(TT.tokstr)); + scan(); + } else expr(); + num_args = 2; + break; + + case tksplit: + expr(); + expect(tkcomma); + optional_nl(); + if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) { + map_name(); + scan(); + } else { + XERR("%s\n", "expected array name as split() 2nd arg"); + expr(); + } + // FIXME some recovery needed here!? + num_args = 2; + if (have_comma()) { + if (ISTOK(tkregex)) { + gen2cd(tkregex, make_literal_regex_val(TT.tokstr)); + scan(); + } else expr(); + num_args++; + } + break; + + case tklength: + if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) { + gen2cd(tkvar, find_or_add_var_name()); + scan(); + num_args++; + } + ATTR_FALLTHROUGH_INTENDED; + + default: + if (ISTOK(tkrparen)) break; + do { + expr(); + num_args++; + } while (have_comma()); + break; + } + expect(tkrparen); + TT.cgl.paren_level--; + + check_builtin_arg_counts(tk, num_args, builtin_name); + + gen2cd(tk, num_args); +} + +static void function_call(void) +{ + // Function call: generate TT.zcode to: + // push placeholder for return value, push placeholder for return addr, + // push args, then push number of args, then: + // for builtins: gen opcode (e.g. tkgsub) + // for user func: gen (tkfunc, function location) + // if function not yet defined, location will be filled in when defined + // the location slots will be chained from the symbol table + int functk = 0, funcnum = 0; + char builtin_name[16]; // be sure it's long enough for all builtins + if (ISTOK(tkbuiltin)) { + functk = TT.scs->tokbuiltin; + strcpy(builtin_name, TT.tokstr); + } else if (ISTOK(tkfunc)) { // user function + funcnum = find_func_def_entry(TT.tokstr); + if (!funcnum) funcnum = add_func_def_entry(TT.tokstr); + FUNC_DEF[funcnum].flags |= FUNC_CALLED; + gen2cd(opprepcall, funcnum); + } else error_exit("bad function %s!", TT.tokstr); + scan(); + // length() can appear without parens + int num_args = 0; + if (functk == tklength && !ISTOK(tklparen)) { + gen2cd(functk, 0); + return; + } + if (functk) { // builtin + builtin_call(functk, builtin_name); + return; + } + expect(tklparen); + TT.cgl.paren_level++; + if (ISTOK(tkrparen)) { + scan(); + } else { + do { + if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) { + // Function call arg that is a lone variable. Cannot tell in this + // context if it is a scalar or map. Just add it to symbol table. + gen2cd(tkvar, find_or_add_var_name()); + scan(); + } else expr(); + num_args++; + } while (have_comma()); + expect(tkrparen); + } + TT.cgl.paren_level--; + gen2cd(tkfunc, num_args); +} + +static void var(void) +{ + // var name is in TT.tokstr + // slotnum: + means global; - means local to function + int slotnum = find_or_add_var_name(); + scan(); + if (havetok(tklbracket)) { + check_set_map(slotnum); + int num_subscripts = 0; + do { + expr(); + num_subscripts++; + } while (have_comma()); + expect(tkrbracket); + if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts); + gen2cd(opmap, slotnum); + } else { + check_set_scalar(slotnum); + gen2cd(tkvar, slotnum); + } +} + +// Dollar $ tkfield can be followed by "any" expresson, but +// the way it binds varies. +// The following are valid lvalues: +// $ ( expr ) +// $ tkvar $ tknumber $ tkstring $ tkregex +// $ tkfunc(...) +// $ tkbuiltin(...) +// $ length # with no parens after +// $ tkclose(), ... $ tksubstr +// $ tkgetline FIXME TODO TEST THIS +// $ ++ lvalue +// $ -- lvalue +// $ + expression_up_to_exponentiation (also -, ! prefix ops) +// $ $ whatever_can_follow_and_bind_to_dollar +// +// tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus, +// tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, +// tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr +// +// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k*k }' +// 18 +// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k*k }' +// 18 +// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k^k }' +// 81 +// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k^k }' +// 8 + +static void field_op(void) +{ + // CURTOK() must be $ here. + expect(tkfield); + // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus, + // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex, + // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr + if (ISTOK(tkfield)) field_op(); + else if (ISTOK(tkvar)) var(); + else primary(); + // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void) + // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]). + gen2cd(tkfield, tkeof); +} + +// Tokens that can start expression +static char exprstartsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, + tkbuiltin, tkfield, tkminus, tkplus, tknot, tkincr, tkdecr, tklparen, + tkgetline, tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, + tksubstr, 0}; + +// Tokens that can end statement +static char stmtendsy[] = {tknl, tksemi, tkrbrace, 0}; + +// Tokens that can follow expressions of a print statement +static char printexprendsy[] = {tkgt, tkappend, tkpipe, tknl, tksemi, tkrbrace, 0}; + +// !! Ensure this: +// ternary op is right associative, so +// a ? b : c ? d : e evaluates as +// a ? b : (c ? d : e) not as +// (a ? b : c) ? d : e + +static void convert_push_to_reference(void) +{ + if (ZCODE[TT.zcode_last - 1] == tkvar) ZCODE[TT.zcode_last-1] = opvarref; + else if (ZCODE[TT.zcode_last - 1] == opmap) ZCODE[TT.zcode_last - 1] = opmapref; + else if (ZCODE[TT.zcode_last - 1] == tkfield) ZCODE[TT.zcode_last - 1] = opfldref; + else error_exit("bad lvalue?"); +} + +static void lvalue(void) +{ + if (ISTOK(tkfield)) { + field_op(); + convert_push_to_reference(); + } else if (ISTOK(tkvar)) { + var(); + convert_push_to_reference(); + } else { + XERR("syntax near '%s' (bad lvalue)\n", TT.tokstr); + } +} + +static int primary(void) +{ + // On entry: CURTOK() is first token of expression + // On exit: CURTOK() is infix operator (for binary_op() to handle) or next + // token after end of expression. + // return -1 for field or var (potential lvalue); + // 2 or more for comma-separated expr list + // as in "multiple subscript expression in array" + // e.g. (1, 2) in array_name, or a print/printf list; + // otherwise return 0 + // + // expr can start with: + // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus, + // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex, + // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr + // + // bwk treats these as keywords, not builtins: close index match split sub gsub + // sprintf substr + // + // bwk builtins are: atan2 cos sin exp log sqrt int rand srand length tolower + // toupper system fflush + // NOTE: fflush() is NOT in POSIX awk + // + // primary() must consume prefix and postfix operators as well as + // num, string, regex, var, var with subscripts, and function calls + + int num_exprs = 0; + int nargs, modifier; + int tok = CURTOK(); + switch (tok) { + case tkvar: + case tkfield: + if (ISTOK(tkvar)) var(); + else field_op(); + if (ISTOK(tkincr) || ISTOK(tkdecr)) { + convert_push_to_reference(); + gencd(CURTOK()); + scan(); + } else return -1; + break; + + case tknumber: + gen2cd(tknumber, make_literal_num_val(TT.scs->numval)); + scan(); + break; + + case tkstring: + gen2cd(tkstring, make_literal_str_val(TT.tokstr)); + scan(); + break; + + case tkregex: + // When an ERE token appears as an expression in any context other + // than as the right-hand of the '~' or "!~" operator or as one of + // the built-in function arguments described below, the value of + // the resulting expression shall be the equivalent of: $0 ~ /ere/ + // FIXME TODO + gen2cd(opmatchrec, make_literal_regex_val(TT.tokstr)); + scan(); + break; + + case tkbuiltin: // various builtins + case tkfunc: // user-defined function + function_call(); + break; + + // Unary prefix ! + - + case tknot: + case tkminus: + case tkplus: + scan(); + exprn(getlbp(tknot)); // unary +/- same precedence as ! + if (tok == tknot) gencd(tknot); + else gencd(opnegate); // forces to number + if (tok == tkplus) gencd(opnegate); // forces to number + break; + + // Unary prefix ++ -- MUST take lvalue + case tkincr: + case tkdecr: + scan(); + lvalue(); + if (tok == tkincr) gencd(oppreincr); + else gencd(oppredecr); + break; + + case tklparen: + scan(); + TT.cgl.paren_level++; + num_exprs = 0; + do { + expr(); + num_exprs++; + } while (have_comma()); + expect(tkrparen); + TT.cgl.paren_level--; + if (num_exprs > 1) return num_exprs; + break; + + case tkgetline: + // getline may be (according to awk book): + // getline [var [= tkgetline + return !! strchr(exprstarttermsy, tok) || tok >= tkgetline; +} + +#define CALLED_BY_PRINT 99987 // Arbitrary, different from any real rbp value + +static int exprn(int rbp) +{ + // On entry: TT.scs has first symbol of expression, e.g. var, number, string, + // regex, func, getline, left paren, prefix op ($ ++ -- ! unary + or -) etc. + static char asgnops[] = {tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn, + tkaddasgn, tksubasgn, tkasgn, 0}; + int prim_st = primary(); + // If called directly by print_stmt(), and found a parenthesized expression list + // followed by an end of print statement: any of > >> | ; } + // Then: return the count of expressions in list + // Else: continue parsing an expression + if (rbp == CALLED_BY_PRINT) { + if (prim_st > 0 && strchr(printexprendsy, CURTOK())) return prim_st; + else rbp = 0; + } + + // mult_expr_list in parens must be followed by 'in' unless it + // immediately follows print or printf, where it may still be followed + // by 'in' ... unless at end of statement + if (prim_st > 0 && ! ISTOK(tkin)) + XERR("syntax near '%s'; expected 'in'\n", TT.tokstr); + if (prim_st > 0) gen2cd(tkrbracket, prim_st); + // primary() has eaten subscripts, function args, postfix ops. + // CURTOK() should be a binary op. + int optor = CURTOK(); + if (strchr(asgnops, optor)) { + + // TODO FIXME ? NOT SURE IF THIS WORKS RIGHT! + // awk does not parse according to POSIX spec in some odd cases. + // When an assignment (lvalue =) is on the right of certain operators, + // it is not treated as a bad lvalue (as it is in C). + // Example: (1 && a=2) # no error; the assignment is performed. + // This happens for ?: || && ~ !~ < <= ~= == > >= + // + static char odd_assignment_rbp[] = {59, 60, 70, 80, 100, 110, 0}; + if (prim_st < 0 && (rbp <= getrbp(optor) || strchr(odd_assignment_rbp, rbp))) { + convert_push_to_reference(); + scan(); + exprn(getrbp(optor)); + gencd(optor); + return 0; + } + XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr); + skip_to(stmtendsy); + } + if (cat_start_concated_expr(optor)) optor = tkcat; + while (rbp < getlbp(optor)) { + binary_op(optor); + // HERE tok s/b an operator or expression terminator ( ; etc.). + optor = CURTOK(); + if (cat_start_concated_expr(optor)) optor = tkcat; + } + return 0; +} + +static void print_stmt(int tk) +{ + static char outmodes[] = {tkgt, tkappend, tkpipe, 0}; + int num_exprs = 0, outmode; + TT.cgl.in_print_stmt = 1; + expect(tk); // tkprint or tkprintf + if ((tk == tkprintf) || !strchr(printexprendsy, CURTOK())) { + // printf always needs expression + // print non-empty statement needs expression + num_exprs = exprn(CALLED_BY_PRINT); + if (num_exprs > 0 && !strchr(printexprendsy, CURTOK())) FATAL("print stmt bug"); + if (!num_exprs) { + for (num_exprs++; have_comma(); num_exprs++) + expr(); + } + } + outmode = CURTOK(); + if (strchr(outmodes, outmode)) { + scan(); + expr(); // FIXME s/b only bwk term? check POSIX + num_exprs++; + } else outmode = 0; + gen2cd(tk, num_exprs); + gencd(outmode); + TT.cgl.in_print_stmt = 0; +} + +static void delete_stmt(void) +{ + expect(tkdelete); + if (ISTOK(tkvar)) { + int slotnum = find_or_add_var_name(); + check_set_map(slotnum); + scan(); + if (havetok(tklbracket)) { + int num_subscripts = 0; + do { + expr(); + num_subscripts++; + } while (have_comma()); + expect(tkrbracket); + if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts); + gen2cd(opmapref, slotnum); + gencd(tkdelete); + } else { + // delete entire map (elements only; var is still a map) + gen2cd(opmapref, slotnum); + gencd(opmapdelete); + } + } else expect(tkvar); +} + +static void simple_stmt(void) +{ + if (strchr(exprstartsy, CURTOK())) { + expr(); + gencd(opdrop); + return; + } + switch (CURTOK()) { + case tkprint: + case tkprintf: + print_stmt(CURTOK()); + break; + + case tkdelete: + delete_stmt(); + break; + + default: + XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr); + skip_to(stmtendsy); + } +} + +static int prev_was_terminated(void) +{ + return !!strchr(stmtendsy, TT.prevtok); +} + +static int is_nl_semi(void) +{ + return ISTOK(tknl) || ISTOK(tksemi); +} + +static void if_stmt(void) +{ + expect(tkif); + expect(tklparen); + expr(); + rparen(); + gen2cd(tkif, -1); + int cdx = TT.zcode_last; + stmt(); + if (!prev_was_terminated() && is_nl_semi()) { + scan(); + optional_nl(); + } + if (prev_was_terminated()) { + optional_nl(); + if (havetok(tkelse)) { + gen2cd(tkelse, -1); + ZCODE[cdx] = TT.zcode_last - cdx; + cdx = TT.zcode_last; + optional_nl(); + stmt(); + } + } + ZCODE[cdx] = TT.zcode_last - cdx; +} + +static void save_break_continue(int *brk, int *cont) +{ + *brk = TT.cgl.break_dest; + *cont = TT.cgl.continue_dest; +} + +static void restore_break_continue(int *brk, int *cont) +{ + TT.cgl.break_dest = *brk; + TT.cgl.continue_dest = *cont; +} + +static void while_stmt(void) +{ + int brk, cont; + save_break_continue(&brk, &cont); + expect(tkwhile); + expect(tklparen); + TT.cgl.continue_dest = TT.zcode_last + 1; + expr(); + rparen(); + gen2cd(tkwhile, 2); // drop, jump if true + TT.cgl.break_dest = TT.zcode_last + 1; + gen2cd(opjump, -1); // jump here to break + stmt(); + gen2cd(opjump, -1); // jump to continue + ZCODE[TT.zcode_last] = TT.cgl.continue_dest - TT.zcode_last - 1; + ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; + restore_break_continue(&brk, &cont); +} + +static void do_stmt(void) +{ + int brk, cont; + save_break_continue(&brk, &cont); + expect(tkdo); + optional_nl(); + gen2cd(opjump, 4); // jump over jumps, to statement + TT.cgl.continue_dest = TT.zcode_last + 1; + gen2cd(opjump, -1); // here on continue + TT.cgl.break_dest = TT.zcode_last + 1; + gen2cd(opjump, -1); // here on break + stmt(); + if (!prev_was_terminated()) { + if (is_nl_semi()) { + scan(); + optional_nl(); + } else { + XERR("syntax near '%s' -- ';' or newline expected\n", TT.tokstr); + // FIXME + } + } + ZCODE[TT.cgl.continue_dest + 1] = TT.zcode_last - TT.cgl.continue_dest - 1; + optional_nl(); + expect(tkwhile); + expect(tklparen); + expr(); + rparen(); + gen2cd(tkwhile, TT.cgl.break_dest - TT.zcode_last - 1); + ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; + restore_break_continue(&brk, &cont); +} + +static void for_not_map_iter(void) +{ + // Here after loop initialization, if any; loop condition + int condition_loc = TT.zcode_last + 1; + if (havetok(tksemi)) { + // "endless" loop variant; no condition + // no NL allowed here in OTA + gen2cd(opjump, -1); // jump to statement + } else { + optional_nl(); // NOT posix or awk book; in OTA + expr(); // loop while true + expect(tksemi); + gen2cd(tkwhile, -1); // drop, jump to statement if true + } + optional_nl(); // NOT posix or awk book; in OTA + TT.cgl.break_dest = TT.zcode_last + 1; + gen2cd(opjump, -1); + TT.cgl.continue_dest = TT.zcode_last + 1; + if (!ISTOK(tkrparen)) simple_stmt(); // "increment" + gen2cd(opjump, condition_loc - TT.zcode_last - 3); + rparen(); + ZCODE[TT.cgl.break_dest - 1] = TT.zcode_last - TT.cgl.break_dest + 1; + stmt(); + gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3); + ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; +} + +static int valid_for_array_iteration(int first, int last) +{ + return ZCODE[first] == tkvar && ZCODE[first + 2] == tkvar + && ZCODE[first + 4] == tkin && ZCODE[first + 5] == opdrop + && first + 5 == last; +} + +static void for_stmt(void) +{ + int brk, cont; + save_break_continue(&brk, &cont); + expect(tkfor); + expect(tklparen); + if (havetok(tksemi)) { + // No "initialization" part + for_not_map_iter(); + } else { + int loop_start_loc = TT.zcode_last + 1; + simple_stmt(); // initializaton part, OR varname in arrayname form + if (!havetok(tkrparen)) { + expect(tksemi); + for_not_map_iter(); + } else { + // Must be map iteration + // Check here for varname in varname! + // FIXME TODO must examine generated TT.zcode for var in array? + if (!valid_for_array_iteration(loop_start_loc, TT.zcode_last)) + XERR("%s", "bad 'for (var in array)' loop\n"); + else { + ZCODE[TT.zcode_last-5] = opvarref; + ZCODE[TT.zcode_last-1] = tknumber; + ZCODE[TT.zcode_last] = make_literal_num_val(-1); + TT.cgl.continue_dest = TT.zcode_last + 1; + gen2cd(opmapiternext, 2); + TT.cgl.break_dest = TT.zcode_last + 1; + gen2cd(opjump, -1); // fill in with loc after stmt + } + optional_nl(); + // fixup TT.stack if return or exit inside for (var in array) + TT.cgl.stack_offset_to_fix += 3; + stmt(); + TT.cgl.stack_offset_to_fix -= 3; + gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3); + ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; + gencd(opdrop); + gencd(opdrop); + gencd(opdrop); + } + } + restore_break_continue(&brk, &cont); +} + +static void stmt(void) +{ + switch (CURTOK()) { + case tkeof: + break; // FIXME ERROR? + + case tkbreak: + scan(); + if (TT.cgl.break_dest) gen2cd(tkbreak, TT.cgl.break_dest - TT.zcode_last - 3); + else XERR("%s", "break not in a loop\n"); + break; + + case tkcontinue: + scan(); + if (TT.cgl.continue_dest) + gen2cd(tkcontinue, TT.cgl.continue_dest - TT.zcode_last - 3); + else XERR("%s", "continue not in a loop\n"); + break; + + case tknext: + scan(); + gencd(tknext); + if (TT.cgl.rule_type) XERR("%s", "next inside BEGIN or END\n"); + if (TT.cgl.in_function_body) XERR("%s", "next inside function def\n"); + break; + + case tknextfile: + scan(); + gencd(tknextfile); + if (TT.cgl.rule_type) XERR("%s", "nextfile inside BEGIN or END\n"); + if (TT.cgl.in_function_body) XERR("%s", "nextfile inside function def\n"); + break; + + case tkexit: + scan(); + if (strchr(exprstartsy, CURTOK())) { + expr(); + } else gen2cd(tknumber, make_literal_num_val(NO_EXIT_STATUS)); + gencd(tkexit); + break; + + case tkreturn: + scan(); + if (TT.cgl.stack_offset_to_fix) gen2cd(opdrop_n, TT.cgl.stack_offset_to_fix); + if (strchr(exprstartsy, CURTOK())) { + expr(); + } else gen2cd(tknumber, make_literal_num_val(0.0)); + gen2cd(tkreturn, TT.cgl.nparms); + if (!TT.cgl.in_function_body) XERR("%s", "return outside function def\n"); + break; + + case tklbrace: + action(tklbrace); + break; + + case tkif: + if_stmt(); + break; + + case tkwhile: + while_stmt(); + break; + + case tkdo: + do_stmt(); + break; + + case tkfor: + for_stmt(); + break; + + case tksemi: + scan(); + break; + default: + simple_stmt(); // expression print printf delete + } +} + +static void add_param(int funcnum, char *s) +{ + if (!find_local_entry(s)) add_local_entry(s); + else XERR("function '%s' dup param '%s'\n", FUNC_DEF[funcnum].name, s); + TT.cgl.nparms++; + + // POSIX: The same name shall not be used as both a function parameter name + // and as the name of a function or a special awk variable. + // !!! NOTE seems implementations exc. mawk only compare param names with + // builtin funcs; use same name as userfunc is OK! + if (!strcmp(s, FUNC_DEF[funcnum].name)) + XERR("function '%s' param '%s' matches func name\n", + FUNC_DEF[funcnum].name, s); + if (find_global(s) && find_global(s) < TT.spec_var_limit) + XERR("function '%s' param '%s' matches special var\n", + FUNC_DEF[funcnum].name, s); +} + +static void function_def(void) +{ + expect(tkfunction); + int funcnum = find_func_def_entry(TT.tokstr); + if (!funcnum) { + funcnum = add_func_def_entry(TT.tokstr); + } else if (FUNC_DEF[funcnum].flags & FUNC_DEFINED) { + XERR("dup defined function '%s'\n", TT.tokstr); + } + FUNC_DEF[funcnum].flags |= FUNC_DEFINED; + if (find_global(TT.tokstr)) { + // POSIX: The same name shall not be used both as a variable name with + // global scope and as the name of a function. + XERR("function name '%s' previously defined\n", TT.tokstr); + } + + gen2cd(tkfunction, funcnum); + FUNC_DEF[funcnum].zcode_addr = TT.zcode_last - 1; + TT.cgl.funcnum = funcnum; + TT.cgl.nparms = 0; + if (ISTOK(tkfunc)) expect(tkfunc); // func name with no space before ( + else expect(tkvar); // func name with space before ( + expect(tklparen); + if (ISTOK(tkvar)) { + add_param(funcnum, TT.tokstr); + scan(); + // FIXME is the the best way? what if TT.tokstr not a tkvar? + while (have_comma()) { + add_param(funcnum, TT.tokstr); + expect(tkvar); + } + } + rparen(); + if (ISTOK(tklbrace)) { + TT.cgl.in_function_body = 1; + action(tkfunc); + TT.cgl.in_function_body = 0; + // Need to return uninit value if falling off end of function. + gen2cd(tknumber, make_uninit_val()); + gen2cd(tkreturn, TT.cgl.nparms); + } else { + XERR("syntax near '%s'\n", TT.tokstr); + // FIXME some recovery needed here!? + } + // Do not re-init locals table for dup function. + // Avoids memory leak detected by LeakSanitizer. + if (!FUNC_DEF[funcnum].function_locals.base) { + FUNC_DEF[funcnum].function_locals = TT.locals_table; + init_locals_table(); + } +} + +static void action(int action_type) +{ +(void)action_type; + // action_type is tkbegin, tkend, tkdo (every line), tkif (if pattern), + // tkfunc (function body), tklbrace (compound statement) + // Should have lbrace on entry. + expect(tklbrace); + for (;;) { + if (ISTOK(tkeof)) unexpected_eof(); + optional_nl_or_semi(); + if (havetok(tkrbrace)) { + break; + } + stmt(); + // stmt() is normally unterminated here, but may be terminated if we + // have if with no else (had to consume terminator looking for else) + // !!! if (ISTOK(tkrbrace) || prev_was_terminated()) + if (prev_was_terminated()) continue; + if (!is_nl_semi() && !ISTOK(tkrbrace)) { + XERR("syntax near '%s' -- newline, ';', or '}' expected\n", TT.tokstr); + while (!is_nl_semi() && !ISTOK(tkrbrace) && !ISTOK(tkeof)) scan(); + if (ISTOK(tkeof)) unexpected_eof(); + } + if (havetok(tkrbrace)) break; + // Must be semicolon or newline + scan(); + } +} + +static void rule(void) +{ + // pa_pat + // | pa_pat lbrace stmtlist '}' + // | pa_pat ',' opt_nl pa_pat + // | pa_pat ',' opt_nl pa_pat lbrace stmtlist '}' + // | lbrace stmtlist '}' + // | XBEGIN lbrace stmtlist '}' + // | XEND lbrace stmtlist '}' + // | FUNC funcname '(' varlist rparen lbrace stmtlist '}' + + switch (CURTOK()) { + case tkbegin: + scan(); + if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin] = TT.zcode_last - TT.cgl.last_begin; + else TT.cgl.first_begin = TT.zcode_last + 1; + + TT.cgl.rule_type = tkbegin; + action(tkbegin); + TT.cgl.rule_type = 0; + gen2cd(opjump, -1); + TT.cgl.last_begin = TT.zcode_last; + break; + + case tkend: + scan(); + if (TT.cgl.last_end) ZCODE[TT.cgl.last_end] = TT.zcode_last - TT.cgl.last_end; + else TT.cgl.first_end = TT.zcode_last + 1; + + TT.cgl.rule_type = tkbegin; + action(tkend); + TT.cgl.rule_type = 0; + gen2cd(opjump, -1); + TT.cgl.last_end = TT.zcode_last; + break; + + case tklbrace: + if (TT.cgl.last_recrule) + ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule; + else TT.cgl.first_recrule = TT.zcode_last + 1; + action(tkdo); + gen2cd(opjump, -1); + TT.cgl.last_recrule = TT.zcode_last; + break; + + case tkfunction: + function_def(); + break; + default: + if (TT.cgl.last_recrule) + ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule; + else TT.cgl.first_recrule = TT.zcode_last + 1; + gen2cd(opjump, 1); + gencd(tkeof); + int cdx = 0, saveloc = TT.zcode_last; + expr(); + if (!have_comma()) { + gen2cd(tkif, -1); + cdx = TT.zcode_last; + } else { + gen2cd(oprange2, ++TT.cgl.range_pattern_num); + gencd(-1); + cdx = TT.zcode_last; + ZCODE[saveloc-2] = oprange1; + ZCODE[saveloc-1] = TT.cgl.range_pattern_num; + ZCODE[saveloc] = TT.zcode_last - saveloc; + expr(); + gen2cd(oprange3, TT.cgl.range_pattern_num); + } + if (ISTOK(tklbrace)) { + action(tkif); + ZCODE[cdx] = TT.zcode_last - cdx; + } else { + gencd(opprintrec); // print $0 ? + ZCODE[cdx] = TT.zcode_last - cdx; + } + gen2cd(opjump, -1); + TT.cgl.last_recrule = TT.zcode_last; + } +} + +static void diag_func_def_ref(void) +{ + int n = zlist_len(&TT.func_def_table); + for (int k = 1; k < n; k++) { + if ((FUNC_DEF[k].flags & FUNC_CALLED) && + !(FUNC_DEF[k].flags & FUNC_DEFINED)) { + // Sorry, we can't tell where this was called from, for now at least. + XERR("Undefined function '%s'", FUNC_DEF[k].name); + } + } +} + +static void compile(void) +{ + init_compiler(); + init_scanner(); + scan(); + optional_nl_or_semi(); // Does posix allow NL or ; before first rule? + while (! ISTOK(tkeof)) { + rule(); + optional_nl_or_semi(); // NOT POSIX + } + + + if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin-1] = opquit; + if (TT.cgl.last_end) ZCODE[TT.cgl.last_end-1] = opquit; + if (TT.cgl.last_recrule) ZCODE[TT.cgl.last_recrule-1] = opquit; + + gen2cd(tknumber, make_literal_num_val(0.0)); + gencd(tkexit); + gencd(opquit); + // If there are only BEGIN and END or only END actions, generate actions to + // read all input before END. + if (TT.cgl.first_end && !TT.cgl.first_recrule) { + gencd(opquit); + TT.cgl.first_recrule = TT.zcode_last; + } + gencd(opquit); // One more opcode to keep ip in bounds in run code. + diag_func_def_ref(); +} + +//////////////////// +//// runtime +//////////////////// + +static void check_numeric_string(struct zvalue *v) +{ + if (v->vst) { + char *end, *s = v->vst->str; + // Significant speed gain with this test: + // num string must begin space, +, -, ., or digit. + if (strchr("+-.1234567890 ", *s)) { + setlocale(LC_NUMERIC, ""); + double num = strtod(s, &end); + if (s == end || end[strspn(end, " ")]) return; + v->num = num; + v->flags |= ZF_NUM | ZF_STR | ZF_NUMSTR; + } + } +} + +static struct zstring *num_to_zstring(double n, char *fmt) +{ + int k; + if (n == (long long)n) k = snprintf(TT.pbuf, PBUFSIZE, "%lld", (long long)n); + else k = snprintf(TT.pbuf, PBUFSIZE, fmt, n); + if (k < 0 || k >= PBUFSIZE) FFATAL("error encoding %f via '%s'", n, fmt); + return new_zstring(TT.pbuf, k); +} + +//////////////////// +//// regex routines +//////////////////// + +static char *rx_escape_str(char *s) +{ + char *p, *escapes = "abfnrtv\"/"; // FIXME TODO should / be in there? + char *s0 = s, *to = s; + while ((*to = *s)) { + if (*s != '\\') { to++, s++; + } else if ((p = strchr(escapes, *++s))) { + // checking char after \ for known escapes + int c = "\a\b\f\n\r\t\v\"/"[p-escapes]; + if (c) *to = c, s++; // else final backslash + to++; + } else if ('0' <= *s && *s <= '9') { + int k, c = *s++ - '0'; + for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++) + c = c * 8 + *s++ - '0'; + *to++ = c; + } else if (*s == 'x') { + if (isxdigit(s[1])) { + int c = hexval(*++s); + if (isxdigit(s[1])) c = c * 16 + hexval(*++s); + *to++ = c, s++; + } + } else *to++ = '\\', *to++ = *s++; + } + return s0; +} + +static int rx_compile(regex_t *rx, char *pat) +{ + int r; + if ((r = regcomp(rx, pat, REG_EXTENDED)) != 0) { + char errbuf[256]; + regerror(r, rx, errbuf, sizeof(errbuf)); + error_exit("regex error %d: %s on '%s' -- ", r, errbuf, pat); + } + return r; +} + +static void rx_compile_or_die(regex_t *rx, char *pat) +{ + if (rx_compile(rx, pat)) FATAL("bad regex\n"); +} + +static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat) +{ + if (IS_RX(pat)) *rx = pat->rx; + else { + val_to_str(pat); + zvalue_dup_zstring(pat); + rx_escape_str(pat->vst->str); + rx_compile_or_die(*rx, pat->vst->str); + } +} + +static void rx_zvalue_free(regex_t *rx, struct zvalue *pat) +{ + if (!IS_RX(pat) || rx != pat->rx) regfree(rx); +} + +// Used by the match/not match ops (~ !~) and implicit $0 match (/regex/) +static int match(struct zvalue *zvsubject, struct zvalue *zvpat) +{ + int r; + regex_t rx, *rxp = ℞ + val_to_str(zvsubject); + rx_zvalue_compile(&rxp, zvpat); + if ((r = regexec(rxp, zvsubject->vst->str, 0, 0, 0)) != 0) { + if (r != REG_NOMATCH) { + char errbuf[256]; + regerror(r, &rx, errbuf, sizeof(errbuf)); + // FIXME TODO better diagnostic here + error_exit("regex match error %d: %s\n", r, errbuf); + } + rx_zvalue_free(rxp, zvpat); + return 1; + } + rx_zvalue_free(rxp, zvpat); + return 0; +} + +static int rx_find(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags) +{ + regmatch_t matches[1]; + int r = regexec(rx, s, 1, matches, eflags); + if (r == REG_NOMATCH) return r; + if (r) FATAL("regexec error"); // TODO ? use regerr() to meaningful msg + *start = matches[0].rm_so; + *end = matches[0].rm_eo; + return 0; +} + +// Differs from rx_find() in that FS cannot match null (empty) string. +// See https://www.austingroupbugs.net/view.php?id=1468. +static int rx_find_FS(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags) +{ + int r = rx_find(rx, s, start, end, eflags); + if (r || *start != *end) return r; // not found, or found non-empty match + // Found empty match, retry starting past the match + char *p = s + *end; + if (!*p) return REG_NOMATCH; // End of string, no non-empty match found + // Empty match not at EOS, move ahead and try again + while (!r && *start == *end && *++p) + r = rx_find(rx, p, start, end, eflags); + if (r || !*p) return REG_NOMATCH; // no non-empty match found + *start += p - s; // offsets from original string + *end += p - s; + return 0; +} + +//////////////////// +//// fields +//////////////////// + +#define FIELDS_MAX 102400 // Was 1024; need more for toybox awk test +#define THIS_MEANS_SET_NF 999999999 + +static int get_int_val(struct zvalue *v) +{ + if (IS_NUM(v)) return (int)v->num; + if (IS_STR(v) && v->vst) return (int)str_to_num(v->vst->str); + return 0; +} + +// A single-char FS is never a regex, so make it a [] regex to +// match only that one char in case FS is a regex metachar. +// If regex FS is needed, must use > 1 char. If a '.' regex +// is needed, use e.g. '.|.' (unlikely case). +static char *fmt_one_char_fs(char *fs) +{ + if (strlen(fs) != 1) return fs; + snprintf(TT.one_char_fs, sizeof(TT.one_char_fs), "[%c]", fs[0]); + return TT.one_char_fs; +} + +static regex_t *rx_fs_prep(char *fs) +{ + if (!strcmp(fs, " ")) return &TT.rx_default; + if (!strcmp(fs, TT.fs_last)) return &TT.rx_last; + if (strlen(fs) >= FS_MAX) FATAL("FS too long"); + strcpy(TT.fs_last, fs); + regfree(&TT.rx_last); + rx_compile_or_die(&TT.rx_last, fmt_one_char_fs(fs)); + return &TT.rx_last; +} + +// Only for use by split() builtin +static void set_map_element(struct zmap *m, int k, char *val, size_t len) +{ + // Do not need format here b/c k is integer, uses "%lld" format. + struct zstring *key = num_to_zstring(k, "");// "" vs 0 format avoids warning + struct zmap_slot *zs = zmap_find_or_insert_key(m, key); + zstring_release(&key); + zs->val.vst = zstring_update(zs->val.vst, 0, val, len); + zs->val.flags = ZF_STR; + check_numeric_string(&zs->val); +} + +static void set_zvalue_str(struct zvalue *v, char *s, size_t size) +{ + v->vst = zstring_update(v->vst, 0, s, size); + v->flags = ZF_STR; +} + +// All changes to NF go through here! +static void set_nf(int nf) +{ + STACK[NF].num = TT.nf_internal = nf; + STACK[NF].flags = ZF_NUM; +} + +static void set_field(struct zmap *unused, int fnum, char *s, size_t size) +{ (void)unused; + if (fnum < 0 || fnum > FIELDS_MAX) FFATAL("bad field num %d\n", fnum); + int nfields = zlist_len(&TT.fields); + // Need nfields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields + while (nfields <= fnum) + nfields = zlist_append(&TT.fields, &uninit_zvalue) + 1; + set_zvalue_str(&FIELD[fnum], s, size); + set_nf(fnum); + check_numeric_string(&FIELD[fnum]); +} + +// Split s via fs, using setter; return number of TT.fields. +// This is used to split TT.fields and also for split() builtin. +static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct zmap *m, char *s, struct zvalue *zvfs) +{ + regex_t *rx; + regoff_t offs, end; + if (!IS_RX(zvfs)) val_to_str(zvfs); + char *fs = IS_STR(zvfs) ? zvfs->vst->str : ""; + int nf = 0, r = 0, eflag = 0; + // Empty string or empty fs (regex). + // Need to include !*s b/c empty string, otherwise + // split("", a, "x") splits to a 1-element (empty element) array + if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) { + for ( ; *s; s++) setter(m, ++nf, s, 1); + return nf; + } + if (IS_RX(zvfs)) rx = zvfs->rx; + else rx = rx_fs_prep(fs); + while (*s) { + // Find the next occurrence of FS. + // rx_find_FS() returns 0 if found. If nonzero, the field will + // be the rest of the record (all of it if first time through). + if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s); + else { + int k = strcspn(s, "\n"); + if (k < offs) offs = k, end = k + 1; + } + eflag |= REG_NOTBOL; + + // Field will be s up to (not including) the offset. If offset + // is zero and FS is found and FS is ' ' (TT.rx_default "[ \t]+"), + // then the find is the leading or trailing spaces and/or tabs. + // If so, skip this (empty) field, otherwise set field, length is offs. + if (offs || r || rx != &TT.rx_default) setter(m, ++nf, s, offs); + s += end; + } + if (!r && rx != &TT.rx_default) setter(m, ++nf, "", 0); + return nf; +} + +static void build_fields(void) +{ + val_to_str(&STACK[FS]); + char *rec = FIELD[0].vst->str; + // TODO test this -- why did I not want to split empty $0? + // Maybe don't split empty $0 b/c non-default FS gets NF==1 with splitter()? + set_nf(*rec ? splitter(set_field, 0, rec, &STACK[FS]) : 0); +} + +static void rebuild_field0(void) +{ + struct zstring *s = FIELD[0].vst; + int nf = TT.nf_internal; + // uninit value needed for eventual reference to .vst in zstring_release() + struct zvalue tempv = uninit_zvalue; + zvalue_copy(&tempv, &STACK[OFS]); + val_to_str(&tempv); + for (int i = 1; i <= nf; i++) { + if (i > 1) { + s = s ? zstring_extend(s, tempv.vst) : zstring_copy(s, tempv.vst); + } + if (FIELD[i].flags) val_to_str(&FIELD[i]); + if (FIELD[i].vst) { + if (i > 1) s = zstring_extend(s, FIELD[i].vst); + else s = zstring_copy(s, FIELD[i].vst); + } + } + FIELD[0].vst = s; + FIELD[0].flags |= ZF_STR; + zvalue_release_zstring(&tempv); +} + +// get field ref (lvalue ref) in prep for assignment to field. +// [... assigning to a nonexistent field (for example, $(NF+2)=5) shall +// increase the value of NF; create any intervening TT.fields with the +// uninitialized value; and cause the value of $0 to be recomputed, with the +// TT.fields being separated by the value of OFS.] +// Called by setup_lvalue() +static struct zvalue *get_field_ref(int fnum) +{ + if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d\n", fnum); + if (fnum > TT.nf_internal) { + // Ensure TT.fields list is large enough for fnum + // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields + for (int i = TT.nf_internal + 1; i <= fnum; i++) { + if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue); + zvalue_copy(&FIELD[i], &uninit_string_zvalue); + } + set_nf(fnum); + } + return &FIELD[fnum]; +} + +// Called by tksplit op +static int split(struct zstring *s, struct zvalue *a, struct zvalue *fs) +{ + return splitter(set_map_element, a->map, s->str, fs); +} + +// Called by getrec_f0_f() and getrec_f0() +static void copy_to_field0(char *buf, size_t k) +{ + set_zvalue_str(&FIELD[0], buf, k); + check_numeric_string(&FIELD[0]); + build_fields(); +} + +// After changing $0, must rebuild TT.fields & reset NF +// Changing other field must rebuild $0 +// Called by gsub() and assignment ops. +static void fixup_fields(int fnum) +{ + if (fnum == THIS_MEANS_SET_NF) { // NF was assigned to + int new_nf = get_int_val(&STACK[NF]); + // Ensure TT.fields list is large enough for fnum + // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields + for (int i = TT.nf_internal + 1; i <= new_nf; i++) { + if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue); + zvalue_copy(&FIELD[i], &uninit_string_zvalue); + } + set_nf(TT.nf_internal = STACK[NF].num); + rebuild_field0(); + return; + } + // fnum is # of field that was just updated. + // If it's 0, need to rebuild the TT.fields 1... n. + // If it's non-0, need to rebuild field 0. + val_to_str(&FIELD[fnum]); + if (fnum) check_numeric_string(&FIELD[fnum]); + if (fnum) rebuild_field0(); + else build_fields(); +} + +// Fetching non-existent field gets uninit string value; no change to NF! +// Called by tkfield op // TODO inline it? +static void push_field(int fnum) +{ + if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d\n", fnum); + // Contrary to posix, awk evaluates TT.fields beyond $NF as empty strings. + if (fnum > TT.nf_internal) push_val(&uninit_string_zvalue); + else push_val(&FIELD[fnum]); +} + +//////////////////// +//// END fields +//////////////////// + +#define STKP (&STACK[TT.stkptr]) // pointer to top of stack + +// Random number generator +// Extracted from http://www.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf +// modified to encapsulate state and add seed function. +static struct jkiss32_state { + unsigned x, y, z, w, c, seed; +} jkst = {123456789, 234567891, 345678912, 456789123, 0, 1}; + +static unsigned jkiss32(void) +{ + int t; + jkst.y ^= (jkst.y<<5); jkst.y ^= (jkst.y>>7); jkst.y ^= (jkst.y<<22); + t = jkst.z+jkst.w+jkst.c; jkst.z = jkst.w; jkst.c = t<0; jkst.w = t&2147483647; + jkst.x += 1411392427; + return jkst.x + jkst.y + jkst.w; +} + +static unsigned seed_jkiss32(unsigned n) +{ + unsigned r = jkst.seed; + if (!n) n = 1; + jkst = (struct jkiss32_state){n*123456789, n*234567891, n*345678912, n*456789123, 0, n}; + if (n > 1) for (n = 10000; n--;) jkiss32(); + return r; +} +// END Random number generator + +static int popnumval(void) +{ + TT.stack.avail -= sizeof(struct zvalue); + return STACK[TT.stkptr--].num; +} + +static void drop(void) +{ + TT.stack.avail -= sizeof(struct zvalue); + struct zvalue *v = &STACK[TT.stkptr--]; + zvalue_release_zstring(v); +} + +static void drop_n(int n) +{ + while (n--) drop(); +} + +static void swap(void) +{ + struct zvalue tmp = STKP[-1]; + STKP[-1] = STKP[0]; + STKP[0] = tmp; +} + +static void force_maybemap_to_scalar(struct zvalue *v) +{ + if (!(v->flags & ZF_ANYMAP)) return; + if (v->flags & ZF_MAP || v->map->count) + FATAL("array in scalar context"); + v->flags = 0; v->map = 0; // v->flags = v->map = 0 gets warning +} + +static void force_maybemap_to_map(struct zvalue *v) +{ + if (v->flags & ZF_MAYBEMAP) v->flags = ZF_MAP; +} + +// Set and return logical (0/1) val of top TT.stack value; flag value as NUM. +static int get_set_logical(void) +{ + struct zvalue *v = STKP; + force_maybemap_to_scalar(v); + int r = 0; + if (IS_NUM(v)) r = !! v->num; + else if (IS_STR(v)) r = (v->vst && v->vst->str[0]); + zvalue_release_zstring(v); + v->num = r; + v->flags = ZF_NUM; + return r; +} + +static struct zvalue *val_to_str_fmt(struct zvalue *v, char *fmt) +{ + force_maybemap_to_scalar(v); + // TODO: consider handling numstring differently + // if string and ONLY string (not numstring) + if (v->flags & ZF_NUMSTR) v->flags = ZF_STR; + if (IS_STR(v)) return v; + else if (!v->flags) { // uninitialized + v->vst = new_zstring("", 0); + } else if (IS_NUM(v)) { + zvalue_release_zstring(v); + v->vst = num_to_zstring(v->num, fmt); + } else { + FATAL("Wrong or unknown type in val_to_str_fmt\n"); + } + v->flags = ZF_STR; + return v; +} + +static struct zvalue *val_to_str(struct zvalue *v) +{ + force_maybemap_to_scalar(v); + // chicken-egg problem here. Need to convert CONVFMT to string + // but need it to be a string. So use default format. + // Should only happen when user sets CONVFMT to not-a-string. + if (!IS_STR(&STACK[CONVFMT])) { + zstring_release(&STACK[CONVFMT].vst); + STACK[CONVFMT].vst = num_to_zstring(STACK[CONVFMT].num, "%.6g"); + STACK[CONVFMT].flags = ZF_STR; + } + return val_to_str_fmt(v, STACK[CONVFMT].vst->str); +} +#define ENSURE_STR(v) (IS_STR(v) ? (v) : val_to_str(v)) + +static double val_to_num(struct zvalue *v) +{ + force_maybemap_to_scalar(v); + if (v->flags & ZF_NUMSTR) zvalue_release_zstring(v); + else if (!(IS_NUM(v))) { + v->num = 0.0; + if (IS_STR(v) && v->vst) v->num = str_to_num(v->vst->str); + zvalue_release_zstring(v); + } + v->flags = ZF_NUM; + return v->num; +} + +static void set_string(struct zvalue *v, struct zstring *zs) +{ + zstring_release(&v->vst); + v->vst = zs; + v->flags = ZF_STR; +} + +static void set_num(struct zvalue *v, double n) +{ + zstring_release(&v->vst); + v->num = n; + v->flags = ZF_NUM; +} + +static void incr_zvalue(struct zvalue *v) +{ + v->num = trunc(val_to_num(v)) + 1; +} + +static void push_int_val(ptrdiff_t n) +{ + struct zvalue v = ZVINIT(ZF_NUM, n, 0); + push_val(&v); +} + +static struct zvalue *get_map_val(struct zvalue *v, struct zvalue *key) +{ + val_to_str(key); // FIXME does this work always? + struct zmap_slot *x = zmap_find_or_insert_key(v->map, key->vst); + return &x->val; +} + +static struct zvalue *setup_lvalue(int ref_stack_ptr, int parmbase, int *field_num) +{ + // for +=, *=, etc + // Stack is: ... scalar_ref value_to_op_by + // or ... subscript_val map_ref value_to_op_by + // or ... fieldref value_to_op_by + // for =, ++, -- + // Stack is: ... scalar_ref + // or ... subscript_val map_ref + // or ... fieldnum fieldref + int k; + struct zvalue *ref, *v = 0; // init v to mute "may be uninit" warning + *field_num = -1; + ref = &STACK[ref_stack_ptr]; + if (ref->flags & ZF_FIELDREF) return get_field_ref(*field_num = ref->num); + k = ref->num >= 0 ? ref->num : parmbase - ref->num; + if (k == NF) *field_num = THIS_MEANS_SET_NF; + v = &STACK[k]; + if (ref->flags & ZF_REF) { + force_maybemap_to_scalar(v); + } else if (ref->flags & ZF_MAPREF) { + force_maybemap_to_map(v); + if (!IS_MAP(v)) FATAL("scalar in array context"); + v = get_map_val(v, &STACK[ref_stack_ptr - 1]); + swap(); + drop(); + } else FATAL("assignment to bad lvalue"); + return v; // order FATAL() and return to mute warning +} + + +static struct zfile *new_file(char *fn, FILE *fp, char mode, char f_or_p) +{ + struct zfile *f = xzalloc(sizeof(struct zfile)); + *f = (struct zfile){TT.zfiles, xstrdup(fn), fp, mode, f_or_p, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + return TT.zfiles = f; +} + +static int fflush_all(void) +{ + int ret = 0; + for (struct zfile *p = TT.zfiles; p; p = p->next) + if (fflush(p->fp)) ret = -1; + return ret; +} + +static int fflush_file(int nargs) +{ + if (!nargs) return fflush_all(); + + val_to_str(STKP); // filename at top of TT.stack + // Null string means flush all + if (!STKP[0].vst->str[0]) return fflush_all(); + + // is it open in file table? + for (struct zfile *p = TT.zfiles; p; p = p->next) + if (!strcmp(STKP[0].vst->str, p->fn)) + if (!fflush(p->fp)) return 0; + return -1; // error, or file not found in table +} +static int close_file(char *fn) +{ + // !fn (null ptr) means close all (exc. stdin/stdout/stderr) + int r = 0; + struct zfile *np, **pp = &TT.zfiles; + for (struct zfile *p = TT.zfiles; p; p = np) { + np = p->next; // save in case unlinking file (invalidates p->next) + // Don't close std files -- wrecks print/printf (can be fixed though TODO) + if ((!p->is_std_file) && (!fn || !strcmp(fn, p->fn))) { + xfree(p->recbuf); + xfree(p->recbuf_multi); + xfree(p->recbuf_multx); + xfree(p->fn); + r = (p->fp) ? (p->file_or_pipe == 'f' ? fclose : pclose)(p->fp) : -1; + *pp = p->next; + xfree(p); + if (fn) return r; + } else pp = &p->next; // only if not unlinking zfile + } + return -1; // file not in table, or closed all files +} + +static struct zfile badfile_obj, *badfile = &badfile_obj; + +// FIXME TODO check if file/pipe/mode matches what's in the table already. +// Apparently gawk/mawk/nawk are OK with different mode, but just use the file +// in whatever mode it's already in; i.e. > after >> still appends. +static struct zfile *setup_file(char *file_or_pipe, char *mode) +{ + val_to_str(STKP); // filename at top of TT.stack + char *fn = STKP[0].vst->str; + // is it already open in file table? + for (struct zfile *p = TT.zfiles; p; p = p->next) + if (!strcmp(fn, p->fn)) { + drop(); + return p; // open; return it + } + FILE *fp = (*file_or_pipe == 'f' ? fopen : popen)(fn, mode); + if (fp) { + struct zfile *p = new_file(fn, fp, *mode, *file_or_pipe); + drop(); + return p; + } + if (*mode != 'r') FFATAL("cannot open '%s'\n", fn); + drop(); + return badfile; +} + +static int getcnt(int k) +{ + if (k >= TT.stkptr) FATAL("too few args for printf\n"); + return (int)val_to_num(&STACK[k]); +} + +static int fsprintf(FILE *ignored, const char *fmt, ...) +{ + (void)ignored; + va_list args, args2; + va_start(args, fmt); + va_copy(args2, args); + int len = vsnprintf(0, 0, fmt, args); // size needed + va_end(args); + + // Unfortunately we have to mess with zstring internals here. + if (len > (int)(TT.rgl.zspr->capacity - TT.rgl.zspr->size) - 1) { + size_t cap = 2 * TT.rgl.zspr->capacity + len; + TT.rgl.zspr = xrealloc(TT.rgl.zspr, sizeof(*TT.rgl.zspr) + cap); + TT.rgl.zspr->capacity = cap; + } + vsnprintf(TT.rgl.zspr->str + TT.rgl.zspr->size, len+1, fmt, args2); + TT.rgl.zspr->size += len; + TT.rgl.zspr->str[TT.rgl.zspr->size] = 0; + + va_end(args2); + return 0; +} + +static void varprint(int(*fpvar)(FILE *, const char *, ...), FILE *outfp, int nargs) +{ + int k, nn, nnc, fmtc, holdc, cnt1 = 0, cnt2 = 0; + double n = 0; + char *s; + regoff_t offs = -1, e = -1; + val_to_str(&STACK[TT.stkptr-nargs+1]); + char *fmt = STACK[TT.stkptr-nargs+1].vst->str; + k = TT.stkptr - nargs + 2; + while (*fmt) { + nn = strcspn(fmt, "%"); + if (nn) { + holdc = fmt[nn]; + fmt[nn] = 0; + fpvar(outfp, "%s", fmt); + fmt[nn] = holdc; + } + fmt += nn; + if (!*fmt) break; + nnc = strcspn(fmt+1, "aAdiouxXfFeEgGcs%"); + fmtc = fmt[nnc+1]; + if (!fmtc) FFATAL("bad printf format '%s'", fmt); + holdc = fmt[nnc+2]; + fmt[nnc+2] = 0; + if (rx_find(&TT.rx_printf_fmt, fmt, &offs, &e, 0)) + FFATAL("bad printf format <%s>\n", fmt); + int nargsneeded = 1; + for (char *p = strchr(fmt, '*'); p; p = strchr(p+1, '*')) + nargsneeded++; + nargsneeded -= fmtc == '%'; + + switch (nargsneeded) { + case 0: + fpvar(outfp, fmt); + break; + case 3: + cnt1 = getcnt(k++); + ATTR_FALLTHROUGH_INTENDED; + case 2: + cnt2 = getcnt(k++); + ATTR_FALLTHROUGH_INTENDED; + case 1: + if (k > TT.stkptr) FATAL("not enough args for printf format\n"); + if (fmtc == 's') { + val_to_str(&STACK[k]); + s = STACK[k++].vst->str; + } else if (fmtc == 'c' && !IS_NUM(&STACK[k])) { + n = STACK[k++].vst ? STACK[k-1].vst->str[0] : '!'; + } else { + val_to_num(&STACK[k]); + n = STACK[k++].num; + } + switch (nargsneeded) { + case 1: + if (fmtc == 's') fpvar(outfp, fmt, s); + else if (strchr("cdi", fmtc)) fpvar(outfp, fmt, (int)n); + else if (strchr("ouxX", fmtc)) fpvar(outfp, fmt, (unsigned)n); + else fpvar(outfp, fmt, n); + break; + case 2: + if (fmtc == 's') fpvar(outfp, fmt, cnt2, s); + else if (strchr("cdi", fmtc)) fpvar(outfp, fmt, cnt2, (int)n); + else if (strchr("ouxX", fmtc)) fpvar(outfp, fmt, cnt2, (unsigned)n); + else fpvar(outfp, fmt, cnt2, n); + break; + case 3: + if (fmtc == 's') fpvar(outfp, fmt, cnt1, cnt2, s); + else if (strchr("cdi", fmtc)) fpvar(outfp, fmt, cnt1, cnt2, (int)n); + else if (strchr("ouxX", fmtc)) fpvar(outfp, fmt, cnt1, cnt2, (unsigned)n); + else fpvar(outfp, fmt, cnt1, cnt2, n); + break; + } + break; + default: + FATAL("bad printf format\n"); + } + fmt += nnc + 2; + *fmt = holdc; + } +} + +static char *escape_str(char *s) +{ + char *p, *escapes = "\\abfnrtv\"/"; // FIXME TODO should / be in there? + char *s0 = s, *to = s; + while ((*to = *s)) { + if (*s != '\\') to++, s++; + else if ((p = strchr(escapes, *++s))) { + // checking char after \ for known escapes + int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes]; + if (c) *to = c, s++; // else final backslash + to++; + } else if ('0' <= *s && *s <= '9') { + int k, c = *s++ - '0'; + for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++) + c = c * 8 + *s++ - '0'; + *to++ = c; + } else if (*s == 'x') { + if (isxdigit(s[1])) { + int c = hexval(*++s); + if (isxdigit(s[1])) c = c * 16 + hexval(*++s); + *to++ = c, s++; + } + } else *to++ = *s++; + } + return s0; +} + +static int is_ok_varname(char *v) +{ + char *ok = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"; + if (!*v) return 0; + for (int i = 0; v[i]; i++) + if (i ? !strchr(ok, v[i]) : !strchr(ok + 10, v[i])) return 0; + return 1; +} + +// FIXME TODO return value never used. What if assign to var not in globals? +static int assign_global(char *var, char *value) +{ + if (!is_ok_varname(var)) FFATAL("Invalid variable name '%s'\n", var); + int globals_ent = find_global(var); + if (globals_ent) { + struct zvalue *v = &STACK[globals_ent]; + if (IS_MAP(v)) error_exit("-v assignment to array\n"); // Maybe not needed? + zvalue_release_zstring(v); + value = xstrdup(value); + *v = new_str_val(escape_str(value)); + xfree(value); + check_numeric_string(v); + return 1; + } + return 0; +} + +// If valid assignment arg, assign the global and return 1; +// otherwise return 0. +// TODO FIXME This does not check the format of the variable per posix. +// Needs to start w/ _A-Za-z then _A-Za-z0-9 +// If not valid assignment form, then nextfilearg needs to treat as filename. +static int assignment_arg(char *arg) +{ + char *val = strchr(arg, '='); + if (val) { + *val++ = 0; + if (!is_ok_varname(arg)) { + *--val = '='; + return 0; + } + assign_global(arg, val); + *--val = '='; + return 1; + } else return 0; +} + +static char *nextfilearg(void) +{ + char *arg; + do { + if (++TT.rgl.narg >= (int)val_to_num(&STACK[ARGC])) return 0; + struct zvalue *v = &STACK[ARGV]; + struct zvalue zkey = ZVINIT(ZF_STR, 0, + num_to_zstring(TT.rgl.narg, val_to_str(&STACK[CONVFMT])->vst->str)); + arg = ""; + if (zmap_find(v->map, zkey.vst)) { + zvalue_copy(&TT.rgl.cur_arg, val_to_str(get_map_val(v, &zkey))); + arg = TT.rgl.cur_arg.vst->str; + } + zvalue_release_zstring(&zkey); + } while (!*arg || assignment_arg(arg)); + TT.rgl.nfiles++; + return arg; +} + +static int next_fp(void) +{ + char *fn = nextfilearg(); + if (TT.cfile->fp && TT.cfile->fp != stdin) fclose(TT.cfile->fp); + if ((!fn && !TT.rgl.nfiles && TT.cfile->fp != stdin) || (fn && !strcmp(fn, "-"))) { + TT.cfile->fp = stdin; + zvalue_release_zstring(&STACK[FILENAME]); + STACK[FILENAME].vst = new_zstring("", 7); + } else if (fn) { + if (!(TT.cfile->fp = fopen(fn, "r"))) FFATAL("can't open %s\n", fn); + zvalue_copy(&STACK[FILENAME], &TT.rgl.cur_arg); + set_num(&STACK[FNR], 0); + } else { + TT.rgl.eof = 1; + return 0; + } + return 1; +} + +static ssize_t getrec_multiline(struct zfile *zfp) +{ + ssize_t k, kk; + do { + k = getdelim(&zfp->recbuf_multi, &zfp->recbufsize_multi, '\n', zfp->fp); + } while (k > 0 && zfp->recbuf_multi[0] == '\n'); + TT.rgl.recptr = zfp->recbuf_multi; + if (k < 0) return k; + // k > 0 and recbuf_multi is not only a \n. Prob. ends w/ \n + // but may not at EOF (last line w/o newline) + for (;;) { + kk = getdelim(&zfp->recbuf_multx, &zfp->recbufsize_multx, '\n', zfp->fp); + if (kk < 0 || zfp->recbuf_multx[0] == '\n') break; + // data is in zfp->recbuf_multi[0..k-1]; append to it + if ((size_t)(k + kk + 1) > zfp->recbufsize_multi) + zfp->recbuf_multi = + xrealloc(zfp->recbuf_multi, zfp->recbufsize_multi = k + kk + 1); + memmove(zfp->recbuf_multi + k, zfp->recbuf_multx, kk+1); + k += kk; + } + if (k > 1 && zfp->recbuf_multi[k-1] == '\n') zfp->recbuf_multi[--k] = 0; + TT.rgl.recptr = zfp->recbuf_multi; + return k; +} + +static ssize_t getrec_f(struct zfile *zfp) +{ + int r = 0, rs = ENSURE_STR(&STACK[RS])->vst->str[0] & 0xff; + if (!rs) return getrec_multiline(zfp); + regex_t rsrx, *rsrxp = &rsrx; + // TEMP!! FIXME Need to cache and avoid too-frequent rx compiles + rx_zvalue_compile(&rsrxp, &STACK[RS]); + regoff_t so = 0, eo = 0; + long ret = -1; + for ( ;; ) { + if (zfp->recoffs == zfp->endoffs) { +#define INIT_RECBUF_LEN 8192 +#define RS_LENGTH_MARGIN (INIT_RECBUF_LEN / 8) + if (!zfp->recbuf) + zfp->recbuf = xmalloc((zfp->recbufsize = INIT_RECBUF_LEN) + 1); + zfp->endoffs = fread(zfp->recbuf, 1, zfp->recbufsize, zfp->fp); + zfp->recoffs = 0; + zfp->recbuf[zfp->endoffs] = 0; + if (!zfp->endoffs) break; + } + TT.rgl.recptr = zfp->recbuf + zfp->recoffs; + r = rx_find(rsrxp, TT.rgl.recptr, &so, &eo, REG_NOTBOL | REG_NOTEOL); + // if not found, or found "near" end of buffer... + if (r || zfp->recoffs + eo > (int)zfp->recbufsize - RS_LENGTH_MARGIN) { + // if at end of data, and (not found or found at end of data) + if (zfp->endoffs < (int)zfp->recbufsize && + (r || zfp->recoffs + eo == zfp->endoffs)) { + ret = zfp->endoffs - zfp->recoffs; + zfp->recoffs = zfp->endoffs; + break; + } + if (zfp->recoffs) { + memmove(zfp->recbuf, TT.rgl.recptr, zfp->endoffs - zfp->recoffs); + zfp->endoffs -= zfp->recoffs; + zfp->recoffs = 0; + } else zfp->recbuf = + xrealloc(zfp->recbuf, (zfp->recbufsize = zfp->recbufsize * 3 / 2) + 1); + zfp->endoffs += fread(zfp->recbuf + zfp->endoffs, + 1, zfp->recbufsize - zfp->endoffs, zfp->fp); + zfp->recbuf[zfp->endoffs] = 0; + } else { + // found and not too near end of data + ret = so; + TT.rgl.recptr[so] = 0; + zfp->recoffs += eo; + break; + } + } + regfree(rsrxp); + return ret; +} + +static ssize_t getrec(void) +{ + ssize_t k; + if (TT.rgl.eof) return -1; + if (!TT.cfile->fp) next_fp(); + do { + if ((k = getrec_f(TT.cfile)) >= 0) return k; + } while (next_fp()); + return -1; +} + +static ssize_t getrec_f0_f(struct zfile *zfp) +{ + ssize_t k = getrec_f(zfp); + if (k >= 0) { + copy_to_field0(TT.rgl.recptr, k); + } + return k; +} + +static ssize_t getrec_f0(void) +{ + ssize_t k = getrec(); + if (k >= 0) { + copy_to_field0(TT.rgl.recptr, k); + incr_zvalue(&STACK[NR]); + incr_zvalue(&STACK[FNR]); + } + return k; +} + +// source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe) +// fp is file or pipe (is NULL if file/pipe could not be opened) +// FIXME TODO should -1 return be replaced by test at caller? +// v is NULL or an lvalue ref +static int awk_getline(int source, struct zfile *zfp, struct zvalue *v) +{ + ssize_t k; + int is_stream = source != tkeof; + if (is_stream && !zfp->fp) return -1; + if (v) { + if ((k = is_stream ? getrec_f(zfp) : getrec()) < 0) return 0; + set_string(v, new_zstring(TT.rgl.recptr, k)); + if (!is_stream) { + incr_zvalue(&STACK[NR]); + incr_zvalue(&STACK[FNR]); + } + } else k = is_stream ? getrec_f0_f(zfp) : getrec_f0(); + return k < 0 ? 0 : 1; +} + +// Define GAWK_SUB to get the same behavior with sub()/gsub() replacement text +// as with gawk, goawk, and recent bwk awk (nawk) versions. Undefine GAWK_SUB +// to get the simpler POSIX behavior, but I think most users will prefer the +// gawk behavior. See the gawk (GNU Awk) manual, +// sec. 9.1.4.1 // More about '\' and '&' with sub(), gsub(), and gensub() +// for details on the differences. +// +#undef GAWK_SUB +#define GAWK_SUB + +// sub(ere, repl[, in]) Substitute the string repl in place of the +// first instance of the extended regular expression ERE in string 'in' +// and return the number of substitutions. An ( '&' ) +// appearing in the string repl shall be replaced by the string from in +// that matches the ERE. (partial spec... there's more) +static void gsub(int opcode, int nargs, int parmbase) +{ (void)nargs; + int field_num = -1; + // compile ensures 3 args + struct zvalue *v = setup_lvalue(TT.stkptr, parmbase, &field_num); + struct zvalue *ere = STKP-2; + struct zvalue *repl = STKP-1; + regex_t rx, *rxp = ℞ + rx_zvalue_compile(&rxp, ere); + val_to_str(repl); + val_to_str(v); + +#define MKINT(x) ((int)(x)) // coerce to integer +#define SLEN(zvalp) ((zvalp)->vst->size) + char *p, *rp0 = repl->vst->str, *rp = rp0, *s = v->vst->str; + int namps = 0, nhits = 0, is_sub = (opcode == tksub), eflags = 0; + regoff_t so = -1, eo; + // Count ampersands in repl string; may be overcount due to \& escapes. + for (rp = rp0; *rp; rp++) namps += *rp == '&'; + p = s; + regoff_t need = SLEN(v) + 1; // capacity needed for result string + // A pass just to determine needed destination (result) string size. + while(!rx_find(rxp, p, &so, &eo, eflags)) { + need += SLEN(repl) + (eo - so) * (namps - 1); + if (!*p) break; + p += eo ? eo : 1; // ensure progress if empty hit at start + if (is_sub) break; + eflags |= REG_NOTBOL; + } + + if (so >= 0) { // at least one hit + struct zstring *z = new_zstring_cap(need); + char *e = z->str; // result destination pointer + p = s; + eflags = 0; + char *ep0 = p, *sp, *ep; + while(!rx_find(rxp, p, &so, &eo, eflags)) { + sp = p + so; + ep = p + eo; + memmove(e, ep0, sp - ep0); // copy unchanged part + e += sp - ep0; + // Skip match if not at start and just after prev match and this is empty + if (p == s || sp - ep0 || eo - so) { + nhits++; + for (rp = rp0; *rp; rp++) { // copy replacement + if (*rp == '&') { + memmove(e, sp, eo - so); //copy match + e += eo - so; + } else if (*rp == '\\') { + if (rp[1] == '&') *e++ = *++rp; + else if (rp[1] != '\\') *e++ = *rp; + else { +#ifdef GAWK_SUB + if (rp[2] == '\\' && rp[3] == '&') { + rp += 2; + *e++ = *rp; + } else if (rp[2] != '&') *e++ = '\\'; +#endif + *e++ = *++rp; + } + } else *e++ = *rp; + } + } + ep0 = ep; + if (!*p) break; + p += eo ? eo : 1; // ensure progress if empty hit at start + if (is_sub) break; + eflags |= REG_NOTBOL; + } + // copy remaining subject string + memmove(e, ep0, s + SLEN(v) - ep0); + e += s + SLEN(v) - ep0; + *e = 0; + z->size = e - z->str; + zstring_release(&v->vst); + v->vst = z; + } + rx_zvalue_free(rxp, ere); + if (!IS_RX(STKP-2)) zstring_release(&STKP[-2].vst); + drop_n(3); + push_int_val(nhits); + if (field_num >= 0) fixup_fields(field_num); +} + +static long time_ms(void) +{ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_sec*1000+ts.tv_nsec/1000000; +} + +static double (*mathfunc[])(double) = {cos, sin, exp, log, sqrt}; +static void math_builtin(int opcode, int nargs) +{ + double d; + switch (opcode) { + case tkint: + STKP->num = trunc(val_to_num(STKP)); + break; + case tkatan2: + d = atan2(val_to_num(STKP-1), val_to_num(STKP)); + drop(); + STKP->num = d; + break; + case tkrand: + push_int_val(0); + // STKP->num = rand(); // Not good in most libc implementations + // STKP->num = (double)jkiss32() / 4294967296.0; + // The above doesn't get all 53 mantissa bits in play. This does: + // (upper 26 bits * 2^27 + upper 27 bits) / 2^53 + double a = (jkiss32() >> 6) * 134217728.0; + STKP->num = (a + (jkiss32() >> 5)) / 9007199254740992.0; + break; + case tksrand: + if (nargs == 1) { + STKP->num = seed_jkiss32((unsigned)trunc(val_to_num(STKP))); + } else push_int_val(seed_jkiss32((unsigned)time_ms())); + break; + default: + if (tkcos <= opcode && opcode <= tksqrt) { + STKP->num = mathfunc[opcode-tkcos](val_to_num(STKP)); + } + } +} + +#define CLAMP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) + +// Main loop of interpreter. Run this once for all BEGIN rules (which +// have had their instructions chained in compile), all END rules (also +// chained in compile), and once for each record of the data file(s). +static int interpx(int start, int *status) +{ + int *ip = &ZCODE[start]; + int opcode, op2, k, r, nargs, nsubscrs, range_num, parmbase = 0; + int field_num; + double nleft, nright; + struct zvalue *v, vv; +// looptop + while ((opcode = *ip++)) { + switch (opcode) { + case opquit: + return opquit; + + case tknot: + (STKP)->num = ! get_set_logical(); + break; + + case opnotnot: + get_set_logical(); + break; + + case opnegate: + val_to_num(STKP); + STKP->num = -STKP->num; + break; + + case tkpow: // FALLTHROUGH intentional here + case tkmul: // FALLTHROUGH intentional here + case tkdiv: // FALLTHROUGH intentional here + case tkmod: // FALLTHROUGH intentional here + case tkplus: // FALLTHROUGH intentional here + case tkminus: + nleft = val_to_num(STKP-1); + nright = val_to_num(STKP); + switch (opcode) { + case tkpow: nleft = pow(nleft, nright); break; + case tkmul: nleft *= nright; break; + case tkdiv: nleft /= nright; break; + case tkmod: nleft = fmod(nleft, nright); break; + case tkplus: nleft += nright; break; + case tkminus: nleft -= nright; break; + } + drop(); + STKP->num = nleft; + break; + + // FIXME REDO REDO ? + case tkcat: + val_to_str(STKP-1); + val_to_str(STKP); + STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP[0].vst); + drop(); + break; + + // Comparisons (with the '<', "<=", "!=", "==", '>', and ">=" + // operators) shall be made numerically if both operands are numeric, + // if one is numeric and the other has a string value that is a numeric + // string, or if one is numeric and the other has the uninitialized + // value. Otherwise, operands shall be converted to strings as required + // and a string comparison shall be made as follows: + // + // For the "!=" and "==" operators, the strings should be compared to + // check if they are identical but may be compared using the + // locale-specific collation sequence to check if they collate equally. + // + // For the other operators, the strings shall be compared using the + // locale-specific collation sequence. + // + // The value of the comparison expression shall be 1 if the relation is + // true, or 0 if the relation is false. +#define CMPSTR(a, b) (strcmp(a.vst->str, b.vst->str)) + case tklt: // FALLTHROUGH intentional here + case tkle: // FALLTHROUGH intentional here + case tkne: // FALLTHROUGH intentional here + case tkeq: // FALLTHROUGH intentional here + case tkgt: // FALLTHROUGH intentional here + case tkge: + ; int cmp = 31416; + + if ( (IS_NUM(&STKP[-1]) && + (STKP[0].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[0].flags)) || + (IS_NUM(&STKP[0]) && + (STKP[-1].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[-1].flags))) { + switch (opcode) { + case tklt: + cmp = STKP[-1].num < STKP[0].num; + break; + case tkle: + cmp = STKP[-1].num <= STKP[0].num; + break; + case tkne: + cmp = STKP[-1].num != STKP[0].num; + break; + case tkeq: + cmp = STKP[-1].num == STKP[0].num; + break; + case tkgt: + cmp = STKP[-1].num > STKP[0].num; + break; + case tkge: + cmp = STKP[-1].num >= STKP[0].num; + break; + } + } else { + val_to_str(STKP-1); + val_to_str(STKP); + cmp = CMPSTR(STKP[-1], STKP[0]); + switch (opcode) { + case tklt: + cmp = cmp < 0; + break; + case tkle: + cmp = cmp <= 0; + break; + case tkne: + cmp = cmp != 0; + break; + case tkeq: + cmp = cmp == 0; + break; + case tkgt: + cmp = cmp > 0; + break; + case tkge: + cmp = cmp >= 0; + break; + } + } + drop(); + drop(); + push_int_val(cmp); + break; + + case opmatchrec: + op2 = *ip++; + int mret = match(&FIELD[0], &LITERAL[op2]); + push_int_val(!mret); + break; + + case tkmatchop: + case tknotmatch: + mret = match(STKP-1, STKP); // mret == 0 if match + drop(); + drop(); + push_int_val(!mret == (opcode == tkmatchop)); + break; + + case tkpowasgn: // FALLTHROUGH intentional here + case tkmodasgn: // FALLTHROUGH intentional here + case tkmulasgn: // FALLTHROUGH intentional here + case tkdivasgn: // FALLTHROUGH intentional here + case tkaddasgn: // FALLTHROUGH intentional here + case tksubasgn: + // Stack is: ... scalar_ref value_to_op_by + // or ... subscript_val map_ref value_to_op_by + // or ... fieldref value_to_op_by + v = setup_lvalue(TT.stkptr-1, parmbase, &field_num); + val_to_num(v); + val_to_num(STKP); + switch (opcode) { + case tkpowasgn: + // TODO + v->num = pow(v->num, STKP->num); + break; + case tkmodasgn: + // TODO + v->num = fmod(v->num, STKP->num); + break; + case tkmulasgn: + v->num *= STKP->num; + break; + case tkdivasgn: + v->num /= STKP->num; + break; + case tkaddasgn: + v->num += STKP->num; + break; + case tksubasgn: + v->num -= STKP->num; + break; + } + + drop_n(2); + v->flags = ZF_NUM; + push_val(v); + if (field_num >= 0) fixup_fields(field_num); + break; + + case tkasgn: + // Stack is: ... scalar_ref value_to_assign + // or ... subscript_val map_ref value_to_assign + // or ... fieldref value_to_assign + v = setup_lvalue(TT.stkptr-1, parmbase, &field_num); + force_maybemap_to_scalar(STKP); + zvalue_copy(v, STKP); + swap(); + drop(); + if (field_num >= 0) fixup_fields(field_num); + break; + + case tkincr: // FALLTHROUGH intentional here + case tkdecr: // FALLTHROUGH intentional here + case oppreincr: // FALLTHROUGH intentional here + case oppredecr: + // Stack is: ... scalar_ref + // or ... subscript_val map_ref + // or ... fieldnum fieldref + v = setup_lvalue(TT.stkptr, parmbase, &field_num); + val_to_num(v); + v->flags = ZF_NUM; + switch (opcode) { + case tkincr: case tkdecr: + // Must be done in this order because push_val(v) may move v, + // invalidating the pointer. + v->num += (opcode == tkincr) ? 1 : -1; + push_val(v); + // Now reverse the incr/decr on the top TT.stack val. + STKP->num -= (opcode == tkincr) ? 1 : -1; + break; + case oppreincr: case oppredecr: + v->num += (opcode == oppreincr) ? 1 : -1; + push_val(v); + break; + } + swap(); + drop(); + if (field_num >= 0) fixup_fields(field_num); + break; + + case tknumber: // FALLTHROUGH intentional here + case tkstring: // FALLTHROUGH intentional here + case tkregex: + push_val(&LITERAL[*ip++]); + break; + + case tkprint: + case tkprintf: + nargs = *ip++; + int outmode = *ip++; + struct zfile *outfp = TT.zstdout; + switch (outmode) { + case tkgt: outfp = setup_file("f", "w"); break; + case tkappend: outfp = setup_file("f", "a"); break; + case tkpipe: outfp = setup_file("p", "w"); break; + default: nargs++; break; + } + nargs--; + if (opcode == tkprintf) { + varprint(fprintf, outfp->fp, nargs); + drop_n(nargs); + break; + } + if (!nargs) { + val_to_str(&FIELD[0]); + fprintf(outfp->fp, "%s", FIELD[0].vst->str); + } else { + struct zvalue tempv = uninit_zvalue; + zvalue_copy(&tempv, &STACK[OFS]); + val_to_str(&tempv); + for (int k = 0; k < nargs; k++) { + if (k) fprintf(outfp->fp, "%s", tempv.vst->str); + int sp = TT.stkptr - nargs + 1 + k; + ////// FIXME refcnt -- prob. don't need to copy from TT.stack? + v = &STACK[sp]; + val_to_str_fmt(v, val_to_str(&STACK[OFMT])->vst->str); + struct zstring *zs = v->vst; + fprintf(outfp->fp, "%s", zs ? zs->str : ""); + } + zvalue_release_zstring(&tempv); + drop_n(nargs); + } + fputs(ENSURE_STR(&STACK[ORS])->vst->str, outfp->fp); + break; + + case opdrop: + drop(); + break; + + case opdrop_n: + drop_n(*ip++); + break; + + // Stack frame layout relative to parmbase: +#define RETURN_VALUE -4 +#define RETURN_ADDR -3 +#define PREV_PARMBASE -2 +#define ARG_CNT -1 +#define FUNCTION_NUM 0 + // Actual args follow, starting at parmbase + 1 + case tkfunction: // function definition + op2 = *ip++; // func table num + struct functab_slot *pfdef = &FUNC_DEF[op2]; + struct zlist *loctab = &pfdef->function_locals; + int nparms = zlist_len(loctab)-1; + + nargs = popnumval(); + int newparmbase = TT.stkptr - nargs; + STACK[newparmbase + PREV_PARMBASE].num = parmbase; + parmbase = newparmbase; + for ( ;nargs > nparms; nargs--) + drop(); + for ( ;nargs < nparms; nargs++) { + // Push additional "args" that were not passed by the caller, to + // match the formal parameters (parms) defined in the function + // definition. In the local var table we may have the type as scalar + // or map if it is used as such within the function. In that case we + // init the pushed arg from the type of the locals table. + // But if a var appears only as a bare arg in a function call it will + // not be typed in the locals table. In that case we can only say it + // "may be" a map, but we have to assume the possibility and attach a + // map to the var. When/if the var is used as a map or scalar in the + // called function it will be converted to a map or scalar as + // required. + // See force_maybemap_to_scalar(). + struct symtab_slot *q = &((struct symtab_slot *)loctab->base)[nargs+1]; + vv = (struct zvalue)ZVINIT(q->flags, 0, 0); + if (vv.flags == 0) { + zvalue_map_init(&vv); + vv.flags = ZF_MAYBEMAP; + } else if (IS_MAP(&vv)) { + zvalue_map_init(&vv); + } else { + vv.flags = 0; + } + push_val(&vv); + } + break; + + case tkreturn: + nparms = *ip++; + nargs = STACK[parmbase+ARG_CNT].num; + force_maybemap_to_scalar(STKP); // Unneeded? + zvalue_copy(&STACK[parmbase+RETURN_VALUE], STKP); + drop(); + // Remove the local args (not supplied by caller) from TT.stack, check to + // release any map data created. + while (TT.stkptr > parmbase + nargs) { + if ((STKP)->flags & ZF_ANYMAP) { + zmap_delete_map_incl_slotdata((STKP)->map); + xfree((STKP)->map); + } + drop(); + } + while (TT.stkptr > parmbase + RETURN_VALUE) + drop(); + ip = &ZCODE[(int)STACK[parmbase+RETURN_ADDR].num]; + parmbase = STACK[parmbase+PREV_PARMBASE].num; + break; + + case opprepcall: // function call prep + push_int_val(0); // return value placeholder + push_int_val(0); // return addr + push_int_val(0); // parmbase + push_int_val(0); // arg count + push_int_val(*ip++); // function tbl ref + break; + + case tkfunc: // function call + nargs = *ip++; + newparmbase = TT.stkptr - nargs; + STACK[newparmbase+RETURN_ADDR].num = ip - &ZCODE[0]; + STACK[newparmbase+ARG_CNT].num = nargs; + push_int_val(nargs); // FIXME TODO pass this in a zregister? + ip = &ZCODE[FUNC_DEF[(int)STACK[newparmbase+FUNCTION_NUM].num].zcode_addr]; + break; + + case tkrbracket: // concat multiple map subscripts + nsubscrs = *ip++; + while (--nsubscrs) { + swap(); + val_to_str(STKP); + push_val(&STACK[SUBSEP]); + val_to_str(STKP); + STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst); + drop(); + swap(); + val_to_str(STKP); + STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst); + drop(); + } + break; + + case opmapdelete: + case tkdelete: + k = STKP->num; + if (k < 0) k = parmbase - k; // loc of var on TT.stack + v = &STACK[k]; + force_maybemap_to_map(v); + if (opcode == opmapdelete) { + zmap_delete_map(v->map); + } else { + drop(); + val_to_str(STKP); + zmap_delete(v->map, STKP->vst); + } + drop(); + break; + + case opmap: + op2 = *ip++; + k = op2 < 0 ? parmbase - op2 : op2; + v = &STACK[k]; + force_maybemap_to_map(v); + if (!IS_MAP(v)) FATAL("scalar in array context"); + v = get_map_val(v, STKP); + drop(); // drop subscript + push_val(v); + break; + + case tkin: + val_to_str(STKP-1); + if (!(STKP->flags & ZF_ANYMAP)) FATAL("scalar in array context"); + v = zmap_find(STKP->map, STKP[-1].vst); + drop(); + drop(); + push_int_val(v ? 1 : 0); + break; + + case opmapiternext: + op2 = *ip++; + v = STKP-1; + force_maybemap_to_map(v); + if (!IS_MAP(v)) FATAL("scalar in array context"); + struct zmap *m = v->map; // Need for MAPSLOT macro + int zlen = zlist_len(&m->slot); + int kk = STKP->num + 1; + while (kk < zlen && !(MAPSLOT[kk].key)) // skip deleted slots + kk++; + STKP->num = kk; // save index for next iteration + if (kk < zlen) { + struct zvalue *var = setup_lvalue(TT.stkptr-2, parmbase, &field_num); + var->flags = ZF_STR; + zstring_release(&var->vst); + var->vst = MAPSLOT[kk].key; + zstring_incr_refcnt(var->vst); + ip += op2; + } + break; + + case tkvar: + op2 = *ip++; + k = op2 < 0 ? parmbase - op2 : op2; + v = &STACK[k]; + push_val(v); + break; + + case tkfield: + // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void) + // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]). + ip++; // skip dummy "operand" instruction field + val_to_num(STKP); + push_field((int)((STKP)->num)); + swap(); + drop(); + break; + + case oppush: + push_int_val(*ip++); + break; + + case tkand: + op2 = *ip++; + if (get_set_logical()) drop(); + else ip += op2; + break; + + case tkor: + op2 = *ip++; + if (!get_set_logical()) drop(); + else ip += op2; + break; + + case tkwhile: + (STKP)->num = ! get_set_logical(); + ATTR_FALLTHROUGH_INTENDED; + // FALLTHROUGH to tkternif + case tkif: + // FALLTHROUGH to tkternif + case tkternif: + op2 = *ip++; + int t = get_set_logical(); // FIXME only need to get, not set + drop(); + if (!t) ip += op2; + break; + + case tkelse: // FALLTHROUGH intentional here + case tkternelse: // FALLTHROUGH intentional here + case tkbreak: // FALLTHROUGH intentional here + case tkcontinue: // FALLTHROUGH intentional here + case opjump: + op2 = *ip++; + ip += op2; + break; + + case opvarref: + op2 = *ip++; + vv = (struct zvalue)ZVINIT(ZF_REF, op2, 0); + push_val(&vv); + break; + + case opmapref: + op2 = *ip++; + vv = (struct zvalue)ZVINIT(ZF_MAPREF, op2, 0); + push_val(&vv); + break; + + case opfldref: + val_to_num(STKP); + (STKP)->flags |= ZF_FIELDREF; + ip++; // skip dummy "operand" instruction field + break; + + case opprintrec: + val_to_str(&FIELD[0]); + puts(FIELD[0].vst->str); + break; + + case oprange1: + range_num = *ip++; + op2 = *ip++; + if (TT.range_sw[range_num]) ip += op2; + break; + + case oprange2: + range_num = *ip++; + op2 = *ip++; + t = get_set_logical(); // FIXME only need to get, not set + drop(); + if (t) TT.range_sw[range_num] = 1; + else ip += op2; + break; + + case oprange3: + range_num = *ip++; + t = get_set_logical(); // FIXME only need to get, not set + drop(); + if (t) TT.range_sw[range_num] = 0; + break; + + case tkexit: + r = popnumval(); + if (r != NO_EXIT_STATUS) *status = (int)r & 255; + // TODO FIXME do we need NO_EXIT_STATUS at all? Just use 0? + return tkexit; + + case tknext: + return tknext; + + case tknextfile: + return tknextfile; + + case tkgetline: + nargs = *ip++; + int source = *ip++; + // TT.stack is: + // if tkgetline 0 tkeof: (nothing stacked; plain getline) + // if tkgetline 1 tkeof: (lvalue) + // if tkgetline 1 tklt: (filename_string) + // if tkgetline 2 tklt: (lvalue) (filename_string) + // if tkgetline 1 tkpipe: (pipe_command_string) + // if tkgetline 2 tkpipe: (pipe_command_string) (lvalue) + // effect is to set: + // if tkgetline 0 tkeof: $0 NF NR FNR + // if tkgetline 1 tkeof: var NR FNR + // if tkgetline 1 tklt: $0 NF + // if tkgetline 2 tklt: var + // if tkgetline 1 tkpipe: $0 NF + // if tkgetline 2 tkpipe: var + // Ensure pipe cmd on top + if (nargs == 2 && source == tkpipe) swap(); + struct zfile *zfp = 0; + if (source == tklt || source == tkpipe) { + zfp = setup_file(source == tklt ? "f" : "p", "r"); + nargs--; + } + // now cases are: + // nargs source TT.stack + // 0 tkeof: (nothing; plain getline) from current data file + // 1 tkeof: (lvalue) from current data file + // 0 tklt: (nothing) from named file in 'stream' + // 1 tklt: (lvalue) from named file in 'stream' + // 0 tkpipe: (nothing) from piped command in 'stream' + // 1 tkpipe: (lvalue) from piped command in 'stream' + v = nargs ? setup_lvalue(TT.stkptr, parmbase, &field_num) : 0; + if (v) drop(); + // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe) + // stream is name of file or pipe + // v is NULL or an lvalue ref + if (zfp != badfile) push_int_val(awk_getline(source, zfp, v)); + else push_int_val(-1); + + // fake return value for now + break; + + ////// builtin functions /////// + + case tksplit: + nargs = *ip++; + if (nargs == 2) push_val(&STACK[FS]); + struct zstring *s = val_to_str(STKP-2)->vst; + force_maybemap_to_map(STKP-1); + struct zvalue *a = STKP-1; + struct zvalue *fs = STKP; + zmap_delete_map(a->map); + k = split(s, a, fs); + drop_n(3); + push_int_val(k); + break; + + case tkmatch: + nargs = *ip++; + val_to_str(STKP-1); + if (!(IS_RX(STKP))) val_to_str(STKP); + regex_t rx_pat, *rxp = &rx_pat; + rx_zvalue_compile(&rxp, STKP); + regoff_t rso, reo; + k = rx_find(rxp, STKP[-1].vst->str, &rso, &reo, 0); + rx_zvalue_free(rxp, STKP); + // Force these to num before setting. + val_to_num(&STACK[RSTART]); + val_to_num(&STACK[RLENGTH]); + if (k) STACK[RSTART].num = 0, STACK[RLENGTH].num = -1; + else STACK[RSTART].num = rso + 1, STACK[RLENGTH].num = reo - rso; + drop(); + drop(); + push_int_val(k ? 0 : rso + 1); + break; + + case tksub: + case tkgsub: + gsub(opcode, *ip++, parmbase); // tksub/tkgsub, args + break; + + case tksubstr: + nargs = *ip++; + struct zstring *zz = val_to_str(&STACK[TT.stkptr-nargs+1])->vst; + // Offset of start of string; convert 1-based to 0-based + ssize_t mm = CLAMP(trunc(val_to_num(&STACK[TT.stkptr-nargs+2]))-1, 0, zz->size); + ssize_t nn = zz->size - mm; // max possible substring length + if (nargs == 3) nn = CLAMP(trunc(val_to_num(STKP)), 0, nn); + struct zstring *zzz = new_zstring(zz->str + mm, nn); + zstring_release(&STACK[TT.stkptr-nargs+1].vst); + STACK[TT.stkptr-nargs+1].vst = zzz; + drop_n(nargs - 1); + break; + + case tkindex: + nargs = *ip++; + char *s1 = val_to_str(STKP-1)->vst->str; + char *s3 = strstr(s1, val_to_str(STKP)->vst->str); + ptrdiff_t offs = s3 ? s3 - s1 + 1 : 0; + drop(); + drop(); + push_int_val(offs); + break; + + case tktolower: + case tktoupper: + nargs = *ip++; + int (*f)(int) = opcode == tktolower ? (tolower) : (toupper); + val_to_str(STKP); + // Need to dup the string to not modify original. + zvalue_dup_zstring(STKP); + struct zstring *z = STKP->vst; + char *p = z->str, *e = z->str + z->size; + for (; p < e; p++) *p = f(*p); + break; + + case tklength: + nargs = *ip++; + v = nargs ? STKP : &FIELD[0]; + force_maybemap_to_map(v); + if (IS_MAP(v)) k = v->map->count - v->map->deleted; + else k = val_to_str(v)->vst->size; + if (nargs) drop(); + push_int_val(k); + break; + + case tksystem: + nargs = *ip++; + fflush(stdout); + fflush(stderr); + r = system(val_to_str(STKP)->vst->str); +#ifdef WEXITSTATUS + // WEXITSTATUS is in sys/wait.h, but I'm not including that. + // It seems to also be in stdlib.h in gcc and musl-gcc. + // No idea how portable this is! + r = r >= 256 ? WEXITSTATUS(r) : r + 256; +#endif + drop(); + push_int_val(r); + break; + + case tkfflush: + nargs = *ip++; + r = fflush_file(nargs); + if (nargs) drop(); + push_int_val(r); + break; + + case tkclose: + nargs = *ip++; + val_to_str(STKP); // filename at top of TT.stack + r = close_file(STKP->vst->str); + drop(); + push_int_val(r); + break; + + case tksprintf: + nargs = *ip++; + zstring_release(&TT.rgl.zspr); + TT.rgl.zspr = new_zstring("", 0); + varprint(fsprintf, 0, nargs); + drop_n(nargs); + vv = (struct zvalue)ZVINIT(ZF_STR, 0, TT.rgl.zspr); + push_val(&vv); + break; + + default: + if (tkatan2 <= opcode && opcode <= tksrand) { + math_builtin(opcode, *ip++); // 2nd arg is number of args in call + break; + } + // This should never happen: + error_exit("!!! Unimplemented opcode %d\n", opcode); + } + } + return opquit; +} + +// interp() wraps the main interpreter loop interpx(). The main purpose +// is to allow the TT.stack to be readjusted after an 'exit' from a function. +// Also catches errors, as the normal operation should leave the TT.stack +// depth unchanged after each run through the rules. +static int interp(int start, int *status) +{ + int stkptrbefore = TT.stkptr; + int r = interpx(start, status); + // If exit from function, TT.stack will be loaded with args etc. Clean it. + if (r == tkexit) { + TT.stack.avail -= (TT.stkptr - stkptrbefore) * TT.stack.size; + TT.stkptr = stkptrbefore; + } + if (TT.stkptr - stkptrbefore) + error_exit("!!AWK BUG stack pointer offset: %d\n", TT.stkptr - stkptrbefore); + return r; +} + +static void insert_argv_map(struct zvalue *map, int key, char *value) +{ + struct zvalue zkey = ZVINIT(ZF_STR, 0, num_to_zstring(key, ENSURE_STR(&STACK[CONVFMT])->vst->str)); + struct zvalue *v = get_map_val(map, &zkey); + zvalue_release_zstring(&zkey); + zvalue_release_zstring(v); + *v = new_str_val(value); + check_numeric_string(v); +} + +static void init_globals(int optind, int argc, char **argv, char *sepstring, + struct arg_list *assign_args, char **envp) +{ + // Global variables reside at the bottom of the TT.stack. Start with the awk + // "special variables": ARGC, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF, + // NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP + + STACK[CONVFMT] = new_str_val("%.6g"); + // Init ENVIRON map. + struct zvalue m = ZVINIT(ZF_MAP, 0, 0); + zvalue_map_init(&m); + STACK[ENVIRON] = m; + for (char **pkey = envp; *pkey; pkey++) { + char *pval = strchr(*pkey, '='); + if (!pval) continue; + *pval++ = 0; + struct zvalue zkey = ZVINIT(ZF_STR, 0, new_zstring(*pkey, strlen(*pkey))); + struct zvalue *v = get_map_val(&m, &zkey); + zstring_release(&zkey.vst); + if (v->vst) FFATAL("env var dup? (%s)", pkey); + *v = new_str_val(pval); // FIXME refcnt + check_numeric_string(v); + } + + // Init ARGV map. + m = (struct zvalue)ZVINIT(ZF_MAP, 0, 0); + zvalue_map_init(&m); + STACK[ARGV] = m; + insert_argv_map(&m, 0, TT.progname); + int nargc = 1; + for (int k = optind; k < argc; k++) { + insert_argv_map(&m, nargc, argv[k]); + nargc++; + } + + // Init rest of the awk special variables. + STACK[ARGC] = (struct zvalue)ZVINIT(ZF_NUM, nargc, 0); + STACK[FILENAME] = new_str_val(""); + STACK[FNR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); + STACK[FS] = new_str_val(sepstring); + STACK[NF] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); + STACK[NR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); + STACK[OFMT] = new_str_val("%.6g"); + STACK[OFS] = new_str_val(" "); + STACK[ORS] = new_str_val("\n"); + STACK[RLENGTH] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); + STACK[RS] = new_str_val("\n"); + STACK[RSTART] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); + STACK[SUBSEP] = new_str_val("\034"); + + // Init program globals. + // + // Push global variables on the TT.stack at offsets matching their index in the + // global var table. In the global var table we may have the type as scalar + // or map if it is used as such in the program. In that case we init the + // pushed arg from the type of the globals table. + // But if a global var appears only as a bare arg in a function call it will + // not be typed in the globals table. In that case we can only say it "may be" + // a map, but we have to assume the possibility and attach a map to the + // var. When/if the var is used as a map or scalar in the called function it + // will be converted to a map or scalar as required. + // See force_maybemap_to_scalar(), and the similar comment in + // 'case tkfunction:' above. + // + int gstx, len = zlist_len(&TT.globals_table); + for (gstx = TT.spec_var_limit; gstx < len; gstx++) { + struct symtab_slot gs = GLOBAL[gstx]; + struct zvalue v = ZVINIT(gs.flags, 0, 0); + if (v.flags == 0) { + zvalue_map_init(&v); + v.flags = ZF_MAYBEMAP; + } else if (IS_MAP(&v)) { + zvalue_map_init(&v); + } else { + // Set SCALAR flag 0 to create "uninitialized" scalar. + v.flags = 0; + } + push_val(&v); + } + + // Init -v assignment options. + for (struct arg_list *p = assign_args; p; p = p->next) { + char *asgn = p->arg; + char *val = strchr(asgn, '='); + if (!val) error_exit("bad -v assignment format\n"); + *val++ = 0; + assign_global(asgn, val); + } + + TT.rgl.cur_arg = new_str_val(""); + uninit_string_zvalue = new_str_val(""); + zvalue_copy(&FIELD[0], &uninit_string_zvalue); +} + +static void run_files(int *status) +{ + int r = 0; + while (r != tkexit && *status < 0 && getrec_f0() >= 0) + if ((r = interp(TT.cgl.first_recrule, status)) == tknextfile) next_fp(); +} + +static void free_literal_regex(void) +{ + int len = zlist_len(&TT.literals); + for (int k = 1; k < len; k++) + if (IS_RX(&LITERAL[k])) regfree(LITERAL[k].rx); +} + +static void run(int optind, int argc, char **argv, char *sepstring, + struct arg_list *assign_args, char **envp) +{ + char *printf_fmt_rx = "%[-+ #0]*([*]|[0-9]*)([.]([*]|[0-9]*))?[aAdiouxXfFeEgGcs%]"; + init_globals(optind, argc, argv, sepstring, assign_args, envp); + TT.cfile = xzalloc(sizeof(struct zfile)); + rx_compile_or_die(&TT.rx_default, "[ \t\n]+"); + rx_compile_or_die(&TT.rx_last, "[ \t\n]+"); + rx_compile_or_die(&TT.rx_printf_fmt, printf_fmt_rx); + new_file("-", stdin, 'r', 'f')->is_std_file = 1; + new_file("/dev/stdin", stdin, 'r', 'f')->is_std_file = 1; + new_file("/dev/stdout", stdout, 'w', 'f')->is_std_file = 1; + TT.zstdout = TT.zfiles; + new_file("/dev/stderr", stderr, 'w', 'f')->is_std_file = 1; + seed_jkiss32(123); + int status = -1, r = 0; + if (TT.cgl.first_begin) r = interp(TT.cgl.first_begin, &status); + if (r != tkexit) + if (TT.cgl.first_recrule) run_files(&status); + if (TT.cgl.first_end) r = interp(TT.cgl.first_end, &status); + regfree(&TT.rx_printf_fmt); + regfree(&TT.rx_default); + regfree(&TT.rx_last); + free_literal_regex(); + close_file(0); // close all files + if (status >= 0) exit(status); +} + +//////////////////// +//// main +//////////////////// + +static void progfiles_init(char *progstring, struct arg_list *prog_args) +{ + + TT.scs->p = progstring ? progstring : " " + 2; + TT.scs->progstring = progstring; + TT.scs->prog_args = prog_args; + TT.scs->filename = "(cmdline)"; + TT.scs->line = 0; // for getline() + TT.scs->line_size = 0; // for getline() + TT.scs->line_num = 0; // Not needed... + TT.scs->fp = 0; // For get_char() initial state. + + TT.scs->tok = 0; + TT.scs->tokbuiltin = 0; + TT.scs->toktype = 0; + TT.scs->maxtok = 256; + TT.scs->tokstr = xzalloc(TT.scs->maxtok); + TT.scs->toklen = 0; // Needed? + TT.scs->ch = 0; // Needed? + + TT.scs->numval = 0; + TT.scs->error = 0; +} + +static int awk(char *sepstring, char *progstring, struct arg_list *prog_args, + struct arg_list *assign_args, int optind, int argc, char **argv, + int opt_run_prog, char **envp, int opt_test_scanner, int opt_dump_symbols) +{ +(void)opt_test_scanner, (void)opt_dump_symbols; + struct scanner_state ss = {0}; + TT.scs = &ss; + + progfiles_init(progstring, prog_args); + compile(); + + if (TT.cgl.compile_error_count) + error_exit("%d syntax error(s)", TT.cgl.compile_error_count); + else { + if (opt_run_prog) + run(optind, argc, argv, sepstring, assign_args, envp); + } + + return TT.cgl.compile_error_count; +} + +void awk_main(void) +{ + char *sepstring = TT.F ? escape_str(TT.F) : " "; + int optind = 0; + char *progstring = NULL; + + TT.pbuf = toybuf; + toys.exitval = 73; + if (!TT.f) { + if (*toys.optargs) progstring = toys.optargs[optind++]; + else error_exit("No program string\n"); + } + TT.progname = toys.which->name; + toys.exitval = awk(sepstring, progstring, TT.f, TT.v, + optind, toys.optc, toys.optargs, !FLAG(c), environ, 0, 0); +} -- 2.39.2