From 2e952a47820d3e03dd7b56d50954c92cc0caa20b Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Thu, 26 Jan 2023 23:58:30 -0600 Subject: [PATCH] Implement tar --wildcards and friends (filter options). Also some pending portability.h comments, and wait for archiver exit. --- lib/portability.h | 50 ++++++++++++++++++------------ tests/tar.test | 51 +++++++++++++++++++++++------- toys/posix/tar.c | 79 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 133 insertions(+), 47 deletions(-) diff --git a/lib/portability.h b/lib/portability.h index ce705dcd..f35ce77f 100644 --- a/lib/portability.h +++ b/lib/portability.h @@ -8,13 +8,6 @@ // This must come before we #include any system header file to take effect! #define _FILE_OFFSET_BITS 64 -// For musl -#define _ALL_SOURCE -#include -#ifndef REG_STARTEND -#define REG_STARTEND 0 -#endif - #ifdef __APPLE__ // macOS 10.13 doesn't have the POSIX 2008 direct access to timespec in // struct stat, but we can ask it to give us something equivalent... @@ -26,6 +19,27 @@ #define st_mtim st_mtimespec #endif +// For musl +#define _ALL_SOURCE +#include +#ifndef REG_STARTEND +#define REG_STARTEND 0 +#endif + +// for some reason gnu/libc only sets these if you #define ia_ia_stallman_ftaghn +// despite FreeBSD and MacOS having both with the same value, and bionic's +// "upstream-openbsd" directory documenting them as "BSD extensions". +// (The flexible extension would have been an fnmatch() that returns length +// matched at location so we could check trailing data ourselves, but no. +// And it's ANSI only case matching instead of UTF8...) +#include +#ifndef FNM_LEADING_DIR +#define FNM_LEADING_DIR 8 +#endif +#ifndef FNM_CASEFOLD +#define FNM_CASEFOLD 16 +#endif + // Test for gcc (using compiler builtin #define) #ifdef __GNUC__ @@ -39,17 +53,13 @@ #define printf_format #endif -// This isn't in the spec, but it's how we determine what libc we're using. - -// Types various replacement prototypes need. -// This also lets us determine what libc we're using. Systems that -// have will transitively include it, and ones that don't -- -// macOS -- won't break. +// This lets us determine what libc we're using: systems that have +// will transitively include it, and ones that don't (macOS) won't break. #include // Various constants old build environments might not have even if kernel does -#ifndef AT_FDCWD +#ifndef AT_FDCWD // Kernel commit 5590ff0d5528 2006 #define AT_FDCWD -100 #endif @@ -61,11 +71,11 @@ #define AT_REMOVEDIR 0x200 #endif -#ifndef RLIMIT_RTTIME +#ifndef RLIMIT_RTTIME // Commit 78f2c7db6068f 2008 #define RLIMIT_RTTIME 15 #endif -// Introduced in Linux 3.1 +// Introduced in Linux 3.1 (Commit 982d816581eee 2011) #ifndef SEEK_DATA #define SEEK_DATA 3 #endif @@ -84,7 +94,7 @@ // claim it's in the name of Gnu. #if defined(__GLIBC__) -// "Function prototypes shall be provided." but aren't. +// Glibc violates posix: "Function prototypes shall be provided." but aren't. // http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html char *crypt(const char *key, const char *salt); @@ -97,14 +107,14 @@ int wcwidth(wchar_t wc); #include char *strptime(const char *buf, const char *format, struct tm *tm); -// They didn't like posix basename so they defined another function with the +// Gnu didn't like posix basename so they defined another function with the // same name and if you include libgen.h it #defines basename to something // else (where they implemented the real basename), and that define breaks // the table entry for the basename command. They didn't make a new function // with a different name for their new behavior because gnu. // -// Solution: don't use their broken header, provide an inline to redirect the -// correct name to the broken name. +// Solution: don't use their broken header and provide an inline to redirect +// the standard name to the renamed function with the standard behavior. char *dirname(char *path); char *__xpg_basename(char *path); diff --git a/tests/tar.test b/tests/tar.test index d7b738c4..5c7e0bbd 100755 --- a/tests/tar.test +++ b/tests/tar.test @@ -322,30 +322,30 @@ mkdir uno ln -s tres uno/dos touch uno/tres ln uno/tres uno/quatro -tt() { $TAR --no-recursion uno uno/{dos,tres,quatro} "$@" | \ - LST --show-transformed-names $XX | sed 's/^.* 23:31 //'; } +LL() { LST --show-transformed-names $XX | sed 's/^.* 23:31 //'; } +TT() { $TAR --no-recursion uno uno/{dos,tres,quatro} "$@" | LL; } testing 'xform S' \ - "tt --xform 's/uno/one/S;s/dos/two/S;s/tres/three/S;s/quatro/four/S'" \ + "TT --xform 's/uno/one/S;s/dos/two/S;s/tres/three/S;s/quatro/four/S'" \ "one/\none/two -> tres\none/three\none/four link to one/three\n" "" "" testing 'xform flags=rh starts with all disabled' \ - "tt --xform 's/uno/one/;flags=rh;s/dos/two/;s/tres/three/;s/quatro/four/'" \ + "TT --xform 's/uno/one/;flags=rh;s/dos/two/;s/tres/three/;s/quatro/four/'" \ "one/\none/two -> tres\none/three\none/four link to one/three\n" "" "" testing 'xform flags=rHhsS toggles' \ - "tt --xform 's/uno/one/;flags=rHhsS;s/dos/two/;s/tres/three/;s/quatro/four/'"\ + "TT --xform 's/uno/one/;flags=rHhsS;s/dos/two/;s/tres/three/;s/quatro/four/'"\ "one/\none/two -> tres\none/three\none/four link to one/three\n" "" "" testing 'xform flags= is not a delta from previous' \ - "tt --xform 'flags=s;flags=rh;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/'" \ + "TT --xform 'flags=s;flags=rh;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/'" \ "one/\none/two -> tres\none/three\none/four link to one/three\n" "" "" testing 'xform H' \ - "tt --xform 'flags=rsH;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/'" \ + "TT --xform 'flags=rsH;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/'" \ "one/\none/two -> three\none/three\none/four link to uno/tres\n" "" "" testing 'xform R' \ - "tt --xform 'flags=rshR;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/'" \ + "TT --xform 'flags=rshR;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/'" \ "uno/\nuno/dos -> three\nuno/tres\nuno/quatro link to one/three\n" "" "" testing "xform path" "$TAR one --xform=s@three/four/@zero@ | tar t | grep six" \ @@ -356,19 +356,19 @@ testing "xform trailing slash special case" \ # The quoting works because default IFS splits on whitepace not ; testing "xform extract all" \ - "XX='--xform s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/' tt" \ + "XX='--xform s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/' TT" \ 'one/\none/two -> three\none/three\none/four link to one/three\n' '' '' testing 'xform extract S' \ - "XX='--xform s/uno/one/S;s/dos/two/S;s/tres/three/S;s/quatro/four/S' tt" \ + "XX='--xform s/uno/one/S;s/dos/two/S;s/tres/three/S;s/quatro/four/S' TT" \ "one/\none/two -> tres\none/three\none/four link to one/three\n" "" "" testing 'xform extract H' \ - "XX='--xform flags=rs;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/' tt"\ + "XX='--xform flags=rs;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/' TT"\ "one/\none/two -> three\none/three\none/four link to uno/tres\n" "" "" testing 'xform extract R' \ - "XX='--xform flags=sh;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/' tt"\ + "XX='--xform flags=sh;s/uno/one/;s/dos/two/;s/tres/three/;s/quatro/four/' TT"\ "uno/\nuno/dos -> three\nuno/tres\nuno/quatro link to one/three\n" "" "" rm -rf uno @@ -381,6 +381,33 @@ testing '-P' "$TAR -P --no-recursion -C / /// .. | SUM 3" \ testing 'without -P' "$TAR --no-recursion -C / /// .. 2>/dev/null | SUM 3" \ "077d03243e247b074806904885e6da272fd5857a\n" "" "" +# Wildcards: --exclude, include (create/extract * cmdline/recursive) +# --anchored, --wildcards, --wildcards-match-slash + +#pattern a.c +# abcd dabc a/c da/c +# top/* + +mkdir sub && cd sub && mkdir -p a da top/a top/da && +touch abcd dabc a/c da/c top/abcd top/dabc top/a/c top/da/c && +$TAR -f ../sub.tar abcd dabc a da top && cd .. || exit 1 + +# TODO I have not made wildcard state changes positional. + +testing 'wildcards do not affect creation cmdline args' \ + '$TAR -C sub --wildcards a.cd abcd dabc a da top 2>/dev/null | cmp - sub.tar' \ + '' '' '' + +testing 'creation --exclude --no-wildcards'\ + '$TAR -C sub --no-wildcards --exclude=d?bc abcd dabc | LL' \ + 'abcd\ndabc\n' '' '' + + +testing 'creation --wildcards --exclude'\ + '$TAR -C sub --wildcards --exclude=d?bc abcd dabc | LL' \ + 'abcd\n' '' '' + + if false then # Sequencing issues that leak implementation details out the interface diff --git a/toys/posix/tar.c b/toys/posix/tar.c index 8a8601d1..a9eefa8c 100644 --- a/toys/posix/tar.c +++ b/toys/posix/tar.c @@ -14,12 +14,13 @@ * * Toybox will never implement the "pax" command as a matter of policy. * + * TODO: --wildcard state changes aren't positional. + * We always --verbatim-files-from * Why --exclude pattern but no --include? tar cvzf a.tgz dir --include '*.txt' - * * No --no-null because the args infrastructure isn't ready. - * + * Until args.c learns about no- toggles, --no-thingy always wins over --thingy -USE_TAR(NEWTOY(tar, "&(show-transformed-names)(selinux)(restrict)(full-time)(no-recursion)(null)(numeric-owner)(no-same-permissions)(overwrite)(exclude)*(mode):(mtime):(group):(owner):(to-command):~(strip-components)(strip)#~(transform)(xform)*o(no-same-owner)p(same-permissions)k(keep-old)c(create)|h(dereference)x(extract)|t(list)|v(verbose)J(xz)j(bzip2)z(gzip)S(sparse)O(to-stdout)P(absolute-names)m(touch)X(exclude-from)*T(files-from)*I(use-compress-program):C(directory):f(file):a[!txc][!jzJa]", TOYFLAG_USR|TOYFLAG_BIN)) +USE_TAR(NEWTOY(tar, "&(no-ignore-case)(ignore-case)(no-anchored)(anchored)(no-wildcards)(wildcards)(no-wildcards-match-slash)(wildcards-match-slash)(show-transformed-names)(selinux)(restrict)(full-time)(no-recursion)(null)(numeric-owner)(no-same-permissions)(overwrite)(exclude)*(mode):(mtime):(group):(owner):(to-command):~(strip-components)(strip)#~(transform)(xform)*o(no-same-owner)p(same-permissions)k(keep-old)c(create)|h(dereference)x(extract)|t(list)|v(verbose)J(xz)j(bzip2)z(gzip)S(sparse)O(to-stdout)P(absolute-names)m(touch)X(exclude-from)*T(files-from)*I(use-compress-program):C(directory):f(file):a[!txc][!jzJa]", TOYFLAG_USR|TOYFLAG_BIN)) config TAR bool "tar" @@ -46,6 +47,13 @@ config TAR --strip-components NUM Ignore first NUM directory components when extracting --xform=SED Modify filenames via SED expression (ala s/find/replace/g) -I PROG Filter through PROG to compress or PROG -d to decompress + + Filename filter types. Create command line args aren't filtered, extract + defaults to --anchored, --exclude defaults to --wildcards-match-slash, + use no- prefix to disable: + + --anchored Match name not path --ignore-case Case insensitive + --wildcards Expand *?[] like shell --wildcards-match-slash */ #define FOR_tar @@ -163,14 +171,53 @@ static void maybe_prefix_block(char *data, int check, int type) if (len>check) write_prefix_block(data, len+1, type); } +static int do_filter(char *pattern, char *name, long long flags) +{ + int ign = !!(flags&FLAG_ignore_case), wild = !!(flags&FLAG_wildcards), + slash = !!(flags&FLAG_wildcards_match_slash), len; + + if (wild || slash) { + // 1) match can end with / 2) maybe case insensitive 2) maybe * matches / + if (!fnmatch(pattern, name, FNM_LEADING_DIR+FNM_CASEFOLD*ign+FNM_PATHNAME*slash)) + return 1; + } else { + len = strlen(pattern); + if (!(ign ? strncasecmp : strncmp)(pattern, name, len)) + if (!name[len] || name[len]=='/') return 1; + } + + return 0; +} + static struct double_list *filter(struct double_list *lst, char *name) { struct double_list *end = lst; + long long flags = toys.optflags; + char *ss, *last; + + if (!lst || !*name) return 0; - if (lst) - // constant is FNM_LEADING_DIR - do if (!fnmatch(lst->data, name, 1<<3)) return lst; - while (end != (lst = lst->next)); + // --wildcards-match-slash implies --wildcards because I couldn't figure + // out a graceful way to explain why it DIDN'T in the help text. We don't + // do the positional enable/disable thing (would need to annotate at list + // creation, maybe a TODO item). + + // Set defaults for filter type, and apply --no-flags + if (lst == TT.excl) flags |= FLAG_wildcards_match_slash; + else flags |= FLAG_anchored; + flags &= (~(flags&(FLAG_no_ignore_case|FLAG_no_anchored|FLAG_no_wildcards|FLAG_no_wildcards_match_slash)))>>1; + if (flags&FLAG_no_wildcards) flags &= ~FLAG_wildcards_match_slash; + + // The +1 instead of ++ is in case of conseutive slashes + do { + for (ss = last = name; *ss; ss++) { + if (*ss!='/' || !ss[1]) continue; + if (!(flags & FLAG_anchored)) { + if (do_filter(lst->data, ss+1, flags)) return lst; + } else last = ss+1; + } + if (do_filter(lst->data, last, flags)) return lst; + } while (end != (lst = lst->next)); return 0; } @@ -227,16 +274,11 @@ static int add_to_tar(struct dirtree *node) i = 1; name = hname = dirtree_path(node, &i); - - // exclusion defaults to --no-anchored and --wildcards-match-slash - for (lnk = name; *lnk;) { - if (filter(TT.excl, lnk)) goto done; - while (*lnk && *lnk!='/') lnk++; - while (*lnk=='/') lnk++; - } + if (filter(TT.excl, name)) goto done; // Consume the 1 extra byte alocated in dirtree_path() - if (S_ISDIR(st->st_mode) && lnk[-1] != '/') strcpy(lnk, "/"); + if (S_ISDIR(st->st_mode) && (lnk = name+strlen(name))[-1] != '/') + strcpy(lnk, "/"); // remove leading / and any .. entries from saved name if (!FLAG(P)) { @@ -927,11 +969,14 @@ void tar_main(void) } if (TT.mtime) xparsedate(TT.mtime, &TT.mtt, (void *)&s, 1); + // TODO: collect filter types here and annotate saved include/exclude? + // Collect file list. for (; TT.exclude; TT.exclude = TT.exclude->next) trim2list(&TT.excl, TT.exclude->arg); for (;TT.X; TT.X = TT.X->next) do_lines(xopenro(TT.X->arg), '\n', do_XT); for (args = toys.optargs; *args; args++) trim2list(&TT.incl, *args); + // -T is always --verbatim-files-from: no quote removal or -arg handling for (;TT.T; TT.T = TT.T->next) do_lines(xopenro(TT.T->arg), FLAG(null) ? '\0' : '\n', do_XT); @@ -1089,6 +1134,10 @@ void tar_main(void) writeall(TT.fd, toybuf, 1024); } + if (TT.pid) { + TT.pid = xpclose_both(TT.pid, 0); + if (TT.pid) toys.exitval = TT.pid; + } if (toys.exitval) error_msg("had errors"); if (CFG_TOYBOX_FREE) { -- 2.39.2