changeset 686:9414be56b1db

wc -m only cares about counting characters. Attached is a try on implementing it and some test cases for it. The test cases are only for UTF-8 locales.
author Felix Janda <felix.janda@posteo.de>
date Thu, 08 Nov 2012 11:19:07 -0600
parents 38e07dba9b20
children 598263aee2b9
files scripts/test/wc.test toys.h toys/posix/wc.c
diffstat 3 files changed, 56 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/test/wc.test	Sat Nov 03 19:21:59 2012 -0500
+++ b/scripts/test/wc.test	Thu Nov 08 11:19:07 2012 -0600
@@ -18,5 +18,29 @@
 testing "wc -l" "wc -l file1" "4 file1\n" "" ""
 testing "wc -w" "wc -w file1" "5 file1\n" "" ""
 testing "wc format" "wc file1" "4 5 26 file1\n" "" ""
-testing "wc multiple files" "wc input - file1" "1 2 3 input\n0 2 3 -\n4 5 26 file1\n5 9 32 total\n" "a\nb" "a b"
+testing "wc multiple files" "wc input - file1" \
+        "1 2 3 input\n0 2 3 -\n4 5 26 file1\n5 9 32 total\n" "a\nb" "a b"
+
+#Tests for wc -m
+if printf "%s" "$LANG" | grep -q UTF-8
+then
+
+printf " " > file1
+for i in $(seq 1 8192)
+do
+  printf "ü" >> file1
+done
+testing "wc -m" "wc -m file1" "8193 file1\n" "" ""
+printf " " > file1
+for i in $(seq 1 8192)
+do
+  printf "ü" >> file1
+done
+testing "wc -m (invalid chars)" "wc -m file1" "8193 file1\n" "" ""
+testing "wc -mlw" "wc -mlw input" "1 2 11 input\n" "hello, 世界!\n" ""
+
+else
+printf "skipping tests for wc -m"
+fi
+
 rm file1
--- a/toys.h	Sat Nov 03 19:21:59 2012 -0500
+++ b/toys.h	Thu Nov 08 11:19:07 2012 -0600
@@ -16,6 +16,7 @@
 #include <inttypes.h>
 #include <limits.h>
 #include <libgen.h>
+#include <locale.h>
 #include <math.h>
 #include <pty.h>
 #include <pwd.h>
@@ -46,6 +47,8 @@
 #include <unistd.h>
 #include <utime.h>
 #include <utmpx.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "lib/lib.h"
 #include "toys/e2fs.h"
--- a/toys/posix/wc.c	Sat Nov 03 19:21:59 2012 -0500
+++ b/toys/posix/wc.c	Thu Nov 08 11:19:07 2012 -0600
@@ -6,22 +6,24 @@
  *
  * See http://opengroup.org/onlinepubs/9699919799/utilities/wc.html
 
-USE_WC(NEWTOY(wc, "cwl", TOYFLAG_USR|TOYFLAG_BIN))
+USE_WC(NEWTOY(wc, "mcwl", TOYFLAG_USR|TOYFLAG_BIN))
 
 config WC
 	bool "wc"
 	default y
 	help
-	  usage: wc -lwc [FILE...]
+	  usage: wc -lwcm [FILE...]
 
 	  Count lines, words, and characters in input.
 
 	  -l	show lines
 	  -w	show words
-	  -c	show characters
+	  -c	show bytes
+	  -m	show characters
 
-	  By default outputs lines, words, characters, and filename for each
-	  argument (or from stdin if none).
+	  By default outputs lines, words, bytes, and filename for each
+	  argument (or from stdin if none). Displays only either bytes
+	  or characters.
 */
 
 #define FOR_wc
@@ -47,7 +49,8 @@
 
 static void do_wc(int fd, char *name)
 {
-	int i, len;
+	int i, len, clen=1, space;
+	wchar_t wchar;
 	unsigned long word=0, lengths[]={0,0,0};
 
 	for (;;) {
@@ -57,9 +60,24 @@
 			toys.exitval = EXIT_FAILURE;
 		}
 		if (len<1) break;
-		for (i=0; i<len; i++) {
+		for (i=0; i<len; i+=clen) {
+			if(toys.optflags&8) {
+				clen = mbrtowc(&wchar, toybuf+i, len-i, 0);
+				if(clen==(size_t)(-1)) {
+					if(i!=len-1) {
+						clen = 1;
+						continue;
+					}
+					else break;
+				}
+				if(clen==(size_t)(-2)) break;
+				if(clen==0) clen=1;
+				space = iswspace(wchar);
+			}
+			else space = isspace(toybuf[i]);
+
 			if (toybuf[i]==10) lengths[0]++;
-			if (isspace(toybuf[i])) word=0;
+			if (space) word=0;
 			else {
 				if (!word) lengths[1]++;
 				word=1;
@@ -73,6 +91,8 @@
 
 void wc_main(void)
 {
+	setlocale(LC_ALL, "");
+	toys.optflags |= (toys.optflags&8)>>1;
 	loopfiles(toys.optargs, do_wc);
 	if (toys.optc>1) show_lengths(TT.totals, "total");
 }