From 994c6c34182606385140e5695e33c90507ce59ee Mon Sep 17 00:00:00 2001 From: Xavier Leroy Date: Mon, 19 Sep 2022 16:37:17 +0200 Subject: Support C11 Unicode string literals and character constants (#452) * Support C11 Unicode string literals and character constants * Add tests for C11 string literals and character constants * Better error message for ill-formed universal character names E.g. \u followed by fewer than 4 hex digits, or \U followed by fewer than 8 hex digits. * Add new warning `invalid-utf8` for byte sequences that are not valid UTF8. The warning is activated but not fatal by default. * Warn on uses of C11 Unicode character constants and string literals This uses the `c11-extensions` warning, which is off by default. * Support preprocessing option -finput-charset= for GNU toolchains --- test/regression/Makefile | 2 +- test/regression/Results/charlit | 27 +++++++++++++++++++++ test/regression/Results/stringlit | 15 ++++++++++++ test/regression/charlit.c | 50 +++++++++++++++++++++++++++++++++++++++ test/regression/stringlit.c | 41 ++++++++++++++++++++++++++++++++ 5 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 test/regression/Results/charlit create mode 100644 test/regression/Results/stringlit create mode 100644 test/regression/charlit.c create mode 100644 test/regression/stringlit.c (limited to 'test/regression') diff --git a/test/regression/Makefile b/test/regression/Makefile index daee05bc..53719900 100644 --- a/test/regression/Makefile +++ b/test/regression/Makefile @@ -16,7 +16,7 @@ TESTS=int32 int64 floats floats-basics floats-lit \ funct3 expr5 struct7 struct8 struct11 struct12 casts1 casts2 char1 \ sizeof1 sizeof2 binops bool for1 for2 switch switch2 compound \ decl1 bitfields9 ptrs3 \ - parsing krfun ifconv generic + parsing krfun ifconv generic stringlit charlit # Can run, but only in compiled mode, and have reference output in Results diff --git a/test/regression/Results/charlit b/test/regression/Results/charlit new file mode 100644 index 00000000..3954a0c8 --- /dev/null +++ b/test/regression/Results/charlit @@ -0,0 +1,27 @@ +c1: 61 +c2: 61 +c3: 61 +c4: 61 +d1: fe +d2: fe +d3: fe +d4: fe +e1: 34 +e2: 1234 +e3: 1234 +e4: 1234 +f1: e9 +f2: e9 +f3: e9 +f4: e9 +g1: 2b +g2: 732b +g3: 732b +g4: 732b +h1: 4c +h2: f34c +h3: 1f34c +h4: 1f34c +m1: 6162 +m2: 1020304 +m3: e9e8 diff --git a/test/regression/Results/stringlit b/test/regression/Results/stringlit new file mode 100644 index 00000000..d6967ccc --- /dev/null +++ b/test/regression/Results/stringlit @@ -0,0 +1,15 @@ +s1: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +s2: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +s3: size 12, contents 61 e9 732b d83c df4c +s4: size 20, contents 61 e9 732b 1f34c +s5: size 20, contents 61 e9 732b 1f34c +t1: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +t2: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +t3: size 12, contents 61 e9 732b d83c df4c +t4: size 20, contents 61 e9 732b 1f34c +t5: size 20, contents 61 e9 732b 1f34c +e1: size 4, contents 61 e9 e8 +e2: size 4, contents 61 e9 e8 +e3: size 10, contents 61 e9 e8 732b +e4: size 24, contents 61 e9 e8 732b 1f34c +e5: size 24, contents 61 e9 e8 732b 1f34c diff --git a/test/regression/charlit.c b/test/regression/charlit.c new file mode 100644 index 00000000..5a7e0916 --- /dev/null +++ b/test/regression/charlit.c @@ -0,0 +1,50 @@ +#include +#include +#include + +unsigned char c1 = 'a'; +char16_t c2 = u'a';; +char32_t c3 = U'a';; +wchar_t c4 = L'a';; + +unsigned char d1 = '\xFE'; +char16_t d2 = u'\xFE';; +char32_t d3 = U'\xFE';; +wchar_t d4 = L'\xFE';; + +unsigned char e1 = '\x1234'; // warning but no error +char16_t e2 = u'\x1234'; +char32_t e3 = U'\x1234'; +wchar_t e4 = L'\x1234'; + +unsigned char f1 = 'é'; // CompCert tolerance +char16_t f2 = u'é'; +char32_t f3 = U'é'; +wchar_t f4 = L'é'; + +unsigned char g1 = '猫'; // CompCert tolerance + warning +char16_t g2 = u'猫'; +char32_t g3 = U'猫'; +wchar_t g4 = L'猫'; + +unsigned char h1 = '🍌'; // CompCert tolerance + warning +char16_t h2 = u'🍌'; // CompCert tolerance + warning +char32_t h3 = U'🍌'; +wchar_t h4 = L'🍌'; + +int m1 = 'ab'; +int m2 = '\x01\x02\x03\x04'; +int m3 = 'éè'; // CompCert tolerance + +#define PRINT(x) printf("%s: %x\n", #x, x) + +int main() +{ + PRINT(c1); PRINT(c2); PRINT(c3); PRINT(c4); + PRINT(d1); PRINT(d2); PRINT(d3); PRINT(d4); + PRINT(e1); PRINT(e2); PRINT(e3); PRINT(e4); + PRINT(f1); PRINT(f2); PRINT(f3); PRINT(f4); + PRINT(g1); PRINT(g2); PRINT(g3); PRINT(g4); + PRINT(h1); PRINT(h2); PRINT(h3); PRINT(h4); + PRINT(m1); PRINT(m2); PRINT(m3); +} diff --git a/test/regression/stringlit.c b/test/regression/stringlit.c new file mode 100644 index 00000000..155f8ebb --- /dev/null +++ b/test/regression/stringlit.c @@ -0,0 +1,41 @@ +#include +#include +#include + +/* Without escapes nor universal character names */ + +unsigned char s1[] = "aé猫🍌"; +unsigned char s2[] = u8"aé猫🍌"; +char16_t s3[] = u"aé猫🍌"; +char32_t s4[] = U"aé猫🍌"; +wchar_t s5[] = L"aé猫🍌"; + +/* With universal character names */ + +unsigned char t1[] = "a\u00e9\u732B\U0001F34C"; +unsigned char t2[] = u8"a\u00e9\u732B\U0001F34C"; +char16_t t3[] = u"a\u00e9\u732B\U0001F34C"; +char32_t t4[] = U"a\u00e9\u732B\U0001F34C"; +wchar_t t5[] = L"a\u00e9\u732B\U0001F34C"; + +/* With numerical escapes */ + +unsigned char e1[] = "a\xe9\350"; +unsigned char e2[] = u8"a\xe9\350"; +char16_t e3[] = u"a\xe9\350\x732B"; +char32_t e4[] = U"a\xe9\350\x732B\x0001F34C"; +wchar_t e5[] = L"a\xe9\350\x732B\x0001F34C"; + +#define PRINT(x) \ + printf("%s: size %u, contents", #x, (int) sizeof(x)); \ + for (int i = 0; x[i] != 0; i++) printf(" %x", x[i]); \ + printf("\n") + +int main() +{ + PRINT(s1); PRINT(s2); PRINT(s3); PRINT(s4); PRINT(s5); + PRINT(t1); PRINT(t2); PRINT(t3); PRINT(t4); PRINT(t5); + PRINT(e1); PRINT(e2); PRINT(e3); PRINT(e4); PRINT(e5); + return 0; +} + -- cgit