Rewritten printable_utf8() because the original one throws too many warnings with GCC 8

2025-08-05 13:25:39 +00:00 · 2018-06-13 16:42:33 +02:00 · 2018-06-13 16:42:33 +02:00 · ce4858f7a3
commit ce4858f7a3
parent 469fece141
1 changed files with 75 additions and 43 deletions
--- a/src/convert.c
+++ b/src/convert.c
@ -7,64 +7,96 @@
 #include "types.h"
 #include "convert.h"

-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 70101
-#define FALLTHROUGH __attribute__ ((fallthrough))
-#else
-#define FALLTHROUGH
-#endif
+/*
+  Source:
+   http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf
+   page 124, 3.9 "Unicode Encoding Forms", "UTF-8"
+
+  Table 3-7. Well-Formed UTF-8 Byte Sequences
+  -----------------------------------------------------------------------------
+  |  Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
+  |  U+0000..U+007F     |     00..7F |             |            |             |
+  |  U+0080..U+07FF     |     C2..DF |      80..BF |            |             |
+  |  U+0800..U+0FFF     |         E0 |      A0..BF |     80..BF |             |
+  |  U+1000..U+CFFF     |     E1..EC |      80..BF |     80..BF |             |
+  |  U+D000..U+D7FF     |         ED |      80..9F |     80..BF |             |
+  |  U+E000..U+FFFF     |     EE..EF |      80..BF |     80..BF |             |
+  |  U+10000..U+3FFFF   |         F0 |      90..BF |     80..BF |      80..BF |
+  |  U+40000..U+FFFFF   |     F1..F3 |      80..BF |     80..BF |      80..BF |
+  |  U+100000..U+10FFFF |         F4 |      80..8F |     80..BF |      80..BF |
+  -----------------------------------------------------------------------------
+*/

 static bool printable_utf8 (const u8 *buf, const size_t len)
 {
-  u8 a;
-  int length;
-  const u8 *buf_end = buf + len;
-  const u8 *srcptr;
-  const char trailingBytesUTF8[64] = {
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+  // there's 9 different code point types for utf8 and ...
+
+  const int cp_types[256] =
+  {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  5,  5,
+     6,  7,  7,  7,  8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  };

-  while (buf < buf_end) {
+  // ... they can be directly translated into a fixed length sequence of bytes

-    // This line rejects unprintables. The rest of the function
-    // reliably rejects invalid UTF-8 sequences.
-    if (*buf < 0x20 || *buf == 0x7f) return false;
+  const size_t cp_lens[9] = { 1, 2, 3, 3, 3, 3, 4, 4, 4 };

-    if (*buf < 0x80) {
-      buf++;
-      continue;
-    }
+  for (size_t pos = 0; pos < len; pos++)
+  {
+    const u8 c0 = buf[pos];

-    length = trailingBytesUTF8[*buf & 0x3f] + 1;
-    srcptr = buf + length;
+    const int cp_type = cp_types[c0];

-    if (srcptr > buf_end) return false;
+    if (cp_type == -1) return false;

-    switch (length) {
-    default:
-      return false;
-    case 4:
-      if ((a = (*--srcptr)) < 0x80 || a > 0xbf) return false; FALLTHROUGH;
-    case 3:
-      if ((a = (*--srcptr)) < 0x80 || a > 0xbf) return false; FALLTHROUGH;
-    case 2:
-      if ((a = (*--srcptr)) < 0x80 || a > 0xbf) return false;
+    // make sure to not read outside the buffer

-      switch (*buf) {
-      case 0xE0: if (a < 0xa0) return false; break;
-      case 0xED: if (a > 0x9f) return false; break;
-      case 0xF0: if (a < 0x90) return false; break;
-      case 0xF4: if (a > 0x8f) return false; FALLTHROUGH;
+    const size_t cp_len = cp_lens[cp_type];
+
+    if ((pos + cp_len) > len) return false;
+
+    // multibyte from here
+
+    if (cp_len >= 2)
+    {
+      pos++;
+
+      const u8 c1 = buf[pos];
+
+      switch (cp_type)
+      {
+        case 2:   if ((c1 < 0xa0) || (c1 > 0xbf)) return false; break;
+        case 4:   if ((c1 < 0x80) || (c1 > 0x9f)) return false; break;
+        case 6:   if ((c1 < 0x90) || (c1 > 0xbf)) return false; break;
+        case 8:   if ((c1 < 0x80) || (c1 > 0x8f)) return false; break;
+        default:  if ((c1 < 0x80) || (c1 > 0xbf)) return false; break;
      }

-    case 1:
-      if (*buf >= 0x80 && *buf < 0xc2) return false;
-    }
-    if (*buf > 0xf4)
-      return false;
+      for (size_t j = 2; j < cp_len; j++)
+      {
+        pos++;

-    buf += length;
+        const u8 cx = buf[pos];
+
+        if ((cx < 0x80) || (cx > 0xbf)) return false;
+      }
+    }
  }
+
  return true;
 }