diff --git a/src/convert.c b/src/convert.c index 457463b4a..628a9b402 100644 --- a/src/convert.c +++ b/src/convert.c @@ -7,64 +7,96 @@ #include "types.h" #include "convert.h" -#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 70101 -#define FALLTHROUGH __attribute__ ((fallthrough)) -#else -#define FALLTHROUGH -#endif +/* + Source: + http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf + page 124, 3.9 "Unicode Encoding Forms", "UTF-8" + + Table 3-7. Well-Formed UTF-8 Byte Sequences + ----------------------------------------------------------------------------- + | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | + | U+0000..U+007F | 00..7F | | | | + | U+0080..U+07FF | C2..DF | 80..BF | | | + | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | + | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | + | U+D000..U+D7FF | ED | 80..9F | 80..BF | | + | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | + | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | + | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | + | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | + ----------------------------------------------------------------------------- +*/ static bool printable_utf8 (const u8 *buf, const size_t len) { - u8 a; - int length; - const u8 *buf_end = buf + len; - const u8 *srcptr; - const char trailingBytesUTF8[64] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + // there's 9 different code point types for utf8 and ... + + const int cp_types[256] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 5, + 6, 7, 7, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; - while (buf < buf_end) { + // ... they can be directly translated into a fixed length sequence of bytes - // This line rejects unprintables. The rest of the function - // reliably rejects invalid UTF-8 sequences. - if (*buf < 0x20 || *buf == 0x7f) return false; + const size_t cp_lens[9] = { 1, 2, 3, 3, 3, 3, 4, 4, 4 }; - if (*buf < 0x80) { - buf++; - continue; - } + for (size_t pos = 0; pos < len; pos++) + { + const u8 c0 = buf[pos]; - length = trailingBytesUTF8[*buf & 0x3f] + 1; - srcptr = buf + length; + const int cp_type = cp_types[c0]; - if (srcptr > buf_end) return false; + if (cp_type == -1) return false; - switch (length) { - default: - return false; - case 4: - if ((a = (*--srcptr)) < 0x80 || a > 0xbf) return false; FALLTHROUGH; - case 3: - if ((a = (*--srcptr)) < 0x80 || a > 0xbf) return false; FALLTHROUGH; - case 2: - if ((a = (*--srcptr)) < 0x80 || a > 0xbf) return false; + // make sure to not read outside the buffer - switch (*buf) { - case 0xE0: if (a < 0xa0) return false; break; - case 0xED: if (a > 0x9f) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8f) return false; FALLTHROUGH; + const size_t cp_len = cp_lens[cp_type]; + + if ((pos + cp_len) > len) return false; + + // multibyte from here + + if (cp_len >= 2) + { + pos++; + + const u8 c1 = buf[pos]; + + switch (cp_type) + { + case 2: if ((c1 < 0xa0) || (c1 > 0xbf)) return false; break; + case 4: if ((c1 < 0x80) || (c1 > 0x9f)) return false; break; + case 6: if ((c1 < 0x90) || (c1 > 0xbf)) return false; break; + case 8: if ((c1 < 0x80) || (c1 > 0x8f)) return false; break; + default: if ((c1 < 0x80) || (c1 > 0xbf)) return false; break; } - case 1: - if (*buf >= 0x80 && *buf < 0xc2) return false; - } - if (*buf > 0xf4) - return false; + for (size_t j = 2; j < cp_len; j++) + { + pos++; - buf += length; + const u8 cx = buf[pos]; + + if ((cx < 0x80) || (cx > 0xbf)) return false; + } + } } + return true; }