Merge pull request #27 from klondi/utf-8_fixes
Add support for 4 byte UTF-8 characters and stricter character checking
This commit is contained in:
commit
e32bb3ff7a
@ -706,6 +706,34 @@ int main(int argc, char** argv)
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_10, "utf8_valid_10");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_11, "utf8_valid_11");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_12, "utf8_valid_12");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_13, "utf8_valid_13");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_14, "utf8_valid_14");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_15, "utf8_valid_15");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_16, "utf8_valid_16");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_17, "utf8_valid_17");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_18, "utf8_valid_18");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_19, "utf8_valid_19");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_20, "utf8_valid_20");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_21, "utf8_valid_21");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_22, "utf8_valid_22");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_23, "utf8_valid_23");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_24, "utf8_valid_24");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_25, "utf8_valid_25");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_26, "utf8_valid_26");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_27, "utf8_valid_27");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_28, "utf8_valid_28");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_29, "utf8_valid_29");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_30, "utf8_valid_30");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_31, "utf8_valid_31");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_32, "utf8_valid_32");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_33, "utf8_valid_33");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_34, "utf8_valid_34");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_35, "utf8_valid_35");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_36, "utf8_valid_36");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_37, "utf8_valid_37");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_38, "utf8_valid_38");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_39, "utf8_valid_39");
|
||||
exotic_add_test(&handle, &exotic_test_utf8_valid_40, "utf8_valid_40");
|
||||
exotic_add_test(&handle, &exotic_test_rbtree_create_destroy, "rbtree_create_destroy");
|
||||
exotic_add_test(&handle, &exotic_test_rbtree_create_1, "rbtree_create_1");
|
||||
exotic_add_test(&handle, &exotic_test_rbtree_size_0, "rbtree_size_0");
|
||||
|
@ -107,6 +107,7 @@ static const char test_utf_seq_6[] = { 0xE2, 0x82, 0xAC, 0x00}; // valid
|
||||
static const char test_utf_seq_7[] = { 0xC2, 0x32, 0x00}; // invalid
|
||||
static const char test_utf_seq_8[] = { 0xE2, 0x82, 0x32, 0x00}; // invalid
|
||||
static const char test_utf_seq_9[] = { 0xE2, 0x32, 0x82, 0x00}; // invalid
|
||||
static const char test_utf_seq_10[] = { 0xF0, 0x9F, 0x98, 0x81, 0x00}; // valid
|
||||
|
||||
EXO_TEST(utf8_valid_4, { return is_valid_utf8(test_utf_seq_1); });
|
||||
EXO_TEST(utf8_valid_5, { return !is_valid_utf8(test_utf_seq_2); });
|
||||
@ -117,5 +118,61 @@ EXO_TEST(utf8_valid_9, { return is_valid_utf8(test_utf_seq_6); });
|
||||
EXO_TEST(utf8_valid_10, { return !is_valid_utf8(test_utf_seq_7); });
|
||||
EXO_TEST(utf8_valid_11, { return !is_valid_utf8(test_utf_seq_8); });
|
||||
EXO_TEST(utf8_valid_12, { return !is_valid_utf8(test_utf_seq_9); });
|
||||
EXO_TEST(utf8_valid_13, { return is_valid_utf8(test_utf_seq_10); });
|
||||
|
||||
// Limits of utf-8
|
||||
static const char test_utf_seq_11[] = { 0x7F, 0x00 }; // valid last 7-bit character
|
||||
static const char test_utf_seq_12[] = { 0x80, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_13[] = { 0xBF, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_14[] = { 0xC0, 0x80, 0x00 }; // invalid out of 2 bytes range
|
||||
static const char test_utf_seq_15[] = { 0xC1, 0x7F, 0x00 }; // invalid out of 2 bytes range
|
||||
static const char test_utf_seq_16[] = { 0xC2, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_17[] = { 0xC2, 0x80, 0x00 }; // valid
|
||||
static const char test_utf_seq_18[] = { 0xDF, 0xBF, 0x00 }; // valid
|
||||
static const char test_utf_seq_19[] = { 0xE0, 0x80, 0x80, 0x00 }; // invalid out of 3 bytes range
|
||||
static const char test_utf_seq_20[] = { 0xE0, 0x9F, 0xBF, 0x00 }; // invalid out of 3 bytes range
|
||||
static const char test_utf_seq_21[] = { 0xE0, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_22[] = { 0xE0, 0xA0, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_23[] = { 0xE0, 0xA0, 0x80, 0x00 }; // valid
|
||||
static const char test_utf_seq_24[] = { 0xEC, 0x9F, 0xBF, 0x00 }; // valid
|
||||
static const char test_utf_seq_25[] = { 0xED, 0xA0, 0x80, 0x00 }; // invalid surrogate
|
||||
static const char test_utf_seq_26[] = { 0xED, 0xBF, 0xBF, 0x00 }; // invalid surrogate
|
||||
static const char test_utf_seq_27[] = { 0xEF, 0x80, 0x80, 0x00 }; // valid
|
||||
static const char test_utf_seq_28[] = { 0xEF, 0xBF, 0xBF, 0x00 }; // valid
|
||||
static const char test_utf_seq_29[] = { 0xF0, 0x80, 0x80, 0x80, 0x00 }; // invalid out of 4 bytes range
|
||||
static const char test_utf_seq_30[] = { 0xF0, 0x8F, 0xBF, 0xBF, 0x00 }; // invalid out of 4 bytes range
|
||||
static const char test_utf_seq_31[] = { 0xF0, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_32[] = { 0xF0, 0x90, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_33[] = { 0xF0, 0x90, 0x80, 0x00 }; // invalid truncated string
|
||||
static const char test_utf_seq_34[] = { 0xF0, 0x90, 0x80, 0x80, 0x00 }; // valid
|
||||
static const char test_utf_seq_35[] = { 0xF4, 0x8F, 0xBF, 0xBF, 0x00 }; // valid
|
||||
static const char test_utf_seq_36[] = { 0xF4, 0x90, 0x80, 0x80, 0x00 }; // invalid out of 4 bytes range
|
||||
static const char test_utf_seq_37[] = { 0xFF, 0xBF, 0xBF, 0xBF, 0x00 }; // invalid out of 4 bytes range
|
||||
|
||||
EXO_TEST(utf8_valid_14, { return is_valid_utf8(test_utf_seq_11); });
|
||||
EXO_TEST(utf8_valid_15, { return !is_valid_utf8(test_utf_seq_12); });
|
||||
EXO_TEST(utf8_valid_16, { return !is_valid_utf8(test_utf_seq_13); });
|
||||
EXO_TEST(utf8_valid_17, { return !is_valid_utf8(test_utf_seq_14); });
|
||||
EXO_TEST(utf8_valid_18, { return !is_valid_utf8(test_utf_seq_15); });
|
||||
EXO_TEST(utf8_valid_19, { return !is_valid_utf8(test_utf_seq_16); });
|
||||
EXO_TEST(utf8_valid_20, { return is_valid_utf8(test_utf_seq_17); });
|
||||
EXO_TEST(utf8_valid_21, { return is_valid_utf8(test_utf_seq_18); });
|
||||
EXO_TEST(utf8_valid_22, { return !is_valid_utf8(test_utf_seq_19); });
|
||||
EXO_TEST(utf8_valid_23, { return !is_valid_utf8(test_utf_seq_20); });
|
||||
EXO_TEST(utf8_valid_24, { return !is_valid_utf8(test_utf_seq_21); });
|
||||
EXO_TEST(utf8_valid_25, { return !is_valid_utf8(test_utf_seq_22); });
|
||||
EXO_TEST(utf8_valid_26, { return is_valid_utf8(test_utf_seq_23); });
|
||||
EXO_TEST(utf8_valid_27, { return is_valid_utf8(test_utf_seq_24); });
|
||||
EXO_TEST(utf8_valid_28, { return !is_valid_utf8(test_utf_seq_25); });
|
||||
EXO_TEST(utf8_valid_29, { return !is_valid_utf8(test_utf_seq_26); });
|
||||
EXO_TEST(utf8_valid_30, { return is_valid_utf8(test_utf_seq_27); });
|
||||
EXO_TEST(utf8_valid_31, { return is_valid_utf8(test_utf_seq_28); });
|
||||
EXO_TEST(utf8_valid_32, { return !is_valid_utf8(test_utf_seq_29); });
|
||||
EXO_TEST(utf8_valid_33, { return !is_valid_utf8(test_utf_seq_30); });
|
||||
EXO_TEST(utf8_valid_34, { return !is_valid_utf8(test_utf_seq_31); });
|
||||
EXO_TEST(utf8_valid_35, { return !is_valid_utf8(test_utf_seq_32); });
|
||||
EXO_TEST(utf8_valid_36, { return !is_valid_utf8(test_utf_seq_33); });
|
||||
EXO_TEST(utf8_valid_37, { return is_valid_utf8(test_utf_seq_34); });
|
||||
EXO_TEST(utf8_valid_38, { return is_valid_utf8(test_utf_seq_35); });
|
||||
EXO_TEST(utf8_valid_39, { return !is_valid_utf8(test_utf_seq_36); });
|
||||
EXO_TEST(utf8_valid_40, { return !is_valid_utf8(test_utf_seq_37); });
|
||||
|
@ -63,7 +63,7 @@ char* strip_white_space(char* string)
|
||||
return string;
|
||||
}
|
||||
|
||||
static int is_valid_utf8_str(const char* string, size_t length)
|
||||
static int is_valid_utf8_str(const unsigned char* string, size_t length)
|
||||
{
|
||||
int expect = 0;
|
||||
char div = 0;
|
||||
@ -82,12 +82,32 @@ static int is_valid_utf8_str(const char* string, size_t length)
|
||||
{
|
||||
if (string[pos] & 0x80)
|
||||
{
|
||||
for (div = 0x40; div > 0x10; div /= 2)
|
||||
for (div = 0x40; div > 0x08; div /= 2)
|
||||
{
|
||||
if (string[pos] & div) expect++;
|
||||
else break;
|
||||
}
|
||||
if ((string[pos] & div) || (pos+expect >= length)) return 0;
|
||||
switch (expect) {
|
||||
case 0:
|
||||
return 0;
|
||||
case 1:
|
||||
/* Out of range */
|
||||
if (string[pos] < 0xC2) return 0;
|
||||
break;
|
||||
case 2:
|
||||
/* Out of range */
|
||||
if ((string[pos] == 0xE0) && (string[pos+1] < 0xA0 )) return 0;
|
||||
/* Surrogates */
|
||||
if ((string[pos] == 0xED) && (string[pos+1] > 0x9F )) return 0;
|
||||
break;
|
||||
case 3:
|
||||
/* Out of range */
|
||||
if ((string[pos] == 0xF0) && (string[pos+1] < 0x90 )) return 0;
|
||||
if (string[pos] > 0xF4) return 0;
|
||||
if ((string[pos] == 0xF4) && (string[pos+1] > 0x8F )) return 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user