Add support for 4 byte UTF-8 characters and stricter character checking

This commit is contained in:
Francisco Blas (klondike) Izquierdo Riera 2014-11-24 10:28:50 +01:00
parent 05edfa48b2
commit 8a7e892aeb

View File

@ -63,7 +63,7 @@ char* strip_white_space(char* string)
return string; return string;
} }
static int is_valid_utf8_str(const char* string, size_t length) static int is_valid_utf8_str(const unsigned char* string, size_t length)
{ {
int expect = 0; int expect = 0;
char div = 0; char div = 0;
@ -82,12 +82,32 @@ static int is_valid_utf8_str(const char* string, size_t length)
{ {
if (string[pos] & 0x80) if (string[pos] & 0x80)
{ {
for (div = 0x40; div > 0x10; div /= 2) for (div = 0x40; div > 0x08; div /= 2)
{ {
if (string[pos] & div) expect++; if (string[pos] & div) expect++;
else break; else break;
} }
if ((string[pos] & div) || (pos+expect >= length)) return 0; if ((string[pos] & div) || (pos+expect >= length)) return 0;
switch (expect) {
case 0:
return 0;
case 1:
/* Out of range */
if (string[pos] < 0xC2) return 0;
break;
case 2:
/* Out of range */
if ((string[pos] == 0xE0) && (string[pos+1] < 0xA0 )) return 0;
/* Surrogates */
if ((string[pos] == 0xED) && (string[pos+1] > 0x9F )) return 0;
break;
case 3:
/* Out of range */
if ((string[pos] == 0xF0) && (string[pos+1] < 0x90 )) return 0;
if (string[pos] > 0xF4) return 0;
if ((string[pos] == 0xF4) && (string[pos+1] > 0x8F )) return 0;
break;
}
} }
} }
} }