Add support for 4 byte UTF-8 characters and stricter character checking
This commit is contained in:
parent
05edfa48b2
commit
8a7e892aeb
|
@ -63,7 +63,7 @@ char* strip_white_space(char* string)
|
||||||
return string;
|
return string;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int is_valid_utf8_str(const char* string, size_t length)
|
static int is_valid_utf8_str(const unsigned char* string, size_t length)
|
||||||
{
|
{
|
||||||
int expect = 0;
|
int expect = 0;
|
||||||
char div = 0;
|
char div = 0;
|
||||||
|
@ -82,12 +82,32 @@ static int is_valid_utf8_str(const char* string, size_t length)
|
||||||
{
|
{
|
||||||
if (string[pos] & 0x80)
|
if (string[pos] & 0x80)
|
||||||
{
|
{
|
||||||
for (div = 0x40; div > 0x10; div /= 2)
|
for (div = 0x40; div > 0x08; div /= 2)
|
||||||
{
|
{
|
||||||
if (string[pos] & div) expect++;
|
if (string[pos] & div) expect++;
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
if ((string[pos] & div) || (pos+expect >= length)) return 0;
|
if ((string[pos] & div) || (pos+expect >= length)) return 0;
|
||||||
|
switch (expect) {
|
||||||
|
case 0:
|
||||||
|
return 0;
|
||||||
|
case 1:
|
||||||
|
/* Out of range */
|
||||||
|
if (string[pos] < 0xC2) return 0;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
/* Out of range */
|
||||||
|
if ((string[pos] == 0xE0) && (string[pos+1] < 0xA0 )) return 0;
|
||||||
|
/* Surrogates */
|
||||||
|
if ((string[pos] == 0xED) && (string[pos+1] > 0x9F )) return 0;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
/* Out of range */
|
||||||
|
if ((string[pos] == 0xF0) && (string[pos+1] < 0x90 )) return 0;
|
||||||
|
if (string[pos] > 0xF4) return 0;
|
||||||
|
if ((string[pos] == 0xF4) && (string[pos+1] > 0x8F )) return 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue