adchpp-docker/src/adchpp/Text.cpp

247 lines
5.9 KiB
C++

/*
* Copyright (C) 2001-2014 Jacek Sieka, arnetheduck on gmail point com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "adchpp.h"
#include "Text.h"
#include "Util.h"
using namespace std;
namespace adchpp {
int Text::utf8ToWc(const char* str, wchar_t& c) {
const auto c0 = static_cast<uint8_t>(str[0]);
const auto bytes = 2 + !!(c0 & 0x20) + ((c0 & 0x30) == 0x30);
if((c0 & 0xc0) == 0xc0) { // 11xx xxxx
// # bytes of leading 1's; check for 0 next
const auto check_bit = 1 << (7 - bytes);
if (c0 & check_bit)
return -1;
c = (check_bit - 1) & c0;
// 2-4 total, or 1-3 additional, bytes
// Can't run off end of str so long as has sub-0x80-terminator
for (auto i = 1; i < bytes; ++i) {
const auto ci = static_cast<uint8_t>(str[i]);
if ((ci & 0xc0) != 0x80)
return -i;
c = (c << 6) | (ci & 0x3f);
}
// Invalid UTF-8 code points
if (c > 0x10ffff || (c >= 0xd800 && c <= 0xdfff)) {
// "REPLACEMENT CHARACTER": used to replace an incoming character
// whose value is unknown or unrepresentable in Unicode
c = 0xfffd;
return -bytes;
}
return bytes;
} else if ((c0 & 0x80) == 0) { // 0xxx xxxx
c = static_cast<unsigned char>(str[0]);
return 1;
} else { // 10xx xxxx
return -1;
}
dcassert(0);
}
void Text::wcToUtf8(wchar_t c, string& str) {
// https://tools.ietf.org/html/rfc3629#section-3
if(c > 0x10ffff || (c >= 0xd800 && c <= 0xdfff)) {
// Invalid UTF-8 code point
// REPLACEMENT CHARACTER: http://www.fileformat.info/info/unicode/char/0fffd/index.htm
wcToUtf8(0xfffd, str);
} else if(c >= 0x10000) {
str += (char)(0x80 | 0x40 | 0x20 | 0x10 | (c >> 18));
str += (char)(0x80 | ((c >> 12) & 0x3f));
str += (char)(0x80 | ((c >> 6) & 0x3f));
str += (char)(0x80 | (c & 0x3f));
} else if(c >= 0x0800) {
str += (char)(0x80 | 0x40 | 0x20 | (c >> 12));
str += (char)(0x80 | ((c >> 6) & 0x3f));
str += (char)(0x80 | (c & 0x3f));
} else if(c >= 0x0080) {
str += (char)(0x80 | 0x40 | (c >> 6));
str += (char)(0x80 | (c & 0x3f));
} else {
str += (char)c;
}
}
const string& Text::acpToUtf8(const string& str, string& tmp) throw() {
wstring wtmp;
return wideToUtf8(acpToWide(str, wtmp), tmp);
}
const wstring& Text::acpToWide(const string& str, wstring& tmp) throw() {
if(str.empty())
return Util::emptyStringW;
#ifdef _WIN32
int n = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, str.c_str(), (int)str.length(), NULL, 0);
if(n == 0) {
return Util::emptyStringW;
}
tmp.resize(n);
n = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, str.c_str(), (int)str.length(), &tmp[0], n);
if(n == 0) {
return Util::emptyStringW;
}
return tmp;
#else
size_t rv;
wchar_t wc;
const char *src = str.c_str();
size_t n = str.length() + 1;
tmp.clear();
tmp.reserve(n);
while(n > 0) {
rv = mbrtowc(&wc, src, n, NULL);
if(rv == 0 || rv == (size_t)-2) {
break;
} else if(rv == (size_t)-1) {
tmp.push_back(L'_');
++src;
--n;
} else {
tmp.push_back(wc);
src += rv;
n -= rv;
}
}
return tmp;
#endif
}
const string& Text::wideToUtf8(const wstring& str, string& tgt) throw() {
if(str.empty()) {
return Util::emptyString;
}
string::size_type n = str.length();
tgt.clear();
for(string::size_type i = 0; i < n; ++i) {
wcToUtf8(str[i], tgt);
}
return tgt;
}
const string& Text::wideToAcp(const wstring& str, string& tmp) throw() {
if(str.empty())
return Util::emptyString;
#ifdef _WIN32
int n = WideCharToMultiByte(CP_ACP, 0, str.c_str(), (int)str.length(), NULL, 0, NULL, NULL);
if(n == 0) {
return Util::emptyString;
}
tmp.resize(n);
n = WideCharToMultiByte(CP_ACP, 0, str.c_str(), (int)str.length(), &tmp[0], n, NULL, NULL);
if(n == 0) {
return Util::emptyString;
}
return tmp;
#else
const wchar_t* src = str.c_str();
int n = wcsrtombs(NULL, &src, 0, NULL);
if(n < 1) {
return Util::emptyString;
}
src = str.c_str();
tmp.resize(n);
n = wcsrtombs(&tmp[0], &src, n, NULL);
if(n < 1) {
return Util::emptyString;
}
return tmp;
#endif
}
bool Text::validateUtf8(const string& str) throw() {
string::size_type i = 0;
while(i < str.length()) {
wchar_t dummy = 0;
int j = utf8ToWc(&str[i], dummy);
if(j < 0)
return false;
i += j;
}
return true;
}
const string& Text::utf8ToAcp(const string& str, string& tmp) throw() {
wstring wtmp;
return wideToAcp(utf8ToWide(str, wtmp), tmp);
}
const wstring& Text::utf8ToWide(const string& str, wstring& tgt) throw() {
tgt.reserve(str.length());
string::size_type n = str.length();
for(string::size_type i = 0; i < n; ) {
wchar_t c = 0;
int x = utf8ToWc(str.c_str() + i, c);
if(x < 0) {
tgt += '_';
i += abs(x);
} else {
i += x;
tgt += c;
}
}
return tgt;
}
string Text::acpToUtf8(const string& str) throw() {
string tmp;
return acpToUtf8(str, tmp);
}
wstring Text::acpToWide(const string& str) throw() {
wstring tmp;
return acpToWide(str, tmp);
}
string Text::utf8ToAcp(const string& str) throw() {
string tmp;
return utf8ToAcp(str, tmp);
}
wstring Text::utf8ToWide(const string& str) throw() {
wstring tmp;
return utf8ToWide(str, tmp);
}
string Text::wideToAcp(const wstring& str) throw() {
string tmp;
return wideToAcp(str, tmp);
}
string Text::wideToUtf8(const wstring& str) throw() {
string tmp;
return wideToUtf8(str, tmp);
}
} // namespace adchpp