|
|
@ -37,6 +37,7 @@ distribution.
|
|
|
|
#include <stdarg.h>
|
|
|
|
#include <stdarg.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include <sstream>
|
|
|
|
#include <sstream>
|
|
|
|
|
|
|
|
#include <map>
|
|
|
|
|
|
|
|
|
|
|
|
std::string stl_sprintf(const char *fmt, ...) {
|
|
|
|
std::string stl_sprintf(const char *fmt, ...) {
|
|
|
|
va_list lst;
|
|
|
|
va_list lst;
|
|
|
@ -150,3 +151,161 @@ uint64_t GetTimeMs64()
|
|
|
|
return ret;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Character decoding */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
|
|
|
|
|
|
|
#define UTF8_ACCEPT 0
|
|
|
|
|
|
|
|
#define UTF8_REJECT 12
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static const uint8_t utf8d[] = {
|
|
|
|
|
|
|
|
// The first part of the table maps bytes to character classes that
|
|
|
|
|
|
|
|
// to reduce the size of the transition table and create bitmasks.
|
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
|
|
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
|
|
|
|
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
|
|
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// The second part is a transition table that maps a combination
|
|
|
|
|
|
|
|
// of a state of the automaton and a character class to a state.
|
|
|
|
|
|
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
|
|
|
|
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
|
|
|
|
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
|
|
|
|
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
|
|
|
|
|
|
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline uint32_t
|
|
|
|
|
|
|
|
decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
|
|
|
|
|
|
|
|
uint32_t type = utf8d[byte];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*codep = (*state != UTF8_ACCEPT) ?
|
|
|
|
|
|
|
|
(byte & 0x3fu) | (*codep << 6) :
|
|
|
|
|
|
|
|
(0xff >> type) & (byte);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*state = utf8d[256 + *state + type];
|
|
|
|
|
|
|
|
return *state;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Character encoding */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline int encode(uint8_t *out, uint16_t c) {
|
|
|
|
|
|
|
|
if (c <= 0x7F)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
out[0] = c;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else if (c <= 0x7FF)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
out[0] = (0xC0 | (c >> 6));
|
|
|
|
|
|
|
|
out[1] = (0x80 | (c & 0x3F));
|
|
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else /*if (c <= 0xFFFF)*/
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
out[0] = (0xE0 | (c >> 12));
|
|
|
|
|
|
|
|
out[1] = (0x80 | ((c >> 6) & 0x3F));
|
|
|
|
|
|
|
|
out[2] = (0x80 | (c & 0x3F));
|
|
|
|
|
|
|
|
return 3;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* CP437 */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static uint16_t character_table[256] = {
|
|
|
|
|
|
|
|
0, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, //
|
|
|
|
|
|
|
|
0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
|
|
|
|
|
|
|
|
0x25BA, 0x25C4, 0x2195, 0x203C, 0xB6, 0xA7, 0x25AC, 0x21A8, //
|
|
|
|
|
|
|
|
0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
|
|
|
|
|
|
|
|
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, //
|
|
|
|
|
|
|
|
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
|
|
|
|
|
|
|
|
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, //
|
|
|
|
|
|
|
|
0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
|
|
|
|
|
|
|
|
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, //
|
|
|
|
|
|
|
|
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
|
|
|
|
|
|
|
|
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, //
|
|
|
|
|
|
|
|
0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
|
|
|
|
|
|
|
|
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, //
|
|
|
|
|
|
|
|
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
|
|
|
|
|
|
|
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, //
|
|
|
|
|
|
|
|
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x2302,
|
|
|
|
|
|
|
|
0xC7, 0xFC, 0xE9, 0xE2, 0xE4, 0xE0, 0xE5, 0xE7, //
|
|
|
|
|
|
|
|
0xEA, 0xEB, 0xE8, 0xEF, 0xEE, 0xEC, 0xC4, 0xC5,
|
|
|
|
|
|
|
|
0xC9, 0xE6, 0xC6, 0xF4, 0xF6, 0xF2, 0xFB, 0xF9, //
|
|
|
|
|
|
|
|
0xFF, 0xD6, 0xDC, 0xA2, 0xA3, 0xA5, 0x20A7, 0x192,
|
|
|
|
|
|
|
|
0xE1, 0xED, 0xF3, 0xFA, 0xF1, 0xD1, 0xAA, 0xBA, //
|
|
|
|
|
|
|
|
0xBF, 0x2310, 0xAC, 0xBD, 0xBC, 0xA1, 0xAB, 0xBB,
|
|
|
|
|
|
|
|
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, //
|
|
|
|
|
|
|
|
0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
|
|
|
|
|
|
|
|
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, //
|
|
|
|
|
|
|
|
0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
|
|
|
|
|
|
|
|
0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, //
|
|
|
|
|
|
|
|
0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
|
|
|
|
|
|
|
|
0x3B1, 0xDF, 0x393, 0x3C0, 0x3A3, 0x3C3, 0xB5, 0x3C4, //
|
|
|
|
|
|
|
|
0x3A6, 0x398, 0x3A9, 0x3B4, 0x221E, 0x3C6, 0x3B5, 0x2229,
|
|
|
|
|
|
|
|
0x2261, 0xB1, 0x2265, 0x2264, 0x2320, 0x2321, 0xF7, 0x2248, //
|
|
|
|
|
|
|
|
0xB0, 0x2219, 0xB7, 0x221A, 0x207F, 0xB2, 0x25A0, 0xA0
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string DF2UTF(const std::string &in)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
std::string out;
|
|
|
|
|
|
|
|
out.reserve(in.size());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint8_t buf[4];
|
|
|
|
|
|
|
|
for (size_t i = 0; i < in.size(); i++)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
int cnt = encode(buf, character_table[(uint8_t)in[i]]);
|
|
|
|
|
|
|
|
out.append(&buf[0], &buf[cnt]);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return out;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string UTF2DF(const std::string &in)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
// Unicode to normal lookup table
|
|
|
|
|
|
|
|
static std::map<uint32_t, char> ctable;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (ctable.empty())
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
for (uint16_t i = 0; i < 256; i++)
|
|
|
|
|
|
|
|
if (character_table[i] != i)
|
|
|
|
|
|
|
|
ctable[character_table[i]] = char(i);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Actual conversion loop
|
|
|
|
|
|
|
|
size_t size = in.size();
|
|
|
|
|
|
|
|
std::string out(size, char(0));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t codepoint = 0;
|
|
|
|
|
|
|
|
uint32_t state = UTF8_ACCEPT, prev = UTF8_ACCEPT;
|
|
|
|
|
|
|
|
uint32_t pos = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < size; prev = state, i++) {
|
|
|
|
|
|
|
|
switch (decode(&state, &codepoint, uint8_t(in[i]))) {
|
|
|
|
|
|
|
|
case UTF8_ACCEPT:
|
|
|
|
|
|
|
|
if (codepoint < 256 && character_table[codepoint] == codepoint) {
|
|
|
|
|
|
|
|
out[pos++] = char(codepoint);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
char v = ctable[codepoint];
|
|
|
|
|
|
|
|
out[pos++] = v ? v : '?';
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
case UTF8_REJECT:
|
|
|
|
|
|
|
|
out[pos++] = '?';
|
|
|
|
|
|
|
|
if (prev != UTF8_ACCEPT) --i;
|
|
|
|
|
|
|
|
state = UTF8_ACCEPT;
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (pos != size)
|
|
|
|
|
|
|
|
out.resize(pos);
|
|
|
|
|
|
|
|
return out;
|
|
|
|
|
|
|
|
}
|
|
|
|