diff --git a/LICENSE b/LICENSE index 5ea59addf..96ab022d9 100644 --- a/LICENSE +++ b/LICENSE @@ -113,3 +113,27 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------- + +See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +Copyright (c) 2008-2010 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/library/MiscUtils.cpp b/library/MiscUtils.cpp index 8247cd002..9b26e2a61 100644 --- a/library/MiscUtils.cpp +++ b/library/MiscUtils.cpp @@ -37,6 +37,7 @@ distribution. #include #include +#include std::string stl_sprintf(const char *fmt, ...) { va_list lst; @@ -149,4 +150,162 @@ uint64_t GetTimeMs64() return ret; } -#endif \ No newline at end of file +#endif + +/* Character decoding */ + +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const uint8_t utf8d[] = { + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static inline uint32_t +decode(uint32_t* state, uint32_t* codep, uint8_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state + type]; + return *state; +} + +/* Character encoding */ + +static inline int encode(uint8_t *out, uint16_t c) { + if (c <= 0x7F) + { + out[0] = c; + return 1; + } + else if (c <= 0x7FF) + { + out[0] = (0xC0 | (c >> 6)); + out[1] = (0x80 | (c & 0x3F)); + return 2; + } + else /*if (c <= 0xFFFF)*/ + { + out[0] = (0xE0 | (c >> 12)); + out[1] = (0x80 | ((c >> 6) & 0x3F)); + out[2] = (0x80 | (c & 0x3F)); + return 3; + } +} + +/* CP437 */ + +static uint16_t character_table[256] = { + 0, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, // + 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C, + 0x25BA, 0x25C4, 0x2195, 0x203C, 0xB6, 0xA7, 0x25AC, 0x21A8, // + 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, // + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, // + 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, // + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x2302, + 0xC7, 0xFC, 0xE9, 0xE2, 0xE4, 0xE0, 0xE5, 0xE7, // + 0xEA, 0xEB, 0xE8, 0xEF, 0xEE, 0xEC, 0xC4, 0xC5, + 0xC9, 0xE6, 0xC6, 0xF4, 0xF6, 0xF2, 0xFB, 0xF9, // + 0xFF, 0xD6, 0xDC, 0xA2, 0xA3, 0xA5, 0x20A7, 0x192, + 0xE1, 0xED, 0xF3, 0xFA, 0xF1, 0xD1, 0xAA, 0xBA, // + 0xBF, 0x2310, 0xAC, 0xBD, 0xBC, 0xA1, 0xAB, 0xBB, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, // + 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, + 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, // + 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, + 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, // + 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, + 0x3B1, 0xDF, 0x393, 0x3C0, 0x3A3, 0x3C3, 0xB5, 0x3C4, // + 0x3A6, 0x398, 0x3A9, 0x3B4, 0x221E, 0x3C6, 0x3B5, 0x2229, + 0x2261, 0xB1, 0x2265, 0x2264, 0x2320, 0x2321, 0xF7, 0x2248, // + 0xB0, 0x2219, 0xB7, 0x221A, 0x207F, 0xB2, 0x25A0, 0xA0 +}; + +std::string DF2UTF(const std::string &in) +{ + std::string out; + out.reserve(in.size()); + + uint8_t buf[4]; + for (size_t i = 0; i < in.size(); i++) + { + int cnt = encode(buf, character_table[(uint8_t)in[i]]); + out.append(&buf[0], &buf[cnt]); + } + + return out; +} + +std::string UTF2DF(const std::string &in) +{ + // Unicode to normal lookup table + static std::map ctable; + + if (ctable.empty()) + { + for (uint16_t i = 0; i < 256; i++) + if (character_table[i] != i) + ctable[character_table[i]] = char(i); + } + + // Actual conversion loop + size_t size = in.size(); + std::string out(size, char(0)); + + uint32_t codepoint = 0; + uint32_t state = UTF8_ACCEPT, prev = UTF8_ACCEPT; + uint32_t pos = 0; + + for (unsigned i = 0; i < size; prev = state, i++) { + switch (decode(&state, &codepoint, uint8_t(in[i]))) { + case UTF8_ACCEPT: + if (codepoint < 256 && character_table[codepoint] == codepoint) { + out[pos++] = char(codepoint); + } else { + char v = ctable[codepoint]; + out[pos++] = v ? v : '?'; + } + break; + + case UTF8_REJECT: + out[pos++] = '?'; + if (prev != UTF8_ACCEPT) --i; + state = UTF8_ACCEPT; + break; + } + } + + if (pos != size) + out.resize(pos); + return out; +} diff --git a/library/RemoteTools.cpp b/library/RemoteTools.cpp index 229e5a70c..01c110643 100644 --- a/library/RemoteTools.cpp +++ b/library/RemoteTools.cpp @@ -221,30 +221,30 @@ void DFHack::describeMaterial(BasicMaterialInfo *info, const MaterialInfo &mat, void DFHack::describeName(NameInfo *info, df::language_name *name) { if (!name->first_name.empty()) - info->set_first_name(name->first_name); + info->set_first_name(DF2UTF(name->first_name)); if (!name->nickname.empty()) - info->set_nickname(name->nickname); + info->set_nickname(DF2UTF(name->nickname)); if (name->language >= 0) info->set_language_id(name->language); std::string lname = Translation::TranslateName(name, false, true); if (!lname.empty()) - info->set_last_name(lname); + info->set_last_name(DF2UTF(lname)); lname = Translation::TranslateName(name, true, true); if (!lname.empty()) - info->set_english_name(lname); + info->set_english_name(DF2UTF(lname)); } void DFHack::describeNameTriple(NameTriple *info, const std::string &name, const std::string &plural, const std::string &adj) { - info->set_normal(name); + info->set_normal(DF2UTF(name)); if (!plural.empty() && plural != name) - info->set_plural(plural); + info->set_plural(DF2UTF(plural)); if (!adj.empty() && adj != name) - info->set_adjective(adj); + info->set_adjective(DF2UTF(adj)); } void DFHack::describeUnit(BasicUnitInfo *info, df::unit *unit, diff --git a/library/include/MiscUtils.h b/library/include/MiscUtils.h index e5ecb25f4..c2a153eb1 100644 --- a/library/include/MiscUtils.h +++ b/library/include/MiscUtils.h @@ -279,3 +279,7 @@ DFHACK_EXPORT uint64_t GetTimeMs64(); DFHACK_EXPORT std::string stl_sprintf(const char *fmt, ...); DFHACK_EXPORT std::string stl_vsprintf(const char *fmt, va_list args); + +// Conversion between CP437 and UTF-8 +DFHACK_EXPORT std::string UTF2DF(const std::string &in); +DFHACK_EXPORT std::string DF2UTF(const std::string &in); diff --git a/plugins/rename.cpp b/plugins/rename.cpp index 8dacf62a9..d983d0962 100644 --- a/plugins/rename.cpp +++ b/plugins/rename.cpp @@ -20,6 +20,8 @@ #include "RemoteServer.h" #include "rename.pb.h" +#include "MiscUtils.h" + #include using std::vector; @@ -128,9 +130,9 @@ static command_result RenameSquad(color_ostream &stream, const RenameSquadIn *in return CR_NOT_FOUND; if (in->has_nickname()) - set_nickname(&squad->name, in->nickname()); + set_nickname(&squad->name, UTF2DF(in->nickname())); if (in->has_alias()) - squad->alias = in->alias(); + squad->alias = UTF2DF(in->alias()); return CR_OK; } @@ -142,9 +144,9 @@ static command_result RenameUnit(color_ostream &stream, const RenameUnitIn *in) return CR_NOT_FOUND; if (in->has_nickname()) - setUnitNickname(unit, in->nickname()); + setUnitNickname(unit, UTF2DF(in->nickname())); if (in->has_profession()) - unit->custom_profession = in->profession(); + unit->custom_profession = UTF2DF(in->profession()); return CR_OK; }