Encode & decode names in utf-8 for transfer in remote messages.

That's the encoding required by the protobuf spec.
develop
Alexander Gavrilov 2012-04-05 18:10:16 +04:00
parent 59f411e401
commit 28a741082f
5 changed files with 201 additions and 12 deletions

@ -113,3 +113,27 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------
See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -37,6 +37,7 @@ distribution.
#include <stdarg.h>
#include <sstream>
#include <map>
std::string stl_sprintf(const char *fmt, ...) {
va_list lst;
@ -149,4 +150,162 @@ uint64_t GetTimeMs64()
return ret;
}
#endif
#endif
/* Character decoding */
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
static inline uint32_t
decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state + type];
return *state;
}
/* Character encoding */
static inline int encode(uint8_t *out, uint16_t c) {
if (c <= 0x7F)
{
out[0] = c;
return 1;
}
else if (c <= 0x7FF)
{
out[0] = (0xC0 | (c >> 6));
out[1] = (0x80 | (c & 0x3F));
return 2;
}
else /*if (c <= 0xFFFF)*/
{
out[0] = (0xE0 | (c >> 12));
out[1] = (0x80 | ((c >> 6) & 0x3F));
out[2] = (0x80 | (c & 0x3F));
return 3;
}
}
/* CP437 */
static uint16_t character_table[256] = {
0, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, //
0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
0x25BA, 0x25C4, 0x2195, 0x203C, 0xB6, 0xA7, 0x25AC, 0x21A8, //
0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, //
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, //
0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, //
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, //
0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, //
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, //
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x2302,
0xC7, 0xFC, 0xE9, 0xE2, 0xE4, 0xE0, 0xE5, 0xE7, //
0xEA, 0xEB, 0xE8, 0xEF, 0xEE, 0xEC, 0xC4, 0xC5,
0xC9, 0xE6, 0xC6, 0xF4, 0xF6, 0xF2, 0xFB, 0xF9, //
0xFF, 0xD6, 0xDC, 0xA2, 0xA3, 0xA5, 0x20A7, 0x192,
0xE1, 0xED, 0xF3, 0xFA, 0xF1, 0xD1, 0xAA, 0xBA, //
0xBF, 0x2310, 0xAC, 0xBD, 0xBC, 0xA1, 0xAB, 0xBB,
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, //
0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, //
0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, //
0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
0x3B1, 0xDF, 0x393, 0x3C0, 0x3A3, 0x3C3, 0xB5, 0x3C4, //
0x3A6, 0x398, 0x3A9, 0x3B4, 0x221E, 0x3C6, 0x3B5, 0x2229,
0x2261, 0xB1, 0x2265, 0x2264, 0x2320, 0x2321, 0xF7, 0x2248, //
0xB0, 0x2219, 0xB7, 0x221A, 0x207F, 0xB2, 0x25A0, 0xA0
};
std::string DF2UTF(const std::string &in)
{
std::string out;
out.reserve(in.size());
uint8_t buf[4];
for (size_t i = 0; i < in.size(); i++)
{
int cnt = encode(buf, character_table[(uint8_t)in[i]]);
out.append(&buf[0], &buf[cnt]);
}
return out;
}
std::string UTF2DF(const std::string &in)
{
// Unicode to normal lookup table
static std::map<uint32_t, char> ctable;
if (ctable.empty())
{
for (uint16_t i = 0; i < 256; i++)
if (character_table[i] != i)
ctable[character_table[i]] = char(i);
}
// Actual conversion loop
size_t size = in.size();
std::string out(size, char(0));
uint32_t codepoint = 0;
uint32_t state = UTF8_ACCEPT, prev = UTF8_ACCEPT;
uint32_t pos = 0;
for (unsigned i = 0; i < size; prev = state, i++) {
switch (decode(&state, &codepoint, uint8_t(in[i]))) {
case UTF8_ACCEPT:
if (codepoint < 256 && character_table[codepoint] == codepoint) {
out[pos++] = char(codepoint);
} else {
char v = ctable[codepoint];
out[pos++] = v ? v : '?';
}
break;
case UTF8_REJECT:
out[pos++] = '?';
if (prev != UTF8_ACCEPT) --i;
state = UTF8_ACCEPT;
break;
}
}
if (pos != size)
out.resize(pos);
return out;
}

@ -221,30 +221,30 @@ void DFHack::describeMaterial(BasicMaterialInfo *info, const MaterialInfo &mat,
void DFHack::describeName(NameInfo *info, df::language_name *name)
{
if (!name->first_name.empty())
info->set_first_name(name->first_name);
info->set_first_name(DF2UTF(name->first_name));
if (!name->nickname.empty())
info->set_nickname(name->nickname);
info->set_nickname(DF2UTF(name->nickname));
if (name->language >= 0)
info->set_language_id(name->language);
std::string lname = Translation::TranslateName(name, false, true);
if (!lname.empty())
info->set_last_name(lname);
info->set_last_name(DF2UTF(lname));
lname = Translation::TranslateName(name, true, true);
if (!lname.empty())
info->set_english_name(lname);
info->set_english_name(DF2UTF(lname));
}
void DFHack::describeNameTriple(NameTriple *info, const std::string &name,
const std::string &plural, const std::string &adj)
{
info->set_normal(name);
info->set_normal(DF2UTF(name));
if (!plural.empty() && plural != name)
info->set_plural(plural);
info->set_plural(DF2UTF(plural));
if (!adj.empty() && adj != name)
info->set_adjective(adj);
info->set_adjective(DF2UTF(adj));
}
void DFHack::describeUnit(BasicUnitInfo *info, df::unit *unit,

@ -279,3 +279,7 @@ DFHACK_EXPORT uint64_t GetTimeMs64();
DFHACK_EXPORT std::string stl_sprintf(const char *fmt, ...);
DFHACK_EXPORT std::string stl_vsprintf(const char *fmt, va_list args);
// Conversion between CP437 and UTF-8
DFHACK_EXPORT std::string UTF2DF(const std::string &in);
DFHACK_EXPORT std::string DF2UTF(const std::string &in);

@ -20,6 +20,8 @@
#include "RemoteServer.h"
#include "rename.pb.h"
#include "MiscUtils.h"
#include <stdlib.h>
using std::vector;
@ -128,9 +130,9 @@ static command_result RenameSquad(color_ostream &stream, const RenameSquadIn *in
return CR_NOT_FOUND;
if (in->has_nickname())
set_nickname(&squad->name, in->nickname());
set_nickname(&squad->name, UTF2DF(in->nickname()));
if (in->has_alias())
squad->alias = in->alias();
squad->alias = UTF2DF(in->alias());
return CR_OK;
}
@ -142,9 +144,9 @@ static command_result RenameUnit(color_ostream &stream, const RenameUnitIn *in)
return CR_NOT_FOUND;
if (in->has_nickname())
setUnitNickname(unit, in->nickname());
setUnitNickname(unit, UTF2DF(in->nickname()));
if (in->has_profession())
unit->custom_profession = in->profession();
unit->custom_profession = UTF2DF(in->profession());
return CR_OK;
}