utf 8 - convertir cadena UTF-8 a ASCII en LUA puro
(1)
local char, byte, pairs, floor = string.char, string.byte, pairs, math.floor
local table_insert, table_concat = table.insert, table.concat
local unpack = table.unpack or unpack
local function unicode_to_utf8(code)
-- converts numeric UTF code (U+code) to UTF-8 string
local t, h = {}, 128
while code >= h do
t[#t+1] = 128 + code%64
code = floor(code/64)
h = h > 32 and 32 or h/2
end
t[#t+1] = 256 - 2*h + code
return char(unpack(t)):reverse()
end
local function utf8_to_unicode(utf8str, pos)
-- pos = starting byte position inside input string (default 1)
pos = pos or 1
local code, size = utf8str:byte(pos), 1
if code >= 0xC0 and code < 0xFE then
local mask = 64
code = code - 128
repeat
local next_byte = utf8str:byte(pos + size) or 0
if next_byte >= 0x80 and next_byte < 0xC0 then
code, size = (code - mask - 2) * 64 + next_byte, size + 1
else
code, size = utf8str:byte(pos), 1
end
mask = mask * 32
until code < mask
end
-- returns code, number of bytes in this utf8 char
return code, size
end
local map_1252_to_unicode = {
[0x80] = 0x20AC,
[0x81] = 0x81,
[0x82] = 0x201A,
[0x83] = 0x0192,
[0x84] = 0x201E,
[0x85] = 0x2026,
[0x86] = 0x2020,
[0x87] = 0x2021,
[0x88] = 0x02C6,
[0x89] = 0x2030,
[0x8A] = 0x0160,
[0x8B] = 0x2039,
[0x8C] = 0x0152,
[0x8D] = 0x8D,
[0x8E] = 0x017D,
[0x8F] = 0x8F,
[0x90] = 0x90,
[0x91] = 0x2018,
[0x92] = 0x2019,
[0x93] = 0x201C,
[0x94] = 0x201D,
[0x95] = 0x2022,
[0x96] = 0x2013,
[0x97] = 0x2014,
[0x98] = 0x02DC,
[0x99] = 0x2122,
[0x9A] = 0x0161,
[0x9B] = 0x203A,
[0x9C] = 0x0153,
[0x9D] = 0x9D,
[0x9E] = 0x017E,
[0x9F] = 0x0178,
[0xA0] = 0x00A0,
[0xA1] = 0x00A1,
[0xA2] = 0x00A2,
[0xA3] = 0x00A3,
[0xA4] = 0x00A4,
[0xA5] = 0x00A5,
[0xA6] = 0x00A6,
[0xA7] = 0x00A7,
[0xA8] = 0x00A8,
[0xA9] = 0x00A9,
[0xAA] = 0x00AA,
[0xAB] = 0x00AB,
[0xAC] = 0x00AC,
[0xAD] = 0x00AD,
[0xAE] = 0x00AE,
[0xAF] = 0x00AF,
[0xB0] = 0x00B0,
[0xB1] = 0x00B1,
[0xB2] = 0x00B2,
[0xB3] = 0x00B3,
[0xB4] = 0x00B4,
[0xB5] = 0x00B5,
[0xB6] = 0x00B6,
[0xB7] = 0x00B7,
[0xB8] = 0x00B8,
[0xB9] = 0x00B9,
[0xBA] = 0x00BA,
[0xBB] = 0x00BB,
[0xBC] = 0x00BC,
[0xBD] = 0x00BD,
[0xBE] = 0x00BE,
[0xBF] = 0x00BF,
[0xC0] = 0x00C0,
[0xC1] = 0x00C1,
[0xC2] = 0x00C2,
[0xC3] = 0x00C3,
[0xC4] = 0x00C4,
[0xC5] = 0x00C5,
[0xC6] = 0x00C6,
[0xC7] = 0x00C7,
[0xC8] = 0x00C8,
[0xC9] = 0x00C9,
[0xCA] = 0x00CA,
[0xCB] = 0x00CB,
[0xCC] = 0x00CC,
[0xCD] = 0x00CD,
[0xCE] = 0x00CE,
[0xCF] = 0x00CF,
[0xD0] = 0x00D0,
[0xD1] = 0x00D1,
[0xD2] = 0x00D2,
[0xD3] = 0x00D3,
[0xD4] = 0x00D4,
[0xD5] = 0x00D5,
[0xD6] = 0x00D6,
[0xD7] = 0x00D7,
[0xD8] = 0x00D8,
[0xD9] = 0x00D9,
[0xDA] = 0x00DA,
[0xDB] = 0x00DB,
[0xDC] = 0x00DC,
[0xDD] = 0x00DD,
[0xDE] = 0x00DE,
[0xDF] = 0x00DF,
[0xE0] = 0x00E0,
[0xE1] = 0x00E1,
[0xE2] = 0x00E2,
[0xE3] = 0x00E3,
[0xE4] = 0x00E4,
[0xE5] = 0x00E5,
[0xE6] = 0x00E6,
[0xE7] = 0x00E7,
[0xE8] = 0x00E8,
[0xE9] = 0x00E9,
[0xEA] = 0x00EA,
[0xEB] = 0x00EB,
[0xEC] = 0x00EC,
[0xED] = 0x00ED,
[0xEE] = 0x00EE,
[0xEF] = 0x00EF,
[0xF0] = 0x00F0,
[0xF1] = 0x00F1,
[0xF2] = 0x00F2,
[0xF3] = 0x00F3,
[0xF4] = 0x00F4,
[0xF5] = 0x00F5,
[0xF6] = 0x00F6,
[0xF7] = 0x00F7,
[0xF8] = 0x00F8,
[0xF9] = 0x00F9,
[0xFA] = 0x00FA,
[0xFB] = 0x00FB,
[0xFC] = 0x00FC,
[0xFD] = 0x00FD,
[0xFE] = 0x00FE,
[0xFF] = 0x00FF,
}
local map_unicode_to_1252 = {}
for code1252, code in pairs(map_1252_to_unicode) do
map_unicode_to_1252[code] = code1252
end
function string.fromutf8(utf8str)
local pos, result_1252 = 1, {}
while pos <= #utf8str do
local code, size = utf8_to_unicode(utf8str, pos)
pos = pos + size
code = code < 128 and code or map_unicode_to_1252[code] or (''?''):byte()
table_insert(result_1252, char(code))
end
return table_concat(result_1252)
end
function string.toutf8(str1252)
local result_utf8 = {}
for pos = 1, #str1252 do
local code = str1252:byte(pos)
table_insert(result_utf8, unicode_to_utf8(map_1252_to_unicode[code] or code))
end
return table_concat(result_utf8)
end
Uso:
local str1252 = "1/128" -- "one euro" in latin-1
local str_utf8 = str1252:toutf8() -- "1/226/130/172" -- one euro in utf-8
local str1252_2 = str_utf8:fromutf8()
Tengo una pregunta sobre el envío y recepción de datos con caracteres especiales. (Diéresis alemán)
Cuando envío la cadena "Café Zeezicht" con el siguiente código, en el lado del servidor, la cadena está activa.
Pero, ¿cómo puedo recibir y decodificar los datos de recepción que contienen los mismos caracteres? Ahora parece que le gusta "Caf? Zeezicht"
Estoy buscando una función LUA pura, porque no tengo la capacidad de cargar bibliotecas.
------------------------------------------------------------
-- Function voor converting ASCII naar UTF8
------------------------------------------------------------
-- return char as utf8 string
local function CodeToUTF8 (Unicode)
if (Unicode == nil) then
return ""
end
if (Unicode < 0x20) then return '' ''; end;
if (Unicode <= 0x7F) then return string.char(Unicode); end;
if (Unicode <= 0x7FF) then
local Byte0 = 0xC0 + math.floor(Unicode / 0x40);
local Byte1 = 0x80 + (Unicode % 0x40);
return string.char(Byte0, Byte1);
end;
if (Unicode <= 0xFFFF) then
local Byte0 = 0xE0 + math.floor(Unicode / 0x1000);
local Byte1 = 0x80 + (math.floor(Unicode / 0x40) % 0x40);
local Byte2 = 0x80 + (Unicode % 0x40);
return string.char(Byte0, Byte1, Byte2);
end;
return ""; -- ignore UTF-32 for the moment
end;
-- convert ascii string to utf8 string
function AsciiToUTF8(str)
result = ""
for i = 1, #str do
result = result .. CodeToUTF8(string.byte(str, i, i+1))
end
return result
end
------------------------------------------------------------
-- Einde Function voor converting ASCII naar UTF8
------------------------------------------------------------