首先:Windows 代码页是1255,而不是1225。
将CP1255转换为Unicode非常简单;毕竟,所有数据都可以在线找到(我使用了Unicode.org的表格),我们在谈论最多256个字符的列表。要将包含“原始”字节的字符串转换为正确的Unicode,只需将每个字节解释为CP1255字符查找正确的Unicode代码点即可。
下面这段快速且简单的JavaScript代码就可以实现此功能;作为额外的奖励,您可以提供一个可选的函数来处理'undefined'字符代码。默认情况下,这些字符将被转换为U+FFFD(Unicode的本地占位符代码点),但您可以使用回调将它们转换为其他任何内容——我的示例会以十六进制插入原始值。
该函数的输出是一个正确的Unicode编码字符串,可以进一步处理成UTF8或根据需要直接使用。例如,这是一个使用Win1255编码、翻译为Unicode并在InDesign中排版的网站的短片段:
![hebrew in indesign](https://istack.dev59.com/xf62Z.webp)
请注意,我强制将其读取为RTL,因此包括HTML标记内的文本。
下面片段中的示例是来自
http://www.shmuelfomberg.com/perlhebtut/chap9.html的一些适当的文本;我在开头附近插入了一个
\xFF
以演示回调函数。其余部分几乎一直到底部都是一个大型查找表和19行非常短的代码。
var in_text = 'çñø \xff ìðå îùäå áëì äñéôåø. äîî... òáøéú! ' +
'ìòáåã áàðâìéú æä ðçîã, àê àðçðå áéùøàì. àðçðå øåöéí ì÷øåà, ' +
'ìëúåá åìòáã òáøéú! ìøåò îæìðå, äîçùá áã"ë òåáã îùîàì ìéîéï. ' +
'ëãé ìùëðò àåúå ìòáåã îéîéï ìùîàì, ãåøù òáåãä. ëãé ìùëðò àåúå ìòáåã ' +
'ãå-ëéååðé, æä áëìì îñåëï. àáì ÷åãí ëì àúä öøéê ëîä îåùâé éñåã.';
out_text = win1255ToUnicode (in_text);
alert (out_text);
out_text = win1255ToUnicode (in_text, insertHexEscape);
alert (out_text);
function insertHexEscape (otherCode)
{
return '\\x'+otherCode.charCodeAt(0).toString(16);
}
function win1255ToUnicode (source, undefHandler )
{
var win1255Encoding = {
"\x00":"\u0000",
"\x01":"\u0001",
"\x02":"\u0002",
"\x03":"\u0003",
"\x04":"\u0004",
"\x05":"\u0005",
"\x06":"\u0006",
"\x07":"\u0007",
"\x08":"\u0008",
"\x09":"\u0009",
"\x0A":"\u000A",
"\x0B":"\u000B",
"\x0C":"\u000C",
"\x0D":"\u000D",
"\x0E":"\u000E",
"\x0F":"\u000F",
"\x10":"\u0010",
"\x11":"\u0011",
"\x12":"\u0012",
"\x13":"\u0013",
"\x14":"\u0014",
"\x15":"\u0015",
"\x16":"\u0016",
"\x17":"\u0017",
"\x18":"\u0018",
"\x19":"\u0019",
"\x1A":"\u001A",
"\x1B":"\u001B",
"\x1C":"\u001C",
"\x1D":"\u001D",
"\x1E":"\u001E",
"\x1F":"\u001F",
"\x20":"\u0020",
"\x21":"\u0021",
"\x22":"\u0022",
"\x23":"\u0023",
"\x24":"\u0024",
"\x25":"\u0025",
"\x26":"\u0026",
"\x27":"\u0027",
"\x28":"\u0028",
"\x29":"\u0029",
"\x2A":"\u002A",
"\x2B":"\u002B",
"\x2C":"\u002C",
"\x2D":"\u002D",
"\x2E":"\u002E",
"\x2F":"\u002F",
"\x30":"\u0030",
"\x31":"\u0031",
"\x32":"\u0032",
"\x33":"\u0033",
"\x34":"\u0034",
"\x35":"\u0035",
"\x36":"\u0036",
"\x37":"\u0037",
"\x38":"\u0038",
"\x39":"\u0039",
"\x3A":"\u003A",
"\x3B":"\u003B",
"\x3C":"\u003C",
"\x3D":"\u003D",
"\x3E":"\u003E",
"\x3F":"\u003F",
"\x40":"\u0040",
"\x41":"\u0041",
"\x42":"\u0042",
"\x43":"\u0043",
"\x44":"\u0044",
"\x45":"\u0045",
"\x46":"\u0046",
"\x47":"\u0047",
"\x48":"\u0048",
"\x49":"\u0049",
"\x4A":"\u004A",
"\x4B":"\u004B",
"\x4C":"\u004C",
"\x4D":"\u004D",
"\x4E":"\u004E",
"\x4F":"\u004F",
"\x50":"\u0050",
"\x51":"\u0051",
"\x52":"\u0052",
"\x53":"\u0053",
"\x54":"\u0054",
"\x55":"\u0055",
"\x56":"\u0056",
"\x57":"\u0057",
"\x58":"\u0058",
"\x59":"\u0059",
"\x5A":"\u005A",
"\x5B":"\u005B",
"\x5C":"\u005C",
"\x5D":"\u005D",
"\x5E":"\u005E",
"\x5F":"\u005F",
"\x60":"\u0060",
"\x61":"\u0061",
"\x62":"\u0062",
"\x63":"\u0063",
"\x64":"\u0064",
"\x65":"\u0065",
"\x66":"\u0066",
"\x67":"\u0067",
"\x68":"\u0068",
"\x69":"\u0069",
"\x6A":"\u006A",
"\x6B":"\u006B",
"\x6C":"\u006C",
"\x6D":"\u006D",
"\x6E":"\u006E",
"\x6F":"\u006F",
"\x70":"\u0070",
"\x71":"\u0071",
"\x72":"\u0072",
"\x73":"\u0073",
"\x74":"\u0074",
"\x75":"\u0075",
"\x76":"\u0076",
"\x77":"\u0077",
"\x78":"\u0078",
"\x79":"\u0079",
"\x7A":"\u007A",
"\x7B":"\u007B",
"\x7C":"\u007C",
"\x7D":"\u007D",
"\x7E":"\u007E",
"\x7F":"\u007F",
"\x80":"\u20AC",
"\x82":"\u201A",
"\x83":"\u0192",
"\x84":"\u201E",
"\x85":"\u2026",
"\x86":"\u2020",
"\x87":"\u2021",
"\x88":"\u02C6",
"\x89":"\u2030",
"\x8B":"\u2039",
"\x91":"\u2018",
"\x92":"\u2019",
"\x93":"\u201C",
"\x94":"\u201D",
"\x95":"\u2022",
"\x96":"\u2013",
"\x97":"\u2014",
"\x98":"\u02DC",
"\x99":"\u2122",
"\x9B":"\u203A",
"\xA0":"\u00A0",
"\xA1":"\u00A1",
"\xA2":"\u00A2",
"\xA3":"\u00A3",
"\xA4":"\u20AA",
"\xA5":"\u00A5",
"\xA6":"\u00A6",
"\xA7":"\u00A7",
"\xA8":"\u00A8",
"\xA9":"\u00A9",
"\xAA":"\u00D7",
"\xAB":"\u00AB",
"\xAC":"\u00AC",
"\xAD":"\u00AD",
"\xAE":"\u00AE",
"\xAF":"\u00AF",
"\xB0":"\u00B0",
"\xB1":"\u00B1",
"\xB2":"\u00B2",
"\xB3":"\u00B3",
"\xB4":"\u00B4",
"\xB5":"\u00B5",
"\xB6":"\u00B6",
"\xB7":"\u00B7",
"\xB8":"\u00B8",
"\xB9":"\u00B9",
"\xBA":"\u00F7",
"\xBB":"\u00BB",
"\xBC":"\u00BC",
"\xBD":"\u00BD",
"\xBE":"\u00BE",
"\xBF":"\u00BF",
"\xC0":"\u05B0",
"\xC1":"\u05B1",
"\xC2":"\u05B2",
"\xC3":"\u05B3",
"\xC4":"\u05B4",
"\xC5":"\u05B5",
"\xC6":"\u05B6",
"\xC7":"\u05B7",
"\xC8":"\u05B8",
"\xC9":"\u05B9",
"\xCB":"\u05BB",
"\xCC":"\u05BC",
"\xCD":"\u05BD",
"\xCE":"\u05BE",
"\xCF":"\u05BF",
"\xD0":"\u05C0",
"\xD1":"\u05C1",
"\xD2":"\u05C2",
"\xD3":"\u05C3",
"\xD4":"\u05F0",
"\xD5":"\u05F1",
"\xD6":"\u05F2",
"\xD7":"\u05F3",
"\xD8":"\u05F4",
"\xE0":"\u05D0",
"\xE1":"\u05D1",
"\xE2":"\u05D2",
"\xE3":"\u05D3",
"\xE4":"\u05D4",
"\xE5":"\u05D5",
"\xE6":"\u05D6",
"\xE7":"\u05D7",
"\xE8":"\u05D8",
"\xE9":"\u05D9",
"\xEA":"\u05DA",
"\xEB":"\u05DB",
"\xEC":"\u05DC",
"\xED":"\u05DD",
"\xEE":"\u05DE",
"\xEF":"\u05DF",
"\xF0":"\u05E0",
"\xF1":"\u05E1",
"\xF2":"\u05E2",
"\xF3":"\u05E3",
"\xF4":"\u05E4",
"\xF5":"\u05E5",
"\xF6":"\u05E6",
"\xF7":"\u05E7",
"\xF8":"\u05E8",
"\xF9":"\u05E9",
"\xFA":"\u05EA",
"\xFD":"\u200E",
"\xFE":"\u200F"
}, i=0,l,dest = '';
l = source.length;
while (l--)
{
if (win1255Encoding[source[i]])
dest += win1255Encoding[source[i]];
else
{
if (undefHandler)
{
dest += undefHandler (source[i]);
} else
{
dest += '\uFFFD';
}
}
i++;
}
return dest;
}