从UTF8获取字符串（无效的URI：字符串中有无效序列）

Question

从UTF8获取字符串（无效的URI：字符串中有无效序列）

3

请问能否帮我了解以下代码中为什么第二行会抛出异常？

string line1 = Uri.UnescapeDataString("Disk:%2FFolder");
string line2 = Uri.UnescapeDataString("Disk:%C0%AFFolder");

也许还有其他函数能够成功解码"Disk:%C0%AFFolder"吗？

- walruz

1

FYI：这与UTF-8完全无关。 - Kris Vandermotten

"%c0%af"是一个非法的Unicode表示"/." <- 可能与此有关来源 - jjjjjjjjjjjjjjjjjjjj

%2F 和 %C0%AF 在使用 UTF-8 编码时都代表着 '/'，是吗？ - walruz

2个回答

0

我花了很多时间试图找到我的问题的答案。最终，我放弃了并决定自己编写解码器。

所以这就是它。如果你想解码UTF-8字节（可能包含过长的序列），考虑使用下面的代码：

    private static string GetString(byte[] bytes)
    {
        StringBuilder builder = new StringBuilder();
        int index = 0;

        while (index < bytes.Length)
        {
            if ((bytes[index] & 0x7F) == bytes[index])
            {
                Int32 code = bytes[index];
                byte[] codeBytes = BitConverter.GetBytes(code);
                builder.Append(UnicodeEncoding.UTF32.GetString(codeBytes));
                index += 1;
            }
            else if (((bytes[index] & 0xDF) == bytes[index]) && (index + 1) < bytes.Length
                                                             && ((bytes[index + 1] & 0xBF) == bytes[index + 1]))
            {
                Int32 code = ((bytes[index + 0] & 0x1F) << 06) +
                             ((bytes[index + 1] & 0x7F) << 00);
                byte[] codeBytes = BitConverter.GetBytes(code);
                builder.Append(UnicodeEncoding.UTF32.GetString(codeBytes));
                index += 2;
            }
            else if (((bytes[index] & 0xEF) == bytes[index]) && (index + 2) < bytes.Length
                                                             && ((bytes[index + 1] & 0xBF) == bytes[index + 1])
                                                             && ((bytes[index + 2] & 0xBF) == bytes[index + 2]))
            {
                Int32 code = ((bytes[index + 0] & 0x0F) << 12) +
                             ((bytes[index + 1] & 0x7F) << 06) +
                             ((bytes[index + 2] & 0x7F) << 00);
                byte[] codeBytes = BitConverter.GetBytes(code);
                builder.Append(UnicodeEncoding.UTF32.GetString(codeBytes));
                index += 3;
            }
            else if (((bytes[index] & 0xF7) == bytes[index]) && (index + 3) < bytes.Length
                                                             && ((bytes[index + 1] & 0xBF) == bytes[index + 1])
                                                             && ((bytes[index + 2] & 0xBF) == bytes[index + 2])
                                                             && ((bytes[index + 3] & 0xBF) == bytes[index + 3]))
            {
                Int32 code = ((bytes[index + 0] & 0x07) << 18) +
                             ((bytes[index + 1] & 0x7F) << 12) +
                             ((bytes[index + 2] & 0x7F) << 06) +
                             ((bytes[index + 3] & 0x7F) << 00);
                byte[] codeBytes = BitConverter.GetBytes(code);
                builder.Append(UnicodeEncoding.UTF32.GetString(codeBytes));
                index += 4;
            }
            else if (((bytes[index] & 0xFB) == bytes[index]) && (index + 4) < bytes.Length
                                                             && ((bytes[index + 1] & 0xBF) == bytes[index + 1])
                                                             && ((bytes[index + 2] & 0xBF) == bytes[index + 2])
                                                             && ((bytes[index + 3] & 0xBF) == bytes[index + 3])
                                                             && ((bytes[index + 4] & 0xBF) == bytes[index + 4]))
            {
                Int32 code = ((bytes[index + 0] & 0x03) << 24) +
                             ((bytes[index + 1] & 0x7F) << 18) +
                             ((bytes[index + 2] & 0x7F) << 12) +
                             ((bytes[index + 3] & 0x7F) << 06) +
                             ((bytes[index + 4] & 0x7F) << 00);
                byte[] codeBytes = BitConverter.GetBytes(code);
                builder.Append(UnicodeEncoding.UTF32.GetString(codeBytes));
                index += 5;
            }
            else if (((bytes[index] & 0xFD) == bytes[index]) && (index + 5) < bytes.Length
                                                             && ((bytes[index + 1] & 0xBF) == bytes[index + 1])
                                                             && ((bytes[index + 2] & 0xBF) == bytes[index + 2])
                                                             && ((bytes[index + 3] & 0xBF) == bytes[index + 3])
                                                             && ((bytes[index + 4] & 0xBF) == bytes[index + 4])
                                                             && ((bytes[index + 5] & 0xBF) == bytes[index + 5]))
            {
                Int32 code = ((bytes[index + 0] & 0x01) << 30) +
                             ((bytes[index + 1] & 0x7F) << 24) +
                             ((bytes[index + 2] & 0x7F) << 18) +
                             ((bytes[index + 3] & 0x7F) << 12) +
                             ((bytes[index + 4] & 0x7F) << 06) +
                             ((bytes[index + 5] & 0x7F) << 00);
                byte[] codeBytes = BitConverter.GetBytes(code);
                builder.Append(UnicodeEncoding.UTF32.GetString(codeBytes));
                index += 6;
            }
            else
                throw new Exception("Wrong UTF-8 format");
        }

        return builder.ToString();
    }

    public static void Main()
    {
        string source = "Disk:%FC%80%80%80%80%AFFolder";
        byte[] bytes = HttpUtility.UrlDecodeToBytes(source);
        string result = GetString(bytes);
    }

- walruz

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Perfect28 · Accepted Answer

URLS ares 7bits ASCII。W3schools说：

只有使用ASCII字符集才能通过互联网发送URL。

%C0和%AF没有进行8位编码。这就是为什么Uri.UnescapeDataString无法对其进行解码的原因。