如何解码已编码的 HttpWebResponse?

4
我可以帮助您进行翻译。以下是需要翻译的内容:

我有一段代码,用于从URL获取页面HTML,但响应内容似乎已经编码。

代码:

    HttpWebRequest xhr = (HttpWebRequest) WebRequest.Create(new Uri("https://www.youtube.com/watch?v=_Ewh75YGIGQ"));
        xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
        //xhr.CookieContainer = request.Account.CookieContainer;
        xhr.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
        xhr.Headers["Accept-Encoding"] = "gzip, deflate, br";
        xhr.Headers["Accept-Language"] = "en-US,en;q=0.5";
        xhr.Headers["Upgrade-Insecure-Requests"] = "1";
        xhr.KeepAlive = true;
        xhr.UserAgent = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)";
        xhr.Host = "www.youtube.com";
        xhr.Referer = "https://www.youtube.com/watch?v=6aCpYxzRkf4";
        var response = xhr.GetResponse();
        string html;
        using (StreamReader reader = new StreamReader(response.GetResponseStream()))
        {
            html = reader.ReadToEnd();
        }

这些是响应头:
    X-XSS-Protection: 1; mode=block; report=https://www.google.com/appserve/security-bugs/log/youtube
    X-Content-Type-Options: nosniff
    X-Frame-Options: SAMEORIGIN
    Strict-Transport-Security: max-age=31536000
    Content-Encoding: br
    Transfer-Encoding: chunked
    Alt-Svc: quic=":443"; ma=2592000; v="44,43,39,35"
    Cache-Control: no-cache
    Content-Type: text/html; charset=utf-8
    Date: Sat, 24 Nov 2018 11:30:38 GMT
    Expires: Tue, 27 Apr 1971 19:44:06 EST
    P3P: CP="This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl=it for more info."
    Set-Cookie: PREF=f1=50000000&al=it; path=/; domain=.youtube.com; expires=Thu, 25-Jul-2019 23:23:38 GMT
    Server: YouTube Frontend Proxy

使用StreamReader.ReadToEnd()解析响应字符串,结果看起来像this


可能是.NET的HttpWebResponse是否自动解压缩GZip和Deflate响应?的重复问题。 - ProgrammingLlama
@John 这是结果:https://imgur.com/a/RGMOl4i,仍然无法工作。 - garry man
1
请问您能否编辑您的问题,提供足够的信息让我们尝试复制这个问题吗?谢谢 :-) - ProgrammingLlama
1
@John,我已经用文本替换了变量。如果您需要更多关于请求的信息,请指出。我还删除了Cookie,因为它们不是必需的。 - garry man
@John,问题可能是什么? - garry man
显示剩余3条评论
3个回答

7

是的,以上答案是正确的。服务器生成的响应采用的是br编码。您需要对其进行解码。默认系统压缩包不支持br编码,您需要安装Brotli.net nuget包。

将以下代码添加到您的代码中以覆盖3种主要的编码类型:gzip、br和defalte

            HttpWebResponse response = (HttpWebResponse)webRequest.GetResponse();
            Stream responseStream = response.GetResponseStream();

             if (response.ContentEncoding.ToLower().Contains("gzip"))
                responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
            else if (response.ContentEncoding.ToLower().Contains("deflate"))
                responseStream = new DeflateStream(responseStream, CompressionMode.Decompress);
            else if (response.ContentEncoding.ToLower().Contains("br"))
                responseStream = new BrotliStream(responseStream, CompressionMode.Decompress);

4
答案在响应头中:Content-Encoding: br -> 这意味着 Brotli 压缩。
有一个 .NET 实现(NuGet 包):
将此添加到您的项目中,添加 "using Brotli; " 并用以下代码替换 "using (StreamReader....."。
       using (BrotliStream bs = new BrotliStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress)) {
            using (System.IO.MemoryStream msOutput = new System.IO.MemoryStream()) {
                bs.CopyTo(msOutput);
                msOutput.Seek(0, System.IO.SeekOrigin.Begin);
                using (StreamReader reader = new StreamReader(msOutput)) {
                    html = reader.ReadToEnd();
                }
            }
        }

哇...不错的发现。 - Stefan

0
public class ZipFileUtilities
{
    private static readonly byte[] ZipBytes1 = { 0x50, 0x4b, 0x03, 0x04, 0x0a };
    private static readonly byte[] GzipBytes = { 0x1f, 0x8b };
    private static readonly byte[] TarBytes = { 0x1f, 0x9d };
    private static readonly byte[] LzhBytes = { 0x1f, 0xa0 };
    private static readonly byte[] Bzip2Bytes = { 0x42, 0x5a, 0x68 };
    private static readonly byte[] LzipBytes = { 0x4c, 0x5a, 0x49, 0x50 };
    private static readonly byte[] ZipBytes2 = { 0x50, 0x4b, 0x05, 0x06 };
    private static readonly byte[] ZipBytes3 = { 0x50, 0x4b, 0x07, 0x08 };

    public static byte[] GetFirstBytes(string filepath, int length)
    {
        using (var sr = new StreamReader(filepath))
        {
            sr.BaseStream.Seek(0, 0);
            var bytes = new byte[length];
            sr.BaseStream.Read(bytes, 0, length);

            return bytes;
        }
    }

    public static bool IsZipFile(string filepath)
    {
        return IsCompressedData(GetFirstBytes(filepath, 5));
    }

    public static bool IsCompressedData(byte[] data)
    {
        foreach (var headerBytes in new[] { ZipBytes1, ZipBytes2, ZipBytes3, GzipBytes, TarBytes, LzhBytes, Bzip2Bytes, LzipBytes })
        {
            if (HeaderBytesMatch(headerBytes, data))
                return true;
        }

        return false;
    }

    private static bool HeaderBytesMatch(byte[] headerBytes, byte[] dataBytes)
    {
        if (dataBytes.Length < headerBytes.Length)
            throw new ArgumentOutOfRangeException(nameof(dataBytes),
                $"Passed databytes length ({dataBytes.Length}) is shorter than the headerbytes ({headerBytes.Length})");

        for (var i = 0; i < headerBytes.Length; i++)
        {
            if (headerBytes[i] == dataBytes[i]) continue;

            return false;
        }

        return true;
    }


    public static byte[] ReadFully(Stream input)
    {
        byte[] buffer = new byte[16 * 1024];
        using (MemoryStream ms = new MemoryStream())
        {
            int read;
            while ((read = input.Read(buffer, 0, buffer.Length)) > 0)
            {
                ms.Write(buffer, 0, read);
            }
            return ms.ToArray();
        }
    }


    public static byte[] Decompress(byte[] data)
    {
        using (var compressedStream = new MemoryStream(data))
        using (var zipStream = new GZipStream(compressedStream, CompressionMode.Decompress))
        using (var resultStream = new MemoryStream())
        {
            zipStream.CopyTo(resultStream);
            return resultStream.ToArray();
        }
    }


    public static string ToQueryString(NameValueCollection nvc)
    {
        if (nvc == null) return string.Empty;

        StringBuilder sb = new StringBuilder();

        foreach (string key in nvc.Keys)
        {
            if (string.IsNullOrWhiteSpace(key)) continue;

            string[] values = nvc.GetValues(key);
            if (values == null) continue;

            foreach (string value in values)
            {
                sb.Append(sb.Length == 0 ? "" : "&");
                sb.AppendFormat("{0}={1}", Uri.EscapeDataString(key), Uri.EscapeDataString(value));
            }
        }

        return sb.ToString();
    }
}

使用

                if (response.StatusCode == HttpStatusCode.OK)
                {
                    using (var responseStream = response.GetResponseStream())
                    {
                        var t = ZipFileUtilities.ReadFully(responseStream);
                        if (t != null)
                            if (ZipFileUtilities.IsCompressedData(t))
                            {
                                t = ZipFileUtilities.Decompress(t);
                            }
                        using (var ms = new MemoryStream(t))
                        using (var streamReader = new StreamReader(ms))
                        using (var jsonReader = new JsonTextReader(streamReader))
                        {
                            var serializer = new JsonSerializer();
                            modil = serializer.Deserialize<Model>(jsonReader)
                        }
                    }
                }

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接