我有这段代码可以从URL提取Page HTML,但是响应内容看起来已经编码。
代码:
HttpWebRequest xhr = (HttpWebRequest) WebRequest.Create(new Uri("https://www.youtube.com/watch?v=_Ewh75YGIGQ"));
xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
//xhr.CookieContainer = request.Account.CookieContainer;
xhr.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
xhr.Headers["Accept-Encoding"] = "gzip, deflate, br";
xhr.Headers["Accept-Language"] = "en-US,en;q=0.5";
xhr.Headers["Upgrade-Insecure-Requests"] = "1";
xhr.KeepAlive = true;
xhr.UserAgent = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)";
xhr.Host = "www.youtube.com";
xhr.Referer = "https://www.youtube.com/watch?v=6aCpYxzRkf4";
var response = xhr.GetResponse();
string html;
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
html = reader.ReadToEnd();
}
这些是响应头:
X-XSS-Protection: 1; mode=block; report=https://www.google.com/appserve/security-bugs/log/youtube
X-Content-Type-Options: nosniff
X-Frame-Options: SAMEORIGIN
Strict-Transport-Security: max-age=31536000
Content-Encoding: br
Transfer-Encoding: chunked
Alt-Svc: quic=":443"; ma=2592000; v="44,43,39,35"
Cache-Control: no-cache
Content-Type: text/html; charset=utf-8
Date: Sat, 24 Nov 2018 11:30:38 GMT
Expires: Tue, 27 Apr 1971 19:44:06 EST
P3P: CP="This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl=it for more info."
Set-Cookie: PREF=f1=50000000&al=it; path=/; domain=.youtube.com; expires=Thu, 25-Jul-2019 23:23:38 GMT
Server: YouTube Frontend Proxy
用StreamReader.ReadToEnd()
解析的响应字符串看起来像this
答案 0 :(得分:1)
答案在响应头中:Content-Encoding:br->这意味着Brotli压缩。
有一个.NET实现(NuGet程序包):
将此安装到项目中,添加“ using Brotli;”,并用以下代码替换“ using(StreamReader .....””:
using (BrotliStream bs = new BrotliStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress)) {
using (System.IO.MemoryStream msOutput = new System.IO.MemoryStream()) {
bs.CopyTo(msOutput);
msOutput.Seek(0, System.IO.SeekOrigin.Begin);
using (StreamReader reader = new StreamReader(msOutput)) {
html = reader.ReadToEnd();
}
}
}
答案 1 :(得分:1)
是。上面的答案是正确的。服务器生成的响应采用br编码。您需要对其进行解码。默认系统压缩程序包中不包含对br编码的支持,您必须安装Brotli.net nuget程序包。
在您的代码中添加此代码,以涵盖gzip,br和defalte的3种主要编码类型。
HttpWebResponse response = (HttpWebResponse)webRequest.GetResponse();
Stream responseStream = response.GetResponseStream();
if (response.ContentEncoding.ToLower().Contains("gzip"))
responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
else if (response.ContentEncoding.ToLower().Contains("deflate"))
responseStream = new DeflateStream(responseStream, CompressionMode.Decompress);
else if (response.ContentEncoding.ToLower().Contains("br"))
responseStream = new BrotliStream(responseStream, CompressionMode.Decompress);
答案 2 :(得分:0)
xtabs(income ~ names + years, a)
# years
#names 2012 2013
# Daniel 45 105
# John 60 80
用途
public class ZipFileUtilities
{
private static readonly byte[] ZipBytes1 = { 0x50, 0x4b, 0x03, 0x04, 0x0a };
private static readonly byte[] GzipBytes = { 0x1f, 0x8b };
private static readonly byte[] TarBytes = { 0x1f, 0x9d };
private static readonly byte[] LzhBytes = { 0x1f, 0xa0 };
private static readonly byte[] Bzip2Bytes = { 0x42, 0x5a, 0x68 };
private static readonly byte[] LzipBytes = { 0x4c, 0x5a, 0x49, 0x50 };
private static readonly byte[] ZipBytes2 = { 0x50, 0x4b, 0x05, 0x06 };
private static readonly byte[] ZipBytes3 = { 0x50, 0x4b, 0x07, 0x08 };
public static byte[] GetFirstBytes(string filepath, int length)
{
using (var sr = new StreamReader(filepath))
{
sr.BaseStream.Seek(0, 0);
var bytes = new byte[length];
sr.BaseStream.Read(bytes, 0, length);
return bytes;
}
}
public static bool IsZipFile(string filepath)
{
return IsCompressedData(GetFirstBytes(filepath, 5));
}
public static bool IsCompressedData(byte[] data)
{
foreach (var headerBytes in new[] { ZipBytes1, ZipBytes2, ZipBytes3, GzipBytes, TarBytes, LzhBytes, Bzip2Bytes, LzipBytes })
{
if (HeaderBytesMatch(headerBytes, data))
return true;
}
return false;
}
private static bool HeaderBytesMatch(byte[] headerBytes, byte[] dataBytes)
{
if (dataBytes.Length < headerBytes.Length)
throw new ArgumentOutOfRangeException(nameof(dataBytes),
$"Passed databytes length ({dataBytes.Length}) is shorter than the headerbytes ({headerBytes.Length})");
for (var i = 0; i < headerBytes.Length; i++)
{
if (headerBytes[i] == dataBytes[i]) continue;
return false;
}
return true;
}
public static byte[] ReadFully(Stream input)
{
byte[] buffer = new byte[16 * 1024];
using (MemoryStream ms = new MemoryStream())
{
int read;
while ((read = input.Read(buffer, 0, buffer.Length)) > 0)
{
ms.Write(buffer, 0, read);
}
return ms.ToArray();
}
}
public static byte[] Decompress(byte[] data)
{
using (var compressedStream = new MemoryStream(data))
using (var zipStream = new GZipStream(compressedStream, CompressionMode.Decompress))
using (var resultStream = new MemoryStream())
{
zipStream.CopyTo(resultStream);
return resultStream.ToArray();
}
}
public static string ToQueryString(NameValueCollection nvc)
{
if (nvc == null) return string.Empty;
StringBuilder sb = new StringBuilder();
foreach (string key in nvc.Keys)
{
if (string.IsNullOrWhiteSpace(key)) continue;
string[] values = nvc.GetValues(key);
if (values == null) continue;
foreach (string value in values)
{
sb.Append(sb.Length == 0 ? "" : "&");
sb.AppendFormat("{0}={1}", Uri.EscapeDataString(key), Uri.EscapeDataString(value));
}
}
return sb.ToString();
}
}