我正在创建一个C#4.0应用程序,用于使用Web客户端下载网页内容。
WebClient功能
public static string GetDocText(string url)
{
string html = string.Empty;
try
{
using (ConfigurableWebClient client = new ConfigurableWebClient())
{
/* Set timeout for webclient */
client.Timeout = 600000;
/* Build url */
Uri innUri = null;
if (!url.StartsWith("http://"))
url = "http://" + url;
Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out innUri);
try
{
client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR " + "3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.2; AskTbFXTV5/5.15.4.23821; BRI/2)");
client.Headers.Add("Vary", "Accept-Encoding");
client.Encoding = Encoding.UTF8;
html = client.DownloadString(innUri);
if (html.Contains("Pagina non disponibile"))
{
string str = "site blocked";
str = "";
}
if (string.IsNullOrEmpty(html))
{
return string.Empty;
}
else
{
return html;
}
}
catch (Exception ex)
{
return "";
}
finally
{
client.Dispose();
}
}
}
catch (Exception ex)
{
return "";
}
}
public class ConfigurableWebClient : WebClient
{
public int? Timeout { get; set; }
public int? ConnectionLimit { get; set; }
protected override WebRequest GetWebRequest(Uri address)
{
var baseRequest = base.GetWebRequest(address);
var webRequest = baseRequest as HttpWebRequest;
if (webRequest == null)
return baseRequest;
if (Timeout.HasValue)
webRequest.Timeout = Timeout.Value;
if (ConnectionLimit.HasValue)
webRequest.ServicePoint.ConnectionLimit = ConnectionLimit.Value;
return webRequest;
}
}
我检查了C#Web客户端中的下载内容,它与浏览器略有不同
内容。我在浏览器(Mozilla Firefox)和我的Web客户端功能中提供相同的URL。
网页正确显示内容但我的Web客户端DownloadString返回另一个
HTML。请参阅下面的Web客户端响应。
Webclient已下载html
<!DOCTYPE html>
<head>
<META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW">
<meta http-equiv="cache-control" content="max-age=0" />
<meta http-equiv="cache-control" content="no-cache" />
<meta http-equiv="expires" content="0" />
<meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />
<meta http-equiv="pragma" content="no-cache" />
<meta http-equiv="refresh" content="10; url=/distil_r_captcha.html?Ref=/pgol/4-abbigliamento/3-Roma%20%28RM%29/p-7&distil_RID=A8D2F8B6-B314-11E3-A5E9-E04C5DBA1712" />
<script type="text/javascript" src="/ga.280243267228712.js?PID=6D4E4D1D-7094-375D-A439-0568A6A70836" defer></script><style type="text/css">#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#glance7ca96c1b,#hiredf795fe70,#target01a7c05a,#hiredf795fe70{display:none!important}</style></head>
<body>
<div id="distil_ident_block"> </div>
<div id="d__fFH"><OBJECT id="d_dlg" CLASSID="clsid:3050f819-98b5-11cf-bb82-00aa00bdce0b" width="0px" height="0px"></OBJECT><span id="d__fF"></span></div></body>
</html>
我的问题是我的Webclient功能没有返回实际的网页内容。
请帮忙。
答案 0 :(得分:2)
某些Web程序通过HTTP请求标头响应不同。
所以,如果你想要与网络浏览器相同的HTML,
然后您将发送与您的Web浏览器相同的HTTP请求!
如何?
使用Firefox Developer工具或Chrome Developer Tool,并复制HTTP请求!
答案 1 :(得分:0)
在我的案例中,WebClient的DownloadData / DownloadFile / DownloadString方法显示的结果与从浏览器(例如Chrome)下载文件时的结果不同。首先,我认为这是一个编码问题,并且遍历了Encoding.GetEncodings()
中的所有编码,但是输出数据显示了无意义的字符。然后经过大量搜索,我最终来到了这里。
我按照@ han058的建议在Chrome浏览器的“网络”标签中查看了Response headers
,它显示为:
Cache-Control: public, max-age=900
content-disposition: attachment;filename=FILENAME.csv
Content-Encoding: gzip
Content-Length: 29310
Content-Type: text/plain; charset=utf-8
Date: Sat, 04 Jan 2020 20:20:13 GMT
Expires: Sat, 04 Jan 2020 20:35:14 GMT
Last-Modified: Sat, 04 Jan 2020 20:20:14 GMT
Server: Microsoft-IIS/10.0
Vary: *
X-Powered-By: ASP.NET
X-Powered-By: ARR/3.0
X-Powered-By: ASP.NET
因此,响应被编码为Content-Encoding: gzip
。换句话说,我必须先解压缩文件,然后才能阅读文件。
using System;
using System.IO;
using System.IO.Compression;
using System.Net;
public class Program
{
static void Main(string[] args)
{
var url = new Uri("http://www.url.com/FILENAME.csv");
var path = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
var fileName = "File.csv";
using (WebClient wc = new WebClient())
using (Stream s = File.Create(Path.Combine(path, fileName)))
using (GZipStream gs = new GZipStream(wc.OpenRead(url), CompressionMode.Decompress))
{
//Saves to C:\Users\[YourUser]\Desktop\File.csv
gs.CopyTo(s);
}
}
}