我正在尝试获取网站的标题标记。有些东西是希伯来语,虽然大部分都是正确的希伯来语,但有时我会得到奇怪的符号: 。 我的代码:
ServicePointManager.Expect100Continue = true;
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls
| SecurityProtocolType.Tls11
| SecurityProtocolType.Tls12
| SecurityProtocolType.Ssl3;
var cookieContainer = new CookieContainer();
using (var web = new WebClientEx(cookieContainer))
{
web.Headers["Accept"] = "*/*";
web.Headers["User-Agent"] = ProxiesServersExtension.GetRandomUserAgent();
web.Headers["Method"] = "GET";
web.Headers["Content-Type"] = "text/html";
web.Headers.Add("Accept-Language", "en-US,en;q=0.8");
Uri myUri = new Uri(url, UriKind.Absolute);
var page = web.DownloadStringAwareOfEncoding(myUri);
public static string DownloadStringAwareOfEncoding(this WebClient webClient, Uri uri)
{
var rawData = webClient.DownloadData(uri);
var encoding = WebUtils.GetEncodingFrom(webClient.ResponseHeaders, Encoding.UTF8);
return encoding.GetString(rawData);
}
这是获取编码的函数:
public static Encoding GetEncodingFrom(NameValueCollection responseHeaders, Encoding defaultEncoding = null)
{
if (responseHeaders == null)
throw new ArgumentNullException("responseHeaders");
//Note that key lookup is case-insensitive
var contentType = responseHeaders["Content-Type"];
if (contentType == null)
return defaultEncoding;
var contentTypeParts = contentType.Split(';');
if (contentTypeParts.Length <= 1)
return defaultEncoding;
var charsetPart =
contentTypeParts.Skip(1).FirstOrDefault(
p => p.TrimStart().StartsWith("charset", StringComparison.InvariantCultureIgnoreCase));
if (charsetPart == null)
return defaultEncoding;
var charsetPartParts = charsetPart.Split('=');
if (charsetPartParts.Length != 2)
return defaultEncoding;
var charsetName = charsetPartParts[1].Trim();
if (charsetName == "")
return defaultEncoding;
try
{
return Encoding.GetEncoding(charsetName);
}
catch (ArgumentException ex)
{
throw new InvalidOperationException("The server returned data in an unknown encoding: " + charsetName, ex);
}
}