我有这段代码使用HtmlAgilityPack
加载和解析网页。它适用于大多数网页,但是当我尝试加载日文网页时,似乎编码是错误的。我怎样才能做到这一点?实际上如何根据网页编码设置编码?
class Program {
private const string HttpMethod = "GET";
private const string UserAgent =
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7";
static void Main(string[] args) {
var request = WebRequest.Create("http://infoseek.co.jp/") as HttpWebRequest;
if (request == null)
return;
request.Method = HttpMethod;
request.UserAgent = UserAgent;
var response = request.GetResponse() as HttpWebResponse;
if (response == null)
return;
var stream = response.GetResponseStream();
var document = new HtmlDocument {
OptionCheckSyntax = true,
OptionFixNestedTags = true,
OptionAutoCloseOnEnd = true,
OptionDefaultStreamEncoding = Encoding.UTF8,
OptionReadEncoding = true
};
document.Load(stream, Encoding.UTF8);
var d = document.DocumentNode;
}
}
答案 0 :(得分:0)
infoseek.co.jp使用HTTP标头进行响应
Content-Type text/html; charset=EUC-JP
在HTML标记
中镜像<meta http-equiv="Content-Type" content="text/html; charset=EUC-JP">
在.Net中,使用Code Page 51932解码EUC-JP。
答案 1 :(得分:0)
我试图通过下面的代码从HttpWebResponse对象获取编码。你有没有看到任何问题或有任何其他想法?
class Program {
private const string HttpMethod = "GET";
private const string UserAgent =
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7";
static void Main(string[] args) {
var request = WebRequest.Create("http://infoseek.co.jp/") as HttpWebRequest;
if (request == null)
return;
request.Method = HttpMethod;
request.UserAgent = UserAgent;
var response = request.GetResponse() as HttpWebResponse;
if (response == null)
return;
var encoding = TryGetEncoding(response);
var stream = response.GetResponseStream();
var document = new HtmlDocument {
OptionCheckSyntax = true,
OptionFixNestedTags = true,
OptionAutoCloseOnEnd = true,
OptionReadEncoding = true,
OptionDefaultStreamEncoding = encoding
};
document.Load(stream, encoding);
var d = document.DocumentNode;
}
private static Encoding TryGetEncoding(HttpWebResponse response) {
var charset = response.CharacterSet;
if (string.IsNullOrWhiteSpace(charset))
charset = response.ContentEncoding;
if (string.IsNullOrWhiteSpace(charset))
return Encoding.UTF8;
try {
return Encoding.GetEncoding(charset);
} catch {
return Encoding.UTF8;
}
}
}