我正在尝试从html文件中提取文本。喜欢这个页面。
https://artkapakistan.wordpress.com/2013/01/08/debunking-the-myth-of-the-artist/
我使用HtmlAgilipack从entry-content类中获取内部html,然后删除html标记。编码似乎有问题因为我得到了奇怪的字符。 •确切地说。根据我的在线搜索,第一个是单引号,第二个是不间断的空间。我尝试使用正则表达式替换单引号和双引号但没有成功。
s1 = Regex.Replace(s1, "’|‘", "'");
s1 = Regex.Replace(s1, "“|”", "\"");
但我无法取代他们。编码似乎存在一些问题。在正则表达式和字符串替换之下,我不是那么精通。你能帮助我解决这个问题吗?我试图找出'修复c#中的unicode问题'但没有成功。 非常感谢这方面的任何帮助。
编辑: 以下是我如何检索内部html和文本。
text = document.DocumentNode.SelectSingleNode(postBodyClass).InnerHtml;
text = RemoveHTMLTags(text);
text = RemoveHTMLPunctuation(text);
public static string RemoveHTMLPunctuation(string input)
{
string s1 = input;
s1 = System.Net.WebUtility.HtmlDecode(s1);
//replace html left right single double quotation marks
s1 = Regex.Replace(s1, "€¦", "…");
s1 = Regex.Replace(s1, "’", "'");
s1 = Regex.Replace(s1, "€œ|€", "\"");
//replace unicode right and left quotation marks with straight quotation
string s2 = s1.Replace("“", "\x201c");
string s3 = s2.Replace("’", "\x2019");
string s4 = s3.Replace("”", "\x201d");
string s5 = s4.Replace("…", "\x2026");
string s6 = s5.Replace(" ", "");
s6 = s6.Replace("«", "");
string s7 = s6.Replace(""", "\"");
string s8 = s7.Replace("&", "&");
s8 = Regex.Replace(s8, "&[a-z]+;", "");
s8 = Regex.Replace(s8, "'", "'");
//remove non breaking space
s8 = Regex.Replace(s8, " |Â", "");
//add missing spaces after punctuation marks
//s8 = Regex.Replace(s8, "([\\.\\?,;:])(\\w+)", "$1 $2");
return s8;
}
public static string RemoveHTMLTags(string input)
{
string s1 = input;
//remove script tag and everything within.
s1 = Regex.Replace(s1, "\\<script\\s*[^><]+\\>[^><]*\\</\\s*script\\>", "");
s1 = Regex.Replace(s1, "\\<\\s*br\\s*/*\\s*\\>", Environment.NewLine);
//add new line for div p or li tag
s1 = Regex.Replace(s1, "\\<\\s*/(div|p|li)\\s*\\s*\\>", Environment.NewLine);
s1 = Regex.Replace(s1, "\\>=", "");
string s2 = Regex.Replace(s1, "“", "\x201c");
string s3 = Regex.Replace(s2, "\\<[Aa]([^><]+|\\s*)\\>.*\\</\\s*[Aa]\\s*\\>", "");
string s4 = Regex.Replace(s3, "\\<[^<>]+\\>", "");
string s5 = Regex.Replace(s4, "\\|", "");
//replace multiple lines with 1 line
s5 = Regex.Replace(s5, "(\\r\\n|\\r|\\n){2,}", Environment.NewLine);
//any annoying text put it here to replace from post text
//s5 = Regex.Replace(s5, "Copyright (c) 2008 Saadia Malik", "");
s5 = s5.Trim();
return s5;
}