如何剥离此文本
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
<test@test.com>
</body>
</html>
看起来像
My First Heading
My first paragraph.
<test@test.com>
使用功能
public static string StripHTML(this string htmlText)
{
var reg = new Regex("<(.|\n)*?>", RegexOptions.IgnoreCase);
return reg.Replace(htmlText, "");
}
我得到了
我的第一个标题 我的第一段。
答案 0 :(得分:4)
使用Html Agility Pack进行此类操作。它比任何正则表达式都快,并且支持LINQ。
答案 1 :(得分:2)
static void Main(string[] args)
{
string modified_html = emas(input);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(modified_html);
string test1 = doc.DocumentNode.InnerText;
Console.WriteLine();
var reg = new Regex("<(.|\n)*?>", RegexOptions.IgnoreCase);
Console.WriteLine(reg.Replace(modified_html , ""));
Console.Read();
}
public static string emas(string text)
{
string stripped = text;
const string MatchEmailPattern =
@"(([\w-]+\.)+[\w-]+|([a-zA-Z]{1}|[\w-]{2,}))@"
+ @"((([0-1]?[0-9]{1,2}|25[0-5]|2[0-4][0-9])\.([0-1]?[0-9]{1,2}|25[0-5]|2[0-4][0-9])\."
+ @"([0-1]?[0-9]{1,2}|25[0-5]|2[0-4][0-9])\.([0-1]?[0-9]{1,2}|25[0-5]|2[0-4][0-9])){1}|"
+ @"([a-zA-Z]+[\w-]+\.)+[a-zA-Z]{2,4})";
Regex rx = new Regex(MatchEmailPattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
// Find matches.
MatchCollection matches = rx.Matches(text);
// Report the number of matches found.
int noOfMatches = matches.Count;
// Report on each match.
foreach (Match match in matches)
{
stripped = stripped.Replace("<"+ match.Value + ">" , match.Value);
}
return stripped;
}
static string input = " Your html goes here ";