我编写了一个函数,在字符串中搜索给定的标签并删除所有这些标签及其内容,但第一个除外:
Sub Main()
Dim fileAsString = "<div>myFirstDiv</div>" +
"<Div></dIV>" +
"<city>NY</city>" +
"<city></city>" +
"<div></div>" +
"<span></span>"
' Removes these tags and their content from fileAsString, except the
' first appearance
Dim forbiddenNodeslist As New List(Of String)
forbiddenNodeslist.Add("div")
forbiddenNodeslist.Add("city")
' Run all over the forbidden tags
For Each node In forbiddenNodeslist
Dim re = New Regex("<" + node + "[^>]*>(.*?)</" + node + ">", RegexOptions.IgnoreCase)
Dim matches = re.Matches(fileAsString)
Dim matchesCount = matches.Count - 1
' Count the characters that were replaced by empty string, in order
' to update the start index of the other matches
Dim removedCharacters = 0
' Run all over the matches, except the first one
For index = 1 To matches.Count - 1
Dim match = matches(index)
' set start index and length in order to replace it by empty string
Dim startIndex = match.Index - removedCharacters
Dim matchCharactersCount = match.Length
' Update the number of characters that will be removed
removedCharacters = matchCharactersCount
' Remove it from the string
fileAsString = fileAsString.Remove(startIndex, matchCharactersCount)
Next
Next
end sub
但是效率低下导致我搜索匹配(字符串的第一个循环),然后一次又一次地循环,以便用空字符串替换它。
如何提高效率?
任何帮助表示赞赏!
答案 0 :(得分:2)
所以我在C#中回答了这个问题。你可以找到我使用的小提琴here
public static void Main()
{
var fileAsString = "<div>myFirstDiv</div><Div></dIV><city>NY</city><city></city><div></div><span></span>";
//Using pipe delimited, this will come in handy for our second regex
var delimetedForbiddenList = "div|city";
//Use this regex to get everything that isn't the first tag
var allButFirstTagRegex = new Regex(@"^(<([a-z]+)>[^</]*</\2>)(.*)", RegexOptions.IgnoreCase);
var matches = allButFirstTagRegex.Matches(fileAsString);
//matches[0].Groups[1] = (<([a-z]+)>[^</]*</\2>) -- the complete first
//tag (open, close, and inner), we'll use this later
//matches[0].Groups[2] = ([a-zA-Z]+) --the first opening tag
//used to get a matching close tag
//matches[0].Groups[3] = (.*) -- everything not in the first tag
var allButFirstTag = matches[0].Groups[3].ToString();
//allButFirstTag == @"<Div></dIV><city>NY</city><city></city><div></div><span></span>"
//the regex to remove our forbidden tags
var removeForbiddenPattern = String.Format("(<({0})>[^</]*</\\2>)", delimetedForbiddenList);
//removeForbiddenPattern == new Regex(@"(<(div|city)>[^</]*</\2>)");
var resultsWithForbiddenRemoved = Regex.Replace(allButFirstTag, removeForbiddenPattern, String.Empty, RegexOptions.IgnoreCase);
//resultsWithForbiddenRemoved == @"<span></span>"
var finalResults = matches[0].Groups[1].ToString() + resultsWithForbiddenRemoved;
//finalResults = <div>myFirstDiv</div><span></span>
}