我尝试使用正则表达式进行解析,但并未解析所有XML文件!!
标签名称未预定义,这意味着,解决方案应该自己识别XML标签的开头和结尾
示例数据如下:
<XML data>
Random text
<XML data>
Random text
答案 0 :(得分:1)
正如我在评论中所说,这是一个难题,尤其是在不了解您的数据的情况下。可能会有很多边缘案例。
以下代码可能适用于您想要的内容。它试图找到元素('&lt; element ...'或'&lt; element /&gt;',然后通过查找结束标记来确定XML内容。它通过尝试解析XML来检查它,如果错误则拒绝它XML。
此代码是为了清晰而编写的,而不是性能或良好的结构(它是所有主线的下降)。这应该为您提供一个示例,您可以将其用作您所描述的解析类型的良好起点。
using System;
using System.Linq;
using System.Text;
using System.Xml.Linq;
namespace messyXml {
class Program {
const String almostXml = @"
@#$%random junk
<Fruits>
<Apples>
Pies
</Apples>
<Pears>
Tarts
</Pears>
</Fruits>
This is some junk about Fruits like Apples and Pears
which can be made into Pies and Tarts. I think that
Pears<Apples. Edge case might be <Parts or /Apples>
<Parts No='123'>
Pie Plate
</Parts>";
static void Main(string[] args) {
Console.WriteLine("Extracting XML from:");
Console.WriteLine(almostXml);
Console.WriteLine();
int i = 0;
var validXml = new StringBuilder();
while (i < almostXml.Length) {
if (almostXml[i] == '<') { // might be an xml start
int ix = almostXml.IndexOfAny(" >\t".ToArray(), i + 1);
// this only check for space, > and tab, you may want to
// include other whitespace chars
if (ix < 0) {
ix = almostXml.IndexOf("/>", i + 1); // you might have <element/>
if (ix >= 0) {
// you could check if element name is valid
var xml = almostXml.Substring(i, (ix + 2) - i);
try {
// see if this is really xml
var doc = XDocument.Parse(xml);
validXml.AppendLine(xml);
i = ix + 2;
continue; // next iteration of while i
}
catch (System.Xml.XmlException) {
// do nothing
}
}
ix = almostXml.IndexOf(">", i + 1); // you might have <element/>
}
else { // we found something like <element ...
var ix2 = almostXml.IndexOf('>', ix); // where is the '>'
if (ix2 >= 0) {
// build an end tag
var endtag = "</" + almostXml.Substring(i + 1, (ix - i) - 1) + ">";
var endix = almostXml.IndexOf(endtag, ix2);
if (endix >= 0) {
var xml = almostXml.Substring(i, (endix + endtag.Length) - i);
try {
// see if this is really xml
var doc = XDocument.Parse(xml);
validXml.AppendLine(xml);
i = (endix + endtag.Length);
continue; // next iteration of while i
}
catch (System.Xml.XmlException) {
// do nothing
}
}
}
}
}
i++;
}
Console.WriteLine("-----------");
Console.WriteLine("Valid XML found:");
Console.WriteLine(validXml.ToString());
Console.ReadKey();
}
}
}