我有以下HTML结构,我试图从中提取第一个h2和第一段:
function getSnippet ($html) {
$doc = new DOMDocument();
$doc->loadHTML($html);
$headings = $doc->getElementsByTagName('h2');
$paragraphs = $doc->getElementsByTagName('p');
$snippet = '';
$firsth2 = $headings->item(0);
$snippet .= $firsth2->firstChild->nodeValue;
$firstp = $paragraphs->item(0);
$snippet .= $firstp->firstChild->nodeValue;
return $snippet;
}
这是我使用DOMDocument的代码,它适用于上面HTML中的标题,但它不适用于段落 - 只获取链接的文本,我认为它可能是有意义的,因为它可能已创建作为另一个节点。我如何更改下面的函数,以便它将获得所有第一段文本?
[TestMethod]
public void Test()
{
var template = "A:<<Default>>;B:<<Default>>;C:<<Default>>;D:<<Default>>;E:<<Default>>";
Assert.AreEqual(Pair.Process("A:aaa;E:eee", template),
"A:aaa;B:<<Default>>;C:<<Default>>;D:<<Default>>;E:eee");
Assert.AreEqual(Pair.Process("D:ddd", template),
"A:<<Default>>;B:<<Default>>;C:<<Default>>;D:ddd;E:<<Default>>");
Assert.AreEqual(Pair.Process("B:bbb;E:eee", template),
"A:<<Default>>;B:bbb;C:<<Default>>;D:<<Default>>;E:eee");
}
public class Pair
{
public static char InnerSeperator = ':';
public static char OuterSeperator = ';';
public static string DefaultValue = "<<Default>>";
public Pair(string asString)
{
var strings = asString.Split(InnerSeperator).ToList();
Key = strings[0];
Value = strings.Count > 1 ? strings[1] : DefaultValue;
}
public string Key { get; set; }
public string Value { get; set; }
public override string ToString()
{
return string.Format("{0}{1}{2}", Key, InnerSeperator, Value);
}
public static string ToStringJoined(IEnumerable<Pair> pairs)
{
return string.Join(OuterSeperator.ToString(), pairs.Select(i => i.ToString()));
}
public static IEnumerable<Pair> FromJoinedString(string joined)
{
return joined.Split(OuterSeperator)
.Select(x => x.Trim())
.Where(x => !string.IsNullOrWhiteSpace(x))
.Select(x => new Pair(x));
}
public static string Process(string values, string template)
{
var templateItems = FromJoinedString(template);
var valueItems = FromJoinedString(values);
var resultItems = templateItems.Select(t => valueItems.FirstOrDefault(x => x.Key == t.Key) ?? t);
return ToStringJoined(resultItems);
}
}