直接复制粘贴
public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
{
var threadNodes =
doc.DocumentNode
.SelectNodes("//ul[@class='thread_list']/child::li[@class='row']");
foreach(var node in threadNodes)
{
HtmlNode titleLink = GetTitleLink(node);
int id;
try
{
string str =
ThreadIdUrlPart
.Matches(titleLink.GetAttributeValue("href", null))[0]
.Groups[1]
.Value;
id = Int32.Parse(str);
}
catch(NullReferenceException) { continue; }
catch(FormatException) { continue; }
var thread = new LRThread()
{
Id = id,
Title = titleLink.InnerText,
Creator = GetCreatorFromRow(node),
Created = GetDateTimeFromRow(node),
Deleted = false
};
yield return thread;
}
}
我发现的是,它只返回threadNodes
中的第一个项目,并返回它的多个副本而不是返回其余项目。我做错了吗?
[TestMethod]
[IntegrationTest]
public void FirstPageScanAndSaveTest()
{
HtmlDocument doc = BoardScanner.GetBoardPage(0);
Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
var threads = BoardScanner.GetPageThreads(doc);
Assert.IsTrue(threads.Any(), "Couldn't get any threads");
编辑:完整代码转储。发生的事情绝对是疯了。
private static LRThread ParseLRThread(HtmlNode node)
{
// We expect to at least get the id of the thread. It is in the href of a
// <span> element class name 'post title'. Skip to next thread if we
// don't find it.
HtmlNode titleLink = GetTitleLink(node);
int id;
try
{
string str =
ThreadIdUrlPart
.Matches(titleLink.GetAttributeValue("href", null))[0]
.Groups[1]
.Value;
id = Int32.Parse(str);
}
catch (NullReferenceException) { return null; }
catch (FormatException) { return null; }
// Now that we've found the id, try to get all the other properties
// of the thread besides Posts, but don't break if we can't find one of them.
var thread = new LRThread()
{
Id = id,
Title = titleLink.InnerText,
Creator = GetCreatorFromRow(node),
Created = GetDateTimeFromRow(node),
Deleted = false
};
return thread;
}
/// <summary>
/// Iterates through the threads on a give page. This will likely need to be updated.
/// IMPORTANT: The one field of each thread that is not set is Posts because we want
/// the consumer of this class to handle the way that posts are retrieved.
/// </summary>
/// <param name="doc">page html document</param>
public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
{
return
doc.DocumentNode
.SelectNodes("//ul[@class='thread_list']/child::li[@class='row']")
.Select(node => ParseLRThread(node));
}
我的测试是
[TestMethod]
[IntegrationTest]
public void FirstPageScanAndSaveTest()
{
HtmlDocument doc = BoardScanner.GetBoardPage(0);
Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
var threads = BoardScanner.GetPageThreads(doc);
Assert.IsTrue(threads.Any(), "Couldn't get any threads");
CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Title).ToList(), "Couldn't parse at least one title");
CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Creator).ToList(), "Couldn't parse at least one Creator");
CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Created).ToList(), "Couldn't parse at least one date/time");
CollectionAssert.AllItemsAreUnique(threads.Select(t => t.Id).ToList());
var thread = threads.First();
thread.Posts = BoardScanner.GetPostsFromThreadPage(thread, 0).ToList();
Assert.IsTrue(thread.Posts.Any(), "Couldn't any posts from first page of thread");
CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.Poster).ToList(), "Couldn't get the poster for a post");
CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.BodyHTML).ToList(), "Couldn't get the html for the body of a post.");
Repo.AddOrUpdateThreads(threads);
}
答案 0 :(得分:0)
{
var thread = new LRThread()
Id = id,
Title = titleLink.InnerText,
Creator = GetCreatorFromRow(node),
Created = GetDateTimeFromRow(node),
Deleted = false
}