Question

直接复制粘贴

    public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
    {
        var threadNodes =
              doc.DocumentNode
              .SelectNodes("//ul[@class='thread_list']/child::li[@class='row']");

        foreach(var node in threadNodes)
        {
            HtmlNode titleLink = GetTitleLink(node);

            int id;
            try
            {
                string str =
                    ThreadIdUrlPart
                    .Matches(titleLink.GetAttributeValue("href", null))[0]
                    .Groups[1]
                    .Value;
                id = Int32.Parse(str);
            }
            catch(NullReferenceException) { continue; }
            catch(FormatException) { continue; }   

            var thread = new LRThread()
            {
                Id = id,
                Title = titleLink.InnerText,
                Creator = GetCreatorFromRow(node),
                Created = GetDateTimeFromRow(node),
                Deleted = false
            };
            yield return thread;
        }

    }

我发现的是，它只返回threadNodes中的第一个项目，并返回它的多个副本而不是返回其余项目。我做错了吗？

编辑：我正在使用像

这样的方法

    [TestMethod]
    [IntegrationTest]
    public void FirstPageScanAndSaveTest()
    {
        HtmlDocument doc = BoardScanner.GetBoardPage(0);
        Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
        var threads = BoardScanner.GetPageThreads(doc);
        Assert.IsTrue(threads.Any(), "Couldn't get any threads");

编辑：完整代码转储。发生的事情绝对是疯了。

    private static LRThread ParseLRThread(HtmlNode node)
    {
        // We expect to at least get the id of the thread. It is in the href of a 
        // <span> element class name 'post title'. Skip to next thread if we 
        // don't find it.
        HtmlNode titleLink = GetTitleLink(node);

        int id;
        try
        {
            string str =
                ThreadIdUrlPart
                .Matches(titleLink.GetAttributeValue("href", null))[0]
                .Groups[1]
                .Value;
            id = Int32.Parse(str);
        }
        catch (NullReferenceException) { return null; }
        catch (FormatException) { return null; }

        // Now that we've found the id, try to get all the other properties
        // of the thread besides Posts, but don't break if we can't find one of them.
        var thread = new LRThread()
        {
            Id = id,
            Title = titleLink.InnerText,
            Creator = GetCreatorFromRow(node),
            Created = GetDateTimeFromRow(node),
            Deleted = false
        };

        return thread;
    }

    /// <summary>
    /// Iterates through the threads on a give page. This will likely need to be updated.
    /// IMPORTANT: The one field of each thread that is not set is Posts because we want
    /// the consumer of this class to handle the way that posts are retrieved.
    /// </summary>
    /// <param name="doc">page html document</param>
    public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
    {
        return
            doc.DocumentNode
            .SelectNodes("//ul[@class='thread_list']/child::li[@class='row']")
            .Select(node => ParseLRThread(node));
    }

我的测试是

    [TestMethod]
    [IntegrationTest]
    public void FirstPageScanAndSaveTest()
    {
        HtmlDocument doc = BoardScanner.GetBoardPage(0);
        Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
        var threads = BoardScanner.GetPageThreads(doc);
        Assert.IsTrue(threads.Any(), "Couldn't get any threads");
        CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Title).ToList(), "Couldn't parse at least one title");
        CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Creator).ToList(), "Couldn't parse at least one Creator");
        CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Created).ToList(), "Couldn't parse at least one date/time");
        CollectionAssert.AllItemsAreUnique(threads.Select(t => t.Id).ToList());
        var thread = threads.First();
        thread.Posts = BoardScanner.GetPostsFromThreadPage(thread, 0).ToList();
        Assert.IsTrue(thread.Posts.Any(), "Couldn't any posts from first page of thread");
        CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.Poster).ToList(), "Couldn't get the poster for a post");
        CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.BodyHTML).ToList(), "Couldn't get the html for the body of a post.");
        Repo.AddOrUpdateThreads(threads);
    }

Answer 1

你可以这样试试。但我自己没有执行它。

{
    var thread = new LRThread()
    Id = id,
    Title = titleLink.InnerText,
    Creator = GetCreatorFromRow(node),
    Created = GetDateTimeFromRow(node),
    Deleted = false
}

为什么我的方法只返回IEnumerable的第一个元素？

1 个答案: