Scrapy:如何将所有帖子放在线程中?

时间:2014-03-14 15:05:51

标签: web-crawler scrapy

我是Scrapy的新手,这是我最近遇到的一个问题。我的目标是在论坛中抓取线程,其中包含每个帖子中的所有帖子。这是代码,它非常粗糙:)

class forumSpider(CrawlSpider):
    name = 'forumSpider'
    allow_domain = ['letsebuy.com']
    start_urls = ['http://www.letsebuy.com/forum-26-1.html',]

    rules = [
        Rule(SgmlLinkExtractor(allow = ('thread-\d*-\d-\d')),callback = 'parse_thread'),
    ]

    def parse_thread(self, response):
        sel = Selector(response)
        thread = ThreadItem()
        thread['title'] = sel.xpath('//span[@id = "thread_subject"]/text()').extract()
        thread['url'] = response.url
        thread['posts'] = list()
        thread['posts'].extend(self.get_posts(response))
        #next_pages = sel.xpath('//div[@class = "pgs mtm mbm cl"]/div[@class = "pg"]/a[re:test(text(),"\d+")]/@href').extract()
        return thread

    # Parse all posts in a thread page
    def get_posts(self, response):
        sel = Selector(response)
        posts = sel.xpath('//div[re:test(@id, "post_\d+")]')

        for r in posts:
            post = PostItem()
            post['author'] = r.xpath('.//div[@class = "authi"]//a[1]//text()').extract()
            post['content'] = str().join(r.xpath('.//td[re:test(@id, "postmessage_\d+$")]/text()').extract())
            yield post

该代码适用于一个线程页面。但是,一些包含大量帖子的帖子包含多个页面。我可以收集所有帖子'某个帖子的网址。我有没有办法request所有这些网页,使用get_posts解析它们并将postItem填入thread['posts']

希望有人可以帮助我!提前谢谢。

1 个答案:

答案 0 :(得分:0)

您可以在parse_thread函数中创建新请求,并使用meta:

传递该线程项
 def parse_thread(self, response):
    sel = Selector(response)
    thread = ThreadItem()
    thread['title'] = sel.xpath('//span[@id = "thread_subject"]/text()').extract()
    thread['url'] = response.url
    thread['posts'] = list()
    thread['posts'].extend(self.get_posts(response))
    next_pages = sel.xpath('//div[@class = "pgs mtm mbm cl"]/div[@class = "pg"]/a[re:test(text(),"\d+")]/@href').extract()
    if next_pages:
        url = next_pages.pop()
        yield Request(url, meta={'thread':thread, 'next_pages' : next_pages}, callback=self.get_more_posts)
    else:
        yield thread

 def get_more_posts(self, response):
    thread = response.meta['thread']
    thread['posts'].extend(self.get_posts(response))
    next_pages = item['next_pages']
    if next_pages:
        url = next_pages.pop()
        yield Request(url, meta={'thread' : thread, 'next_pages' : next_pages}, callback=self.get_more_posts)
    else:
        yield thread

清洁解决方案是在每个PostItem中存储线程ID。作为this thread的示例,您存储在thread['id'] = 1419553以及所有相应的帖子中post['thread_id'] = 1419553。现在,只要您看到线程的第一页,就会调用parse_thread。对于后续页面,请致电get_posts。代码如下所示:

class forumSpider(CrawlSpider):
    name = 'forumSpider'
    allow_domain = ['letsebuy.com']
    start_urls = ['http://www.letsebuy.com/forum-26-1.html',]

    rules = [
        Rule(SgmlLinkExtractor(allow = ('thread-\d+-1-\d')),callback = 'parse_thread'),
        Rule(SgmlLinkExtractor(allow = ('thread-\d+-([2-9]|\d\d+)-\d')),callback = 'get_posts'),
    ]

    def parse_thread(self, response):
        sel = Selector(response)
        thread = ThreadItem()
        yield self.get_posts(response)
        thread['title'] = sel.xpath('//span[@id = "thread_subject"]/text()').extract()
        thread['url'] = response.url
        thread['id'] = re.search(r'thread-(\d+)', response.url).group(1)
        yield thread

    # Parse all posts in a thread page
    def get_posts(self, response):
        sel = Selector(response)
        posts = sel.xpath('//div[re:test(@id, "post_\d+")]')
        thread_id = re.search(r'thread-(\d+)', response.url).group(1)
        for r in posts:
            post = PostItem()
            post['thread_id'] = thread_id
            post['author'] = r.xpath('.//div[@class = "authi"]//a[1]//text()').extract()
            post['content'] = str().join(r.xpath('.//td[re:test(@id, "postmessage_\d+$")]/text()').extract())
            yield post