我是Scrapy的新手,这是我最近遇到的一个问题。我的目标是在论坛中抓取线程,其中包含每个帖子中的所有帖子。这是代码,它非常粗糙:)
class forumSpider(CrawlSpider):
name = 'forumSpider'
allow_domain = ['letsebuy.com']
start_urls = ['http://www.letsebuy.com/forum-26-1.html',]
rules = [
Rule(SgmlLinkExtractor(allow = ('thread-\d*-\d-\d')),callback = 'parse_thread'),
]
def parse_thread(self, response):
sel = Selector(response)
thread = ThreadItem()
thread['title'] = sel.xpath('//span[@id = "thread_subject"]/text()').extract()
thread['url'] = response.url
thread['posts'] = list()
thread['posts'].extend(self.get_posts(response))
#next_pages = sel.xpath('//div[@class = "pgs mtm mbm cl"]/div[@class = "pg"]/a[re:test(text(),"\d+")]/@href').extract()
return thread
# Parse all posts in a thread page
def get_posts(self, response):
sel = Selector(response)
posts = sel.xpath('//div[re:test(@id, "post_\d+")]')
for r in posts:
post = PostItem()
post['author'] = r.xpath('.//div[@class = "authi"]//a[1]//text()').extract()
post['content'] = str().join(r.xpath('.//td[re:test(@id, "postmessage_\d+$")]/text()').extract())
yield post
该代码适用于一个线程页面。但是,一些包含大量帖子的帖子包含多个页面。我可以收集所有帖子'某个帖子的网址。我有没有办法request
所有这些网页,使用get_posts
解析它们并将postItem
填入thread['posts']
。
希望有人可以帮助我!提前谢谢。
答案 0 :(得分:0)
您可以在parse_thread
函数中创建新请求,并使用meta:
def parse_thread(self, response):
sel = Selector(response)
thread = ThreadItem()
thread['title'] = sel.xpath('//span[@id = "thread_subject"]/text()').extract()
thread['url'] = response.url
thread['posts'] = list()
thread['posts'].extend(self.get_posts(response))
next_pages = sel.xpath('//div[@class = "pgs mtm mbm cl"]/div[@class = "pg"]/a[re:test(text(),"\d+")]/@href').extract()
if next_pages:
url = next_pages.pop()
yield Request(url, meta={'thread':thread, 'next_pages' : next_pages}, callback=self.get_more_posts)
else:
yield thread
def get_more_posts(self, response):
thread = response.meta['thread']
thread['posts'].extend(self.get_posts(response))
next_pages = item['next_pages']
if next_pages:
url = next_pages.pop()
yield Request(url, meta={'thread' : thread, 'next_pages' : next_pages}, callback=self.get_more_posts)
else:
yield thread
清洁解决方案是在每个PostItem
中存储线程ID。作为this thread的示例,您存储在thread['id'] = 1419553
以及所有相应的帖子中post['thread_id'] = 1419553
。现在,只要您看到线程的第一页,就会调用parse_thread
。对于后续页面,请致电get_posts
。代码如下所示:
class forumSpider(CrawlSpider):
name = 'forumSpider'
allow_domain = ['letsebuy.com']
start_urls = ['http://www.letsebuy.com/forum-26-1.html',]
rules = [
Rule(SgmlLinkExtractor(allow = ('thread-\d+-1-\d')),callback = 'parse_thread'),
Rule(SgmlLinkExtractor(allow = ('thread-\d+-([2-9]|\d\d+)-\d')),callback = 'get_posts'),
]
def parse_thread(self, response):
sel = Selector(response)
thread = ThreadItem()
yield self.get_posts(response)
thread['title'] = sel.xpath('//span[@id = "thread_subject"]/text()').extract()
thread['url'] = response.url
thread['id'] = re.search(r'thread-(\d+)', response.url).group(1)
yield thread
# Parse all posts in a thread page
def get_posts(self, response):
sel = Selector(response)
posts = sel.xpath('//div[re:test(@id, "post_\d+")]')
thread_id = re.search(r'thread-(\d+)', response.url).group(1)
for r in posts:
post = PostItem()
post['thread_id'] = thread_id
post['author'] = r.xpath('.//div[@class = "authi"]//a[1]//text()').extract()
post['content'] = str().join(r.xpath('.//td[re:test(@id, "postmessage_\d+$")]/text()').extract())
yield post