网页上有一个包含多个元素的网站。问题是我不知道如何使用Scrapy正确抓取它们。
这是我的代码:
class IdSpider(CrawlSpider):
name = 'id'
allowed_domains = ['example.com']
start_urls = ['example.com/page/1']
deep_level = 1
rules = (
Rule(SgmlLinkExtractor(allow=r'.*'), callback='parse_item1', follow=True),
Rule(SgmlLinkExtractor(allow=r'.*'), callback='parse_item2', follow=True),
)
def parse_item1(self, response):
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[4]/td/h4/text()""").extract()
i['title'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[2]/td/h4/text()""").extract()
return i
def parse_item2(self, response):
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[6]/td/h4/text()""").extract()
i['email'] = sel.xpath("""//html/body/table/tr[1]/td[2]/div[4]/div/table/tr[8]/td/a[1]/text()""").extract()
return i
我预计网页将由parse_item1和parse_item2解析。但只有parse_item1正在运行。如何让它们同时运行?
答案 0 :(得分:0)
CrawlSpider
只会跟随第一个匹配规则:
如果多个规则与同一个链接匹配,则将根据它们在此属性中定义的顺序使用第一个规则。
您有几个选择:
使用yield
rules = (
Rule(SgmlLinkExtractor(allow=r'.*'), callback='parse_items', follow=True),
)
def parse_items(self, response):
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[4]/td/h4/text()""").extract()
i['title'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[2]/td/h4/text()""").extract()
yield i
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[6]/td/h4/text()""").extract()
i['email'] = sel.xpath("""//html/body/table/tr[1]/td[2]/div[4]/div/table/tr[8]/td/a[1]/text()""").extract()
yield i
或者在另一个之后调用2个回调:
rules = (
Rule(SgmlLinkExtractor(allow=r'.*'), callback='parse_items', follow=True),
)
def parse_items(self, response, sel=None):
sel = Selector(response)
for r in self.parse_item1(response, sel):
yield r
for r in self.parse_item2(response, sel):
yield r
def parse_item1(self, response, sel=None):
if sel is None:
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[4]/td/h4/text()""").extract()
i['title'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[2]/td/h4/text()""").extract()
return i
def parse_item2(self, response, sel=None):
if sel is None:
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[6]/td/h4/text()""").extract()
i['email'] = sel.xpath("""//html/body/table/tr[1]/td[2]/div[4]/div/table/tr[8]/td/a[1]/text()""").extract()
return i