虽然有很多问题,但是大多数人都会因为“dont_filter”参数而遇到这个问题。我传递了这个参数“dont_filter = True”但是我的自定义解析生成器仍然没有用。这是我的代码(第三个解析器“parse_spec”从未被调用过,“parse_models_follow_next_page”在parse()调用时效果很好,但是当它需要转到下一页时它无法调用自身):
import scrapy
from gsmarena.items import PhoneItems
class VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" % self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)[{}]/@href".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
length = len(response.xpath(
"//div[@class='makers']/self::div//a").extract())
for i in range(1, length):
url = "https://www.gsmarena.com/" + \
response.xpath(
"(//div[@class='makers']/self::div//a)[{}]/@href".format(i)).extract()[0]
model = response.xpath(
"(//div[@class='makers']/self::div//a//span/text())[{}]".format(i)).extract()[0]
yield scrapy.Request(url, callback=self.parse_spec, meta={'brand': brand, 'model': model}, dont_filter=True)
is_next_page = response.xpath(
"//a[@class=\"pages-next\"]/@href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
item['model'] = response.meta.get('model')
item['brand'] = response.meta.get('brand')
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
item[spec_name] = spec
yield item
抱歉我的英语不好。
答案 0 :(得分:0)
我在代码中做了一些更改,它会删除所有期望spec_name的结果,而这种结果并没有以理解的方式指定。
导入scrapy
来自lxml import html
来自tutorial.items的导入PhoneItems
类VendorSpider(scrapy.Spider):
custom_settings = {
'DOWNLOAD_DELAY': 1.5,
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.75.14 (KHTML, '
'like Gecko) Version/7.0.3 Safari/7046A194A',
'COOKIES_ENABLED': False
}
name = "gsmarena_spec"
allowed_domains = ["https://www.gsmarena.com/"]
start_urls = [
"https://www.gsmarena.com/makers.php3"
]
def parse(self, response):
# print("Existing settings: %s" %
self.settings.attributes.items())
length = len(response.xpath("//table//a").extract())
for i in range(1, length):
brand = response.xpath(
'(//table//a)[{}]/text()'.format(i)).extract()[0]
url = "https://www.gsmarena.com/" + \
response.xpath("(//table//a)
[{}]/@href".format(i)).extract()[0]
yield scrapy.Request(url,
callback=self.parse_models_follow_next_page,
meta={'brand':brand},dont_filter=True)
def parse_models_follow_next_page(self, response):
brand = response.meta.get('brand')
meta = response.meta
doc = html.fromstring(response.body)
single_obj = doc.xpath('.//div[@class="makers"]/ul//li')
for obj in single_obj:
url = self.allowed_domains[0]+obj.xpath('.//a/@href')[0]
meta['brand'] = obj.xpath('.//a/@href')[0].split('_')[0]
meta['model'] = obj.xpath('.//a/@href')[0]
yield scrapy.Request(url=url, callback=self.parse_spec,
meta=meta, dont_filter=True)
is_next_page = response.xpath(
"//a[@class=\"pages-next\"]/@href").extract()
if is_next_page:
next_page = "https://www.gsmarena.com/" + is_next_page[0]
yield scrapy.Request(next_page,
callback=self.parse_models_follow_next_page,
meta={'brand': brand},dont_filter=True)
def parse_spec(self, response):
item = PhoneItems()
meta = response.meta
item['model'] = meta['model']
item['brand'] = meta['brand']
#Need to specify details about spec_name
# for spec_name, spec in
#zip(response.xpath('//table//td[1]').extract(),
# response.xpath('//table//td[2]').extract()):
# item[spec_name] = spec
yield item
答案 1 :(得分:0)
刮刀几乎没有问题。
allowed_domains = ["https://www.gsmarena.com/"]
应该是
allowed_domains = ["www.gsmarena.com"]
接下来,您没有在班级中定义errback_httpbin
方法
def errback_httpbin(self, response):
pass
代码
for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
应该是
for spec_name, spec in zip(response.xpath('//table//td[1]/text()').extract(), response.xpath('//table//td[2]/text()').extract()):
这虽然仍有一些问题。
此外,您的代码需要一些时间才能获得第一个收益,因为调度程序将根据进入的网址顺序选择网址