我想抓取township directory of China。该网站分为4个级别,分别为省页面,城市页面,县页面和乡镇页面。例如,在省页面上,列出了所有省份。如果我们点击一个省的链接,则会将我们带到城市页面,并显示该省的城市列表。
我希望我的每个项目都是一个乡镇。它包括town_name,town_id(gbcode)和相应的county_name,city_name,prov_name。所以当蜘蛛进入乡镇页面时,它应该沿途收集信息。但是,我目前使用for循环的方法似乎不起作用。 prov_name没有问题。但是城市和县名大多不正确,它们始终是相应页面列表中的最后一个城市/县。我认为问题是蜘蛛不够深入,只能在循环结束时转到parse_county请求。但是,在设置中更改深度优先级并不能解决问题。
---------- Sample Result --------
town_name, year, gbcode, city, province, county
建国门街道办事处,2016,110101008000,市辖区,北京市,延庆区
东直门街道办事处,2016,110101009000,市辖区,北京市,延庆区
和平里街道办事处,2016,110101010000,市辖区,北京市,延庆区
前门街道办事处,2016,110101011000,市辖区,北京市,延庆区
崇文门外街道办事处,2016,110101012000,市辖区,北京市,延庆区
import scrapy
import re
from scrapy.spiders import Spider
from admincode.items import AdmincodeItem
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in self.parse_provincetr(response, response.selector.css(".provincetr")):
yield item
def get_text_href(self, td):
if not td.xpath('a'):
return td.xpath('text()').extract()[0], None
else:
return td.xpath('a/text()').extract()[0], td.xpath('a/@href').extract()[0]
def parse_provincetr(self, response, trs):
year_pattern = re.compile('(tjyqhdmhcxhfdm/)([0-9][0-9][0-9][0-9])')
year = year_pattern.search(response.url).group(2)
for td in trs.xpath('td'):
scraped = {}
scraped['year'] = year
scraped['prov_name'], href = self.get_text_href(td)
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_citytr,
meta={'scraped': scraped})
def parse_2td(self, response, trs, var_name, nextparse):
for tr in trs:
scraped = response.meta['scraped']
scraped[var_name], href = self.get_text_href(tr.xpath('td')[1])
if nextparse:
url = response.urljoin(href)
yield scrapy.Request(url, callback=nextparse, meta={'scraped': scraped})
else:
item = AdmincodeItem()
item['year'] = scraped['year']
item['prov_name'] = scraped['prov_name']
item['city_name'] = scraped['city_name']
item['county_name'] = scraped['county_name']
item['town_name'] = scraped['town_name']
item['gbcode'], href = self.get_text_href(
tr.xpath('td')[0])
yield item
def parse_citytr(self, response):
for city in self.parse_2td(response, response.selector.css(".citytr"), 'city_name', self.parse_countytr):
yield city
def parse_countytr(self, response):
for county in self.parse_2td(response, response.selector.css(".countytr"), 'county_name', self.parse_towntr):
yield county
def parse_towntr(self, response):
for town in self.parse_2td(response, response.selector.css(".towntr"), 'town_name', None):
yield town
答案 0 :(得分:0)
我认为你让事情变得有点复杂。这是一个简单的刮刀,您需要做的是使用meta将信息从一个页面传递到另一个页面。由于meta是内存中的字典,我们需要确保为后续项目创建信息的副本。为此,我们使用copy.deepcopy
。这将确保在产生项目之前不会覆盖数据
下面是执行该操作的刮刀
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in response.css(".provincetr a"):
name = item.xpath("./text()").extract_first().strip()
link = item.xpath("./@href").extract_first().strip()
yield response.follow(link, callback=self.parse_province, meta={'item':{'province':name}})
def parse_province(self, response):
meta = response.meta['item']
for cityrow in response.css(".citytr"):
city_link = cityrow.xpath("./td[2]/a/@href").extract_first()
city_name = cityrow.xpath("./td[2]/a/text()").extract_first()
city_code = cityrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['city_name'] = city_name
meta_new['city_code'] = city_code
yield response.follow(city_link, callback=self.parse_city, meta = {'item':meta_new})
def parse_city(self, response):
meta = response.meta['item']
for countyrow in response.css(".countytr"):
county_link = countyrow.xpath("./td[2]/a/@href").extract_first()
county_name = countyrow.xpath("./td[2]/a/text()").extract_first()
county_code = countyrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['county_name'] = county_name
meta_new['county_code'] = county_code
yield response.follow(county_link, callback=self.parse_county, meta = {"item": meta_new})
def parse_county(self, response):
meta = response.meta['item']
for townrow in response.css(".towntr"):
town_link = townrow.xpath("./td[2]/a/@href").extract_first()
town_name = townrow.xpath("./td[2]/a/text()").extract_first()
town_code = townrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['town_name'] = town_name
meta_new['town_code'] = town_code
yield meta_new