我是网络抓取的新手。
我基本上想要做的是抓住以下网站:https://www.admin.ch/opc/fr/classified-compilation/national.html,它引用了所有瑞士联邦法律。 网站的结构是树状的(你从类别开始:Etat - Peuple - Autorités,Défensecountryale等等;其中包含子类别,直到你到达法律文本的树叶)我希望得到JSON文件中的相同结构。
这是我刚才写的蜘蛛:
1
辅助函数只检查正则表达式,在我们的例子中没什么重要的(因为它们确实有效):
import scrapy
from scrapy.spiders import XMLFeedSpider
from admin_ch.helpers.helper_functions import is_article_number, is_int, is_final_article
class Spider(XMLFeedSpider):
name = "spider_droit_interne"
allowed_domains = ["admin.ch"]
start_urls = [
'https://www.admin.ch/opc/fr/classified-compilation/national.html'
]
itertag = 'item'
def parse(self, response):
for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
cat_number = category.xpath('td/text()').extract_first()
cat_name = cat_number + ' ' + category.xpath('td/a/text()').extract_first()
url = category.xpath('td/a/@href').extract_first()
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_category, meta={'name': cat_name})
return #Just do the 1st category as an example
def parse_category(self, response):
name = response.meta['name']
tree = {}
node = {}
for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
cat_number = category.xpath('td/text()').extract_first()
cat_name = cat_number + ' ' + category.xpath('td/a/text()').extract_first()
if is_int(cat_number) and len(cat_number) < 3:
url = category.xpath('td/a/@href').extract_first()
url = response.urljoin(url)
if is_final_article(url):
node[cat_name] = 'leaf reached!!!!!'
yield node
else:
#node[cat_name] = 'more nodes'
yield scrapy.Request(url, callback=self.get_article, meta={'name': cat_name})
tree[name] = node
yield tree
def get_article(self, response):
name = response.meta['name']
tree = {}
node = {}
for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
cat_number = category.xpath('td/text()').extract()[1]
for element in [' ', '\n', '\r']:
cat_number = cat_number.replace(element, '')
if is_article_number(cat_number):
url = category.xpath('td/a/@href').extract_first()
url = response.urljoin(url)
if is_final_article(url):
node[cat_number] = 'leaf reached!!!!!'
else:
node[cat_number] = cat_number
#yield scrapy.Request(url, callback=self.get_article, meta={'tree': item})
#The content is not important for the moment, only the structure
tree[name] = node
yield tree
我从中得到的结果是非常错误的,你能帮我解决这个问题吗?