我试图使用scrapy解析的网站结构如下:
我希望提取的数据具有以下格式:
[{
"project":{"projectname":"project1"},
"samples":["sample1's_content","sample2's_content","sample3's_content"]
},
{
"project":{"projectname":"project2"},
"samples":["sample1's_content","samples2's_content","sample3's_content"]
}]
我试过了:
from item import Item
class Spider(scrapy.scrapy):
name = spider
def start_request(self):
url = "the main page's url"
yield scrapy.Request(url=url, callback=self.parseProjectList)
def parseProjectList(self, response):
for url in Selector(Project_list)
yield scrapy.Request(url=url, callback=self.parseProject)
def parseProject(self, response):
#scrap some data
myItem = Item()
myItem['samples']=[]
myItem['project']={'projectname':projectname,...}
yield scrapy.Request(url=SampleListPage, callback=self.parseSampleListPage,meta={'myItem':myItem})
def parseSampleListPage(self, response):
for url in Selector(Sample_list)
yield scrapy.Request(url=url, callback=self.parseSample,meta={'myItem':'myItem'})
def parseSample(self, response):
#parse some sample data
response.meta['myItem'].append(sample_data)
我尝试将yield response.meta['myItem']
放在parseSampleListPage
def parseSampleListPage(self, response):
for url in Selector(Sample_list)
yield scrapy.Request(url=url, callback=self.parseSample,meta={'myItem':'myItem'})
yield response.meta['myItem']
以及parseSample
中的yield response.meta['myItem']
def parseSample(self, response):
#parse some sample data
response.meta['myItem'].append(sample_data)
yield response.meta['myItem']
两种解决方案都失败了。
第一个产生空的“样本”字段。第二个创建具有相同项目的多个数据,如下所示:
[ {
"project": {
"projectname": "project2"
},
"samples": [
"sample1's_content"
] }, {
"project": {
"projectname": "project2"
},
"samples": [
"sample1's_content",
"sample2's_content"
] }, {
"project": {
"projectname": "project2"
},
"samples": [
"sample1's_content",
"sample2's_content",
"sample3's_content"
] } ]
奇怪有没有办法解决这个问题?