我如何告诉Scrapy将所有产品分成两个列表?例如,我们说我有两种主要类型的项目 - article
和author
。我希望将它们放在两个单独的列表中。现在我得到输出JSON:
[
{
"article_title":"foo",
"article_published":"1.1.1972",
"author": "John Doe"
},
{
"name": "John Doe",
"age": 42,
"email": "foo@example.com"
}
]
如何将其转换为类似的内容?
{
"articles": [
{
"article_title": "foo",
"article_published": "1.1.1972",
"author": "John Doe"
}
],
"authors": [
{
"name": "John Doe",
"age": 42,
"email": "foo@example.com"
}
]
}
我输出这些的功能很简单,类似于:
def parse_author(self, response):
name = response.css('div.author-info a::text').extract_first()
print("Parsing author: {}".format(name))
yield {
'author_name': name
}
答案 0 :(得分:3)
项目将分别到达管道,并相应地添加每个管道:
items.py
class Article(scrapy.Item):
title = scrapy.Field()
published = scrapy.Field()
author = scrapy.Field()
class Author(scrapy.Item):
name = scrapy.Field()
age = scrapy.Field()
spider.py
def parse(self, response):
author = items.Author()
author['name'] = response.css('div.author-info a::text').extract_first()
print("Parsing author: {}".format(author['name']))
yield author
article = items.Article()
article['title'] = response.css('article css').extract_first()
print("Parsing article: {}".format(article['title']))
yield article
pipelines.py
process_item(self, item, spider):
if isinstance(item, items.Author):
# Do something to authors
elif isinstance(item, items.Article):
# Do something to articles
我建议这个架构:
[{
"title": "foo",
"published": "1.1.1972",
"authors": [
{
"name": "John Doe",
"age": 42,
"email": "foo@example.com"
},
{
"name": "Jane Doe",
"age": 21,
"email": "bar@example.com"
},
]
}]
这使得它全部集中在一个项目中。
items.py
class Article(scrapy.Item):
title = scrapy.Field()
published = scrapy.Field()
authors = scrapy.Field()
spider.py
def parse(self, response):
authors = []
author = {}
author['name'] = "John Doe"
author['age'] = 42
author['email'] = "foo@example.com"
print("Parsing author: {}".format(author['name']))
authors.append(author)
article = items.Article()
article['title'] = "foo"
article['published'] = "1.1.1972"
print("Parsing article: {}".format(article['title']))
article['authors'] = authors
yield article
答案 1 :(得分:1)
raw = [
{
"article_title":"foo",
"article_published":"1.1.1972",
"author": "John Doe"
},
{
"name": "John Doe",
"age": 42,
"email": "foo@example.com"
}
]
data = {'articles':[], "authors":[]}
for a in raw:
if 'article_title' in a:
data['articles'].extend([ a ])
else:
data['articles'].extend([ a ])