我正在抓取Science of Us网站上有关心理健康的文章,并尝试将其转储到我在本地运行的postgres数据库中。 scrapy输出存储在类似articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
在运行我的代码时,它会将每个键的整个值列表转储到名为== key的列中。相反,我希望每篇文章都是数据库中的一行,例如第1条在每一栏中都有自己的行,标题,预告片,链接,日期,作者和来源。
以下是相关代码:
1)spider.py
from scrapy.spiders import Spider
from scrapy import Request
from mhnewsbot_app.items import SOUItem
import string
mh_search_terms = ["DEPRESS", "MENTAL HEALTH", "EMOTIONAL HEALTH", "MENTAL DISORDER", "DIGITAL MEDICINE", "ANXI", "PSYCH", "THERAPY", "THERAPIST"]
tbl = string.maketrans('-', ' ') #To protect against cases where the article has hyphens or other special characters
articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
def url_lister():
url_list = []
article_count = 0
while article_count < 150:
url = 'http://nymag.com/scienceofus/?start=%s' %article_count
url_list.append(url)
article_count += 50
return url_list
class SOUSpider(Spider):
name = 'scienceofus'
start_urls = url_lister()
def parse(self, response):
for article in response.xpath('//ul[@class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(@class, "newsfeed-article")]/div[@class="headline-wrapper"]/a[@class="headline-link"]/h3[@class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
articles['title'].append(article.xpath('.//li[contains(@class, "newsfeed-article")]/div[@class="headline-wrapper"]/a[@class="headline-link"]/h3[@class="headline"]/text()').extract()[title.index(i)])
articles['teaser'].append(article.xpath('.//li[contains(@class, "newsfeed-article")]/p[@class = "teaser"]/text()').extract()[title.index(i)])
articles['link'].append(article.xpath('.//li[contains(@class, "newsfeed-article")]/a[@class = "read-more"]/@href').extract()[title.index(i)])
articles['date'].append(article.xpath('.//li[contains(@class, "newsfeed-article")]/div[@class="headline-wrapper"]/div[@class="headline-above"]/time/text()').extract()[title.index(i)])
articles['author'].append(article.xpath('.//li[contains(@class, "newsfeed-article")]/span[@class="by-authors"]/span/span[@class="author"]/text()').extract()[title.index(i)])
articles['source'].append('Science Of Us')
return articles
2)pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Articles, db_connect, create_articles_table
class ArticlesPipeline(object):
def __init__(self):
engine = db_connect()
create_articles_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
article = Articles(**item)
try:
session.add(article)
session.commit()
except :
session.rollback()
raise
finally:
session.close()
return item
答案 0 :(得分:1)
您输出的是1个项目,其字段中包含多个值,最好每个值输出一个项目,因为这是您的数据库似乎接受它的方式:
def parse(self, response):
for article in response.xpath('//ul[@class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(@class, "newsfeed-article")]/div[@class="headline-wrapper"]/a[@class="headline-link"]/h3[@class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
article_item = {}
article_item['title'] = article.xpath('.//li[contains(@class, "newsfeed-article")]/div[@class="headline-wrapper"]/a[@class="headline-link"]/h3[@class="headline"]/text()').extract()[title.index(i)]
article_item['teaser'] = article.xpath('.//li[contains(@class, "newsfeed-article")]/p[@class = "teaser"]/text()').extract()[title.index(i)]
article_item['link'] = article.xpath('.//li[contains(@class, "newsfeed-article")]/a[@class = "read-more"]/@href').extract()[title.index(i)]
article_item['date'] = article.xpath('.//li[contains(@class, "newsfeed-article")]/div[@class="headline-wrapper"]/div[@class="headline-above"]/time/text()').extract()[title.index(i)]
article_item['author'] = article.xpath('.//li[contains(@class, "newsfeed-article")]/span[@class="by-authors"]/span/span[@class="author"]/text()').extract()[title.index(i)]
article_item['source'] = 'Science Of Us'
yield article_item