我想基于几个列将NYT API中的JSON响应导出到CSV文件。我最近在使用Scrapy蜘蛛连接API时遇到了问题,但在这个论坛的帮助下,我已经能够解决这个问题了。目前,我在使用for循环时遇到问题我相信我需要提取数据,因为我的当前代码出现了GET错误(某些项目已被注释掉)。这是第一篇文章的JSON响应片段,下一篇以{“web_url”开头:......)开头:
{"response":
{"meta":{"hits":1,"time":24,"offset":0},"docs":
[{"web_url":"http:\/\/www.nytimes.com\/2013\/09\/17\/arts\/design\/art-dealer-admits-role-
in-selling-fake-works.html","snippet":"Glafira Rosales, a Long Island art dealer, pleaded
guilty to fraud on Monday in the sale of counterfeit works for more than $80
million.","lead_paragraph":"Glafira Rosales, a Long Island art dealer, pleaded guilty to
fraud on Monday in the sale of counterfeit works for more than $80
million.","abstract":null,"print_page":"1","blog":[],"source":"The New York
Times","headline":{"main":"Art Dealer Admits to Role in Fraud","print_headline":"Art
Dealer Admits To Role In Fraud "},"keywords":
[{"rank":"3","is_major":"N","name":"persons","value":"Rosales, Glafira"},
{"rank":"1","is_major":"N","name":"subject","value":"Frauds and Swindling"},
{"rank":"2","is_major":"N","name":"subject","value":"Art"}],"pub_date":"2013-09-
17T00:00:00Z","document_type":"article","news_desk":"Culture","section_name":"Arts","subsectio
n_name":"Art & Design","copyright":"Copyright (c) 2013 The New York Times Company. All Rights Reserved."}
到目前为止我的代码副本:
from scrapy.spider import BaseSpider
from nytimesAPIjson.items import NytimesapijsonItem
import json
import urllib2
class MySpider(BaseSpider):
name = "nytimesapijson"
allowed_domains = ["api.nytimes.com"]
start_urls = ['http://api.nytimes.com/svc/search/v2/articlesearch.json?q="financial crime"&facet_field=day_of_week&begin_date=20130101&end_date=20130916&page=2&rank=newest&api- key=xxx']
def parse(self, response):
jsonresponse = json.loads(response)
##NEED LOOP#
#item = NytimesapijsonItem()
#item ["pubDate"] = jsonresponse["pub_date"]
#item ["description"] = jsonresponse["lead_paragraph"]
#item ["title"] = jsonresponse["print_headline"]
#item ["link"] = jsonresponse["web_url"]
#items.append(item)
#return items
print jsonresponse #would like to remove with for loop for above items
我是python的新手,并且不确定for循环的语法。对不起,如果有太多的细节,但我很乐意让它运行起来。我很欣赏每个人的时间。
如果有人对提取元数据和聚合关键字(不一定是CSV)有任何好的想法,我会接受建议。我想先了解一下基于国家和行业的金融犯罪趋势。
建议后更新:
我仍然在循环中遇到错误。我打印了多篇文章的输出,这是一个片段:
{u'copyright': u'Copyright (c) 2013 The New York Times Company. All Rights Reserved.',
u'response':{u'docs':[{u'_id':................
u'word_count'}
{u'_id':................
u'word_count'},
u'faucet':{........},
u'meta':{.........},
u'status':u'OK'}
在使文档显示为列表之后,似乎缺少右端括号。这是当前的代码:
##from scrapy.spider import BaseSpider
##from nytimesAPIjson.items import NytimesapijsonItem
##import json
##
##class MySpider(BaseSpider):
## name = "nytimesapijson"
## allowed_domains = ["api.nytimes.com"]
## start_urls=['http://api.nytimes.com/svc/search/v2/articlesearch.json?q=%22laundering%22&facet_field=day_of_week&begin_date=20130917&end_date=20130917&rank=newest&api-key=xxx']
##
## def parse(self, response):
## items =[]
## jsonresponse = json.loads(response.body_as_unicode())
## for doc in jsonresponse['copyright']['response']['docs'][0]:
## item = NytimesapijsonItem()
## item ["pubDate"] = doc['pub_date']
## item ["description"] = doc['lead_paragraph']
## item ["title"] = doc['headline']['print_headline']
## item ["link"] = doc['web_url']
## items.append(item)
##
## return items
蜘蛛运行但在GET HTTP部分仍然出现错误。如果我只打印jsonresponse,它运行正常。
我尝试过使用和不使用[0](来自别人的帖子的建议)。还有其他想法吗?再次感谢。
答案 0 :(得分:1)
以下是Python pprint.pprint()
如何显示您的JSON对象:(它有助于理解嵌套)
>>> import pprint
>>> pprint.pprint(json.loads(nyt))
{u'response': {u'docs': [{u'abstract': None,
u'blog': [],
u'copyright': u'Copyright (c) 2013 The New York Times Company. All Rights Reserved.',
u'document_type': u'article',
u'headline': {u'main': u'Art Dealer Admits to Role in Fraud',
u'print_headline': u'Art Dealer Admits To Role In Fraud '},
u'keywords': [{u'is_major': u'N',
u'name': u'persons',
u'rank': u'3',
u'value': u'Rosales, Glafira'},
{u'is_major': u'N',
u'name': u'subject',
u'rank': u'1',
u'value': u'Frauds and Swindling'},
{u'is_major': u'N',
u'name': u'subject',
u'rank': u'2',
u'value': u'Art'}],
u'lead_paragraph': u'Glafira Rosales, a Long Island art dealer, pleaded guilty to fraud on Monday in the sale of counterfeit works for more than $80 million.',
u'news_desk': u'Culture',
u'print_page': u'1',
u'pub_date': u'2013-09-17T00:00:00Z',
u'section_name': u'Arts',
u'snippet': u'Glafira Rosales, a Long Island art dealer, pleaded guilty to fraud on Monday in the sale of counterfeit works for more than $80 million.',
u'source': u'The New York Times',
u'subsection_name': u'Art & Design',
u'web_url': u'http://www.nytimes.com/2013/09/17/arts/design/art-dealer-admits-role-in-selling-fake-works.html'}],
u'meta': {u'hits': 1, u'offset': 0, u'time': 24}}}
现在你可以像那样编写循环
class MySpider(BaseSpider):
name = "nytimesapijson"
allowed_domains = ["api.nytimes.com"]
start_urls = ['http://api.nytimes.com/svc/search/v2/articlesearch.json?q="financial crime"&facet_field=day_of_week&begin_date=20130101&end_date=20130916&page=2&rank=newest&api- key=xxx']
def parse(self, response):
items = []
jsonresponse = json.loads(response)
for doc in jsonresponse["response"]["docs"]:
item = NytimesapijsonItem()
item ["pubDate"] = doc["pub_date"]
item ["description"] = doc["lead_paragraph"]
item ["title"] = doc["headline"]["print_headline"]
item ["link"] = doc["web_url"]
items.append(item)
return items