我想用scrapy抓取json。这是我的代码:
class zhangjiaweiSpider(scrapy.Spider):
name = "zhangjiawei"
start_urls = [
"https://zhuanlan.zhihu.com/api/columns/zhangjiawei/posts?limit=20&offset="
]
def start_requests(self):
for i in range(1):
url = self.start_urls[0] + str(i * 20)
yield scrapy.Request(url,callback = self.parse)
def parse(self, response):
jsonbody = json.loads(response.body.decode('utf-8','ignore'))
print(jsonbody)
但是当我运行它时,我会收到错误:
Traceback (most recent call last):
File "d:\soft\python\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "D:\code\python\spider\zhihuzhuanlan\zhihuzhuanlan\spiders\zhangjiaweispider.py", line 24, in parse
jsonbody = json.loads(response.body.decode('utf-8','ignore'))
File "d:\soft\python\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "d:\soft\python\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "d:\soft\python\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
我尝试打印response.body.decode()
,但收到了错误:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa5 in position 0:
invalid start byte
如果我打印response.body.decode('utf-8','ignore')
他们是乱码。
我认为这个错误可能是由response.body
的解码引起的。但我不知道如何解决这个问题。
我的setting.py
:
BOT_NAME = 'zhihuzhuanlan'
SPIDER_MODULES = ['zhihuzhuanlan.spiders']
NEWSPIDER_MODULE = 'zhihuzhuanlan.spiders'
DEFAULT_REQUEST_HEADERS = {
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding':'gzip, deflate, br',
'accept-language':'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
'USER-AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
}
ITEM_PIPELINES = {
'zhihuzhuanlan.pipelines.ArticleDataBasePipeline': 5,
}
FEED_EXPORT_ENCODING = 'utf-8'
# linux pip install MySQL-python
DATABASE = {'drivername': 'mysql',
'host': '192.168.203.95',
'port': '3306',
'username': 'root',
'password': 'Password',
'database': 'spider',
'query': {'charset': 'utf8'}}
ROBOTSTXT_OBEY = True