我正在尝试抓取多个json页面,但它没有在文件中存储任何内容。我还要定义页数。如何自动调整到最后一页。
import urllib
for x in xrange(1,5):
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code=us_ca¤t_status=Active&page={0}'.format(x)
file_name = url.split('/')[-1]
u = urllib.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
print status,
f.close()
答案 0 :(得分:0)
import urllib2
import json
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code=us_ca¤t_status=Active&page='
i = 0
while True:
i += 1
print i
response = urllib2.urlopen('%s%d' % (url, i))
content = response.read()
with open(str(i) + '.json', 'w') as f:
f.write(content)
在第22页我得到HTTP Error 401: Unauthorized