我正在https://ocr.space/ocrapi使用ocr api使用scrapy处理ocr项目。我有一些代码可以使用请求成功运行:
file_string = ctypes.string_at(image_data_pointer, length.value)
payload_filename = 'my_hires_image.jpg'
# Post payload as multipart encoded image file with filename.
# requests.post(THE_URL, files={'file': (payload_filename, payload)})
payload = {'isOverlayRequired': overlay,
'apikey': api_key,
'language': language,
r = requests.post('https://api.ocr.space/parse/image',
files={payload_filename: file_string},
data=payload,
)
return r.content.decode()
我现在正试图把它变成一个scrapy post请求。我有:
payload_filename = 'my_hires_image.jpg'
# Post payload as multipart encoded image file with filename.
# requests.post(THE_URL, files={'file': (payload_filename, payload)})
body = {'file': file_string,
'isOverlayRequired': True,
'apikey': 'mykey',
'language': 'eng',
}
files = {payload_filename: file_string}
yield FormRequest(url='https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
请注意file_string是一个字节字符串。你可以在上面的截图中看到它。代码给了我:
File "/\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "/\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "/\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "Emy_PROject/spiders\ocr_spider.py", line 148, in get_PDF
yield FormRequest(url='https://api.ocr.space/parse/image', headers=headers2, body=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
File "/\lib\site-packages\scrapy\http\request\form.py", line 27, in __init__
super(FormRequest, self).__init__(*args, **kwargs)
File "/\lib\site-packages\scrapy\http\request\__init__.py", line 26, in __init__
self._set_body(body)
File "/\lib\site-packages\scrapy\http\request\__init__.py", line 69, in _set_body
self._body = to_bytes(body, self.encoding)
File "/\lib\site-packages\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got dict
我怎样才能使这个工作?
编辑:
body = {'files':file_string,
'isOverlayRequired': True,
'apikey': '*******',
'language': 'eng',
}
body = urllib.parse.urlencode(body)
x = FormRequest('https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
的产率:
File "....\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "....\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "....\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "....\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "....\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "myproject\spiders\ocr_spider.py", line 151, in get_PDF
x = FormRequest('https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
File "....\scrapy\http\request\form.py", line 31, in __init__
querystr = _urlencode(items, self.encoding)
File "....\scrapy\http\request\form.py", line 66, in _urlencode
for k, vs in seq
File "....\scrapy\http\request\form.py", line 65, in <listcomp>
values = [(to_bytes(k, enc), to_bytes(v, enc))
ValueError: not enough values to unpack (expected 2, got 1)