受到this question的已接受答案的启发我试图用requests
类似接口包装PyCurl。 Everythig会很好,但在跟踪PyCURL docs描述如何从头文件中读取正文编码后,我遇到了以下问题。每个响应头都会调用Header回调,但只有在迭代器开始产生响应行之后,才会使编码/字符集检测变得毫无意义。
以下是代码:
import re
import io
import urllib
import urllib.error
import http
import pycurl
class CurlHTTPStream(object):
SELECT_TIMEOUT = 10
HTTP_STANDARD_ENCODING = 'iso-8859-1'
def __init__(self, method, url, data=None, params=None, headers=None):
self.url = url
self.received_buffer = io.BytesIO()
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.CUSTOMREQUEST, method)
if headers:
self.curl.setopt(
pycurl.HTTPHEADER,
[
'{}: {}'.format(key, value)
for key, value in headers.items()
]
)
if params:
query_string = '&'.join((
'{}={}'.format(key, value)
for key, value in params.items()
))
url = '{}?{}'.format(url, query_string)
self.curl.setopt(pycurl.URL, url)
self.curl.setopt(pycurl.ENCODING, 'gzip')
self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)
self.curl_multi = pycurl.CurlMulti()
self.curl_multi.add_handle(self.curl)
self.status_code = 0
self.headers = {}
def _any_data_received(self):
return self.received_buffer.tell() != 0
def _get_received_data(self):
result = self.received_buffer.getvalue()
self.received_buffer.truncate(0)
self.received_buffer.seek(0)
return result
def _check_status_code(self):
if self.status_code == 0:
self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
raise urllib.error.HTTPError(
self.url, self.status_code, None, None, None
)
def _perform_on_curl(self):
while True:
ret, num_handles = self.curl_multi.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
return num_handles
def _iter_chunks(self):
while True:
remaining = self._perform_on_curl()
if self._any_data_received():
self._check_status_code()
yield self._get_received_data()
if remaining == 0:
break
self.curl_multi.select(self.SELECT_TIMEOUT)
self._check_status_code()
self._check_curl_errors()
def _check_curl_errors(self):
for f in self.curl_multi.info_read()[2]:
raise pycurl.error(*f[1:])
def iter_lines(self):
chunks = self._iter_chunks()
return self._split_lines_from_chunks(chunks)
def _split_lines_from_chunks(self, chunks):
print('foo')
print(self.headers)
charset = None
if 'content-type' in self.headers:
content_type = self.headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
charset = match.group(1)
print('Decoding using %s' % charset)
if charset is None:
charset = self.HTTP_STANDARD_ENCODING
print('Assuming encoding is %s' % charset)
pending = None
for chunk in chunks:
if pending is not None:
chunk = pending + chunk
lines = chunk.splitlines()
if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
pending = lines.pop()
else:
pending = None
for line in lines:
yield line.decode(charset)
if pending is not None:
yield pending.decode(charset)
def header_function(self, header_line):
print('hello')
header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
if ':' not in header_line:
return
name, value = header_line.split(':', 1)
name = name.strip()
value = value.strip()
name = name.lower()
self.headers[name] = value
def request(method, url, data=None, params=None, headers=None,
stream=False):
if stream:
return CurlHTTPStream(method, url, data=data, params=params,
headers=headers)
当我尝试测试它时,终端会发生什么:
Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from pycurl_requests.requests import request
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True)
>>> for l in r.iter_lines():
... print(l)
...
foo
{}
Assuming encoding is iso-8859-1
hello
hello
hello
hello
hello
hello
hello
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]}
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]}
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}
还有更多来自CouchDB更改Feed的行,但由于它们不相关,我截断了输出。
输出中基本上foo
表示它进入了预期标题就位的块,但下一行显示self.headers
为空。多个hello
代表对header_function()
的每次调用。如何在将标头回调触发之前调用将主体写入BytesIO
的写回调?
答案 0 :(得分:0)
我找到了解决方案。问题是_split_lines_from_chunks(self, chunks)
在响应之前发生了任何问题,因此标题也不存在。
这是有效的代码。当第一行正文可用时检测到字符集,因此我已经处理了所有标题。
import re
import io
import urllib
import urllib.error
import http
import pycurl
class CurlHTTPStream(object):
SELECT_TIMEOUT = 10
HTTP_STANDARD_ENCODING = 'iso-8859-1'
def __init__(self, method, url, data=None, params=None, headers=None):
self.url = url
self.received_buffer = io.BytesIO()
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.CUSTOMREQUEST, method)
if headers:
self.curl.setopt(
pycurl.HTTPHEADER,
[
'{}: {}'.format(key, value)
for key, value in headers.items()
]
)
if params:
query_string = '&'.join((
'{}={}'.format(key, value)
for key, value in params.items()
))
url = '{}?{}'.format(url, query_string)
self.curl.setopt(pycurl.URL, url)
self.curl.setopt(pycurl.ENCODING, 'gzip')
self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)
self.curl_multi = pycurl.CurlMulti()
self.curl_multi.add_handle(self.curl)
self.status_code = 0
self.headers = {}
self._charset = None
def _any_data_received(self):
return self.received_buffer.tell() != 0
def _get_received_data(self):
result = self.received_buffer.getvalue()
self.received_buffer.truncate(0)
self.received_buffer.seek(0)
return result
def _check_status_code(self):
if self.status_code == 0:
self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
raise urllib.error.HTTPError(
self.url, self.status_code, None, None, None
)
def _perform_on_curl(self):
while True:
ret, num_handles = self.curl_multi.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
return num_handles
def _iter_chunks(self):
while True:
remaining = self._perform_on_curl()
if self._any_data_received():
self._check_status_code()
yield self._get_received_data()
if remaining == 0:
break
self.curl_multi.select(self.SELECT_TIMEOUT)
self._check_status_code()
self._check_curl_errors()
def _check_curl_errors(self):
for f in self.curl_multi.info_read()[2]:
raise pycurl.error(*f[1:])
def iter_lines(self):
chunks = self._iter_chunks()
return self._split_lines_from_chunks(chunks)
def _split_lines_from_chunks(self, chunks):
print('foo')
print(self.headers)
pending = None
for chunk in chunks:
if pending is not None:
chunk = pending + chunk
lines = chunk.splitlines()
if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
pending = lines.pop()
else:
pending = None
for line in lines:
yield line.decode(self.charset)
if pending is not None:
yield pending.decode(self.charset)
@property
def charset(self):
if self._charset is not None:
return self._charset
try:
content_type = self.headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
self._charset = match.group(1).strip()
print('Decoding using %s' % self._charset)
else:
raise KeyError('charset')
except KeyError:
self._charset = self.HTTP_STANDARD_ENCODING
print('Assuming encoding is %s' % self._charset)
return self._charset
def header_function(self, header_line):
print('hello')
header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
if ':' not in header_line:
return
name, value = header_line.split(':', 1)
name = name.strip()
value = value.strip()
name = name.lower()
self.headers[name] = value
def request(method, url, data=None, params=None, headers=None,
stream=False):
if stream:
return CurlHTTPStream(method, url, data=data, params=params,
headers=headers)