PyCURL在标题之前处理正文

时间:2016-07-14 10:27:14

标签: python python-3.x pycurl response-headers

受到this question的已接受答案的启发我试图用requests类似接口包装PyCurl。 Everythig会很好,但在跟踪PyCURL docs描述如何从头文件中读取正文编码后,我遇到了以下问题。每个响应头都会调用Header回调,但只有在迭代器开始产生响应行之后,才会使编码/字符集检测变得毫无意义。

以下是代码:

import re
import io
import urllib
import urllib.error
import http

import pycurl


class CurlHTTPStream(object):

    SELECT_TIMEOUT = 10
    HTTP_STANDARD_ENCODING = 'iso-8859-1'

    def __init__(self, method, url, data=None, params=None, headers=None):
        self.url = url
        self.received_buffer = io.BytesIO()

        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if headers:
            self.curl.setopt(
                pycurl.HTTPHEADER,
                [
                    '{}: {}'.format(key, value)
                    for key, value in headers.items()
                ]
            )
        if params:
            query_string = '&'.join((
                '{}={}'.format(key, value)
                for key, value in params.items()
            ))
            url = '{}?{}'.format(url, query_string)
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.ENCODING, 'gzip')
        self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)

        self.curl_multi = pycurl.CurlMulti()
        self.curl_multi.add_handle(self.curl)

        self.status_code = 0
        self.headers = {}

    def _any_data_received(self):
        return self.received_buffer.tell() != 0

    def _get_received_data(self):
        result = self.received_buffer.getvalue()
        self.received_buffer.truncate(0)
        self.received_buffer.seek(0)
        return result

    def _check_status_code(self):
        if self.status_code == 0:
            self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
        if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
            raise urllib.error.HTTPError(
                self.url, self.status_code, None, None, None
            )

    def _perform_on_curl(self):
        while True:
            ret, num_handles = self.curl_multi.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        return num_handles

    def _iter_chunks(self):
        while True:
            remaining = self._perform_on_curl()
            if self._any_data_received():
                self._check_status_code()
                yield self._get_received_data()
            if remaining == 0:
                break
            self.curl_multi.select(self.SELECT_TIMEOUT)

        self._check_status_code()
        self._check_curl_errors()

    def _check_curl_errors(self):
        for f in self.curl_multi.info_read()[2]:
            raise pycurl.error(*f[1:])

    def iter_lines(self):
        chunks = self._iter_chunks()
        return self._split_lines_from_chunks(chunks)

    def _split_lines_from_chunks(self, chunks):
        print('foo')
        print(self.headers)
        charset = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                charset = match.group(1)
                print('Decoding using %s' % charset)
        if charset is None:
            charset = self.HTTP_STANDARD_ENCODING
            print('Assuming encoding is %s' % charset)
        pending = None
        for chunk in chunks:
            if pending is not None:
                chunk = pending + chunk
            lines = chunk.splitlines()
            if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
                pending = lines.pop()
            else:
                pending = None
            for line in lines:
                yield line.decode(charset)
        if pending is not None:
            yield pending.decode(charset)

    def header_function(self, header_line):
        print('hello')
        header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
        if ':' not in header_line:
            return
        name, value = header_line.split(':', 1)
        name = name.strip()
        value = value.strip()
        name = name.lower()
        self.headers[name] = value


def request(method, url, data=None, params=None, headers=None,
            stream=False):
    if stream:
        return CurlHTTPStream(method, url, data=data, params=params,
                              headers=headers)

当我尝试测试它时,终端会发生什么:

Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from pycurl_requests.requests import request
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True)
>>> for l in r.iter_lines():
...     print(l)
... 
foo
{}
Assuming encoding is iso-8859-1
hello
hello
hello
hello
hello
hello
hello
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]}
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]}
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}

还有更多来自CouchDB更改Feed的行,但由于它们不相关,我截断了输出。

输出中基本上foo表示它进入了预期标题就位的块,但下一行显示self.headers为空。多个hello代表对header_function()的每次调用。如何在将标头回调触发之前调用将主体写入BytesIO的写回调?

1 个答案:

答案 0 :(得分:0)

我找到了解决方案。问题是_split_lines_from_chunks(self, chunks)在响应之前发生了任何问题,因此标题也不存在。

这是有效的代码。当第一行正文可用时检测到字符集,因此我已经处理了所有标题。

import re
import io
import urllib
import urllib.error
import http

import pycurl


class CurlHTTPStream(object):

    SELECT_TIMEOUT = 10
    HTTP_STANDARD_ENCODING = 'iso-8859-1'

    def __init__(self, method, url, data=None, params=None, headers=None):
        self.url = url
        self.received_buffer = io.BytesIO()

        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if headers:
            self.curl.setopt(
                pycurl.HTTPHEADER,
                [
                    '{}: {}'.format(key, value)
                    for key, value in headers.items()
                ]
            )
        if params:
            query_string = '&'.join((
                '{}={}'.format(key, value)
                for key, value in params.items()
            ))
            url = '{}?{}'.format(url, query_string)
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.ENCODING, 'gzip')
        self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)

        self.curl_multi = pycurl.CurlMulti()
        self.curl_multi.add_handle(self.curl)

        self.status_code = 0
        self.headers = {}
        self._charset = None

    def _any_data_received(self):
        return self.received_buffer.tell() != 0

    def _get_received_data(self):
        result = self.received_buffer.getvalue()
        self.received_buffer.truncate(0)
        self.received_buffer.seek(0)
        return result

    def _check_status_code(self):
        if self.status_code == 0:
            self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
        if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
            raise urllib.error.HTTPError(
                self.url, self.status_code, None, None, None
            )

    def _perform_on_curl(self):
        while True:
            ret, num_handles = self.curl_multi.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        return num_handles

    def _iter_chunks(self):
        while True:
            remaining = self._perform_on_curl()
            if self._any_data_received():
                self._check_status_code()
                yield self._get_received_data()
            if remaining == 0:
                break
            self.curl_multi.select(self.SELECT_TIMEOUT)

        self._check_status_code()
        self._check_curl_errors()

    def _check_curl_errors(self):
        for f in self.curl_multi.info_read()[2]:
            raise pycurl.error(*f[1:])

    def iter_lines(self):
        chunks = self._iter_chunks()
        return self._split_lines_from_chunks(chunks)

    def _split_lines_from_chunks(self, chunks):
        print('foo')
        print(self.headers)
        pending = None
        for chunk in chunks:
            if pending is not None:
                chunk = pending + chunk
            lines = chunk.splitlines()
            if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
                pending = lines.pop()
            else:
                pending = None
            for line in lines:
                yield line.decode(self.charset)
        if pending is not None:
            yield pending.decode(self.charset)

    @property
    def charset(self):
        if self._charset is not None:
            return self._charset
        try:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                self._charset = match.group(1).strip()
                print('Decoding using %s' % self._charset)
            else:
                raise KeyError('charset')
        except KeyError:
            self._charset = self.HTTP_STANDARD_ENCODING
            print('Assuming encoding is %s' % self._charset)
        return self._charset

    def header_function(self, header_line):
        print('hello')
        header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
        if ':' not in header_line:
            return
        name, value = header_line.split(':', 1)
        name = name.strip()
        value = value.strip()
        name = name.lower()
        self.headers[name] = value


def request(method, url, data=None, params=None, headers=None,
            stream=False):
    if stream:
        return CurlHTTPStream(method, url, data=data, params=params,
                              headers=headers)