Question

我在抓取Wikipedia转储json文件时遇到了UnicodeEncodeError。这是我的代码片段和错误消息。看起来角色'é'会导致这个问题。但是，我不知道如何解决这个问题。

import urllib2
import json

# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
    i = i+1
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    titlename = name.replace(" ", "_")
    print titlename
    title = "titles="+titlename
    content = "prop=revisions&rvprop=content"
    dataformat = "format=json"

# construct the query
    query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
    print query
    wikiresponse = urllib2.urlopen(query)
    wikisource = wikiresponse.read()
#     print wikisource
    wikijson = json.loads(wikisource)
    jsonfilename = './json/'+titlename+'.json'
    with open(jsonfilename, 'w') as outfile:
        json.dump(wikijson, outfile)

错误讯息：

Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
     21     query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
     22     print query
---> 23     wikiresponse = urllib2.urlopen(query)
     24     wikisource = wikiresponse.read()
     25 #     print wikisource

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    152     else:
    153         opener = _opener
--> 154     return opener.open(url, data, timeout)
    155 
    156 def install_opener(opener):

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    429             req = meth(req)
    430 
--> 431         response = self._open(req, data)
    432 
    433         # post-process response

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
    447         protocol = req.get_type()
    448         result = self._call_chain(self.handle_open, protocol, protocol +
--> 449                                   '_open', req)
    450         if result:
    451             return result

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    407             func = getattr(handler, meth_name)
    408 
--> 409             result = func(*args)
    410             if result is not None:
    411                 return result

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
   1238         def https_open(self, req):
   1239             return self.do_open(httplib.HTTPSConnection, req,
-> 1240                 context=self._context)
   1241 
   1242         https_request = AbstractHTTPHandler.do_request_

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
   1192 
   1193         try:
-> 1194             h.request(req.get_method(), req.get_selector(), req.data, headers)
   1195         except socket.error, err: # XXX what error?
   1196             h.close()

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
   1051     def request(self, method, url, body=None, headers={}):
   1052         """Send a complete request to the server."""
-> 1053         self._send_request(method, url, body, headers)
   1054 
   1055     def _set_content_length(self, body, method):

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
   1091         for hdr, value in headers.iteritems():
   1092             self.putheader(hdr, value)
-> 1093         self.endheaders(body)
   1094 
   1095     def getresponse(self, buffering=False):

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
   1047         else:
   1048             raise CannotSendHeader()
-> 1049         self._send_output(message_body)
   1050 
   1051     def request(self, method, url, body=None, headers={}):

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
    891             msg += message_body
    892             message_body = None
--> 893         self.send(msg)
    894         if message_body is not None:
    895             #message_body was not a string (i.e. it is a file) and

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
    867                 datablock = data.read(blocksize)
    868         else:
--> 869             self.sock.sendall(data)
    870 
    871     def _output(self, s):

/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
    719             count = 0
    720             while (count < amount):
--> 721                 v = self.send(data[count:])
    722                 count += v
    723             return amount

/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
    685                     self.__class__)
    686             try:
--> 687                 v = self._sslobj.write(data)
    688             except SSLError as x:
    689                 if x.args[0] == SSL_ERROR_WANT_READ:

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)

然而，下面简单＆amp;直接代码没有从列表中获取标题，只是没有任何问题。

import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
    json.dump(wikijson, outfile)

Answer 1

不要混合Unicode和字节串：使用Unicode字符串来处理Python中的文本。

不要手动创建网址，使用urllib，quote()等urlencode()功能。另外，请考虑来自urlparse模块的功能，例如urljoin()，urlunsplit()。

您已经请求了json格式，无需解析它，只能使用相同的格式立即将其转储回来;您可以使用shutil.copyfileobj()复制类似文件的对象。您可以稍后检查结果文件，以确保它已正确下载。

总结一下，这里是如何将具有给定标题的维基页面保存为JSON格式的文件：

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from contextlib import closing
from urllib import quote
from urllib2 import urlopen
from shutil import copyfileobj

def urlretrieve(url, filename, chunksize=8096):
    with closing(urlopen(url)) as response, open(filename, 'wb') as file:
        copyfileobj(response, file, chunksize)

#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
            "action=query&prop=revisions&rvprop=content&format=json&"
            "titles=" + quote(name.encode('utf-8')),
            os.path.join('json', name + '.json'))

注意：

在这种情况下你不需要.replace(' ', '_')
os.path.join('json', name + '.json')行混合了字节串（'json'，'.json'）和Unicode（type(name) == unicode）。这里没关系，因为源代码中的'json'和'.json' ascii -only literals
# -*- coding: utf-8 -*-编码声明仅影响Python源代码中字面上出现的字符，例如，在这种特定情况下，查询字符串也使用相同的编码是偶然的。源代码的编码与可能用于文件名的字符编码无关，或者通过http传输数据，或者将Unicode文本写入终端等（所有这些编码可能彼此不同）。
原则上，您可以在此处使用urllib.urlretrieve(url, filename)而不是urlopen + copyfile，但{2}上的urllib.urlretrieve()行为与{2}不同

以下是使用urllib2.urlopen()的相同代码：

requests

然而，下面简单＆amp;直接代码没有从列表中获取标题，只是没有任何问题。

您的代码使用非ascii bytestring文字（在Python 3中是非法的）。没有编码错误，因为所有数据都已经是字节。使用字节串的问题是，如果不同的环境可能使用不同的字符编码，它会中断它们（你不能期望所有东西都使用utf-8，但它可能是可取的）。另外，the query part should be properly encoded e.g., é should be sent as '%C3%A9'。

无关：要一次下载多个网页，您可以使用线程池：

#!/usr/bin/env python2
# -*- coding: utf-8 -*-    
import os
from urllib import quote
import requests # $ pip install requests

def urlretrieve(url, filename, chunksize=8096):
    r = requests.get(url, stream=True)
    r.raise_for_status() # raise on http error
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunksize): 
            f.write(chunk)

#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
            "action=query&prop=revisions&rvprop=content&format=json&"
            "titles=" + quote(name.encode('utf-8')),
            os.path.join('json', name + '.json'))

set maxlag query parameter and respect Retry-After http header是礼貌的。维基百科API有几个可以为你做的包装器。

urllib2

1 个答案: