我有这段代码:
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import re
import urllib2
import BeautifulSoup
import csv
origin_site = 'http://typo3.nimes.fr/index.php?id=annuaire_assos&theme=0&rech=&num_page='
get_url = re.compile(r"""window.open\('(.*)','','toolbar=0,""", re.DOTALL).findall
pages = range(1,2)
for page_no in pages:
req = ('%s%s' % (origin_site, page_no))
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
urllib2.urlopen(req)
except urllib2.URLError, e:
pass
else:
# do something with the page
doc = urllib2.urlopen(req)
soup = BeautifulSoup.BeautifulSoup(doc)
infoblock = soup.findAll('tr', { "class" : "menu2" })
for item in infoblock:
assoc_data = []
soup = BeautifulSoup.BeautifulSoup(str(item))
for tag in soup.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
if tag.string is not None:
assoc_name = (tag.string)
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('u'):
if tag.string is not None:
assoc_theme = (tag.string)
get_onclick = str(soup('a')[0]['onclick']) # get the 'onclick' attribute
url = get_url(get_onclick)[0]
try:
urllib2.urlopen(url)
except urllib2.URLError, e:
pass
else:
assoc_page = urllib2.urlopen(url)
#print assoc_page, url
soup_page = BeautifulSoup.BeautifulSoup(assoc_page)
assoc_desc = soup_page.find('table', { "bgcolor" : "#FFFFFF" })
#print assoc_desc
get_address = str(soup_page('td', { "class" : "menu2" }))
soup_address = BeautifulSoup.BeautifulSoup(get_address)
for tag in soup_address.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
if tag.string is not None:
assoc_email = (tag.string)
assoc_data.append(assoc_theme)
assoc_data.append(assoc_name)
assoc_data.append(assoc_email)
for tag in soup_address.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
if tag.string is not None:
if tag.string != ' ':
get_string = BeautifulSoup.BeautifulSoup(tag.string)
assoc_data.append(get_string)
#data.append(get_string)
c = csv.writer(open("MYFILE.csv", "wb"))
for item in assoc_data:
c.writerow(item)
但得到此错误:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xc7' in position 0: ordinal not in range(128)
如何将法语字符传递到MYFILE.csv文件中?我可以进一步改进代码吗?
答案 0 :(得分:3)
看起来urllib2的结果是unicode,但CSV模块不兼容Unicode,但是兼容8位。
相反,您必须在编写之前将每个字符串转换为UTF-8。例如:
c = csv.writer(open("MYFILE.csv", "wb"))
for item in assoc_data:
# Ensure item is an object and not an empty unicode string
if item and item != u'':
c.writerow([item.encode("UTF-8")])
答案 1 :(得分:3)
滚动到底部:http://docs.python.org/library/csv.html
具体来说,请使用此作者:
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
然后,而不是
c = csv.writer(open("MYFILE.csv", "wb"))
使用
c = UnicodeWriter(open("MYFILE.csv", "wb"))
答案 2 :(得分:3)
问题是我没有正确使用unicode,这是最新的代码
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import urllib2
import BeautifulSoup
import csv
origin_site = 'http://typo3.nimes.fr/index.php?id=annuaire_assos&theme=0&rech=&num_page='
pages = range(1,21)
assoc_table = []
for page_no in pages:
print page_no
req = ('%s%s' % (origin_site, page_no))
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
doc = urllib2.urlopen(req)
except urllib2.URLError, e:
pass
else:
# do something with the page
soup = BeautifulSoup.BeautifulSoup(doc)
for row in soup.findAll('tr', { "class" : "menu2" }):
assoc_data = []
item = row.renderContents()
soup = BeautifulSoup.BeautifulSoup(item)
# we get the Thème
for assoc_theme in soup.findAll('u'):
assoc_data.append(assoc_theme.renderContents())
# we get the Nom de l'association
for assoc_name in soup.findAll('td', { "width": "70%"}):
assoc_data.append(assoc_name.renderContents())
# we list all the links to the indivudual pages
for i in soup.findAll('a', {'href':'#'}):
if 'associations' in i.attrMap['onclick']:
req = i.attrMap['onclick'].split('\'')[1]
try:
doc = urllib2.urlopen(req)
except urllib2.URLError, e:
pass
else:
soup = BeautifulSoup.BeautifulSoup(doc)
emails = []
web_sites = []
for tag in soup.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
assoc_link = (tag.string)
if '@' in str(assoc_link):
print assoc_link
emails.append(assoc_link)
if emails != []:
assoc_data.append(emails[0])
else:
assoc_data.append('pas du email')
for tag in soup.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
assoc_link = (tag.string)
if 'http' in str(assoc_link):
web_sites.append(assoc_link)
#
if web_sites != []:
assoc_data.append(web_sites[0])
else:
assoc_data.append('pas du site web')
assoc_addr = []
assoc_cont = soup.findAll('td', { "width" : "49%", "class": "menu2" })
for i in assoc_cont:
assoc_addr.append(i.renderContents())
assoc_tels = []
for addr in assoc_addr:
assoc_data.append(addr)
assoc_tel = soup.findAll('td', { "width" : "45%", "class": "menu2" })
for i in assoc_tel:
assoc_tels.append(i.renderContents())
assoc_data.append(assoc_tels[0])
print assoc_tels[0]
assoc_table.append(assoc_data)
print assoc_data
print assoc_table
c = csv.writer(open("nimes_assoc.csv", "wb"))
for item in assoc_table:
#print item
c.writerow(item)
感谢您的帮助和tutor@python.org邮件列表