知道如何解决这个问题吗?
import csv
import re
import time
import urllib2
from urlparse import urljoin
from bs4 import BeautifulSoup
BASE_URL = 'http://omaha.craigslist.org/sys/'
URL = 'http://omaha.craigslist.org/sya/'
FILENAME = '/Users/mona/python/craigstvs.txt'
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
soup = BeautifulSoup(opener.open(URL))
with open(FILENAME, 'a') as f:
writer = csv.writer(f, delimiter=';')
for link in soup.find_all('a', class_=re.compile("hdrlnk")):
timeset = time.strftime("%m-%d %H:%M")
item_url = urljoin(BASE_URL, link['href'])
item_soup = BeautifulSoup(opener.open(item_url))
# do smth with the item_soup? or why did you need to follow this link?
writer.writerow([timeset, link.text, item_url])
答案 0 :(得分:0)
作为一种体验,我不得不说csv模块完全不支持unicode,但你可能会发现这种方式有用的打开文件
import codecs
...
codecs.open('file.csv', 'r', 'UTF-8')
或者可能想要自己处理它而不是使用csv模块
答案 1 :(得分:0)
您只需要encode
文字:
link.text.encode("utf-8")
您也可以使用requests
代替urllib2:
import requests
BASE_URL = 'http://omaha.craigslist.org/sys/'
URL = 'http://omaha.craigslist.org/sya/'
FILENAME = 'craigstvs.txt'
soup = BeautifulSoup(requests.get(URL).content)
with open(FILENAME, 'a') as f:
writer = csv.writer(f, delimiter=';')
for link in soup.find_all('a', class_=re.compile("hdrlnk")):
timeset = time.strftime("%m-%d %H:%M")
item_url = urljoin(BASE_URL, link['href'])
item_soup = BeautifulSoup(requests.get(item_url).content)
# do smth with the item_soup? or why did you need to follow this link?
writer.writerow([timeset, link.text.encode("utf-8"), item_url])