当我使用python2.7和MySQLdb在Ubuntu中编程时,我在python中使用其他语言时出错了。只有英文不会出现此错误。
Traceback (most recent call last):
File "crawl.py", line 242, in <module>
parseArticle( u )
File "crawl.py", line 146, in parseArticle
gatherNeighborInfo( soup )
File "crawl.py", line 69, in gatherNeighborInfo
db.updateURL( url , '자신의 글 주소들을 db에 저장합니다' )
File "crawl.py", line 211, in updateURL self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xec in position 33: ordinal not in range(128)
所以我试图将ascii更改为utf-8。我在/usr/local/lib/python2.7/site-packages上创建了一个名为sitecustomize.py的文件。 和sitecustomize.py源代码如下。
import sys
sys.setdefaultencoding("utf-8")
但没有任何改变。请帮我。 这是完整的源代码。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re
import MySQLdb
crawler_name = 'daum_blog_crawler'
mainpage = 'http://blog.daum.net/'
# robot parser setting.
rp = robotparser.RobotFileParser( mainpage + 'robots.txt' )
rp.read()
def canFetch( url ):
return rp.can_fetch( crawler_name, url )
def getContent( url, delay=1):
time.sleep( delay )
if not canFetch( url ):
#print 'This url can NOT be fetched by our crawler :', url
return None
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent',crawler_name)]
contents = opener.open(url).read()
except:
traceback.print_exc()
return None
return contents
def getArticleInfo( soup ):
rBlog = re.compile('.+blog.daum.net/\w+/\d+.*?')
URLs = soup('a',{'href':rBlog})
return [ u.get('href').split('?')[0] for u in URLs ]
def getOwnArticles( contents ):
ret = []
soup = BeautifulSoup( contents )
rBlog = re.compile('.+/BlogTypeView.+')
for u in soup('a',{'href':rBlog}):
href = u.get('href')
article = href.split('articleno=')[1].split('&')[0]
if ret.count(article)<1:
ret.append( article )
return ret
def gatherNeighborInfo( soup ):
rBlog = re.compile('http://blog.daum.net/\w+')
Neighbors = soup('a',{'href':rBlog})
cnt = 0
for n in Neighbors:
url = n.get('href')
blogname = url.split('/')[-1]
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
db.insertURL( url, 1 )
db.updateURL( url , '자신의 글 주소들을 db에 저장합니다' )
url2 = getRedirectedURL( url )
if not url2: continue
re_url = 'http://blog.daum.net' + url2
body = getContent( re_url, 0 )
if body:
for u in getOwnArticles( body ):
fullpath = 'http://blog.daum.net/'+blogname+'/'+u
cnt+=db.insertURL( fullpath )
if cnt>0: print '%d neighbor articles inserted'%cnt
def getRedirectedURL( url ):
contents = getContent( url )
if not contents: return None
#redirect
try:
soup = BeautifulSoup( contents )
frame = soup('frame')
src = frame[0].get('src')
except:
src = None
return src
def getBody( soup, parent ):
rSrc = re.compile('.+/ArticleContentsView.+')
iframe = soup('iframe',{'src':rSrc})
if len(iframe)>0:
src = iframe[0].get('src')
iframe_src = 'http://blog.daum.net'+src
req = urllib2.Request( iframe_src )
req.add_header('Referer', parent )
body = urllib2.urlopen(req).read()
soup = BeautifulSoup( body )
return str(soup.body)
else:
print 'NULL contents'
return ''
def parseArticle( url ):
article_id = url.split('/')[-1]
blog_id = url.split('/')[-2]
if blog_id.isdigit():
print 'digit:', url.split('/')
newURL = getRedirectedURL( url )
if newURL:
newURL = 'http://blog.daum.net'+newURL
print 'redirecting', newURL
contents = getContent( newURL, 0 )
if not contents:
print 'Null Contents...'
db.updateURL( url, -1 )
return
soup = BeautifulSoup( contents )
gatherNeighborInfo( soup )
n=0
for u in getArticleInfo( soup ):
n+=db.insertURL( u )
if n>0: print 'inserted %d urls from %s'%(n,url)
sp = contents.find('<title>')
if sp>-1:
ep = contents[sp+7:].find('</title>')
title = contents[sp+7:sp+ep+7]
else:
title = ''
contents = getBody( soup, newURL )
pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub("", contents)
db.updateURL( url , '처리했다고 db에 표시합니다.' )
else:
print 'Invalid blog article...'
db.updateURL( url, 'None', -1 )
class DB:
"MySQL wrapper class"
def __init__(self):
self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='......')
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)')
def commit(self):
self.conn.commit()
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0, content=None):
if url[-1]=='/': url=url[:-1]
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content))
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute('SELECT * FROM urls where state=0')
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, content, state=1):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
def isCrawledURL(self, url):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print 'starting crawl.py...'
contents = getContent( mainpage )
URLs = getArticleInfo( BeautifulSoup( contents ) )
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL( u )
print 'inserted %d new pages.'%nSuccess
while 1:
uncrawled_urls = db.selectUncrawledURL()
if not uncrawled_urls: break
for u in uncrawled_urls:
print 'downloading %s'%u
try:
parseArticle( u )
except:
traceback.print_exc()
db.updateURL( u, -1 )
db.commit()
#bs.UpdateIndex()
答案 0 :(得分:1)
连接时指定charset
self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='......', charset='utf8')
替换以下行:
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
with(将sql与参数分开):
self.cursor.execute("UPDATE urls SET state=%s, content=%s WHERE url=%s", (state,content,url))
示例会话:
>>> import MySQLdb
>>> db = MySQLdb.connect('localhost', db='test', charset='utf8')
>>> cursor = db.cursor()
>>> cursor.execute('DROP TABLE IF EXISTS urls')
0L
>>> cursor.execute('CREATE TABLE urls(url char(200), state int, content text)')
0L
>>> cursor.execute('INSERT INTO urls(url, state, content) VALUES(%s, %s, %s)', ('http://daum.net/', 1, u'\uc548\ub155'))
1L
>>> cursor.execute('SELECT * FROM urls')
1L
>>> for row in cursor.fetchall():
... print row
...
(u'http://daum.net/', 1L, u'\uc548\ub155')
答案 1 :(得分:0)
由于您要将MySql命令生成为字符串,因此需要将这些字符串设置为unicode字符串,尝试将所有cursor.execute("
行更改为cursor.execute(u"
答案 2 :(得分:0)
尝试将envirement变量“PYTHONIOENCODING”更改为“utf_8”。如果你不想导出它,你可以做这样的事情
PYTHONIOENCODING=utf-8 python myproject.py
此外,您必须使用 u“”字符串。