我目前正致力于制作新闻聚合器的个人项目。
为此,我使用Python feedparser
lib从一些站点收集了一些关于文章的来源。以下代码是我的模块digitaltrends.py
的功能。
import feedparser
import requests
import re
import html
def cleanHtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr,'',raw_html)
return cleantext
def to_utf(string):
return string.encode('utf-8',errors='ignore').decode('utf-8')
def get_feed ():
feed = 'https://www.digitaltrends.com/feed/'
parsed_example = feedparser.parse (feed)
press = to_utf(parsed_example['feed']['title'])
articleList = []
for items in parsed_example['entries'] :
article = {}
article['press'] = press
article['link'] = to_utf(items.link)
article['pubdate'] = to_utf(items.published)
article['author'] =to_utf(items.authors[0]['name'])
article['title'] = to_utf(items.title)
article['summary'] = to_utf( html.unescape(cleanHtml(items.summary)) )
if 'thumbnail' in items :
article['thumbnail'] = to_utf(items['thumbnail'])
else:
article['thumbnail'] = None
articleList.append(article)
return articleList
之后,我通过发送以下查询字符串在我自己的数据库中创建了一个表:
# i made it roughly for my first test
sql = '''CREATE TABLE articles (
ID INT NOT NULL auto_increment,
Press VARCHAR(255) NOT NULL,
Title VARCHAR(255) NOT NULL,
Author VARCHAR(255),
Pubdate TEXT NOT NULL,
Summary TEXT,
Link VARCHAR(3000) NOT NULL,
Thumbnail VARCHAR(3000),
PRIMARY KEY (ID)
)'''
最后,我试图将数据插入到我的表格中。
import digitaltrends
import MySQLdb as mysql
article_list = digitaltrends.get_feed()
''''' omitting connecting code '''''
sql = """INSERT INTO articles (Press,Title,Author,Pubdate,Summary,Link,Thumbnail) VALUES ("%s","%s","%s","%s","%s","%s","%s")"""
for each_article in article_list:
data = (each_article['press'],
each_article['title'],
each_article['author'],
each_article['pubdate'],
each_article['summary'],
each_article['link'],
each_article['thumbnail'])
cursor.execute(sql,data)
conn.commit()
执行此代码时出现错误。
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-24-4d8567a1f7ad> in <module>()
11 each_article['thumbnail'])
12
---> 13 cursor.execute(sql,data)
14 conn.commit()
15 #conn.rollback()
~\Anaconda3\lib\site-packages\MySQLdb\cursors.py in execute(self, query, args)
232 args = dict((key, db.literal(item)) for key, item in args.items())
233 else:
--> 234 args = tuple(map(db.literal, args))
235 if not PY2 and isinstance(query, (bytes, bytearray)):
236 query = query.decode(db.encoding)
~\Anaconda3\lib\site-packages\MySQLdb\connections.py in literal(self, o)
316 s = self._tuple_literal(o)
317 else:
--> 318 s = self.escape(o, self.encoders)
319 # Python 3(~3.4) doesn't support % operation for bytes object.
320 # We should decode it before using %.
~\Anaconda3\lib\site-packages\MySQLdb\connections.py in unicode_literal(u, dummy)
223 # unicode_literal() is called for arbitrary object.
224 def unicode_literal(u, dummy=None):
--> 225 return db.string_literal(str(u).encode(db.encoding))
226
227 def bytes_literal(obj, dummy=None):
UnicodeEncodeError: 'latin-1' codec can't encode character '\u2018' in position 18: ordinal not in range(256)
我没有将编码设置为latin-1。我的数据库和表也设置为UTF-8编码。我试着自己解决这个问题,但我不知道为什么会出现与latin-1相关的错误。
解决:
我的初始连接代码:
conn = mysql.connect(host = 'localhost',
user = me,
password = pw,
db = bd,
)
在我为连接函数添加charset ='utf8'参数后,它运行良好。