我正在尝试解析俄语网站(http://games4you.ucoz.ua/news/)中的所有帖子。我使用的是Python 2.7.9和BeautifulSoup 4.我在PyCharm工作。我已经尝试了许多工作来使它工作,但我仍然得到这个而不是俄语文本:'\u0421\u0442\u0440\u0430\u0442\u0435\u0433\u0456\u0457'
这是我的代码:
# Parsing information from games4you.ucoz.ua
# -*- coding: utf-8 -*-
import re
import csv
import urllib
from bs4 import BeautifulSoup
BASE_URL = "http://games4you.ucoz.ua/news/"
def get_html(url):
response = urllib.urlopen(url)
return response.read()
def get_page_count(html):
soup = BeautifulSoup(html)
paggination = soup.find('div', class_='catPages1')
return int(paggination.find_all('a')[-2].text)
def save(games, path):
# with open(path, 'w') as csvfile:
# writer = csv.writer(csvfile)
#
# writer.writerow(('Title', 'Category', 'Date', 'Time'))
#
# writer.writerows(
# (game['title'], ', '.join(game['category']), game['date'], game['time']) for game in games
# )
with open(path,'w+') as f:
f.write(str(games).encode("UTF-8"))
def parse(html):
soup = BeautifulSoup(html)
# Getting the <div> that contains all posts on page
all_entries = soup.find('div',id='allEntries')
# Getting all of the posts (every table represents one post)
tables = all_entries.find_all('table',class_='eBlock')
# Creating a list o dictionaries for games information
games = []
for table in tables:
try:
# Getting the game title
game_title = table.tr.td.a.text
game_post_body = table.find('div',class_='eMessage')
# Getting the game description
game_description = game_post_body.p.text.split('....')[0] + '.'
game_details = table.find('div',class_='eDetails')
# Getting the game category
game_category = game_details.a.text
game_post_details = game_details.text
except:
print 'Some error'
continue
# Getting the post views count
post_views = game_post_details[:game_post_details.find('function')].split()[-2]
# Getting the post date
post_date = game_details.span.text
# Getting the post time
post_time = game_details.span['title']
# print 'Game title: ',game_title,'\n'
# print 'Views: ',post_views,'\n'
# print 'Game category: ',game_category,'\n'
# print 'Game description: ','\n',game_description,'\n'
# print 'Post date: ',post_date,'\n'
# print 'Post time: ',post_time,'\n'
games.append({
'title': game_title,
'category' : game_category,
'description' : game_description,
'date' : post_date,
'time' : post_time
})
return games
def main():
total_pages = get_page_count(get_html(BASE_URL))
print('Total found %d pages...' % total_pages)
games = []
for page in range(1, total_pages + 1):
print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
games.extend(parse(get_html(BASE_URL + "?page%d" % page)))
print('Saving...')
save(games, 'games.txt')
main()
答案 0 :(得分:0)
>>> import HTMLParser
>>> s = 'Ell és la víctima que expia els nostres pecats, i no tan sols els nostres, sinó els del món sencer.'
>>> print HTMLParser.HTMLParser().unescape(s)
Ell és la víctima que expia els nostres pecats, i no tan sols els nostres, sinó els del món sencer.
在Python3中
>>> import html
>>> html.unescape(s)
你的例子
'Стратегії'
对于“普通”utf-8文件写入(阅读)使用
import codecs
f = codecs.open(filename, 'w', 'utf-8')
希望这会有所帮助
答案 1 :(得分:0)
是的,我做到了!猜猜我搞砸了解码/编码文本并使用了不同的字符集。我所要做的就是将我从BeautifulSoup获得的数据从Unicode转换为Utf-8,如下所示:
game_title = game_title.encode("utf-8")
game_category = game_category.encode("utf-8")
game_description = game_description.encode("utf-8")
post_date = post_date.encode("utf-8")
post_time = post_time.encode("utf-8")
不需要其他任何东西。这是对我有用的结果代码:
# Parsing information from games4you.ucoz.ua
import csv
import urllib
from bs4 import BeautifulSoup
BASE_URL = "http://games4you.ucoz.ua/news/"
def get_html(url):
response = urllib.urlopen(url)
return response.read()
def get_page_count(html):
soup = BeautifulSoup(html)
paggination = soup.find('div', class_='catPages1')
return int(paggination.find_all('a')[-2].text)
def save(games, path):
with open(path, 'w+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('Title', 'Category', 'Date', 'Time'))
writer.writerows(
(game['title'],game['category'], game['date'], game['time']) for game in games
)
def parse(html):
soup = BeautifulSoup(html)
# Getting the <div> that contains all posts on page
all_entries = soup.find('div',id='allEntries')
# Getting all of the posts (every table represents one post)
tables = all_entries.find_all('table',class_='eBlock')
# Creating a list o dictionaries for games information
games = []
for table in tables:
try:
# Getting the game title
game_title = table.tr.td.a.text
game_post_body = table.find('div',class_='eMessage')
# Getting the game description
game_description = game_post_body.p.text.split('....')[0] + '.'
game_details = table.find('div',class_='eDetails')
# Getting the game category
game_category = game_details.a.text
game_post_details = game_details.text
except:
print 'Some error'
continue
# Getting the post views count
post_views = game_post_details[:game_post_details.find('function')].split()[-2]
# Getting the post date
post_date = game_details.span.text
# Getting the post time
post_time = game_details.span['title']
# Converting all data from Unicode to Utf-8
game_title = game_title.encode("utf-8")
game_category = game_category.encode("utf-8")
game_description = game_description.encode("utf-8")
post_date = post_date.encode("utf-8")
post_time = post_time.encode("utf-8")
# Writing data to the list
games.append({
'title': game_title,
'category' : game_category,
'description' : game_description,
'date' : post_date,
'time' : post_time
})
return games
def main():
total_pages = get_page_count(get_html(BASE_URL))
print('Total found %d pages...' % total_pages)
games = []
for page in range(1, total_pages + 1):
print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
games.extend(parse(get_html(BASE_URL + "?page%d" % page)))
print('Saving...')
save(games, 'games.csv')
main()