如何使用Python 2.7和BeautifulSoup正确解析俄语文本?

时间:2016-02-19 17:45:11

标签: html python-2.7 parsing beautifulsoup html-parsing

我正在尝试解析俄语网站(http://games4you.ucoz.ua/news/)中的所有帖子。我使用的是Python 2.7.9和BeautifulSoup 4.我在PyCharm工作。我已经尝试了许多工作来使它工作,但我仍然得到这个而不是俄语文本:'\u0421\u0442\u0440\u0430\u0442\u0435\u0433\u0456\u0457'

这是我的代码:

# Parsing information from games4you.ucoz.ua
# -*- coding: utf-8 -*-

import re
import csv
import urllib
from bs4 import BeautifulSoup

BASE_URL = "http://games4you.ucoz.ua/news/"

def get_html(url):
    response = urllib.urlopen(url)
    return response.read()

def get_page_count(html):
    soup = BeautifulSoup(html)
    paggination = soup.find('div', class_='catPages1')
    return int(paggination.find_all('a')[-2].text)

def save(games, path):
    # with open(path, 'w') as csvfile:
    #     writer = csv.writer(csvfile)
    #
    #     writer.writerow(('Title', 'Category', 'Date', 'Time'))
    #
    #     writer.writerows(
    #         (game['title'], ', '.join(game['category']), game['date'], game['time']) for game in games
    #     )
    with open(path,'w+') as f:
        f.write(str(games).encode("UTF-8"))


def parse(html):
    soup = BeautifulSoup(html)
    # Getting the <div> that contains all posts on page
    all_entries = soup.find('div',id='allEntries')

    # Getting all of the posts (every table represents one post)
    tables = all_entries.find_all('table',class_='eBlock')

    # Creating a list o dictionaries for games information
    games = []
    for table in tables:

        try:
            # Getting the game title
            game_title = table.tr.td.a.text
            game_post_body = table.find('div',class_='eMessage')
            # Getting the game description
            game_description = game_post_body.p.text.split('....')[0] + '.'
            game_details = table.find('div',class_='eDetails')
            # Getting the game category
            game_category = game_details.a.text
            game_post_details = game_details.text
        except:
            print 'Some error'
            continue

        # Getting the post views count
        post_views = game_post_details[:game_post_details.find('function')].split()[-2]

        # Getting the post date
        post_date = game_details.span.text
        # Getting the post time
        post_time = game_details.span['title']

        # print 'Game title: ',game_title,'\n'
        # print 'Views: ',post_views,'\n'
        # print 'Game category: ',game_category,'\n'
        # print 'Game description: ','\n',game_description,'\n'
        # print 'Post date: ',post_date,'\n'
        # print 'Post time: ',post_time,'\n'

        games.append({
            'title': game_title,
            'category' : game_category,
            'description' : game_description,
            'date' : post_date,
            'time' : post_time
        })

    return games

def main():
    total_pages = get_page_count(get_html(BASE_URL))
    print('Total found %d pages...' % total_pages)

    games = []

    for page in range(1, total_pages + 1):
        print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
        games.extend(parse(get_html(BASE_URL + "?page%d" % page)))

    print('Saving...')
    save(games, 'games.txt')

main()

2 个答案:

答案 0 :(得分:0)

Python2中的

>>> import HTMLParser
>>> s = 'Ell &#233;s la v&#237;ctima que expia els nostres pecats, i no tan sols els nostres, sin&#243; els del m&#243;n sencer.'
>>> print HTMLParser.HTMLParser().unescape(s)
Ell és la víctima que expia els nostres pecats, i no tan sols els nostres, sinó els del món sencer.

在Python3中

>>> import html
>>> html.unescape(s)  

你的例子

'Стратегії'

对于“普通”utf-8文件写入(阅读)使用

 import codecs
 f = codecs.open(filename, 'w', 'utf-8')

希望这会有所帮助

答案 1 :(得分:0)

是的,我做到了!猜猜我搞砸了解码/编码文本并使用了不同的字符集。我所要做的就是将我从BeautifulSoup获得的数据从Unicode转换为Utf-8,如下所示:

    game_title = game_title.encode("utf-8")
    game_category = game_category.encode("utf-8")
    game_description = game_description.encode("utf-8")
    post_date = post_date.encode("utf-8")
    post_time = post_time.encode("utf-8")

不需要其他任何东西。这是对我有用的结果代码:

# Parsing information from games4you.ucoz.ua

import csv
import urllib
from bs4 import BeautifulSoup

BASE_URL = "http://games4you.ucoz.ua/news/"

def get_html(url):
    response = urllib.urlopen(url)
    return response.read()

def get_page_count(html):
    soup = BeautifulSoup(html)
    paggination = soup.find('div', class_='catPages1')
    return int(paggination.find_all('a')[-2].text)

def save(games, path):
    with open(path, 'w+') as csvfile:
        writer = csv.writer(csvfile)

        writer.writerow(('Title', 'Category', 'Date', 'Time'))

        writer.writerows(
            (game['title'],game['category'], game['date'], game['time']) for game in games
        )

def parse(html):
    soup = BeautifulSoup(html)
    # Getting the <div> that contains all posts on page
    all_entries = soup.find('div',id='allEntries')

    # Getting all of the posts (every table represents one post)
    tables = all_entries.find_all('table',class_='eBlock')

    # Creating a list o dictionaries for games information
    games = []
    for table in tables:

        try:
            # Getting the game title
            game_title = table.tr.td.a.text
            game_post_body = table.find('div',class_='eMessage')
            # Getting the game description
            game_description = game_post_body.p.text.split('....')[0] + '.'
            game_details = table.find('div',class_='eDetails')
            # Getting the game category
            game_category = game_details.a.text
            game_post_details = game_details.text
        except:
            print 'Some error'
            continue

        # Getting the post views count
        post_views = game_post_details[:game_post_details.find('function')].split()[-2]

        # Getting the post date
        post_date = game_details.span.text
        # Getting the post time
        post_time = game_details.span['title']

        # Converting all data from Unicode to Utf-8
        game_title = game_title.encode("utf-8")
        game_category = game_category.encode("utf-8")
        game_description = game_description.encode("utf-8")
        post_date = post_date.encode("utf-8")
        post_time = post_time.encode("utf-8")

        # Writing data to the list
        games.append({
            'title': game_title,
            'category' : game_category,
            'description' : game_description,
            'date' : post_date,
            'time' : post_time
        })

    return games

def main():
    total_pages = get_page_count(get_html(BASE_URL))
    print('Total found %d pages...' % total_pages)

    games = []

    for page in range(1, total_pages + 1):
        print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
        games.extend(parse(get_html(BASE_URL + "?page%d" % page)))

    print('Saving...')
    save(games, 'games.csv')

main()