Question

我对这个剧本的问题是我必须继续打电话.encode('utf-8')，这对我来说似乎并不是很好。我一定是做错了。

# -*- coding: utf-8 -*-

""" Simple rss to html converter """

__version__ = "0.0.1"
__author__ = "Ricky L Wilson"

import StringIO
from feedparser import parse as parsefeed
from bs4 import BeautifulSoup as bs


def entry2html(**kwargs):
    """ Format feedparser entry """
    title = kwargs['title'].encode('utf-8')
    link = kwargs['link'].encode('utf-8')
    description = kwargs['description'].encode('utf-8')
    template = """
    <h2 class='title'>{title}</h2>
    <a class='link' href='{link}'>{title}</a>
    <span class='description'>{description}</span>
    """
    return template.format(title=title, link=link, description=description)


def convert_feed(**kwargs):
    """ Main loop """
    out = StringIO.StringIO("")
    for entry in parsefeed(kwargs['url']).entries:
        title = entry['title']
        link = entry['link']
        description = entry['description']
        print >> out, entry2html(title=title, link=link, description=description)
    return bs(out.getvalue(), 'lxml').prettify()

def save_file(url, fname):
    ''' Save data to disc'''
    with open(fname, 'w') as file_object:
        file_object.write(convert_feed(url=url).encode('utf-8'))

print save_file('http://stackoverflow.com/feeds', 'index.html')

PS。如果我删除它们中的任何一个，脚本将会中断。

我尝试将# -*- coding: utf-8 -*-放在文件的顶部但是没有修复它。我收到了这个错误。

C:\Python27>python rss2html.py
  File "rss2html.py", line 40
    save_file('http://stackoverflow.com/feeds', 'index.html')
            ^
SyntaxError: invalid syntax

C:\Python27>python rss2html.py
Traceback (most recent call last):
  File "rss2html.py", line 40, in <module>
    save_file('http://stackoverflow.com/feeds', 'index.html')
  File "rss2html.py", line 38, in save_file
    file_object.write(convert_feed(url=url))
  File "rss2html.py", line 32, in convert_feed
    print >> out, entry2html(title=title, link=link, description=description)
  File "rss2html.py", line 22, in entry2html
    return template.format(title=title, link=link, description=description)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xa3' in position 172: ordinal not in range(128)

Answer 1

你的kwargs是unicode字符串。您应该使用u前缀将模板定义为unicode字符串，然后在结尾处编码一次。

template = u"""
<h2 class='title'>{title}</h2>
<a class='link' href='{link}'>{title}</a>
<span class='description'>{description}</span>
"""
return template.format(title=title, link=link, description=description).encode('utf-8')

Answer 2

我想通了，让我知道你们的想法。这种方法比所有那些.encode（＆＃39; utf-8＆＃39;）调用更好，或者这是一个黑客攻击，它们仍然是消除.encode('utf-8')调用的更加pythonic的方式。

以下是新脚本

# -*- coding: utf-8 -*-
""" Simple rss to html converter """

__version__ = "0.0.1"
__author__ = "Ricky L Wilson"

import StringIO
from feedparser import parse as parsefeed
from bs4 import BeautifulSoup as bs

def flatten_unicode_keys(d):
    '''pass unicode keywords to **kwargs '''
    for k in d:
        if isinstance(k, unicode):
            v = d[k]
            del d[k]
            d[str(k)] = v

def entry2html(**kwargs):
    """ Format feedparser entry """
    flatten_unicode_keys(kwargs)
    title = kwargs['title']
    link = kwargs['link']
    description = kwargs['description']
    template = """
    <h2 class='title'>{title}</h2>
    <a class='link' href='{link}'>{title}</a>
    <span class='description'>{description}</span>
    """
    return template.format(title=title, link=link, description=description)


def convert_feed(**kwargs):
    """ Main loop """
    out = StringIO.StringIO("")
    for entry in parsefeed(kwargs['url']).entries:
        title = entry['title']
        link = entry['link']
        description = entry['description']
        print >> out, entry2html(title=title, link=link, description=description)
    return bs(out.getvalue(), 'lxml').prettify()

def save_file(url, fname):
    ''' Save data to disc'''
    with open(fname, 'w') as file_object:
        file_object.write(convert_feed(url=url))

save_file('http://stackoverflow.com/feeds', 'index.html')

Answer 3

所以这里是最新版本的脚本flatten_unicode_keys(d) 不再StringIO而且不再for loops我认为这比过去的版本更加pythonic。它也更快。

# -*- coding: utf-8 -*-
"""Simple RSS to HTML converter."""

__version__ = "0.0.2"
__author__ = "Ricky L Wilson"

from bs4 import BeautifulSoup
from feedparser import parse as parse_feed


TEMPLATE = u"""
<h2 class='title'>{title}</h2>
<a class='link' href='{link}'>{title}</a>
<span class='description'>{summary}</span>
"""

def entry_to_html(**kwargs):
    """Formats feedparser entry."""
    return TEMPLATE.format(**kwargs).encode('utf-8')


def convert_feed(url):
    """Main loop."""
    html_fragments = [entry_to_html(**entry) for entry in parse_feed(url).entries]
    return BeautifulSoup("\n".join(html_fragments), 'lxml').prettify()


def save_file(url, filename):
    """Saves data to disc."""
    with open(filename, 'w') as file_object:
        file_object.write(convert_feed(url).encode('utf-8'))


if __name__ == '__main__':
    save_file('http://stackoverflow.com/feeds', 'index.html')
    with open('index.html') as fobj:
        print fobj.read()

我的脚本经常使用.encode（＆＃39; utf-8＆＃39;）。我觉得我做错了什么

3 个答案: