在Python中创建博客摘要?

时间:2011-03-08 16:03:37

标签: python html blogs

是否有可以将博客条目转换为博客摘要的好库(或正则表达式魔法)?我希望摘要显示前四个句子,第一段或第一个X个字符......不确定什么是最好的。理想情况下,我希望保留html格式标记,例如<a><b><u><i>,但它可以删除所有其他html标记,javascript和css。

更具体地说,作为输入,我会给出一个代表整个博客文章的html字符串。作为输出,我想要一个包含前几个句子,段落或X个字符的html字符串。删除所有可能不安全的html标签。请用Python。

3 个答案:

答案 0 :(得分:1)

如果您正在查看HTML,则需要解析它。除了前面提到的BeautifulSoup之外,lxml.html还有一些不错的HTML处理工具。

但是,如果它是一个博客,您可能会发现使用RSS / Atom提要更加容易。 Feedparser太棒了,会让事情变得简单。您获得了兼容性和持久性(因为RSS更明确,事物的变化会更少)但如果Feed没有包含您需要的内容,那么它将无法为您提供帮助。

答案 1 :(得分:1)

我最终使用gdata库并滚动我自己的博客摘要生成器,它使用gdata库在Google App Engine上获取Blogspot博客(将其移植到其他平台并不困难)。代码如下。要使用它,首先设置常量blog_id_constant,然后调用get_blog_info以返回包含博客摘要的字典。

我不相信代码可以在互联网上创建任何随机博客的摘要,因为它可能无法从博客Feed中删除所有不安全的html。但是,对于您自己编写的简单博客,下面的代码应该有效。

请随意复制,但如果您发现任何错误或想要进行改进,请在评论中添加。 (对不起分号)。

import sys
import os
import logging
import time
import urllib
from HTMLParser import HTMLParser
from django.core.cache import cache
# Import the Blogger API
sys.path.insert(0, 'gdata.zip')
from gdata import service

Months = ["Jan.", "Feb.", "Mar.", "Apr.", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."];
blog_id_constant = -1 # YOUR BLOG ID HERE
blog_pages_at_once = 5

# -----------------------------------------------------------------------------
#   Blogger 
class BlogHTMLSummarizer(HTMLParser):
    '''
    An HTML parser which only grabs X number of words and removes
    all tags except for certain safe ones.
    '''

    def __init__(self, max_words = 80):
        self.max_words = max_words
        self.allowed_tags = ["a", "b", "u", "i", "br", "div", "p", "img", "li", "ul", "ol"]
        if self.max_words < 80:
            # If it's really short, don't include layout tags
            self.allowed_tags = ["a", "b", "u", "i"]
        self.reset()
        self.out_html = ""
        self.num_words = 0
        self.no_more_data = False
        self.no_more_tags = False
        self.tag_stack = []

    def handle_starttag(self, tag, attrs):
        if not self.no_more_data and tag in self.allowed_tags:
            val = "<%s %s>"%(tag, 
                " ".join("%s='%s'"%(a,b) for (a,b) in attrs))
            self.tag_stack.append(tag)
            self.out_html += val

    def handle_data(self, data):
        if self.no_more_data:
            return
        data = data.split(" ")
        if self.num_words + len(data) >= self.max_words:
            data = data[:self.max_words-self.num_words]
            data.append("...")
            self.no_more_data = True
        self.out_html  += " ".join(data)
        self.num_words += len(data)

    def handle_endtag(self, tag):
        if self.no_more_data and not self.tag_stack:
            self.no_more_tags = True
        if not self.no_more_tags and self.tag_stack and tag == self.tag_stack[-1]:
            if not self.tag_stack:
                logging.warning("mixed up blogger tags")
            else:
                self.out_html += "</%s>"%tag
                self.tag_stack.pop()

def get_blog_info(short_summary = False, page = 1, year = "", month = "", day = "", post = None):
    '''
    Returns summaries of several recent blog posts to be displayed on the front page
        page: which page of blog posts to get. Starts at 1.
    '''
    blogger_service = service.GDataService()
    blogger_service.source = 'exampleCo-exampleApp-1.0'
    blogger_service.service = 'blogger'
    blogger_service.account_type = 'GOOGLE'
    blogger_service.server = 'www.blogger.com'
    blog_dict = {}

    # Do the common stuff first
    query = service.Query()
    query.feed = '/feeds/' + blog_id_constant + '/posts/default'
    query.order_by = "published"
    blog_dict['entries'] = []

    def get_common_entry_data(entry, summarize_len = None):
        '''
        Convert an entry to a dictionary object.
        '''
        content = entry.content.text
        if summarize_len != None:
            parser = BlogHTMLSummarizer(summarize_len)
            parser.feed(entry.content.text)
            content = parser.out_html
        pubstr = time.strptime(entry.published.text[:-10], '%Y-%m-%dT%H:%M:%S')
        safe_title = entry.title.text.replace(" ","_")
        for c in ":,.<>!@#$%^&*()+-=?/'[]{}\\\"":
            # remove nasty characters
            safe_title = safe_title.replace(c, "")
        link = "%d/%d/%d/%s/"%(pubstr.tm_year, pubstr.tm_mon, pubstr.tm_mday, 
            urllib.quote_plus(safe_title))
        return {
                'title':entry.title.text,
                'alllinks':[x.href for x in entry.link] + [link], #including blogger links
                'link':link,
                'content':content,
                'day':pubstr.tm_mday,
                'month':Months[pubstr.tm_mon-1],
                'summary': True if summarize_len != None else False,
            }

    def get_blogger_feed(query):
        feed = cache.get(query.ToUri())
        if not feed:
            logging.info("GET Blogger Page: " + query.ToUri())
            try:
                feed = blogger_service.Get(query.ToUri())
            except DownloadError:
                logging.error("Cant download blog, rate limited? %s"%str(query.ToUri()))
                return None
            except Exception, e:
                web_exception('get_blogger_feed', e)
                return None
            cache.set(query.ToUri(), feed, 3600)
        return feed

    def _in_one(a, allBs):
        # Return true if a is in one of allBs
        for b in allBs:
            if a in b:
                return True
        return False

    def _get_int(i):
        try:
            return int(i)
        except ValueError:
            return None
    (year, month, day) = (_get_int(year), _get_int(month), _get_int(day))

    if not short_summary and year and month and day:
        # Get one more than we need so we can see if we have more
        query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, day)
        query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, day)
        feed = get_blogger_feed(query)
        if not feed:
            return {}
        blog_dict['detail_view'] = True
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
    elif not short_summary and year and month and not day:
        # Get one more than we need so we can see if we have more
        query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, 1)
        query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, 31)
        feed = get_blogger_feed(query)
        if not feed:
            return {}
        blog_dict['detail_view'] = True
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
        if post:
            blog_dict['entries'] = filter(lambda f: _in_one(post, f['alllinks']), blog_dict['entries'])
    elif short_summary:
        # Get a summary of all posts
        query.max_results = str(3) 
        query.start_index = str(1)
        feed = get_blogger_feed(query)
        if not feed:
            return {}
        feed.entry = feed.entry[:3]
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 18), feed.entry)
    else:
        # Get a summary of all posts
        try:
            page = int(page)
        except ValueError:
            page = 1

        # Get one more than we need so we can see if we have more
        query.max_results = str(blog_pages_at_once + 1) 
        query.start_index = str((page - 1)* blog_pages_at_once + 1)
        logging.info("GET Blogger Page: " + query.ToUri())
        feed = blogger_service.Get(query.ToUri())

        has_older = len(feed.entry) > blog_pages_at_once
        feed.entry = feed.entry[:blog_pages_at_once]
        if page > 1:
            blog_dict['newer_page'] = str(page-1)
        if has_older:
            blog_dict['older_page'] = str(page+1)       
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 80), feed.entry)

    return blog_dict

答案 2 :(得分:0)

你必须解析html。这样做的好习惯是BeautifulSoup。它将允许删除特定标签和提取值(标签之间的文本)。文本可以相对容易地减少到四个句子,尽管我会选择固定数量的字符,因为句子长度可能会有很大差异。