是否有可以将博客条目转换为博客摘要的好库(或正则表达式魔法)?我希望摘要显示前四个句子,第一段或第一个X个字符......不确定什么是最好的。理想情况下,我希望保留html格式标记,例如<a>
,<b>
,<u>
和<i>
,但它可以删除所有其他html标记,javascript和css。
更具体地说,作为输入,我会给出一个代表整个博客文章的html字符串。作为输出,我想要一个包含前几个句子,段落或X个字符的html字符串。删除所有可能不安全的html标签。请用Python。
答案 0 :(得分:1)
如果您正在查看HTML,则需要解析它。除了前面提到的BeautifulSoup之外,lxml.html还有一些不错的HTML处理工具。
但是,如果它是一个博客,您可能会发现使用RSS / Atom提要更加容易。 Feedparser太棒了,会让事情变得简单。您获得了兼容性和持久性(因为RSS更明确,事物的变化会更少)但如果Feed没有包含您需要的内容,那么它将无法为您提供帮助。
答案 1 :(得分:1)
我最终使用gdata库并滚动我自己的博客摘要生成器,它使用gdata库在Google App Engine上获取Blogspot博客(将其移植到其他平台并不困难)。代码如下。要使用它,首先设置常量blog_id_constant
,然后调用get_blog_info
以返回包含博客摘要的字典。
我不相信代码可以在互联网上创建任何随机博客的摘要,因为它可能无法从博客Feed中删除所有不安全的html。但是,对于您自己编写的简单博客,下面的代码应该有效。
请随意复制,但如果您发现任何错误或想要进行改进,请在评论中添加。 (对不起分号)。
import sys
import os
import logging
import time
import urllib
from HTMLParser import HTMLParser
from django.core.cache import cache
# Import the Blogger API
sys.path.insert(0, 'gdata.zip')
from gdata import service
Months = ["Jan.", "Feb.", "Mar.", "Apr.", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."];
blog_id_constant = -1 # YOUR BLOG ID HERE
blog_pages_at_once = 5
# -----------------------------------------------------------------------------
# Blogger
class BlogHTMLSummarizer(HTMLParser):
'''
An HTML parser which only grabs X number of words and removes
all tags except for certain safe ones.
'''
def __init__(self, max_words = 80):
self.max_words = max_words
self.allowed_tags = ["a", "b", "u", "i", "br", "div", "p", "img", "li", "ul", "ol"]
if self.max_words < 80:
# If it's really short, don't include layout tags
self.allowed_tags = ["a", "b", "u", "i"]
self.reset()
self.out_html = ""
self.num_words = 0
self.no_more_data = False
self.no_more_tags = False
self.tag_stack = []
def handle_starttag(self, tag, attrs):
if not self.no_more_data and tag in self.allowed_tags:
val = "<%s %s>"%(tag,
" ".join("%s='%s'"%(a,b) for (a,b) in attrs))
self.tag_stack.append(tag)
self.out_html += val
def handle_data(self, data):
if self.no_more_data:
return
data = data.split(" ")
if self.num_words + len(data) >= self.max_words:
data = data[:self.max_words-self.num_words]
data.append("...")
self.no_more_data = True
self.out_html += " ".join(data)
self.num_words += len(data)
def handle_endtag(self, tag):
if self.no_more_data and not self.tag_stack:
self.no_more_tags = True
if not self.no_more_tags and self.tag_stack and tag == self.tag_stack[-1]:
if not self.tag_stack:
logging.warning("mixed up blogger tags")
else:
self.out_html += "</%s>"%tag
self.tag_stack.pop()
def get_blog_info(short_summary = False, page = 1, year = "", month = "", day = "", post = None):
'''
Returns summaries of several recent blog posts to be displayed on the front page
page: which page of blog posts to get. Starts at 1.
'''
blogger_service = service.GDataService()
blogger_service.source = 'exampleCo-exampleApp-1.0'
blogger_service.service = 'blogger'
blogger_service.account_type = 'GOOGLE'
blogger_service.server = 'www.blogger.com'
blog_dict = {}
# Do the common stuff first
query = service.Query()
query.feed = '/feeds/' + blog_id_constant + '/posts/default'
query.order_by = "published"
blog_dict['entries'] = []
def get_common_entry_data(entry, summarize_len = None):
'''
Convert an entry to a dictionary object.
'''
content = entry.content.text
if summarize_len != None:
parser = BlogHTMLSummarizer(summarize_len)
parser.feed(entry.content.text)
content = parser.out_html
pubstr = time.strptime(entry.published.text[:-10], '%Y-%m-%dT%H:%M:%S')
safe_title = entry.title.text.replace(" ","_")
for c in ":,.<>!@#$%^&*()+-=?/'[]{}\\\"":
# remove nasty characters
safe_title = safe_title.replace(c, "")
link = "%d/%d/%d/%s/"%(pubstr.tm_year, pubstr.tm_mon, pubstr.tm_mday,
urllib.quote_plus(safe_title))
return {
'title':entry.title.text,
'alllinks':[x.href for x in entry.link] + [link], #including blogger links
'link':link,
'content':content,
'day':pubstr.tm_mday,
'month':Months[pubstr.tm_mon-1],
'summary': True if summarize_len != None else False,
}
def get_blogger_feed(query):
feed = cache.get(query.ToUri())
if not feed:
logging.info("GET Blogger Page: " + query.ToUri())
try:
feed = blogger_service.Get(query.ToUri())
except DownloadError:
logging.error("Cant download blog, rate limited? %s"%str(query.ToUri()))
return None
except Exception, e:
web_exception('get_blogger_feed', e)
return None
cache.set(query.ToUri(), feed, 3600)
return feed
def _in_one(a, allBs):
# Return true if a is in one of allBs
for b in allBs:
if a in b:
return True
return False
def _get_int(i):
try:
return int(i)
except ValueError:
return None
(year, month, day) = (_get_int(year), _get_int(month), _get_int(day))
if not short_summary and year and month and day:
# Get one more than we need so we can see if we have more
query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, day)
query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, day)
feed = get_blogger_feed(query)
if not feed:
return {}
blog_dict['detail_view'] = True
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
elif not short_summary and year and month and not day:
# Get one more than we need so we can see if we have more
query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, 1)
query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, 31)
feed = get_blogger_feed(query)
if not feed:
return {}
blog_dict['detail_view'] = True
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
if post:
blog_dict['entries'] = filter(lambda f: _in_one(post, f['alllinks']), blog_dict['entries'])
elif short_summary:
# Get a summary of all posts
query.max_results = str(3)
query.start_index = str(1)
feed = get_blogger_feed(query)
if not feed:
return {}
feed.entry = feed.entry[:3]
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 18), feed.entry)
else:
# Get a summary of all posts
try:
page = int(page)
except ValueError:
page = 1
# Get one more than we need so we can see if we have more
query.max_results = str(blog_pages_at_once + 1)
query.start_index = str((page - 1)* blog_pages_at_once + 1)
logging.info("GET Blogger Page: " + query.ToUri())
feed = blogger_service.Get(query.ToUri())
has_older = len(feed.entry) > blog_pages_at_once
feed.entry = feed.entry[:blog_pages_at_once]
if page > 1:
blog_dict['newer_page'] = str(page-1)
if has_older:
blog_dict['older_page'] = str(page+1)
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 80), feed.entry)
return blog_dict
答案 2 :(得分:0)
你必须解析html。这样做的好习惯是BeautifulSoup。它将允许删除特定标签和提取值(标签之间的文本)。文本可以相对容易地减少到四个句子,尽管我会选择固定数量的字符,因为句子长度可能会有很大差异。