这个剧本抓住了许多新闻网站的头条新闻,并计算了头条新闻中出现的词数。
对于",#34;和"以及类似的单词,我没有打算用这个脚本抢夺。
我尝试写一个str.translate(无,"到")来删除单词,但它删除了"贪婪" - 当我想要删除的所有内容都是“#34;到"”时,抢走华盛顿的各个部分。
import pprint
import feedparser
from collections import Counter
def feedGrabber(feed):
parsed = feedparser.parse(feed)
feed1 = []
feed1.append(parsed.entries[0].title)
feed1.append(parsed.entries[1].title)
feed1.append(parsed.entries[3].title)
feed1.append(parsed.entries[4].title)
feed1.append(parsed.entries[5].title)
feed1.append(parsed.entries[6].title)
feed1.append(parsed.entries[7].title)
feed1.append(parsed.entries[8].title)
feed1.append(parsed.entries[9].title)
feed1 = str(feed1)
feedsplit = feed1
feedsplit = feedsplit.translate(None, '\'')
feedsplit = feedsplit.translate(None, 'u')
feedsplit = feedsplit.translate(None, '[')
feedsplit = feedsplit.translate(None, ']')
feedsplit = str.lower(feedsplit)
feedsplit = str.split(feedsplit)
return(feedsplit)
reddit = feedGrabber("https://www.reddit.com/r/news/.rss")
cnn = feedGrabber('http://rss.cnn.com/rss/cnn_topstories.rss')
nyt = feedGrabber('http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml')
one = Counter(reddit)
two = Counter(cnn)
three = Counter(nyt)
pprint.pprint(one + two + three)
答案 0 :(得分:2)
这是一个常用单词列表,您可以使用列表完整性将其从文本中删除
text = [ x for x in text if not isCommon(x)]
def isCommon(word):
commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
"i", "that", "for", "you", "he", "with", "on", "do", "say", "this",
"they", "is", "an", "at", "but","we", "his", "from", "that", "not",
"by", "she", "or", "as", "what", "go", "their","can", "who", "get",
"if", "would", "her", "all", "my", "make", "about", "know", "will",
"as", "up", "one", "time", "has", "been", "there", "year", "so",
"think", "when", "which", "them", "some", "me", "people", "take",
"out", "into", "just", "see", "him", "your", "come", "could", "now",
"than", "like", "other", "how", "then", "its", "our", "two", "more",
"these", "want", "way", "look", "first", "also", "new", "because",
"day", "more", "use", "no", "man", "find", "here", "thing", "give",
"many", "well"]
if word in commonWords:
return True
return False