我正在解析xml文件中的单词,我只需要过滤掉英文字典中的单词。解析后的单词可以是复数或过去时,等等。但是经过NLTK英语单词语料库(英语词典)验证后,单词中的单词应保持原始时态。
我尝试使用apply
中的词汇表来查找与内容相交的单词。我没有看到错误,但没有结果数据。我不确定是什么问题。
如何将这些单词过滤到新列表中?
nltk.corpus.words
#!/usr/bin/python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import urllib.request
with urllib.request.urlopen('http://python.org/') as response:
html = response.read()
import math
import random
import requests
import collections
import string
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET
from xml.dom import minidom
from textblob import TextBlob
from textblob import Word
from textblob.wordnet import VERB
from textblob.classifiers import NaiveBayesClassifier
from string import punctuation
from oauthlib import *
from operator import itemgetter
from collections import defaultdict
from functools import reduce
import requests, re
from collections import Counter
from lxml import html
from operator import itemgetter
def process(url, xpath):
"""
Downloads a feed url and extracts the results with a variable path
:param url: string
:param xpath: string
:return: list
"""
contents = requests.get(url)
root = ET.fromstring(contents.content)
return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]
## XML PARSING
def main(n=10):
# A list of feeds to process and their xpath
feeds = [
{'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/'},
{'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/'},
{'url': 'http://www.artandeducation.net/category/announcement/feed/', 'xpath': './/'},
{'url': 'http://www.blouinartinfo.com/rss/visual-arts.xml', 'xpath': './/'},
{'url': 'http://feeds.feedburner.com/ContemporaryArtDaily?format=xml', 'xpath': './/'},
{'url': 'http://www.e-flux.com/category/announcements/feed/', 'xpath': './/'},
{'url': 'http://www.e-flux.com/category/journal/feed/', 'xpath': './/'}
]
# A place to hold all feed results
results = []
# Loop all the feeds
for feed in feeds:
# Append feed results together
results = results + process(feed['url'], feed['xpath'])
# Join all results into a big string
contents=",".join(map(str, results))
# Remove double+ spaces
contents = re.sub('\s+', ' ', contents)
# Remove everything that is not a character or whitespace
contents = re.sub('[^A-Za-z ]+', '', contents)
words_split = [w.lower() for w in contents.split() if len(w) <=13 ]
wordlist = [w.lower() for w in nltk.corpus.words.words() ]
words_sort = [val for val in words_split if val in wordlist]
# Count the words
word_count = Counter(words_sort)
# Clean the content a little
filter_words = ['art', 'artist', 'artists']
for word in filter_words:
if word in word_count:
del word_count[word]
##
words = word_count.most_common(n)
words = sorted(words, key = itemgetter(1))
out = []
for word, count in words:
out += [word]*count
final = " ".join(out)
with open("filename.txt","w+") as f:
f.write(final)
def process(url, xpath):
"""
Downloads a feed url and extracts the results with a variable path
:param url: string
:param xpath: string
:return: list
"""
contents = requests.get(url)
root = ET.fromstring(contents.content)
return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]
if __name__ == "__main__":
main()