过滤英语词典中的单词

时间:2015-12-07 03:32:45

标签: python dictionary filter nltk word

我正在解析xml文件中的单词,我只需要过滤掉英文字典中的单词。解析后的单词可以是复数或过去时,等等。但是经过NLTK英语单词语料库(英语词典)验证后,单词中的单词应保持原始时态。

我尝试使用apply中的词汇表来查找与内容相交的单词。我没有看到错误,但没有结果数据。我不确定是什么问题。

如何将这些单词过滤到新列表中?

nltk.corpus.words
#!/usr/bin/python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import urllib.request
with urllib.request.urlopen('http://python.org/') as response:
     html = response.read()

import math
import random
import requests
import collections
import string
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET
from xml.dom import minidom
from textblob import TextBlob
from textblob import Word
from textblob.wordnet import VERB
from textblob.classifiers import NaiveBayesClassifier
from string import punctuation
from oauthlib import *
from operator import itemgetter
from collections import defaultdict
from functools import reduce
import requests, re

from collections import Counter
from lxml import html
from operator import itemgetter

def process(url, xpath):
            """
            Downloads a feed url and extracts the results with a variable path
            :param url: string
            :param xpath: string
            :return: list
            """
            contents = requests.get(url)
            root = ET.fromstring(contents.content)
            return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]



    ## XML PARSING
def main(n=10):

        # A list of feeds to process and their xpath


        feeds = [
            {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/'},
            {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/'},
            {'url': 'http://www.artandeducation.net/category/announcement/feed/', 'xpath': './/'},
            {'url': 'http://www.blouinartinfo.com/rss/visual-arts.xml', 'xpath': './/'},
            {'url': 'http://feeds.feedburner.com/ContemporaryArtDaily?format=xml', 'xpath': './/'},
            {'url': 'http://www.e-flux.com/category/announcements/feed/', 'xpath': './/'},
            {'url': 'http://www.e-flux.com/category/journal/feed/', 'xpath': './/'}

        ]



        # A place to hold all feed results
        results = []

        # Loop all the feeds
        for feed in feeds:
            # Append feed results together
            results = results + process(feed['url'], feed['xpath'])

        # Join all results into a big string
        contents=",".join(map(str, results))

        # Remove double+ spaces
        contents = re.sub('\s+', ' ', contents)

        # Remove everything that is not a character or whitespace
        contents = re.sub('[^A-Za-z ]+', '', contents)


        words_split = [w.lower() for w in contents.split() if len(w) <=13 ]
        wordlist = [w.lower() for w in nltk.corpus.words.words() ]

        words_sort = [val for val in words_split if val in wordlist]


        # Count the words
        word_count = Counter(words_sort)

        # Clean the content a little
        filter_words = ['art', 'artist', 'artists']
        for word in filter_words:
                if word in word_count:
                    del word_count[word]


    ##
        words = word_count.most_common(n)
        words = sorted(words, key = itemgetter(1))

        out = []
        for word, count in words:
            out += [word]*count

        final = " ".join(out)

        with open("filename.txt","w+") as f:
            f.write(final)





def process(url, xpath):
        """
        Downloads a feed url and extracts the results with a variable path
        :param url: string
        :param xpath: string
        :return: list
        """
        contents = requests.get(url)
        root = ET.fromstring(contents.content)
        return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]




    if __name__ == "__main__":
        main()

0 个答案:

没有答案