
时间:2015-12-07 03:32:45

标签: python dictionary filter nltk word





from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import urllib.request
with urllib.request.urlopen('http://python.org/') as response:
     html = response.read()

import math
import random
import requests
import collections
import string
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET
from xml.dom import minidom
from textblob import TextBlob
from textblob import Word
from textblob.wordnet import VERB
from textblob.classifiers import NaiveBayesClassifier
from string import punctuation
from oauthlib import *
from operator import itemgetter
from collections import defaultdict
from functools import reduce
import requests, re

from collections import Counter
from lxml import html
from operator import itemgetter

def process(url, xpath):
            Downloads a feed url and extracts the results with a variable path
            :param url: string
            :param xpath: string
            :return: list
            contents = requests.get(url)
            root = ET.fromstring(contents.content)
            return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]

def main(n=10):

        # A list of feeds to process and their xpath

        feeds = [
            {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/'},
            {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/'},
            {'url': 'http://www.artandeducation.net/category/announcement/feed/', 'xpath': './/'},
            {'url': 'http://www.blouinartinfo.com/rss/visual-arts.xml', 'xpath': './/'},
            {'url': 'http://feeds.feedburner.com/ContemporaryArtDaily?format=xml', 'xpath': './/'},
            {'url': 'http://www.e-flux.com/category/announcements/feed/', 'xpath': './/'},
            {'url': 'http://www.e-flux.com/category/journal/feed/', 'xpath': './/'}


        # A place to hold all feed results
        results = []

        # Loop all the feeds
        for feed in feeds:
            # Append feed results together
            results = results + process(feed['url'], feed['xpath'])

        # Join all results into a big string
        contents=",".join(map(str, results))

        # Remove double+ spaces
        contents = re.sub('\s+', ' ', contents)

        # Remove everything that is not a character or whitespace
        contents = re.sub('[^A-Za-z ]+', '', contents)

        words_split = [w.lower() for w in contents.split() if len(w) <=13 ]
        wordlist = [w.lower() for w in nltk.corpus.words.words() ]

        words_sort = [val for val in words_split if val in wordlist]

        # Count the words
        word_count = Counter(words_sort)

        # Clean the content a little
        filter_words = ['art', 'artist', 'artists']
        for word in filter_words:
                if word in word_count:
                    del word_count[word]

        words = word_count.most_common(n)
        words = sorted(words, key = itemgetter(1))

        out = []
        for word, count in words:
            out += [word]*count

        final = " ".join(out)

        with open("filename.txt","w+") as f:

def process(url, xpath):
        Downloads a feed url and extracts the results with a variable path
        :param url: string
        :param xpath: string
        :return: list
        contents = requests.get(url)
        root = ET.fromstring(contents.content)
        return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]

    if __name__ == "__main__":

0 个答案:
