好。我想添加一个word_tag, 但我不确定这是不是正确的方法。 (对不起,我是新手)
from nltk.corpus import wordnet as wn
# Count the words
word_count = Counter(words)
# Clean the content a little
filter_words = ['artists']
for word in filter_words:
if word in word_count:
del word_count[word]
# POS_TAG the words
word_tag = nltk.corpus.wn.synsets(word_count)
# And the survey says...
print("The Top {0} words".format(n))
for word, count, word_tag in word_count.most_common(n) and nltk.corpus.wordnet.synsets(n):
print("{0}: {1, 2}".format(word, count, word_tag))
我想制作一个带有词干/词形化词汇的数据库表,并标记词性标注(VERB,NOUN,ADV,..),就像这样。
http://www.nltk.org/book/ch05.html#tab-universal-tagset
如何解决错误? 在mySQL db上,#|字| POS标签。 |频率 我也在寻找一种方法来删除不在字典上的单词(artistessex,asifyou),因为我使用len解析单词...
##
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET
import requests, re
from xml.etree import ElementTree
from collections import Counter
from lxml import html
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
##
def is_noun(tag):
return tag in ['NN', 'NNS', 'NNP', 'NNPS']
def is_verb(tag):
return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
def is_adverb(tag):
return tag in ['RB', 'RBR', 'RBS']
def is_adjective(tag):
return tag in ['JJ', 'JJR', 'JJS']
def penn_to_wn(tag):
if is_adjective(tag):
return wn.ADJ
elif is_noun(tag):
return wn.NOUN
elif is_adverb(tag):
return wn.ADV
elif is_verb(tag):
return wn.VERB
return None
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
## XML PARSING
def main(n=10):
# A list of feeds to process and their xpath
feeds = [
{'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
{'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'}
]
# A place to hold all feed results
results = []
# Loop all the feeds
for feed in feeds:
# Append feed results together
results = results + process(feed['url'], feed['xpath'])
# Join all results into a big string
contents=",".join(map(str, results))
# Remove double+ spaces
contents = re.sub('\s+', ' ', contents)
# Remove everything that is not a character or whitespace
contents = re.sub('[^A-Za-z ]+', '', contents)
# Create a list of lower case words that are at least 8 characters
words=[w.lower() for w in contents.split() if len(w) >=8 ]
# Count the words
word_count = Counter(words)
# POS_TAG the words
word_stem = stemmer.stem(words)
word_refine = lemmatiser.lemmatize(word_stem)
# tokens = word_tokenize(words) # Generate list of tokens
# tokens_pos = pos_tag(tokens)
# Clean the content a little
filter_words = ['artists']
for word in filter_words:
if word in word_refine:
del word_refine[word]
# And the survey says...
print("The Top {0} words".format(n))
for word, pos in word_refine.stemmer.stem(n):
for word, count in word_count.most_common(n):
print("{0}: {1, 2}".format(word, pos, count))
def process(url, xpath):
"""
Downloads a feed url and extracts the results with a variable path
:param url: string
:param xpath: string
:return: list
"""
contents = requests.get(url)
root = ElementTree.fromstring(contents.content)
return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]
# Add to DB
for word, count in word_count.most_common(n):
sql = """INSERT INTO Table1 (keyword, pos, freq) VALUES(%s, %s, %s)"""
cursor.execute(sql, (word, pos, count))
db.commit()
if __name__ == "__main__":
main()