我似乎无法弄清楚我的tfidf矢量化器出了什么问题。我正在尝试将其应用到100,000个文件中,但是我的所有ngram都将其作为重量。我的文件存储在“ / Users / lucy / Desktop / newdic”中。我认为错误与我的文件导入有关。非常感谢你的帮助!我万分感谢!
# -*- coding: utf-8 -*-
#use explode in excel to generate different columns of words
#VLOOKUP() look up internal words in your dictionary
#as3:/usr/local/lib/python2.7/site-packages# cat sitecustomize.py
# encoding=utf8
#import sys
#reload(sys)
#sys.setdefaultencoding('utf8')
#The thing above destroys a bit of your codes in the the original nlp corpus. Don't use it!!!
#https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-document
#http://wordlist.aspell.net/12dicts-readme/#classic
import re
import sys
import os
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sys
#reload(sys)
#sys.setdefaultencoding("ISO-8859-1")
#a.encode('utf-8').strip()
os.chdir("/Users/lucy/Desktop/newdic")
file=[]
for files in os.listdir("/Users/lucy/Desktop/newdic"):
with open(files) as f:
#files.encode('utf-8').strip()
lineList = f.readlines()
lines=" ".join(lineList)
lines=[lines]
vectorizer = TfidfVectorizer(input='files',stop_words='english', ngram_range=(1,1),analyzer="word",min_df=1)
#[texts]=yes
#with open(os.path.join(root,files),"r") as auto:
#print(file)
#vectorizer = TfidfVectorizer('input='filename'')
#Calling fit_transform() on either vectorizer with our list of documents, [a,b], as the argument in each case, returns the same type of object – a 2x6 sparse matrix with 8 stored elements in Compressed Sparse Row format. The only difference is that the TfidfVectorizer() returns floats while the CountVectorizer() returns ints. And that’s to be expected – as explained in the documentation quoted above, TfidfVectorizer() assigns a score while CountVectorizer() counts.
#file.append(lines)
#file1= " ".join(file)
word_count_vector=vectorizer.fit_transform(lines)
file.append(word_count_vector)
#print(vectorizer.vocabulary_)
file2=[]
from sklearn.feature_extraction.text import TfidfTransformer
lines.append(file2)
list(file2)
tf_transformer = TfidfTransformer(use_idf=True).fit(word_count_vector)
X_train_tf = tf_transformer.transform(word_count_vector)
#print(vectorizer.get_feature_names())
idf=vectorizer._tfidf.idf_
#print(idf)
p =zip(vectorizer.get_feature_names(),idf)
# p.sort(key = lambda t: t[1])
import csv
with open('ngram=1(0).csv', 'w') as csvfile:
fwriter = csv.writer(csvfile)
for row in p:
fwriter.writerow(row)
## This piece of codes below was what I tried out but unsuccessfully
#writer = csv.writerow(p, delimiter=';', lineterminator='\n')
#This piece of codes write out the words in the form of a .txt file, I dont know if this is of any use
with open('daemons2.txt', 'w') as fp:
fp.write('\n'.join('%s %s' % x for x in p))
#with open("tfidf.txt","w") as t:
#for x in p:
#t.print(x)
#t.close()