我正在计算内容的tf-idf向量。我正在使用向量之间的余弦相似性来找出内容的相似程度。我正在使用带有sklearn和Snowball stemmer的nltk库来创建我的tf-idf矢量化器,如下所示。
import nltk, string, os
from tfidf_functions import tokenize, stem_tokens,token_dict,stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
import cPickle as pickle
token_dict = {}
stemmer = SnowballStemmer("english")
stopwords = ["the","and","I"]
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
for subdir, dirs, files in os.walk("path/to/trainingset"):
for file in files:
file_path = subdir + os.path.sep + file
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict[file] = no_punctuation
tfidf = TfidfVectorizer(strip_accents='unicode',tokenizer=tokenize, stop_words=stopwords, max_df=1500)
tfs = tfidf.fit_transform(token_dict.values())
with open('vectorizer.pkl', 'wb') as fin:
pickle.dump(tfidf,fin)
然后我在另一个文件中打开vectorizer.pkl并使用它来获取新文本的特征向量并在它们上运行余弦相似性。我发现了两种不同的方法,但结果却截然不同。 为什么我的两种方法会返回不同的相似度值?
def cosine_sim(text1, text2, vectorizer):
tfidf = vectorizer.fit_transform([text1, text2])
return ((tfidf * tfidf.T).A)[0,1]
def cosine_sim2(u,v):
return np.dot(u,v) / (np.sqrt(np.dot(u,u)) * np.sqrt(np.dot(v,v)))
raw1 = "the quick brown fox jumped"
raw2 = "the lazy and quick dog jumped"
tfidf = pickle.load(open("vectorizer.pkl", "rb" ) )
response1 = tfidf.transform([raw1])
response2 = tfidf.transform([raw2])
#first function, pass the raw text and get vectors inside the function
print("similarity between above two items:"+str(cosine_sim(raw1,raw2, tfidf)))
#second fuction, pass the vectors calculated above
print("similarity between above two items:"+str(cosine_sim2(response1.toarray()[0],response2.toarray()[0])))
在我看来,两种方法都应该达到基本相同的响应。我看到的唯一区别是第一种方法是将文本添加到模型中,因为使用了fit_transform而不仅仅是transform。这并没有解释输出的巨大差异 - 我的训练集是5000个文件