我已将此link which is nicely colour coded中的一些代码放在一起,并进行了4次小修改以修复某些错误。我还使用了之前2个论坛的一些代码。
代码应该做的是计算整个文本中连续句子之间的语义相似度,然后显示像这样获得的所有相似度值;
'黄色的门。','红色的锤子' 0.65
'粉红色的狐狸在树林里。','指挥官狐狸是蓝色的。' 0.32
这是代码;
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85
brown_freqs = dict()
N = 0
######################### word similarity ##########################
def get_best_synset_pair(word_1, word_2):
"""
Choose the pair with highest path similarity among all pairs.
Mimics pattern-seeking behavior of humans.
"""
max_sim = -1.0
synsets_1 = wn.synsets(word_1)
synsets_2 = wn.synsets(word_2)
if len(synsets_1) == 0 or len(synsets_2) == 0:
return None, None
else:
max_sim = -1.0
best_pair = None, None
for synset_1 in synsets_1:
for synset_2 in synsets_2:
sim = wn.path_similarity(synset_1, synset_2)
if sim > max_sim:
max_sim = sim
best_pair = synset_1, synset_2
return best_pair
def length_dist(synset_1, synset_2):
l_dist = sys.maxint
if synset_1 is None or synset_2 is None:
return 0.0
if synset_1 == synset_2:
# if synset_1 and synset_2 are the same synset return 0
l_dist = 0.0
else:
wset_1 = set([str(x.name()) for x in synset_1.lemmas()])
wset_2 = set([str(x.name()) for x in synset_2.lemmas()])
if len(wset_1.intersection(wset_2)) > 0:
# if synset_1 != synset_2 but there is word overlap, return 1.0
l_dist = 1.0
else:
# just compute the shortest path between the two
l_dist = synset_1.shortest_path_distance(synset_2)
if l_dist is None:
l_dist = 0.0
# normalize path length to the range [0,1]
return math.exp(-ALPHA * l_dist)
def hierarchy_dist(synset_1, synset_2):
h_dist = sys.maxint
if synset_1 is None or synset_2 is None:
return h_dist
if synset_1 == synset_2:
# return the depth of one of synset_1 or synset_2
h_dist = max([x[1] for x in synset_1.hypernym_distances()])
else:
# find the max depth of least common subsumer
hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}
hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}
lcs_candidates = set(hypernyms_1.keys()).intersection(
set(hypernyms_2.keys()))
if len(lcs_candidates) > 0:
lcs_dists = []
for lcs_candidate in lcs_candidates:
lcs_d1 = 0
if lcs_candidate in hypernyms_1:
lcs_d1 = hypernyms_1[lcs_candidate]
lcs_d2 = 0
if lcs_candidate in hypernyms_2:
lcs_d2 = hypernyms_2[lcs_candidate]
lcs_dists.append(max([lcs_d1, lcs_d2]))
h_dist = max(lcs_dists)
else:
h_dist = 0
return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) /
(math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))
def word_similarity(word_1, word_2):
synset_pair = get_best_synset_pair(word_1, word_2)
return (length_dist(synset_pair[0], synset_pair[1]) *
hierarchy_dist(synset_pair[0], synset_pair[1]))
######################### sentence similarity ##########################
def most_similar_word(word, word_set):
max_sim = -1.0
sim_word = ""
for ref_word in word_set:
sim = word_similarity(word, ref_word)
if sim > max_sim:
max_sim = sim
sim_word = ref_word
return sim_word, max_sim
def info_content(lookup_word):
global N
if N == 0:
# poor man's lazy evaluation
for sent in brown.sents():
for word in sent:
word = word.lower()
if not word in brown_freqs:
brown_freqs[word] = 0
brown_freqs[word] = brown_freqs[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
def semantic_vector(words, joint_words, info_content_norm):
sent_set = set(words)
semvec = np.zeros(len(joint_words))
i = 0
for joint_word in joint_words:
if joint_word in sent_set:
# if word in union exists in the sentence, s(i) = 1 (unnormalized)
semvec[i] = 1.0
if info_content_norm:
semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)
else:
# find the most similar word in the joint set and set the sim value
sim_word, max_sim = most_similar_word(joint_word, sent_set)
semvec[i] = PHI if max_sim > PHI else 0.0
if info_content_norm:
semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)
i = i + 1
return semvec
def semantic_similarity(sentence_1, sentence_2, info_content_norm):
words_1 = nltk.word_tokenize(sentence_1)
words_2 = nltk.word_tokenize(sentence_2)
joint_words = set(words_1).union(set(words_2))
vec_1 = semantic_vector(words_1, joint_words, info_content_norm)
vec_2 = semantic_vector(words_2, joint_words, info_content_norm)
return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
######################### word order similarity ##########################
def word_order_vector(words, joint_words, windex):
wovec = np.zeros(len(joint_words))
i = 0
wordset = set(words)
for joint_word in joint_words:
if joint_word in wordset:
# word in joint_words found in sentence, just populate the index
wovec[i] = windex[joint_word]
else:
# word not in joint_words, find most similar word and populate
# word_vector with the thresholded similarity
sim_word, max_sim = most_similar_word(joint_word, wordset)
if max_sim > ETA:
wovec[i] = windex[sim_word]
else:
wovec[i] = 0
i = i + 1
return wovec
def word_order_similarity(sentence_1, sentence_2):
"""
Computes the word-order similarity between two sentences as the normalized
difference of word order between the two sentences.
"""
words_1 = nltk.word_tokenize(sentence_1)
words_2 = nltk.word_tokenize(sentence_2)
joint_words = list(set(words_1).union(set(words_2)))
windex = {x[1]: x[0] for x in enumerate(joint_words)}
r1 = word_order_vector(words_1, joint_words, windex)
r2 = word_order_vector(words_2, joint_words, windex)
return 1.0 - (np.linalg.norm(r1 - r2) / np.linalg.norm(r1 + r2))
######################### overall similarity ##########################
def similarity(sentence_1, sentence_2, info_content_norm):
"""
Calculate the semantic similarity between two sentences. The last
parameter is True or False depending on whether information content
normalization is desired or not.
"""
return DELTA * semantic_similarity(sentence_1, sentence_2, info_content_norm) + \
(1.0 - DELTA) * word_order_similarity(sentence_1, sentence_2)
with open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r") as sentence_file:
# Initialize a list to hold the results
results = []
# Loop until we hit the end of the file
while True:
# Read two lines
x = sentence_file.readline()
y = sentence_file.readline()
# Check if we've reached the end of the file, if so, we're done
if not y:
# Break out of the infinite loop
break
else:
# The .rstrip('\n') removes the newline character from each line
x = x.rstrip('\n')
y = y.rstrip('\n')
# Calculate your similarity value
similarity_value = similarity(x, y, True)
# Add the two lines and similarity value to the results list
results.append([x, y, similarity_value])
# Loop through the pairs in the results list and print them
for pair in results:
print(pair)
当我在文本文件上运行代码时,我得到一个错误代码,而不是获得句子之间相似性值的数字,我得到了nan;
Warning (from warnings module):
File "C:\Users\Lenovo2\Desktop\Semantic Analysis (1).py", line 191
return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
RuntimeWarning: invalid value encountered in double_scalars
在上一个论坛中,我明白这个错误可能意味着我除以零,所以我们有一个零向量。我几乎被困在那里并且python经验有限,我不知道如何轻松地修改程序而不会改变太多。
答案 0 :(得分:0)
我的猜测是你传递一个空字符串。你的文字中有空白行吗?在检查空字符串之前,您不会剥离换行符,因此不会捕获仅包含换行符的字符串。
由于您似乎是在Windows上,因此也可能有' \ r \ n'样式换行符,因此rstrip
可能无法正常工作。
我建议添加以下修改(也打印用于调试):
# Loop until we hit the end of the file
while True:
# Read two lines, removing trailing whitespace
x = sentence_file.readline().rstrip()
y = sentence_file.readline().rstrip()
# Check if we've reached the end of the file, if so, we're done
if not x or not y:
# Break out of the infinite loop
break
else:
print(x, y)
# Calculate your similarity value
similarity_value = similarity(x, y, True)
# Add the two lines and similarity value to the results list
results.append([x, y, similarity_value])
请注意,代码似乎有错误,因为您不是成对地比较句子。也就是说,如果你有句子[a,b,c,d],你只是比较(a,b)和(c,d),但你真的想比较(a,b),(b,c) ,(c,d)。
您可以使用itertools库进行清理:
from itertools import pairwise
lines = open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r")
for a, b in pairwise(lines):
x = a.rstrip()
y = b.rstrip()
# ... rest unchanged