我有一个与此类似的python代码:
df = pd.read_csv('genelist.csv', names = ['ID', 'Tag'])
matrix = np.zeros((df.shape[0], df.shape[0]))
for element in range(df.shape[0]):
for Element in range(element, df.shape[0]):
matrix[element, Element] = distance(df['Tag'].iloc[element], df['Tag'].iloc[Element]
distance()
是一个复杂的函数,用于计算两个标签之间的距离。
我的数据帧长8000行,我需要加快处理速度。
我昨天晚上开始了我的脚本, 12小时后还没有结束。
我当时在看multiprocessing
库,但由于从未使用过它,所以不知道是否可能。
distance
函数如下所示:
def penn_to_wn(tag):
"""
Convert between a Penn Treebank tag to a simplified Wordnet tag
"""
if tag[0] == 'N':
return 'n'
if tag.startswith('V'):
return 'v'
if tag.startswith('J'):
return 'a'
if tag.startswith('R'):
return 'r'
return None
def tagged_to_synset(word, tag):
"""
Returns synset of a couple of word, WordNet tag
"""
wn_tag = penn_to_wn(tag)
if wn_tag is None:
return None
try:
return wn.synsets(word, wn_tag)[0]
except:
return None
def NS_sentence_similarity(sentence1, sentence2):
"""
Compute the sentence similarity using Wordnet
Distance(A, A) == 1,
Distance(A, B) != Distance(B, A)
"""
# Tokenize and tag
sentence1 = pos_tag(word_tokenize(sentence1))
sentence2 = pos_tag(sentence2.split())
# Get the synsets for the tagged words
synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
# Filter out the Nones
synsets1 = [ss for ss in synsets1 if ss]
synsets2 = [ss for ss in synsets2 if ss]
score, count = 0.0, 0
# For each word in the first sentence
for synset in synsets1:
# Get the similarity value of the most similar word in the other sentence
try:
best_score = max([synset.path_similarity(ss) for ss in synsets2])
except:
best_score = 0
#print(best_score)
# Check that the similarity could have been computed
if best_score is not None:
score += best_score
count += 1
# Average the values
if count != 0:
score /= count
return score
def distance(sentence1, sentence2):
"""
Compute the sentence similarity using Wordnet
Distance(A, A) == 1,
Distance(A, B) == Distance(B, A)
"""
return (NS_sentence_similarity(sentence1, sentence2) +
NS_sentence_similarity(sentence2, sentence1)) / 2
您知道任何可行的方法吗?