我在Python中创建一个单词共生矩阵,并使用嵌套的默认值来创建矩阵。我已经成功创建了矩阵并存储了字数,但是在尝试从嵌套的defaultdict中获取向量(矩阵行)时遇到了麻烦。
以下是我用来初始化矩阵的代码行:
matrix = collections.defaultdict(lambda: collections.defaultdict(int))
以下是我用来将单词计数放入矩阵的行:
matrix[target_word_id][collocated_word_id] += 1
matrix[collocated_word_id][target_word_id] += 1
这就是我试图访问对应于给定单词id的行的矩阵行:
vector1 = matrix[word1_id]
当我打印vector1来测试我的工作时,这就是我得到的输出:
defaultdict(<class 'int'>, {})
该类的完整代码在这里。我从一个单独的主类调用函数:
class Create_vector():
def build_vocab(self, corpus):
vocab = collections.defaultdict(int)
i = 1
for line in corpus:
token = line.strip()
if token not in vocab:
vocab[token] = i
i += 1
return vocab
def build_cooccurrence(self, corpus, vocab, window):
matrix = collections.defaultdict(lambda: collections.defaultdict(int))
for x, line in enumerate(corpus):
if x % 100000 == 0:
print('Building cooccurrence matrix: on line %i', x)
tokens = line.strip()
token_ids = [vocab[token] for token in tokens]
for i, target_word_id in enumerate(token_ids):
collocated_word_ids = token_ids[min(0, target_word_id - window): target_word_id]
for j, collocated_word_id in enumerate(collocated_word_ids):
matrix[target_word_id][collocated_word_id] += 1
matrix[collocated_word_id][target_word_id] += 1
return matrix
def get_vector(self, matrix, vocab, weight, word1, word2):
if weight == 'FREQ':
if word1 in vocab:
word1_id = vocab[word1]
vector1 = matrix[word1_id]
pprint.pprint(vector1)
主要课程在这里:
import nltk
import sys
from nltk.corpus import stopwords
import create_vector
import pprint
import string
def main():
brown_words = list(nltk.corpus.brown.words())
window = int(sys.argv[1])
weight = sys.argv[2]
brown_words_lower = [word.lower() for word in brown_words]
brown_words_only = [w for w in brown_words_lower if w not in string.punctuation]
stops = set(stopwords.words('english'))
brown_words_filtered = [w for w in brown_words_only if w not in stops]
vector = create_vector.Create_vector()
vocab = vector.build_vocab(brown_words_filtered)
cooccurrence = vector.build_cooccurrence(brown_words_filtered, vocab, window)
for line in text:
words = line.split(',')
word1 = words[0]
word2 = words[1]
vector1, vector2 = vector.get_vector(cooccurrence, vocab, weight, word1, word2)
运行它的命令是: python3.4 main.py 2 FREQ
答案 0 :(得分:1)
无法重现:
>>> import collections
>>> matrix = collections.defaultdict(lambda: collections.defaultdict(int))
>>> matrix[2][3] += 1
>>> matrix[3][2] += 1
>>> vector1 = matrix[2]
>>> vector1
defaultdict(<type 'int'>, {3: 1})
您确定word1_id
中的值是否等于已插入矩阵的值?
你可以发布完整的代码,而不仅仅是它的碎片吗?