Question

我正在一个信息检索项目中，在这里我必须处理约1.5 GB的文本数据并创建字典（单词，文档频率）和发布列表（文档ID，术语频率）。据教授说，大约需要10到15分钟。但是我的代码现在已经运行了8个多小时！我尝试了一个较小的数据集（〜35 MB），处理过程花了5个小时。

我是python的新手，我认为这花了很长时间，因为我在代码中创建了许多python字典和列表。我尝试使用生成器，但不确定如何使用它。

file = open(filename, 'rt')
text = file.read()
file.close()

p = r'<P ID=\d+>.*?</P>' 
tag = RegexpTokenizer(p)
passage = tag.tokenize(text)
doc_re = re.compile(r"<P ID=(\d+)>")

def process_data(docu):
    tokens = RegexpTokenizer(r'\w+')    
    lower_tokens = [word.lower() for word in tokens.tokenize(docu)]     
    table = str.maketrans('','', string.punctuation)
    stripped = [w.translate(table) for w in lower_tokens] 
    alpha = [word for word in stripped if word.isalpha()]   
    stopwordlist = stopwords.words('english')
    stopped = [w for w in alpha if not w in stopwordlist]

    return stopped

data = {}
for doc in passage:
    group_docID = doc_re.match(doc)
    docID = group_docID.group(1)
    tokens = process_data(doc)
    data[docID] = list(set(tokens))

vocab = [item for i in data.values() for item in i]
total_vocab = list(set(vocab))
total_vocab.sort()
print('Document Size = ', len(data))
print('Collection Size = ', len(vocab)) 
print('Vocabulary Size= ', len(total_vocab)) 


inv_index = {} 
for x in total_vocab:
    for y, z in data.items():
        if x in z:
            wordfreq = z.count(x)
            inv_index.setdefault(x, []).append((int(y), wordfreq)) 

flattend = [item for tag in inv_index.values() for item in tag] 
posting = [item  for tag in flattend for item in tag ] 


doc_freq=[]
for k,v in inv_index.items():
    freq1=len([item for item in v if item])
    doc_freq.append((freq1))

#offset value of each vocabulary/words
offset = []
offset1=0
for i in range(len(doc_freq)):
    if i>0:
        offset1 =offset1 + (doc_freq[i-1]*2)
    offset.append((offset1))

#create dcitionary of words, document frequency and offset
dictionary = {}
for i in range(len(total_vocab)):
    dictionary[total_vocab[i]]=(doc_freq[i],offset[i])

#dictionary of word, inverse document frequency
idf = {}
for i in range(len(dictionary)):
    a = np.log2(len(data)/doc_freq[i])
    idf[total_vocab[i]] = a

with open('dictionary.json', 'w') as f:
    json.dump(dictionary,f)

with open('idf.json', 'w') as f:
    json.dump(idf, f)

binary_file = open('binary_file.txt', 'wb')

for i in range(0, len(posting)):
    binary_int = (posting[i]).to_bytes(4, byteorder = 'big')
    binary_file.write(binary_int)

binary_file.close()

有人可以帮我重写这段代码，以便使其在计算上更省时吗？

如何提高我的python代码的计算效率

0 个答案: