我正在一个信息检索项目中,在这里我必须处理约1.5 GB的文本数据并创建字典(单词,文档频率)和发布列表(文档ID,术语频率)。据教授说,大约需要10到15分钟。但是我的代码现在已经运行了8个多小时!我尝试了一个较小的数据集(〜35 MB),处理过程花了5个小时。
我是python的新手,我认为这花了很长时间,因为我在代码中创建了许多python字典和列表。我尝试使用生成器,但不确定如何使用它。
file = open(filename, 'rt')
text = file.read()
file.close()
p = r'<P ID=\d+>.*?</P>'
tag = RegexpTokenizer(p)
passage = tag.tokenize(text)
doc_re = re.compile(r"<P ID=(\d+)>")
def process_data(docu):
tokens = RegexpTokenizer(r'\w+')
lower_tokens = [word.lower() for word in tokens.tokenize(docu)]
table = str.maketrans('','', string.punctuation)
stripped = [w.translate(table) for w in lower_tokens]
alpha = [word for word in stripped if word.isalpha()]
stopwordlist = stopwords.words('english')
stopped = [w for w in alpha if not w in stopwordlist]
return stopped
data = {}
for doc in passage:
group_docID = doc_re.match(doc)
docID = group_docID.group(1)
tokens = process_data(doc)
data[docID] = list(set(tokens))
vocab = [item for i in data.values() for item in i]
total_vocab = list(set(vocab))
total_vocab.sort()
print('Document Size = ', len(data))
print('Collection Size = ', len(vocab))
print('Vocabulary Size= ', len(total_vocab))
inv_index = {}
for x in total_vocab:
for y, z in data.items():
if x in z:
wordfreq = z.count(x)
inv_index.setdefault(x, []).append((int(y), wordfreq))
flattend = [item for tag in inv_index.values() for item in tag]
posting = [item for tag in flattend for item in tag ]
doc_freq=[]
for k,v in inv_index.items():
freq1=len([item for item in v if item])
doc_freq.append((freq1))
#offset value of each vocabulary/words
offset = []
offset1=0
for i in range(len(doc_freq)):
if i>0:
offset1 =offset1 + (doc_freq[i-1]*2)
offset.append((offset1))
#create dcitionary of words, document frequency and offset
dictionary = {}
for i in range(len(total_vocab)):
dictionary[total_vocab[i]]=(doc_freq[i],offset[i])
#dictionary of word, inverse document frequency
idf = {}
for i in range(len(dictionary)):
a = np.log2(len(data)/doc_freq[i])
idf[total_vocab[i]] = a
with open('dictionary.json', 'w') as f:
json.dump(dictionary,f)
with open('idf.json', 'w') as f:
json.dump(idf, f)
binary_file = open('binary_file.txt', 'wb')
for i in range(0, len(posting)):
binary_int = (posting[i]).to_bytes(4, byteorder = 'big')
binary_file.write(binary_int)
binary_file.close()
有人可以帮我重写这段代码,以便使其在计算上更省时吗?