这是我在stackoverflow上的第一篇文章,我对编码更加新鲜。所以,请耐心等待。
我正在进行一项有两套数据文件的实验。 Doc1如下:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
TOPIC:topic_2 ....
.....
.....
TOPIC:topic_3 1066.0
say 0.062
word 0.182
依此类推100个主题。
在本文档中,有些词语出现在所有主题中,或者只出现在少数主题中。所以,我想执行一个过程,如果一个主题中没有一个单词,我希望该主题中的单词的值为0.这就是主题2中出现的BBC这个词,但是不存在主题1,所以我希望我的列表为:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
Mr 0
s 0
president 0
tell 0
BBC 0
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
site 0
Internet 0
online 0
web 0
say 0
image 0
我必须将这些值乘以另一个文档中的另一组值。为此,
from collections import defaultdict
from itertools import groupby, imap
d = defaultdict(list)
with open("doc1") as f,open("doc2") as f2:
values = map(float, f2.read().split())
for line in f:
if line.strip() and not line.startswith("TOPIC"):
name, val = line.split()
d[name].append(float(val))
for k,v in d.items():
print("Prob for {} is {}".format(k ,sum(i*j for i, j in zip(v,values)) ))
我的doc2格式为:
0.566667 0.0333333 0.133333 0 0 0 2.43333 0 0.13333......... till 100 values.
上述代码考虑“说”这个词。它检查该单词是否包含3个主题,并在[0.015,0.45,0.062]等列表中收集它们的值。此列表与doc2中的值相乘,使得值0.015乘以doc2中的第0个值,doc2中的0.45 *第1个值和doc2中的0.062 *第2个值。但这不是我想要的。我们可以看到topic_2中没有“SAY”字样。此列表必须包含[0.015,0.45,0,0.062]。因此,当这些值与doc2中各自的位置值相乘时,它们会给出
P(SAY) = (0.566667*0.015) + (0.0333333*0.045) + (0.133333 *0) + (0*0.062)
因此,代码非常好,但只需要进行此修改。
答案 0 :(得分:4)
问题是你正在将TOPICS视为所有问题,如果你希望各个部分使用原始答案中的groupby代码先获取一组所有名称,然后将名称集与defualtdict键进行比较,找到每个部分的差异:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
# find every word in every TOPIC
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0) # rset pointer
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
# get difference in all_words vs words in current TOPIC
# giving 0 as default for missing values
for word in all_words - d.viewkeys():
d[word] = 0
for k,v in d.iteritems():
print("Prob for {} is {}".format(k,v))
d = defaultdict(float)
要存储所有输出,您可以将dicts添加到列表中:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
for word in all_words - d.viewkeys():
d[word] = 0
out.append(d)
d = defaultdict(float)
然后遍历列表:
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
或者忘记defualtdict并使用dict.fromkeys:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = [line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")]
f.seek(0)
out, d = [], dict.fromkeys(all_words ,0.0)
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
out.append(d)
d = dict.fromkeys(all_words ,0)
如果你总是希望最后丢失的单词使用collections.OrderedDict,第一种方法是在dict结尾处添加缺失值:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d.setdefault(name, (float(val) * f))
for word in all_words.difference(d):
d[word] = 0
out.append(d)
d = OrderedDict()
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
最后按顺序和按主题存储:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = OrderedDict()
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v).rstrip()
# create OrderedDict for each topic
out[topic] = OrderedDict()
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
out[topic].setdefault(name, (float(val) * f))
# find words missing from TOPIC and set to 0
for word in all_words.difference(out[topic]):
out[topic][word] = 0
for k,v in out.items():
print(k) # each TOPIC
for k,v in v.iteritems():
print("Prob for {} is {}".format(k,v)) # the OrderedDict items
print("\n")
DOC1:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
DOC2:
0.345 0.566667
输出:
TOPIC:topic_0 5892.0
Prob for site is 0.0128233197556
Prob for Internet is 0.00901731160895
Prob for online is 0.00790478615073
Prob for web is 0.00755346232181
Prob for say is 0.00550407331974
Prob for image is 0.00521130346231
Prob for BBC is 0
Prob for Mr is 0
Prob for s is 0
Prob for president is 0
Prob for tell is 0
TOPIC:topic_1 12366.0
Prob for Mr is 0.085187930859
Prob for s is 0.0293277438137
Prob for say is 0.0255701266375
Prob for president is 0.00870667394471
Prob for tell is 0.0076985327511
Prob for BBC is 0.0076985327511
Prob for web is 0
Prob for image is 0
Prob for online is 0
Prob for site is 0
Prob for Internet is 0
您可以使用常规for循环应用完全相同的逻辑,groupby只为您完成所有分组工作。
如果你真的只想写一个文件,那么代码就更简单了:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2,open("prob.txt","w") as f3:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic, words = next(v), []
flt = next(values)
f3.write(topic)
for s in v:
name, val = s.split()
words.append(name)
f3.write("{} {}\n".format(name, (float(val) * flt)))
for word in all_words.difference(words):
f3.write("{} {}\n".format(word, 0))
f3.write("\n")
prob.txt:
TOPIC:topic_0 5892.0
site 0.0128233197556
Internet 0.00901731160895
online 0.00790478615073
web 0.00755346232181
say 0.00550407331974
image 0.00521130346231
BBC 0
Mr 0
s 0
president 0
tell 0
TOPIC:topic_1 12366.0
Mr 0.085187930859
s 0.0293277438137
say 0.0255701266375
president 0.00870667394471
tell 0.0076985327511
BBC 0.0076985327511
web 0
image 0
online 0
site 0
Internet 0
答案 1 :(得分:2)
作为重写块的另一种简洁方法,您可以将所有名称存储在一个集合中,然后创建块的相对OrderedDict
,然后使用带有主要字词的set.difference
获取错过的名称(集合) words
)对于每个块,然后在块的末尾写下它们:
from itertools import tee
from collections import OrderedDict
d=OrderedDict()
with open('input.txt') as f,open('new','w') as new:
f2,f3,f=tee(f,3)
next(f3)
words={line.split()[0] for line in f if not line.startswith('TOPIC') and line.strip()}
for line in f2:
if line.startswith('TOPIC'):
key=line
next_line=next(f3)
try:
while not next_line.startswith('TOPIC'):
d.setdefault(key,[]).append(next_line)
next_line=next(f3)
except:
pass
for k,v in d.items():
block_words={line.split()[0] for line in v if line.strip()}
insec=words.difference(block_words)
new.writelines([k]+v+['{} {}\n'.format(i,0) for i in insec])
结果:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
president 0
s 0
BBC 0
tell 0
Mr 0
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398web 0
image 0
online 0
site 0
Internet 0
答案 2 :(得分:2)
我首先将file1作为映射列表{word,value},每个主题构建一个列表元素。
with open('Doc1') as f:
maps = []
for line in f:
line = line.strip()
if line.startswith('TOPIC'):
mapping = {}
maps.append(mapping)
elif len(line) == 0:
pass
else:
k, v = line.split()
mapping[k] = v
然后我将通过从所有映射中取出键的组合来构建一组所有单词
words = set()
for mapping in maps:
words = words.union(mapping.keys())
然后我将迭代每个映射,并为dict中尚未出现的单词集中的所有键添加0值。
for mapping in maps:
for k in words.difference(mapping.keys()):
mapping[k] = 0
这样,所有映射中都存在所有单词,构建一个漂亮的d
dict是微不足道的:
d = {k : list() for k in words }
for mapping in maps:
for k in mappings:
d[k].append(float(mapping[k]))
至少有一个主题中出现的每个单词都有一个值为100个值的列表,每个主题一个,当它存在时为真值,如果不存在则为0:zip现在可以正常工作。