我是python的新手,我有两个csv文件,一个包含细菌名称
import csv
import pandas as pd
from collections import Counter
import re
import operator
#Bacteria File Open
Bac = []
with open ("/home/shayez/Desktop/Bacteria.csv", "r") as csv_file1:
csv_reader1 = csv.reader(csv_file1,delimiter = ',')
for lines1 in csv_reader1:
Bac.append(lines1)
# print(lines1[0])
#Abstract File Open
Abs = []
with open ("/home/shayez/Desktop/Anti.csv", "r") as csv_file:
csv_reader = csv.reader(csv_file,delimiter = ',')
for lines in csv_reader:
Abs.append(lines[2])
abswordlist = []
for ab in Abs:
abswordlist.append(Counter(ab.split()))
#print (abswordlist)
cntword = Counter(Abs)
for Bac in Bac:
print (f"{Bac}:{abswordlist[Bac]}")
像这样:-
这是细菌文件,其中包含大约2200个细菌名称
我必须将第一个细菌文件名的单词与第二个Abstract列进行比较,并将细菌的频率计数为Abstract并保存到第三个csv
答案 0 :(得分:0)
我建议您使用pandas库执行此任务,因为看来您将要进行很多聚合。
由于您没有提供[mcve],所以我必须自己做一个。 因此,您必须阅读您的第一个csv并将值保留为列表。它们将稍后成为您保留的专栏。
然后...使用此数组。我建议您将.apply()
与split()
和Counter()
(来自python集合)结合使用。
然后,使用join()
json_normalize()
进行所有操作。
import pandas as pd
from collections import Counter
from pandas.io.json import json_normalize
to_keep = ['LONER', 'I', 'AM']
df = pd.DataFrame({
'date' : ['some date', 'some_other_date', 'lol date'],
'garbage' : ['I AM A LONER', 'AND SO AM I LOL', 'some other weird sentence']
})
print(df.head())
# date garbage
# 0 some date I AM A LONER
# 1 some_other_date AND SO AM I LOL
# 2 lol date some other weird sentence
# Here I am showing you the inside of what I insert into json_normalize.
# It basically counts the word occurrences per line. You split the words,
# and count the list items using `Counter()`
print(df['garbage'].apply(lambda x:Counter(x.split())))
# 0 {'I': 1, 'AM': 1, 'A': 1, 'LONER': 1}
# 1 {'AND': 1, 'SO': 1, 'AM': 1, 'I': 1, 'LOL': 1}
# 2 {'some': 1, 'other': 1, 'weird': 1, 'sentence'...
# Then, you use the json_normalize() function to turn all your jsons into a big DataFrame. And join the result to the previously created DataFrame.
df = df.join( json_normalize(df['garbage'].apply(lambda x:Counter(x.split()))) )
print(df)
# date garbage A ... sentence some weird
# 0 some date I AM A LONER 1.0 ... NaN NaN NaN
# 1 some_other_date AND SO AM I LOL NaN ... NaN NaN NaN
# 2 lol date some other weird sentence NaN ... 1.0 1.0 1.0
# And keep the first indices, here, only date, in addition of the columns you wished to keep earlier.
final_df = df[ ['date'] + [*to_keep] ]
print(final_df)
# date LONER I AM
# 0 some date 1.0 1.0 1.0
# 1 some_other_date NaN 1.0 1.0
# 2 lol date NaN NaN NaN