df = pd.DataFrame({
'Name': ['Ann', 'Juh', 'Jeo', 'Sam'],
'Age': [43,29,42,59],
'Task1': ['drafting a letter', 'Sending', 'Pasting', 'Sending'],
'Task2': ['Sending', 'Packing', 'Sending', 'Pasting'],
'Task3': ['Packing', 'Letter Drafting', 'Packing', 'Letter Drafting']
})
在上面的字符串数据框中,我需要检查给定条件的出现。
condition = [“阅读”,“起草”,“包装书”,“分类”,“发送”,“计数”]
为此,我创建了一个新列,其中使用结合了Task1,Task2,Task3
df['NewTask'] = df[df.columns[2:]].apply(
lambda x: ','.join(x.dropna().astype(str)),
axis=1)
然后我应用了从
获得的逻辑https://www.geeksforgeeks.org/sentence-that-contains-all-the-given-phrases/
我得到
Phrase1:count=0, plus the corresponding index values.
Phrase2:count=1 etc..
现在,我需要找到条件中给出的df中最常见的“出现句子”和最常见的“出现句子对”。上面给出的数据帧是一个示例。
我用来分别获取每个数的逻辑是
def getRes(sent, ph):
sentHash = dict()
# Loop for adding hased sentences to sentHash
for s in range(1, len(sent)+1):
sentHash[s] = set(sent[s-1].split())
# For Each Phrase
for p in range(0, len(ph)):
print("Phrase"+str(p + 1)+":")
# Get the list of Words
wordList = ph[p].split()
res = []
# Then Check in every Sentence
for s in range(1, len(sentHash)+1):
wCount = len(wordList)
# Every word in the Phrase
for w in wordList:
if w in sentHash[s]:
wCount -= 1
#wCount -= 1
# If every word in phrase matches
if wCount == 0:
# add Sentence Index to result Array
res.append(s+1)
if(len(res) == 0):
print("NONE")
else:
print('% s' % ' '.join(map(str, res)))
def main():
sent = dff['NewTask']
condition = ["reading", "drafting a letter","Packing","pasting","Sending","counting"]
getRes(sent,condition)
main()`
答案 0 :(得分:1)
要按条件生成行数,您可以过滤数据框以仅包含其中一项任务满足条件的行,然后对行求和。
condition2 = {}
for criteria in condition:
condition2[criteria] = df.loc[(df['Task1'] == criteria) | (df['Task2'] == criteria) | (df['Task3'] == criteria)].shape[0]
如果您希望为此使用新列,则可以在该列中查看任务名称,尽管它的功能较差。
condition2 = {}
for criteria in condition:
condition2[criteria] = df.loc[df['NewTask'].str.contains(criteria)].shape[0]
为了识别常见的任务对,一种方法是使用itertools模块创建任务的每种可能组合,然后计算包含这两个任务的行数。
import itertools
combinations = itertools.combinations(condition, 2)
然后您可以找到以与以前相同的方式执行这两项任务的行。
pairs = {}
for i in combinations:
pairs[i] = df.loc[(df['NewTask'].str.contains(i[0]) )* (df['NewTask'].str.contains(i[1]) ) ].shape[0]
要返回最高的货币对,可以使用以下内容;
print(max(pairs, key=pairs.get), pairs[max(pairs, key=pairs.get)] )