考虑df1
熊猫DataFrame。我想将这个DataFrame转换为每个count
和date
有一个concept
(请参阅df2
)。
import pandas as pd
inp_data = [
{'date': '2020-02-01', 'concepts': [{'surfaceForm': 'ABC'}, {'surfaceForm': 'DEF'}]},
{'date': '2020-02-01', 'concepts': [{'surfaceForm': 'ABC'}, {'surfaceForm': 'XYZ'}]},
{'date': '2020-02-02', 'concepts': [{'surfaceForm': 'XYZ'}]}
]
df1 = pd.DataFrame(inp_data, columns=['date', 'concepts'])
# transform df1 into df2...
# goal
out_data = [
{'day': '2020-02-01', 'concept': 'ABC', 'count': 2},
{'day': '2020-02-01', 'concept': 'DEF', 'count': 1},
{'day': '2020-02-01', 'concept': 'XYZ', 'count': 1},
{'day': '2020-02-02', 'concept': 'XYZ', 'count': 1},
]
df2 = pd.DataFrame(out_data, columns=['day', 'concept', 'count'])
请注意,df1
date
在day
中变成了df2
; concepts
中df1
中的每个对象在concept
中被视为自己的df2
。
我可以通过遍历df1
的行来破解它,这显然存在很多性能问题,而且不是熊猫方式。然后我想为无法及时运行的更大的DataFrame运行它。
作为参考,这是一种骇人听闻的方法:
import pandas as pd
columns = ['concept', 'day']
def concept_occurence(row, columns):
insert_list = list()
for c in row['concepts']:
sf = c['surfaceForm']
insert_list.append({'concept': sf, 'day': row['date']})
return pd.DataFrame(insert_list, columns=columns)
df2 = pd.DataFrame(columns=columns)
for index, row in df1.iterrows():
concept_map = concept_occurence(row, columns)
df2 = df2.append(concept_map, ignore_index=True)
答案 0 :(得分:4)
IIUC,我们可以使用explode和pd.series
(
df1.explode("concepts")
.set_index("date")["concepts"]
.apply(pd.Series)
.reset_index(0)
.groupby(["date","surfaceForm"])
.agg(count=("surfaceForm", "count"))
)
count
date surfaceForm
2020-02-01 ABC 2
DEF 1
XYZ 1
2020-02-02 XYZ 1
答案 1 :(得分:3)
这是在str.get
之后使用explode
和a = df1.explode('concepts')
out = (a.assign(concepts = a['concepts'].str.get('surfaceForm'))
.groupby(['date','concepts'])['concepts'].count().reset_index(name='Count'))
print(out)
的一种方式
date concepts Count
0 2020-02-01 ABC 2
1 2020-02-01 DEF 1
2 2020-02-01 XYZ 1
3 2020-02-02 XYZ 1
clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', AdaBoostClassifier())
])
parameters = {
'clf__n_estimators': [20, 50, 70, 100],
'clf__learning_rate' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
gs_clf = GridSearchCV(clf, parameters, cv=kfold, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
print("Best score accurracy = %.3f%%" %((gs_clf.best_score_)*100.0))
print("Best parameters are : ")
print(gs_clf.best_params_)