我有一个数据框,其中有一列文本和一列关键字。
>>> main_df.head(3)
+-------+-----------------------------------------+---------------------------------------+
| Index | Text | Keywords |
+-------+-----------------------------------------+---------------------------------------+
| 1 | "Here is some text" | ["here","text"] |
| 2 | "Some red birds and blue elephants" | ["red", "bird", "blue", "elephant"] |
| 3 | "Please help me with my pandas problem" | ["help", "pandas", "problem"] |
+-------+-----------------------------------------+---------------------------------------+
我使用itertools.combinations来制作具有所有可能的关键字组合的数据框。
>>> edge_df.head(3)
+-------+--------+--------+
| Index | Src | Dst |
+-------+--------+--------+
| 1 | "here" | "text" |
| 2 | "here" | "red" |
| 3 | "here" | "bird" |
+-------+--------+--------+
然后我应用遍历每个关键字对的函数,并在edge_df['weight']
中分配一个值,该值是每个关键字对出现在同一文本/关键字列表中的次数。
>>> edge_df.head(3)
+-------+--------+--------+--------+
| Index | Src | Dst | Weight |
+-------+--------+--------+--------+
| 1 | "here" | "text" | 1 |
| 2 | "here" | "red" | 3 |
| 3 | "here" | "bird" | 8 |
+-------+--------+--------+--------+
我的问题是此刻的代码非常慢(300行短文本需要1小时)。下面是我用来获取edge_df并应用该函数的代码。我能做些什么来加快速度?
from itertools import combinations
def indexes_by_word(word1, word2):
"""
Find the matching texts between two words.
"""
indx1 = set(df[df['Keywords'].apply(lambda lst: word1 in lst)].index)
indx2 = set(df[df['Keywords'].apply(lambda lst: word2 in lst)].index)
return len(indx1.intersection(indx2))
# Make list of all unique words
unique_words = df['Keywords'].apply(pd.Series).stack().reset_index(drop=True).unique()
# Make an empty edgelist dataframe of our words
edges = pd.DataFrame(data=list(combinations(unique_words, 2)),
columns=['src', 'dst'])
edges['weight'] = edges.progress_apply(lambda x: indexes_by_word(x['src'], x['dst']), axis=1)
edges.head()
答案 0 :(得分:1)
将apply
从indexes_by_word
中剔除仅10%。无论如何,这是一个忙箱来A / B您的代码。希望看到其他优化。
import pandas as pd
import numpy as np
from itertools import combinations
import timeit
df = pd.DataFrame([{"Text":"Here is some text","Keywords":["here","text"]},
{"Text":"Some red birds and blue elephants","Keywords":["red", "bird", "blue", "elephant"]},
{"Text":"Please help me with my pandas problem","Keywords":["help", "pandas", "problem"]}])
#https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows/40449726#40449726
def explode(df, lst_cols, fill_value='', preserve_index=False):
# make sure `lst_cols` is list-alike
if (lst_cols is not None
and len(lst_cols) > 0
and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
lst_cols = [lst_cols]
# all columns except `lst_cols`
idx_cols = df.columns.difference(lst_cols)
# calculate lengths of lists
lens = df[lst_cols[0]].str.len()
# preserve original index values
idx = np.repeat(df.index.values, lens)
# create "exploded" DF
res = (pd.DataFrame({
col:np.repeat(df[col].values, lens)
for col in idx_cols},
index=idx)
.assign(**{col:np.concatenate(df.loc[lens>0, col].values)
for col in lst_cols}))
# append those rows that have empty lists
if (lens == 0).any():
# at least one list in cells is empty
res = (res.append(df.loc[lens==0, idx_cols], sort=False)
.fillna(fill_value))
# revert the original index order
res = res.sort_index()
# reset index if requested
if not preserve_index:
res = res.reset_index(drop=True)
return res
keyword_index = explode(df,['Keywords'], preserve_index=True)['Keywords']
def first(df):
def indexes_by_word_first(word1, word2):
"""
Find the matching texts between two words.
"""
indx1 = set(df[df['Keywords'].apply(lambda lst: word1 in lst)].index)
indx2 = set(df[df['Keywords'].apply(lambda lst: word2 in lst)].index)
return len(indx1.intersection(indx2))
# Make list of all unique words
unique_words = df['Keywords'].apply(pd.Series).stack().reset_index(drop=True).unique()
# Make an empty edgelist dataframe of our words
edges = pd.DataFrame(data=list(combinations(unique_words, 2)),
columns=['src', 'dst'])
edges['weight'] = edges.apply(lambda x: indexes_by_word_first(x['src'], x['dst']), axis=1)
return edges
def second(df):
def indexes_by_word_second(word1, word2):
"""
Find the matching texts between two words.
"""
indx1 = set(keyword_index[keyword_index == word1].index.values)
indx2 = set(keyword_index[keyword_index == word2].index.values)
return len(indx1.intersection(indx2))
# Make list of all unique words
unique_words = df['Keywords'].apply(pd.Series).stack().reset_index(drop=True).unique()
# Make an empty edgelist dataframe of our words
edges = pd.DataFrame(data=list(combinations(unique_words, 2)),
columns=['src', 'dst'])
edges['weight'] = edges.apply(lambda x: indexes_by_word_second(x['src'], x['dst']), axis=1)
return edges
if __name__ == '__main__':
assert(first(df).equals(second(df)))
print("first ",timeit.timeit("first(df)", setup="from __main__ import first, df", number=50))
print("second ",timeit.timeit("second(df)", setup="from __main__ import second, df", number=50))
生产
first 1.8623420829999997
second 1.7135651139999997