我们需要根据时间顺序创建组。
我们正在使用dask
,但由于dask
中尚未实现transform
,因此我们需要返回到熊猫这个功能。尽管该功能有效-仍然可以改善性能吗? (我们的代码在local Client
上运行,有时也在yarn-client
上运行)
贝娄是我们的职能,是一个最小,完整和可验证的示例:
import pandas as pd
import numpy as np
import random
import dask
import dask.dataframe as dd
from datetime import timedelta
def create_groups_from_time_sequence(df, col_id: str=None, col_time: np.datetime64=None, time_threshold: str='120s',
min_weight: int=2) -> pd.DataFrame:
"""
Function creates group of units for relationships
:param df: dataframe pandas or dask
:param col_id: column containing the index
:param col_time: column containing datetime of query
:param time_threshold: maximum threshold between queries to create
:param min_weight: The threshold to filter the minimum relationship between 2 ids
:return: pandas dataframe
"""
partitions = None
if isinstance(df, dd.DataFrame):
partitions = df.npartitions
df = df.compute()
if np.issubdtype(df[col_time].dtype, np.datetime64):
df[col_time] = pd.to_datetime(df[col_time])
df = df.sort_values([col_id, col_time])
df['cluster_bool'] = df.groupby(col_id)[col_time].transform(lambda x: x.diff() > time_threshold)
df['EdgeID'] = df.groupby(col_id)['cluster_bool'].transform(lambda x: x.astype(int).cumsum())
df['cluster_weight'] = df.groupby([col_id, 'EdgeID'])['EdgeID'].transform('count')
mask_weight = df['cluster_weight'] > min_weight
df = df[mask_weight]
df = df.drop(['cluster_bool'], axis=1).reset_index(drop=True)
if partitions:
df = dd.from_pandas(df, npartitions=partitions)
df = df.set_index('EdgeID')
return df
在dask
数据集示例中使用以上功能:
df_raw = dask.datasets.timeseries()
df = df_raw[['id', 'name']]
df = df.assign(timegroup=df.index)
df.timegroup = df.timegroup.apply(lambda s: s + timedelta(seconds=random.randint(0,60)) )
df.head()
| timestamp | id | name | timegroup |
| 2000-01-01 00:00:00 | 968 | Alice | 2000-01-01 00:00:46 |
| 2000-01-01 00:00:01 | 1030 | Xavier | 2000-01-01 00:00:22 |
| 2000-01-01 00:00:02 | 991 | George | 2000-01-01 00:00:59 |
| 2000-01-01 00:00:03 | 975 | Zelda | 2000-01-01 00:00:26 |
| 2000-01-01 00:00:04 | 1028 | Zelda | 2000-01-01 00:00:18 |
dfg = create_groups_from_time_sequence(df, col_id='id', col_time='timegroup', time_threshold='120s',min_weight=2)
dfg.head()
| EdgeID | id | name | timegroup | cluster_weight |
|-------- |------ |--------- |--------------------- |---------------- |
| 0 | 960 | Norbert | 2000-01-01 00:01:10 | 3 |
| 0 | 969 | Sarah | 2000-01-01 00:03:32 | 7 |
| 0 | 1013 | Michael | 2000-01-01 00:02:58 | 8 |
| 0 | 963 | Ray | 2000-01-01 00:05:58 | 5 |
| 0 | 996 | Ray | 2000-01-01 00:03:41 | 6 |