以下方法有效,但在我看来似乎过于复杂。有没有更简单的方法来计算时差和汇总统计信息?我特别希望替换for循环
import pandas as pd
import numpy as np
# Read in the csv file using the 'record_id' field as the index, keeping only the timestamp
df = pd.read_csv("my_data.csv", sep=',', index_col='record_id', usecols=["record_id", "timestamp"])
# Group them by record_id
record_id_grouping = df.groupby("record_id")
# Create a list of data frames, each with a different record_id
df_list = [x for _, x in record_id_grouping]
new_df_list = []
# Iterate over the list of data frames
for df in df_list:
# Add a time difference column
df['diff'] = df["timestamp"].diff()
# Drop the timestamp column and any data frame rows with NaN
df = df.loc[:,["diff"]].dropna()
# Append the new data frame to a new list
new_df_list.append(df)
# Remove any data frames from the list that are empty
new_df_list = [df for df in new_df_list if df.empty == False]
# Put all the data frames in the list back into a single data frame
new_df = pd.concat(new_df_list)
# Calculate mean, std, max, min and count for each record_id in the data frame
final_df = new_df.groupby("record_id").agg(['mean', 'std', 'max', 'min', 'count'])
# Drop the diff level
final_df.columns = final_df.columns.droplevel()
# Drop any rows that have Nan in them.
final_df = final_df.dropna()