我目前正在接受培训https://www.tensorflow.org/tutorials/structured_data/time_series 但是,我想知道如何将代码修改为具有多个group_id的数据集,并且应该仅基于相同的group_id生成时间序列
以下是基于本教程的当前代码,我正在make_dataset下进行编辑。
我想知道是否有更好的方法,因为make数据集在我的实际数据集上太长了
import tkinter as tk
lineNumber = 10
def create_grid(event=None):
w = c.winfo_width() # Get current width of canvas
h = c.winfo_height() # Get current height of canvas
c.delete('grid_line') # Will only remove the grid_line
# Creates all vertical lines at intevals of 100
for i in range(0, w, int(w/lineNumber)):
c.create_line([(i, 0), (i, h)], tag='grid_line')
# Creates all horizontal lines at intevals of 100
for i in range(0, h, int(h/lineNumber)):
c.create_line([(0, i), (w, i)], tag='grid_line')
root = tk.Tk()
c = tk.Canvas(root, height=500, width=500, bg='white')
c.pack(fill=tk.BOTH, expand=True)
c.bind('<Configure>', create_grid)
def onClick(event):
if event.num == 1: # LEFT CLICK
cLenX = int(c.winfo_width()/lineNumber)
cLenY = int(c.winfo_height()/lineNumber)
x, y = event.x, event.y
cubeX = (x-(x % cLenX)) / cLenX
cubeY = (y-(y % cLenY)) / cLenY
print(cubeX, cubeY)
# canvas.create_polygon(x0, y0, x1, y1,...xn, yn, options)
c.create_polygon(cLenX*cubeX, cLenY*cubeY, cLenX*(cubeX+1), cLenY*cubeY, cLenX*(cubeX+1), cLenY*(cubeY+1), cLenX*cubeX, cLenY*(cubeY+1), fill="red")
root.bind("<Button>", onClick)
root.mainloop()
我只是专门针对此部分进行了更改,以便针对该小组。
#Create dummy df
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(100,4), columns=list('ABCD'))
df["group_id"] = pd.Series(['A']*25+ ['B']*20+['C']*20 + ['D']*35)
for id in df.group_id.unique():
df.loc[df.group_id==id,'timestamp']=pd.date_range(start='1/1/2018',periods=df[df.group_id==id].shape[0])
df=df.set_index(['group_id','timestamp'])
df['cumcount']=df.groupby('group_id').cumcount()
for col in df.columns:
df[col]=df[col]/100+df.cumcount/10
df=df.drop(['cumcount'],axis=1)
import tensorflow as tf
import matplotlib.pyplot as plt
import IPython
import IPython.display
column_indices = {name: i for i, name in enumerate(df.columns)}
n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]
num_features = df.shape[1]
class WindowGenerator():
def __init__(self, input_width, label_width, shift,
train_df=train_df, val_df=val_df, test_df=test_df,
label_columns=None):
# Store the raw data.
self.train_df = train_df
self.val_df = val_df
self.test_df = test_df
# Work out the label column indices.
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in
enumerate(label_columns)}
self.column_indices = {name: i for i, name in
enumerate(train_df.columns)}
# Work out the window parameters.
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
def __repr__(self):
return '\n'.join([
f'Total window size: {self.total_window_size}',
f'Input indices: {self.input_indices}',
f'Label indices: {self.label_indices}',
f'Label column name(s): {self.label_columns}'])
def plot(self, model=None, plot_col='A', max_subplots=3):
inputs, labels = self.example
plt.figure(figsize=(12, 8))
plot_col_index = self.column_indices[plot_col]
max_n = min(max_subplots, len(inputs))
for n in range(max_n):
plt.subplot(3, 1, n+1)
plt.ylabel(f'{plot_col} [normed]')
plt.plot(self.input_indices, inputs[n, :, plot_col_index],
label='Inputs', marker='.', zorder=-10)
if self.label_columns:
label_col_index = self.label_columns_indices.get(plot_col, None)
else:
label_col_index = plot_col_index
if label_col_index is None:
continue
plt.scatter(self.label_indices, labels[n, :, label_col_index],
edgecolors='k', label='Labels', c='#2ca02c', s=64)
if model is not None:
predictions = model(inputs)
plt.scatter(self.label_indices, predictions[n, :, label_col_index],
marker='X', edgecolors='k', label='Predictions',
c='#ff7f0e', s=64)
if n == 0:
plt.legend()
plt.xlabel('Time [h]')
WindowGenerator.plot=plot
def split_window(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
if self.label_columns is not None:
labels = tf.stack(
[labels[:, :, self.column_indices[name]] for name in self.label_columns],
axis=-1)
# Slicing doesn't preserve static shape information, so set the shapes
# manually. This way the `tf.data.Datasets` are easier to inspect.
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
WindowGenerator.split_window = split_window
以下根据教程提供的代码
def process_dataset(self,data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32,)
ds = ds.map(self.split_window)
return ds
def make_dataset(self,data):
dataset=data.groupby('group_id').apply(lambda x:self.process_dataset(x))
ds=dataset[0]
for i in range(1,data.index.get_level_values(0).nunique()):
ds=ds.concatenate(dataset[i])
return ds
WindowGenerator.make_dataset = make_dataset