张量流rnn的预处理

时间:2020-11-07 13:32:59

标签: python pandas dataframe tensorflow

我目前正在接受培训https://www.tensorflow.org/tutorials/structured_data/time_series 但是,我想知道如何将代码修改为具有多个group_id的数据集,并且应该仅基于相同的group_id生成时间序列

以下是基于本教程的当前代码,我正在make_dataset下进行编辑。

我想知道是否有更好的方法,因为make数据集在我的实际数据集上太长了

import tkinter as tk

lineNumber = 10

def create_grid(event=None):
    w = c.winfo_width() # Get current width of canvas
    h = c.winfo_height() # Get current height of canvas
    c.delete('grid_line') # Will only remove the grid_line

    # Creates all vertical lines at intevals of 100
    for i in range(0, w, int(w/lineNumber)):
        c.create_line([(i, 0), (i, h)], tag='grid_line')

    # Creates all horizontal lines at intevals of 100
    for i in range(0, h, int(h/lineNumber)):
        c.create_line([(0, i), (w, i)], tag='grid_line')

root = tk.Tk()

c = tk.Canvas(root, height=500, width=500, bg='white')
c.pack(fill=tk.BOTH, expand=True)

c.bind('<Configure>', create_grid)

def onClick(event):
    if event.num == 1: # LEFT CLICK
        cLenX = int(c.winfo_width()/lineNumber)
        cLenY = int(c.winfo_height()/lineNumber)

        x, y = event.x, event.y
        
        cubeX = (x-(x % cLenX)) / cLenX
        cubeY = (y-(y % cLenY)) / cLenY

        print(cubeX, cubeY)

        # canvas.create_polygon(x0, y0, x1, y1,...xn, yn, options)
        c.create_polygon(cLenX*cubeX, cLenY*cubeY, cLenX*(cubeX+1), cLenY*cubeY, cLenX*(cubeX+1), cLenY*(cubeY+1), cLenX*cubeX, cLenY*(cubeY+1), fill="red")

root.bind("<Button>", onClick)
root.mainloop()

我只是专门针对此部分进行了更改,以便针对该小组。

#Create dummy df
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(100,4), columns=list('ABCD'))
df["group_id"] = pd.Series(['A']*25+ ['B']*20+['C']*20 + ['D']*35)
for id in df.group_id.unique():
    df.loc[df.group_id==id,'timestamp']=pd.date_range(start='1/1/2018',periods=df[df.group_id==id].shape[0])
df=df.set_index(['group_id','timestamp'])
df['cumcount']=df.groupby('group_id').cumcount()
for col in df.columns:
    df[col]=df[col]/100+df.cumcount/10
df=df.drop(['cumcount'],axis=1)

import tensorflow as tf
import matplotlib.pyplot as plt
import IPython
import IPython.display
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]
class WindowGenerator():
  def __init__(self, input_width, label_width, shift,
               train_df=train_df, val_df=val_df, test_df=test_df,
               label_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in
                                    enumerate(label_columns)}
    self.column_indices = {name: i for i, name in
                           enumerate(train_df.columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])

def plot(self, model=None, plot_col='A', max_subplots=3):
  inputs, labels = self.example
  plt.figure(figsize=(12, 8))
  plot_col_index = self.column_indices[plot_col]
  max_n = min(max_subplots, len(inputs))
  for n in range(max_n):
    plt.subplot(3, 1, n+1)
    plt.ylabel(f'{plot_col} [normed]')
    plt.plot(self.input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)

    if self.label_columns:
      label_col_index = self.label_columns_indices.get(plot_col, None)
    else:
      label_col_index = plot_col_index

    if label_col_index is None:
      continue

    plt.scatter(self.label_indices, labels[n, :, label_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
    if model is not None:
      predictions = model(inputs)
      plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)

    if n == 0:
      plt.legend()

  plt.xlabel('Time [h]')

WindowGenerator.plot=plot
def split_window(self, features):
  inputs = features[:, self.input_slice, :]
  labels = features[:, self.labels_slice, :]
  if self.label_columns is not None:
    labels = tf.stack(
        [labels[:, :, self.column_indices[name]] for name in self.label_columns],
        axis=-1)

  # Slicing doesn't preserve static shape information, so set the shapes
  # manually. This way the `tf.data.Datasets` are easier to inspect.
  inputs.set_shape([None, self.input_width, None])
  labels.set_shape([None, self.label_width, None])

  return inputs, labels

WindowGenerator.split_window = split_window

以下根据教程提供的代码

def process_dataset(self,data):
    
  data = np.array(data, dtype=np.float32)
  ds = tf.keras.preprocessing.timeseries_dataset_from_array(
      data=data,
      targets=None,
      sequence_length=self.total_window_size,
      sequence_stride=1,
      shuffle=True,
      batch_size=32,)

  ds = ds.map(self.split_window)
  return ds
def make_dataset(self,data):
    dataset=data.groupby('group_id').apply(lambda x:self.process_dataset(x))
    ds=dataset[0]


    for i in range(1,data.index.get_level_values(0).nunique()):
        ds=ds.concatenate(dataset[i])
    return ds

WindowGenerator.make_dataset = make_dataset

0 个答案:

没有答案