我使用Googles datalab查询BigQuery并将该数据用于RNN。我试图将大多数数据转换为数字格式,但我收到的是"日期"柱。如果我把它作为字符串保留,我的代码可以工作但我需要date字段来保留RNN的日期时间数据类型。我的代码和错误发布在下面。
import datalab.bigquery as bq
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow as tf
import shutil
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')
%%sql --m log_data
SELECT resource, place_id, CAST(end_time AS TIMESTAMP) AS Date, device, device_os, device_os_version, CAST(latency AS FLOAT) AS Latency,
CAST(megacycles AS FLOAT) AS Megacycles, method, CAST(response_size AS FLOAT) AS Response_Size, CAST(cost AS FLOAT) AS Cost, status,
device_brand, device_family, browser_version, app, ua_parse
FROM [project:app_logs_data]
WHERE DATE(CAST(end_time AS DATE)) >='2018-03-09' AND DATE(CAST(end_time AS DATE)) <='2018-03-10'
GROUP BY resource, place_id, Date, device, device_os, device_os_version, Latency, Megacycles, method,
Response_size, Cost, status, device_brand, device_family, browser_version, app, ua_parse
logs = bq.Query(log_data).to_dataframe()
logs['Date'] = pd.to_datetime(logs['Date'])
logs['resource'].apply(str)
logs.dtypes
resource object
place_id object
Date datetime64[ns]
device object
device_os object
device_os_version object
Latency float64
Megacycles float64
method object
Response_Size float64
Cost float64
status object
device_brand object
device_family object
browser_version object
app object
ua_parse object
dtype: object
def handle_non_numerical_data(logs):
columns = logs.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if logs[column].dtype != np.int64 and \
logs[column].dtype != np.float64 and logs[column].dtype != np.datetime64:
column_contents = logs[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
logs[column] = list(map(convert_to_int, logs[column]))
return logs
logs = handle_non_numerical_data(logs)
KeyErrorTraceback (most recent call last)
<ipython-input-8-27437e47a873> in <module>()
17 return logs
18
---> 19 logs = handle_non_numerical_data(logs)
<ipython-input-8-27437e47a873> in handle_non_numerical_data(logs)
14 text_digit_vals[unique] = x
15 x+=1
---> 16 logs[column] = list(map(convert_to_int, logs[column]))
17 return logs
18
<ipython-input-8-27437e47a873> in convert_to_int(val)
5 text_digit_vals = {}
6 def convert_to_int(val):
----> 7 return text_digit_vals[val]
8 if logs[column].dtype != np.int64 and logs[column].dtype != np.float64 and logs[column].dtype != np.datetime64:
9 column_contents = logs[column].values.tolist()
KeyError: Timestamp('2018-03-09 00:00:00')
答案 0 :(得分:2)
datalab.*
是不推荐使用的库,您应该切换为使用google.datalab.*
,而使用%%bq
代替%%sql
。看看它是否已在新库中修复。