我试图在Kaggle上分叉脚本但是它的给定错误功能不可迭代。我试图将日期转换为字符串,但它不起作用。
https://www.kaggle.com/bitsofbits/predict-west-nile-virus/simple-lasagne-nn
错误消息:
net, mean, std = train()
File "../src/script.py", line 270, in train
X = assemble_X(training, weather)
File "../src/script.py", line 225, in assemble_X
for b in base:
TypeError: 'function' object is not iterable
此外,我甚至将数据帧转换为字典,但这没有帮助
请将以下代码粘贴到Fork Script中以查看结果:
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 27 00:55:09 2017
"""
from __future__ import print_function
from collections import defaultdict
import numpy as np
import datetime
import csv
from operator import itemgetter
import sys
import pandas as pd
from sklearn import ensemble, preprocessing
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
import math
from lasagne.layers import InputLayer, DropoutLayer, DenseLayer
from lasagne.updates import nesterov_momentum
from lasagne.objectives import binary_crossentropy
from nolearn.lasagne import NeuralNet
import theano
from theano import tensor as T
from theano.tensor.nnet import sigmoid
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
import datetime
# Load dataset
train = pd.read_csv("../input/train.csv")
test = pd.read_csv('../input/test.csv')
def date(text):
return datetime.datetime.strptime(text, "%Y-%m-%d").date()
#Converting to Date Object
def getdate(datacol):
return pd.to_datetime(datacol, format="%Y-%m-%d")
#Getting the Year
def getyear(datacol):
return datacol['Date1'].dt.year
#Getting day of Year
def getdayofyear(datacol):
return datacol['Date1'].dt.dayofyear
#Getting the Duplicated Values
def Duplicatedfeature(datacol):
datacol['Freqcount'] = datacol.groupby(['Trap','year','day_of_year','Latitude','Longitude'])['Species'].transform(pd.Series.value_counts)
return datacol['Freqcount']
#Getting Freq by Traps
def Duplicatedtraps(datacol):
datacol['Freqcounttraps'] = datacol.groupby(['Trap','year'])['day_of_year'].transform(pd.Series.value_counts)
return datacol['Freqcounttraps']
#Get Frequency of the Count of traps
def trapsfrequency(datacol):
datacol['trapsfrequency'] = datacol.groupby(['Trap','year','day_of_year'])['Freqcounttraps'].transform(pd.Series.value_counts)
return datacol['trapsfrequency']
#Create New feature for count of traps
def trapsfrequencyequal(datacol,number):
datacol['trapsfrequencyequal_'+str(number)] = np.where(datacol['Freqcounttraps']==number,datacol['trapsfrequency'], 0)
return datacol['trapsfrequencyequal_'+str(number)]
#Create New feature for count of rows of traps atleast 2
def trapsfrequencygreater(datacol,number):
datacol['trapsfrequencygreater_'+str(number)] = np.where(datacol['Freqcounttraps']>number,datacol['trapsfrequency'], 0)
return datacol['trapsfrequencygreater_'+str(number)]
##Get count by currendate and check for atleast 2
def trapsfrequencybycurrentdate(datacol):
datacol['trapsfrequencycurrentdate'] = datacol.groupby(['Date'])['Trap'].transform(pd.Series.value_counts)
return datacol['trapsfrequencycurrentdate']
def trapsfrequencycurrentdategreater1(datacol):
datacol['trapsfrequencycurrentdategreater1'] = np.where(datacol['trapsfrequencycurrentdate']>1,datacol['trapsfrequencycurrentdate'], 0)
return datacol['trapsfrequencycurrentdategreater1']
def Preprocessing(train):
train['Date1']=getdate(train['Date'])
train['year']=getyear(train).astype('int64')
train['day_of_year']=getdayofyear(train).astype('int64')
train['Freqcount']=Duplicatedfeature(train).astype('int64')
train['Freqcounttraps']=Duplicatedtraps(train).astype('int64')
train['trapsfrequency']=trapsfrequency(train).astype('int64')
trapsfrequencyequal(train,2).astype('int64')
trapsfrequencyequal(train,3).astype('int64')
trapsfrequencyequal(train,4).astype('int64')
trapsfrequencyequal(train,5).astype('int64')
trapsfrequencyequal(train,6).astype('int64')
train['sumoftrapsfrequencyequal']=train.trapsfrequencyequal_2+train.trapsfrequencyequal_3+train.trapsfrequencyequal_4+train.trapsfrequencyequal_5+train.trapsfrequencyequal_6
trapsfrequencygreater(train,1).astype('int64')
trapsfrequencybycurrentdate(train).astype('int64')
trapsfrequencycurrentdategreater1(train).astype('int64')
train.drop(['Date1','Address','AddressNumberAndStreet'], inplace=True, axis=1)
#train['AddressNumberandStreet'].apply(int)
train['trapsfrequencycurrentdate']=train['trapsfrequencycurrentdate'].astype('int64')
train['trapsfrequencycurrentdategreater1']=train['trapsfrequencycurrentdategreater1'].astype('int64')
return train
train=Preprocessing(train)
test=Preprocessing(test)
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)
lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)
lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)
train.info()
train=train.to_dict()
test=test.to_dict()
def precip(text):
TRACE = 1e-3
text = text.strip()
if text == "M":
return None
if text == "-":
return None
if text == "T":
return TRACE
return float(text)
def impute_missing_weather_station_values(weather):
# Stupid simple
for k, v in weather.items():
if v[0] is None:
v[0] = v[1]
elif v[1] is None:
v[1] = v[0]
for k1 in v[0]:
if v[0][k1] is None:
v[0][k1] = v[1][k1]
for k1 in v[1]:
if v[1][k1] is None:
v[1][k1] = v[0][k1]
def load_weather():
weather = {}
for line in csv.DictReader(open("../input/weather.csv")):
for name, converter in {"Date" : date,
"Tmax" : float,"Tmin" : float,"Tavg" : float,
"DewPoint" : float, "WetBulb" : float,
"PrecipTotal" : precip,
"Depart" : float,
"ResultSpeed" : float,"ResultDir" : float,"AvgSpeed" : float,
"StnPressure" : float, "SeaLevel" : float}.items():
x = line[name].strip()
line[name] = converter(x) if (x != "M") else None
station = int(line["Station"]) - 1
assert station in [0,1]
dt = line["Date"]
if dt not in weather:
weather[dt] = [None, None]
assert weather[dt][station] is None, "duplicate weather reading {0}:{1}".format(dt, station)
weather[dt][station] = line
impute_missing_weather_station_values(weather)
return weather
def load_training():
training = train
#for index,r in train:
# training.append((r['Date'],r['Latitude'],r['Species'],r['Trap'],r['Latitude'],r['Longitude'],r['NumMosquitos'],r['year'],r['WnvPresent'],r['day_of_year'],r['Freqcount'],r['Freqcount'],r['Freqcounttraps'],r['trapsfrequency'],r['trapsfrequencyequal_2'],r['trapsfrequencyequal_3'],r['trapsfrequencyequal_4'],r['trapsfrequencyequal_5'],r['trapsfrequencyequal_6'],r['sumoftrapsfrequencyequal'],r['trapsfrequencygreater_1'],r['trapsfrequencycurrentdategreater1']))
return training
def load_testing():
training = test
#for line in csv.DictReader(open("../input/test.csv")):
# for name, converter in {"Date" : datetime.date,
# "Latitude" : float, "Longitude" : float}.items():
# line[name] = converter(line[name])
# training.append(line)
return training
def closest_station(lat, long):
# Chicago is small enough that we can treat coordinates as rectangular.
stations = np.array([[41.995, -87.933],
[41.786, -87.752]])
loc = np.array([lat, long])
deltas = stations - loc[None, :]
dist2 = (deltas**2).sum(1)
return np.argmin(dist2)
def normalize(X, mean=None, std=None):
count = X.shape[1]
if mean is None:
mean = np.nanmean(X, axis=0)
for i in range(count):
X[np.isnan(X[:,i]), i] = mean[i]
if std is None:
std = np.std(X, axis=0)
for i in range(count):
X[:,i] = (X[:,i] - mean[i]) / std[i]
return mean, std
def scaled_count(record):
SCALE = 10.0
if "NumMosquitos" not in record:
# This is test data
return 1
return int(np.ceil(record["NumMosquitos"] / SCALE))
def assemble_X(base, weather):
X = []
for b in base:
date = b["Date"]
lat, long = b["Latitude"], b["Longitude"]
case = [date.year, date.month, date.day, lat, long]
# Look at a selection of past weather values
for days_ago in [1,3,7,14]:
day = date - datetime.timedelta(days=days_ago)
for obs in ["Tmax","Tmin","Tavg","DewPoint","WetBulb","PrecipTotal","Depart"]:
station = closest_station(lat, long)
case.append(weather[day][station][obs])
# Specify which mosquitos are present
species_vector = [float(x) for x in species_map[b["Species"]]]
case.extend(species_vector)
# Weight each observation by the number of mosquitos seen. Test data
# Doesn't have this column, so in that case use 1. This accidentally
# Takes into account multiple entries that result from >50 mosquitos
# on one day.
for repeat in range(scaled_count(b)):
X.append(case)
X = np.asarray(X, dtype=np.float32)
return X
def assemble_y(base):
y = []
for b in base:
present = b["WnvPresent"]
for repeat in range(scaled_count(b)):
y.append(present)
return np.asarray(y, dtype=np.int32).reshape(-1,1)
class AdjustVariable(object):
def __init__(self, variable, target, half_life=20):
self.variable = variable
self.target = target
self.half_life = half_life
def __call__(self, nn, train_history):
delta = self.variable.get_value() - self.target
delta /= 2**(1.0/self.half_life)
self.variable.set_value(np.float32(self.target + delta))
def train():
weather = load_weather()
training = load_training()
X = assemble_X(training, weather)
mean, std = normalize(X)
y = assemble_y(training)
input_size = len(X[0])
learning_rate = theano.shared(np.float32(0.1))
net = NeuralNet(
layers=[
('input', InputLayer),
('hidden1', DenseLayer),
('dropout1', DropoutLayer),
('hidden2', DenseLayer),
('dropout2', DropoutLayer),
('output', DenseLayer),
],
# layer parameters:
input_shape=(None, input_size),
hidden1_num_units=256,
dropout1_p=0.4,
hidden2_num_units=256,
dropout2_p=0.4,
output_nonlinearity=sigmoid,
output_num_units=1,
# optimization method:
update=nesterov_momentum,
update_learning_rate=learning_rate,
update_momentum=0.9,
# Decay the learning rate
on_epoch_finished=[
AdjustVariable(learning_rate, target=0, half_life=4),
],
# This is silly, but we don't want a stratified K-Fold here
# To compensate we need to pass in the y_tensor_type and the loss.
regression=True,
y_tensor_type = T.imatrix,
objective_loss_function = binary_crossentropy,
max_epochs=32,
eval_size=0.1,
verbose=1,
)
X, y = shuffle(X, y, random_state=123)
net.fit(X, y)
_, X_valid, _, y_valid = train_test_split(X, y)
probas = net.predict_proba(X_valid)[:,0]
print("ROC score", metrics.roc_auc_score(y_valid, probas))
return net, mean, std
def submit(net, mean, std):
weather = load_weather()
testing = load_testing()
X = assemble_X(testing, weather)
normalize(X, mean, std)
predictions = net.predict_proba(X)[:,0]
#
out = csv.writer(open("west_nile.csv", "w"))
out.writerow(["Id","WnvPresent"])
for row, p in zip(testing, predictions):
out.writerow([row["Id"], p])
if __name__ == "__main__":
net, mean, std = train()
submit(net, mean, std)
答案 0 :(得分:0)
也许你在assemble_X中有NaN值,试着这样做:
for b in base:
if isinstance(b):