功能不可迭代:Kaggle Fork

时间:2017-03-27 02:30:01

标签: python pandas numpy

我试图在Kaggle上分叉脚本但是它的给定错误功能不可迭代。我试图将日期转换为字符串,但它不起作用。

https://www.kaggle.com/bitsofbits/predict-west-nile-virus/simple-lasagne-nn

错误消息:

  net, mean, std = train()
  File "../src/script.py", line 270, in train
    X = assemble_X(training, weather)
  File "../src/script.py", line 225, in assemble_X
    for b in base:
TypeError: 'function' object is not iterable

此外,我甚至将数据帧转换为字典,但这没有帮助

请将以下代码粘贴到Fork Script中以查看结果:

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 27 00:55:09 2017

"""
from __future__ import print_function
from collections import defaultdict
import numpy as np
import datetime
import csv
from operator import itemgetter
import sys
import pandas as pd
from sklearn import ensemble, preprocessing
import xgboost as xgb 
from sklearn.tree import DecisionTreeClassifier
import math
from lasagne.layers import InputLayer, DropoutLayer, DenseLayer
from lasagne.updates import nesterov_momentum
from lasagne.objectives import binary_crossentropy
from nolearn.lasagne import NeuralNet
import theano
from theano import tensor as T
from theano.tensor.nnet import sigmoid
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
import datetime

# Load dataset
train = pd.read_csv("../input/train.csv")
test = pd.read_csv('../input/test.csv')

def date(text):
    return datetime.datetime.strptime(text, "%Y-%m-%d").date()

#Converting to Date Object
def getdate(datacol):
    return pd.to_datetime(datacol, format="%Y-%m-%d")

#Getting the Year
def getyear(datacol):
    return datacol['Date1'].dt.year

#Getting day of Year
def getdayofyear(datacol):
    return datacol['Date1'].dt.dayofyear

#Getting the Duplicated Values
def Duplicatedfeature(datacol):
    datacol['Freqcount'] = datacol.groupby(['Trap','year','day_of_year','Latitude','Longitude'])['Species'].transform(pd.Series.value_counts)
    return datacol['Freqcount']

#Getting Freq by Traps
def Duplicatedtraps(datacol):
    datacol['Freqcounttraps'] = datacol.groupby(['Trap','year'])['day_of_year'].transform(pd.Series.value_counts)
    return datacol['Freqcounttraps']

#Get Frequency of the Count of traps
def trapsfrequency(datacol):
    datacol['trapsfrequency'] = datacol.groupby(['Trap','year','day_of_year'])['Freqcounttraps'].transform(pd.Series.value_counts)
    return datacol['trapsfrequency']

#Create New feature for count of traps 
def trapsfrequencyequal(datacol,number):
    datacol['trapsfrequencyequal_'+str(number)] = np.where(datacol['Freqcounttraps']==number,datacol['trapsfrequency'], 0)
    return datacol['trapsfrequencyequal_'+str(number)]

#Create New feature for count of rows of traps atleast 2
def trapsfrequencygreater(datacol,number):
    datacol['trapsfrequencygreater_'+str(number)] = np.where(datacol['Freqcounttraps']>number,datacol['trapsfrequency'], 0)
    return datacol['trapsfrequencygreater_'+str(number)]

##Get count by currendate and check for atleast 2
def trapsfrequencybycurrentdate(datacol):
    datacol['trapsfrequencycurrentdate'] = datacol.groupby(['Date'])['Trap'].transform(pd.Series.value_counts)
    return datacol['trapsfrequencycurrentdate']

def trapsfrequencycurrentdategreater1(datacol):
    datacol['trapsfrequencycurrentdategreater1'] = np.where(datacol['trapsfrequencycurrentdate']>1,datacol['trapsfrequencycurrentdate'], 0)
    return datacol['trapsfrequencycurrentdategreater1']

def Preprocessing(train):
    train['Date1']=getdate(train['Date'])
    train['year']=getyear(train).astype('int64')
    train['day_of_year']=getdayofyear(train).astype('int64')
    train['Freqcount']=Duplicatedfeature(train).astype('int64')
    train['Freqcounttraps']=Duplicatedtraps(train).astype('int64')
    train['trapsfrequency']=trapsfrequency(train).astype('int64')
    trapsfrequencyequal(train,2).astype('int64')
    trapsfrequencyequal(train,3).astype('int64')
    trapsfrequencyequal(train,4).astype('int64')
    trapsfrequencyequal(train,5).astype('int64')
    trapsfrequencyequal(train,6).astype('int64')
    train['sumoftrapsfrequencyequal']=train.trapsfrequencyequal_2+train.trapsfrequencyequal_3+train.trapsfrequencyequal_4+train.trapsfrequencyequal_5+train.trapsfrequencyequal_6
    trapsfrequencygreater(train,1).astype('int64')
    trapsfrequencybycurrentdate(train).astype('int64')
    trapsfrequencycurrentdategreater1(train).astype('int64')
    train.drop(['Date1','Address','AddressNumberAndStreet'], inplace=True, axis=1)
    #train['AddressNumberandStreet'].apply(int)
    train['trapsfrequencycurrentdate']=train['trapsfrequencycurrentdate'].astype('int64')
    train['trapsfrequencycurrentdategreater1']=train['trapsfrequencycurrentdategreater1'].astype('int64')
    return train


train=Preprocessing(train)
test=Preprocessing(test)

# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)

lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)

lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

train.info()

train=train.to_dict()
test=test.to_dict()


def precip(text):
    TRACE = 1e-3
    text = text.strip()
    if text == "M":
        return None
    if text == "-":
        return None
    if text == "T":
        return TRACE
    return float(text)

def impute_missing_weather_station_values(weather):
    # Stupid simple
    for k, v in weather.items():
        if v[0] is None:
            v[0] = v[1]
        elif v[1] is None:
            v[1] = v[0]
        for k1 in v[0]:
            if v[0][k1] is None:
                v[0][k1] = v[1][k1]
        for k1 in v[1]:
            if v[1][k1] is None:
                v[1][k1] = v[0][k1]

def load_weather():
    weather = {}
    for line in csv.DictReader(open("../input/weather.csv")):
        for name, converter in {"Date" : date,
                                "Tmax" : float,"Tmin" : float,"Tavg" : float,
                                "DewPoint" : float, "WetBulb" : float,
                                "PrecipTotal" : precip,
                                "Depart" : float, 
                                "ResultSpeed" : float,"ResultDir" : float,"AvgSpeed" : float,
                                "StnPressure" : float, "SeaLevel" : float}.items():
            x = line[name].strip()
            line[name] = converter(x) if (x != "M") else None
        station = int(line["Station"]) - 1
        assert station in [0,1]
        dt = line["Date"]
        if dt not in weather:
            weather[dt] = [None, None]
        assert weather[dt][station] is None, "duplicate weather reading {0}:{1}".format(dt, station)
        weather[dt][station] = line
    impute_missing_weather_station_values(weather)        
    return weather


def load_training():
    training = train
    #for index,r in train:
    #   training.append((r['Date'],r['Latitude'],r['Species'],r['Trap'],r['Latitude'],r['Longitude'],r['NumMosquitos'],r['year'],r['WnvPresent'],r['day_of_year'],r['Freqcount'],r['Freqcount'],r['Freqcounttraps'],r['trapsfrequency'],r['trapsfrequencyequal_2'],r['trapsfrequencyequal_3'],r['trapsfrequencyequal_4'],r['trapsfrequencyequal_5'],r['trapsfrequencyequal_6'],r['sumoftrapsfrequencyequal'],r['trapsfrequencygreater_1'],r['trapsfrequencycurrentdategreater1']))
    return training

def load_testing():
    training = test
    #for line in csv.DictReader(open("../input/test.csv")):
    #    for name, converter in {"Date" : datetime.date, 
    #                            "Latitude" : float, "Longitude" : float}.items():
    #        line[name] = converter(line[name])
    #    training.append(line)
    return training


def closest_station(lat, long):
    # Chicago is small enough that we can treat coordinates as rectangular.
    stations = np.array([[41.995, -87.933],
                         [41.786, -87.752]])
    loc = np.array([lat, long])
    deltas = stations - loc[None, :]
    dist2 = (deltas**2).sum(1)
    return np.argmin(dist2)

def normalize(X, mean=None, std=None):
    count = X.shape[1]
    if mean is None:
        mean = np.nanmean(X, axis=0)
    for i in range(count):
        X[np.isnan(X[:,i]), i] = mean[i]
    if std is None:
        std = np.std(X, axis=0)
    for i in range(count):
        X[:,i] = (X[:,i] - mean[i]) / std[i]
    return mean, std

def scaled_count(record):
    SCALE = 10.0
    if "NumMosquitos" not in record:
        # This is test data
        return 1
    return int(np.ceil(record["NumMosquitos"] / SCALE))


def assemble_X(base, weather):
    X = []
    for b in base:
        date = b["Date"]
        lat, long = b["Latitude"], b["Longitude"]
        case = [date.year, date.month, date.day, lat, long]
        # Look at a selection of past weather values
        for days_ago in [1,3,7,14]:
            day = date - datetime.timedelta(days=days_ago)
            for obs in ["Tmax","Tmin","Tavg","DewPoint","WetBulb","PrecipTotal","Depart"]:
                station = closest_station(lat, long)
                case.append(weather[day][station][obs])
        # Specify which mosquitos are present
        species_vector = [float(x) for x in species_map[b["Species"]]]
        case.extend(species_vector)
        # Weight each observation by the number of mosquitos seen. Test data
        # Doesn't have this column, so in that case use 1. This accidentally
        # Takes into account multiple entries that result from >50 mosquitos
        # on one day. 
        for repeat in range(scaled_count(b)):
            X.append(case)    
    X = np.asarray(X, dtype=np.float32)
    return X

def assemble_y(base):
    y = []
    for b in base:
        present = b["WnvPresent"]
        for repeat in range(scaled_count(b)):
            y.append(present)    
    return np.asarray(y, dtype=np.int32).reshape(-1,1)


class AdjustVariable(object):
    def __init__(self, variable, target, half_life=20):
        self.variable = variable
        self.target = target
        self.half_life = half_life
    def __call__(self, nn, train_history):
        delta = self.variable.get_value() - self.target
        delta /= 2**(1.0/self.half_life)
        self.variable.set_value(np.float32(self.target + delta))

def train():
    weather = load_weather()
    training = load_training()

    X = assemble_X(training, weather)
    mean, std = normalize(X)
    y = assemble_y(training)

    input_size = len(X[0])

    learning_rate = theano.shared(np.float32(0.1))

    net = NeuralNet(
    layers=[  
        ('input', InputLayer),
         ('hidden1', DenseLayer),
        ('dropout1', DropoutLayer),
        ('hidden2', DenseLayer),
        ('dropout2', DropoutLayer),
        ('output', DenseLayer),
        ],
    # layer parameters:
    input_shape=(None, input_size), 
    hidden1_num_units=256, 
    dropout1_p=0.4,
    hidden2_num_units=256, 
    dropout2_p=0.4,
    output_nonlinearity=sigmoid, 
    output_num_units=1, 

    # optimization method:
    update=nesterov_momentum,
    update_learning_rate=learning_rate,
    update_momentum=0.9,

    # Decay the learning rate
    on_epoch_finished=[
            AdjustVariable(learning_rate, target=0, half_life=4),
            ],

    # This is silly, but we don't want a stratified K-Fold here
    # To compensate we need to pass in the y_tensor_type and the loss.
    regression=True,
    y_tensor_type = T.imatrix,
    objective_loss_function = binary_crossentropy,

    max_epochs=32, 
    eval_size=0.1,
    verbose=1,
    )

    X, y = shuffle(X, y, random_state=123)
    net.fit(X, y)

    _, X_valid, _, y_valid = train_test_split(X, y)
    probas = net.predict_proba(X_valid)[:,0]
    print("ROC score", metrics.roc_auc_score(y_valid, probas))

    return net, mean, std     


def submit(net, mean, std):
    weather = load_weather()
    testing = load_testing()
    X = assemble_X(testing, weather) 
    normalize(X, mean, std)
    predictions = net.predict_proba(X)[:,0]    
    #
    out = csv.writer(open("west_nile.csv", "w"))
    out.writerow(["Id","WnvPresent"])
    for row, p in zip(testing, predictions):
        out.writerow([row["Id"], p])


if __name__ == "__main__":
    net, mean, std = train()
    submit(net, mean, std)

1 个答案:

答案 0 :(得分:0)

也许你在assemble_X中有NaN值,试着这样做:

for b in base:
    if isinstance(b):