train.csv
约为90G。
喜欢:
label,col1,col2,col3,col4......col400
0,0.92,-0.69,-0.13,0.01.......,0.22
1,0.22,-0.39,0.11,0.92........,-0.43
我的代码如下:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
train=pd.read_csv('/home/data/train.csv')
test=pd.read_csv('/home/data/test.csv')
params = {
'booster': 'gbtree',
'objective': 'multi:softmax',
'num_class': 10,
'gamma': 0.1,
'max_depth': 12,
'lambda': 2,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.007,
'seed': 1000,
}
plst = list(params.items())
num_rounds = 100
train_xy, val = train_test_split(train, test_size=0.3, random_state=1)
y = train_xy.label
X = train_xy.drop(['label'], axis=1)
val_y = val.label
val_X = val.drop(['label'], axis=1)
xgb_val = xgb.DMatrix(val_X, label=val_y)
xgb_train = xgb.DMatrix(X, label=y)
xgb_test = xgb.DMatrix(tests)
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
# training model
start_time = time.time()
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=100)
print(time.time() - start_time)
plst = list(params.items())
num_rounds = 100
train_xy, val = train_test_split(train, test_size=0.3, random_state=1)
y = train_xy.label
X = train_xy.drop(['label'], axis=1)
val_y = val.label
val_X = val.drop(['label'], axis=1)
xgb_val = xgb.DMatrix(val_X, label=val_y)
xgb_train = xgb.DMatrix(X, label=y)
xgb_test = xgb.DMatrix(tests)
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
# training model
start_time = time.time()
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=100)
print(time.time() - start_time)
model.save_model('./model/xgb.model')
我可以将这个csv分成几个部分吗?或者增加虚拟内存?