我有一个火车和测试数据集。里面有一些缺失的值。我想为missin值分配NA。但是,我收到了错误。
这是我的代码:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import random
from sklearn.preprocessing import LabelEncoder
Train=pd.read_csv('Dataset/train.csv')
Test=pd.read_csv('Dataset/train.csv')
Train['Type']='Train' #Create a flag for Train and Test Data set
Test['Type']='Test'
FullData = pd.concat([Train,Test],axis=0) #Combined both Train and Test Data set
ID_Col = ['USER_ID'] # ID Variables
Target_Col = ["ACTIVITY_DEC_16"]
Cat_Cols = ['ACT_DATE','STATUS', 'TP_CURRENT','TP_CHANGES_NUM','START_PACK','OFFER_GROUP','BIRTHDAY','GENDER','MLLS_STATE',
'PORTED_IN','PORTED_OUT','OBLIG_NUM','OBLIG_ON_START','ASSET_TYPE_LAST','DEVICE_TYPE_BUS','USAGE_AREA','REFILL_OCT_16','REFILL_NOV_16',
'OUTGOING_OCT_16','OUTGOING_NOV_16','GPRS_OCT_16','GPRS_NOV_16','REVENUE_OCT_16','REVENUE_NOV_16'] #Categorical Variables
Num_Cols = list(set(list(FullData.columns))-set(Cat_Cols)-set(ID_Col)-set(Target_Col)) #Numerical Variables
Other_Col = ['Type'] #Test and Train Data Set Identifier
Num_Cat_Cols = Num_Cols + Cat_Cols #Combined numerical and Categorical variables
#Create a new variable for each variable having missing value with VariableName_NA
# and flag missing value with 1 and other with 0
for var in Num_Cat_Cols:
if FullData[var].isnull().any()==True:
FullData[var+'_NA']=FullData[var].isnull()*1
以下是我的火车数据集中的示例:(如果您希望我可以全部上传:))
这是我可爱的错误
/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 /Users/isozyesil/PycharmProjects/TaskNo2/TaskNo2.py
Traceback (most recent call last):
File "/Users/isozyesil/PycharmProjects/TaskNo2/TaskNo2.py", line 32, in <module>
if FullData[var].isnull().any()==True:
File "/Users/isozyesil/Library/Python/2.7/lib/python/site-packages/pandas/core/frame.py", line 2059, in __getitem__
return self._getitem_column(key)
File "/Users/isozyesil/Library/Python/2.7/lib/python/site-packages/pandas/core/frame.py", line 2066, in _getitem_column
return self._get_item_cache(key)
File "/Users/isozyesil/Library/Python/2.7/lib/python/site-packages/pandas/core/generic.py", line 1386, in _get_item_cache
values = self._data.get(item)
File "/Users/isozyesil/Library/Python/2.7/lib/python/site-packages/pandas/core/internals.py", line 3543, in get
loc = self.items.get_loc(item)
File "/Users/isozyesil/Library/Python/2.7/lib/python/site-packages/pandas/indexes/base.py", line 2136, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)
File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)
KeyError: 'ACT_DATE'
Process finished with exit code 1
答案 0 :(得分:0)
使用分隔符可以解决问题。
Train=pd.read_csv('Dataset/train.csv', delimiter=';')
Test=pd.read_csv('Dataset/train.csv', delimiter=';')