我正在尝试创建SVR和KRR模型,这些模型可以根据测试保持中的大约32个特征来预测利率。我想用于培训的数据是一个非常大的CVS文件。 以下是我目前输入列车数据的内容:
import pandas as pd
import numpy as np
from numpy import nan
from datetime import datetime as dt
import statsmodels.api as sm
def strip2float(x):
try:
return float(x.strip('%'))/100 # convert percent strings to
float
except:
return nan
def strip2int(x):
try:
return int(x.strip('$').replace(',','')) # convert monetary
strings to int
except:
return nan
def verify2bool(x):
if("not verified" in x.lower()): return False
elif("verified" in x.lower()): return True
else: return nan
def subgrade2int(x):
try: return((ord(x[0].upper())-65)*5+int(x[1]))
except: return nan
def payments2bool(x):
try:
if (int(x.replace(' months',''))==36): return False
elif (int(x.replace(' months',''))==60): return True
else: return nan
except: return nan
def year2int(x):
try:
if("<" in x): return 0
elif("+" in x): return 15
else: return int(x.replace(' years',''))
except:
return nan
def date2int(x):
try: return dt.strptime(x, '%b-%d')
except:
try: return dt.strptime(x, '%d-%b')
except: return nan
def month2int(x):
try: return dt.strptime(x, '%b-%y')
except:
try: return dt.strptime(x, '%y-%b')
except: return nan
def status2bool(x):
if(x.lower()=="f"): return True
elif(x.lower()=="w"): return False
else: return nan
filepath='Data for Cleaning & Modeling.csv'
dtype={'X2':float, 'X3':float, 'X8':str, 'X16':str}
converters={'X1':strip2float, 'X4':strip2int, 'X5':strip2int,
'X6':strip2int, 'X7':payments2bool, 'X14':verify2bool,
'X9':subgrade2int, 'X11':year2int, 'X15':date2int, 'X23':month2int,
'X32':status2bool}
l=list(range(32))
data=pd.read_csv(filepath, index_col=False, dtype=dtype,
converters=converters)
data.drop(['X2','X3','X8','X10','X16','X18','X19'], axis=1)
我是新手,但我需要知道如何使用我现有的东西并创建SVR和KRR模型