我有一个csv文件,其中包含位置,纬度,经度,犯罪类型等详细信息。 我想制作一个分类器,以便在给定其他属性的情况下预测犯罪类型。 现在,我想使用sklearn训练它,但sklearn需要浮动值。
import os
import glob
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
input_file_name = []
input_file = []
frame = pd.DataFrame()
fields = ['Reported by', 'Falls within', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name', 'Crime type']
to_drop = ['No Location']
for filename in glob.glob(os.path.join('/home/prakhar/Programming/Minor/2017-01', '*.csv')):
input_file_name.append(filename)
for ifn in input_file_name:
input_file.append(pd.read_csv(ifn, error_bad_lines=False, skipinitialspace=True, usecols=fields));
frame = pd.concat(input_file, ignore_index=True)
frame = frame[~frame['Location'].isin(to_drop)]
x, y = frame.iloc[:,:-1], frame.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
frame.head()
Reported by Falls within Longitude Latitude \
0 Thames Valley Police Thames Valley Police -0.972323 51.991460
1 Thames Valley Police Thames Valley Police -0.981511 51.997312
2 Thames Valley Police Thames Valley Police -0.970516 51.992128
3 Thames Valley Police Thames Valley Police -0.973412 51.992225
4 Thames Valley Police Thames Valley Police -0.984120 51.997263
Location LSOA code LSOA name \
0 On or near Osprey Walk E01017648 Aylesbury Vale 001A
1 On or near Portfield Close E01017648 Aylesbury Vale 001A
2 On or near Lime Avenue E01017648 Aylesbury Vale 001A
3 On or near Martin Close E01017648 Aylesbury Vale 001A
4 On or near Mckenzie Close E01017649 Aylesbury Vale 001B
Crime type
0 Anti-social behaviour
1 Anti-social behaviour
2 Other theft
3 Vehicle crime
4 Criminal damage and arson
答案 0 :(得分:0)
尝试以下方法:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
y = frame['Crime type'].values
# convert the labels from strings to numbers (0,1,2....)
y = LabelEncoder().fit_transform(y)
frame = frame.drop(['Crime type'], axis=1)
for f in frame.columns:
if frame[f].dtype == 'object':
lbl_enc = LabelEncoder()
# same as above encoding. it takes every object dtype from
# pandas dataframe and converts to numerical labels
frame[f] = lbl_enc.fit_transform(frame[f].values)
X = frame.values
# binarize the encoded columns. this is not needed if you are using a tree based algorithm
ohe = OneHotEncoder(categorical_features=[0, 1, 4, 5, 6])
X = ohe.fit_transform(X)
# use the following for SVMs (with_mean=False for sparse data)
scl = StandardScaler(with_mean=False)
X = scl.fit_transform(X)
# fit model here: model.fit(X, y)