我是Python和深度学习的初学者。这对你们大多数人来说可能很容易,但我怎么能这样做呢?
如何将以下对象转换为唯一的数值?
df['city'].unique()
array(['LIMA', 'VACAVILLE', 'CINCINNATI', 'GLASGOW', 'BOWLING GREEN',
'LANCASTER', 'HOUSTON', 'SPRINGFIELD', 'RAPID CITY', 'FORT WORTH',
'LAREDO', 'NEW YORK', 'CHARLESTON', 'PITTSBURGH',
'WEST VALLEY CITY', 'CAYCE', 'HOT SPRINGS NATIO', 'CANTON',
'FORT WAYNE', 'DU BOIS', 'DAYTON', 'MASON CITY', 'WASHINGTON',
'LAKE OSWEGO', 'FAYETTEVILLE', 'SALT LAKE CITY', 'KNOXVILLE',
'TURLOCK', 'MCALLEN', 'CENTERVILLE', 'ROCHESTER', 'OKLAHOMA CITY',
'GAUTIER', 'DOYLESTOWN', 'ATLANTA', 'MEADVILLE', 'FORT MYERS',
'ERIE', 'BEAUMONT', 'JACKSON', 'CLARKSVILLE', 'BETHLEHEM',
'SAN ANTONIO', 'LAS VEGAS', 'ATHENS', 'SAN LUIS OBISPO', 'SEATTLE',
'BRADENTON', 'TINLEY PARK', 'HUNTLEY', 'SYRACUSE', 'WHEELWRIGHT',
'TOWSON', 'YONKERS', 'ARDEN HILLS', 'MARION', 'LIVONIA',
'COLORADO SPRINGS', 'CURWENSVILLE', 'SAINT CHARLES', 'PETERSBURG',
'SCOTTSDALE', 'SILVER SPRING', 'PORTLAND', 'BIRMINGHAM',
'CEDARVILLE', 'CLERMONT', 'ASHEVILLE', 'SHREVEPORT', 'DRAPER',
'WAVERLY', 'CANANDAIGUA', 'MOUNT PLEASANT', 'MARIETTA', 'MANKATO',
'HARLINGEN', 'HATCH', 'MOBILE', 'POULSBO', 'GARDEN GROVE',
'GIG HARBOR', 'OCONOMOWOC', 'MOUNT MORRIS', 'ORLANDO',
'DODGE CITY', 'DILLSBURG', 'HUNTSVILLE', 'KANSAS CITY',
'JACKSONVILLE', 'DULUTH', 'CITRUS HEIGHTS', 'ONEONTA', 'LOS LUNAS',
'GIBSONIA', 'ROBINSON', 'VERNON HILLS', 'PHOENIX', 'DESTIN',
'SHEPHERD', 'BROOKLYN', 'PLANO', 'WINTERS', 'JAMAICA', 'POWAY',
'LEXINGTON', 'UPLAND', 'NEW ALBANY', 'GREENVILLE',
'JEFFERSON CITY', 'ARLINGTON', 'BUFFALO', 'LOS ANGELES',
'CHARLOTTE', 'WEST LAFAYETTE', 'GARY', 'COOPERSTOWN', 'GREAT BEND',
'DAVISON', 'SMYRNA', 'MISSOURI CITY', 'MEMPHIS',
'FORT WALTON BEACH', 'KISSIMMEE', 'BATAVIA', 'OLDSMAR', 'WYNNE',
'ASHVILLE', 'FT BRAGG', 'TROY', 'SHAKER HTS', 'CLEVELAND HTS',
'HAMBURG'], dtype=object)
我正在尝试使用这些数据训练模型。
答案 0 :(得分:1)
IIUC需要factorize
:
df = pd.DataFrame({'city':list('abcddf')})
df['city1'] = pd.factorize(df['city'])[0]
或转换为categorical
并获取codes
:
df['city'] = pd.Categorical(df['city'])
df['city1'] = df['city'].cat.codes
print (df)
city city1
0 a 0
1 b 1
2 c 2
3 d 3
4 d 3
5 f 4
答案 1 :(得分:0)
您也可以尝试sklearn.preprocessing.LabelEncoder
。如文档中所述,它对值为0到n_classes-1的标签进行编码。
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['city_num'] = le.fit_transform(df['city'])
print(df.head())
# city city_num
# 0 LIMA 72
# 1 VACAVILLE 122
# 2 CINCINNATI 21
# 3 GLASGOW 50
# 4 BOWLING GREEN 10
print(len(df.city.unique()))
# 132
print(len(set(df.city_num)))
# 132
然后,您可以将数字列转换为指标列
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
city_ind = ohe.fit_transform(df.city_num.values.reshape(-1, 1))
print(type(city_ind))
# <class 'scipy.sparse.csr.csr_matrix'>
print(city_ind.shape)
# (132, 132)
print(city_ind[0:2, ].toarray())
# [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.]
# [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.]]