我试图从csv文件中提取一些列,但是打印出的结果仅显示0或1,而不显示列的实际值。
在stackoverflow用户的大力帮助下,现在我的代码中没有错误消息可提取列。
csv文件示例数据:
companyID,year,company_age,Debt_TA,gcp
654001,2000,49,0.14,0
654001,2001,50,0.17,0
654001,2002,51,0.23,1
112089,2013,38,0.11,0
112089,2014,39,0.13,0
342980,2007,54,0.15,0
342980,2008,55,0.22,1
113456,2009,12,0.11,0
113456,2010,13,0.13,0
代码:
import csv
import numpy as np
from sklearn import feature_extraction
#from sklearn.svm import SVC
#from sklearn.model_selection import GridSearchCV
#from sklearn import model_selection
def parseFile (filename):
companies = list ()
with open (filename) as csvfile:
reader = csv.reader (csvfile, delimiter = ',', quotechar = '"')
for index, line in enumerate (reader):
#print index, line
if (index > 0 and index < 150):
CompanyID, year, company_age, gcp = line[0], line[1], line[2], line[4]
#print company_name
company = {\
'CompanyID' : CompanyID,\
'year' : year,\
'company_age' : company_age,\
'gcp': int (gcp),\
}
companies.append (company)
return companies
def extract_year_features (companies):
year_list = list ()
for company in companies:
year_list.append (company['year'] * 10)
tweet_vectorizer = feature_extraction.text.CountVectorizer ()
X = tweet_vectorizer.fit_transform (year_list).toarray ()
return X
def extract_company_age_features (companies):
company_age_list = list ()
for company in companies:
company_age_list.append (company['company_age'] * 10)
tweet_vectorizer = feature_extraction.text.CountVectorizer ()
X = tweet_vectorizer.fit_transform (company_age_list).toarray ()
return X
def extract_all_features (companies):
return np.concatenate ( (extract_year_features (companies), \
extract_company_age_features (companies)), \
axis=1)
def generate_target (companies):
y = [company['gcp'] for company in companies]
return np.array (y)
companies = parseFile ("sample.csv")
X = extract_all_features (companies)
y = generate_target (companies)
print(X)
print(y)
#Credit to G.Li
下面的print(X)看起来不是列的真实值。
print(X)
[[1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
[0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0]
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0]]
印刷品(y)看起来还不错。
print(y)
[0 0 1 0 0 0 1 0 0]
有人知道我做错了什么吗?预先感谢您的帮助!
答案 0 :(得分:0)
我真的希望您开始使用pandas,因为您可以轻松地将大多数数据类型读入易于浏览的“数据框”。导入csv文件所需的所有代码都是
import pandas as pd
dataframe = pd.readcsv("sample.csv")
然后,您可以轻松访问所需的列。