只有STRING定义的列在HIVE中加载,即int和double的列为NULL

时间:2016-09-11 16:45:11

标签: hadoop hive analysis data-analysis

只有STRING定义的列在HIVE中加载,即int和double的列为NULL

创建表命令

create table A(
id STRING,
member_id STRING,
loan_amnt DOUBLE,   
funded_amnt DOUBLE,
`funded_amnt_inv` DOUBLE,
`term` STRING,
`int_rate`  STRING, 
`installment` DOUBLE,   
`grade` STRING, 
`sub_grade` STRING, 
`emp_title` STRING, 
`emp_length` STRING,    
`home_ownership` STRING,    
`nnual_inc` INT,
`verification_status` STRING,
`issue_d` STRING,
`loan_status` STRING,
`pymnt_plan`    STRING,
`url`   STRING,
`desc`  STRING,
`purpose`   STRING,
`title` STRING,
`zip_code` STRING,  
`addr_state`    STRING,
`dti`   DOUBLE,
`delinq_2yrs`   INT,
`earliest_cr_line` STRING,  
`inq_last_6mths`    STRING,
`mths_since_last_delinq`    STRING,
`mths_since_last_record`    STRING,
`open_acc`  INT,
`pub_rec`   INT,
`revol_bal` INT,
`revol_util`    STRING,    
`total_acc` INT,    
`initial_list_status`   STRING,    
`out_prncp` DOUBLE,    
`out_prncp_inv` DOUBLE,
`total_pymnt`   DOUBLE,
`total_pymnt_inv`   DOUBLE,
`total_rec_prncp`   DOUBLE,
`total_rec_int` DOUBLE,
`total_rec_late_fee`    DOUBLE,
`recoveries`    DOUBLE,
`collection_recovery_fee`   DOUBLE,
`last_pymnt_d`  STRING,
`last_pymnt_amnt`   DOUBLE,
`next_pymnt_d`  STRING,
`last_credit_pull_d`    STRING,
`collections_12_mths_ex_med`    INT,
`mths_since_last_major_derog`   STRING,
`policy_code`   STRING,
`application_type`  STRING,
`annual_inc_joint`  STRING,
`dti_joint` STRING,
`verification_status_joint` STRING, 
`acc_now_delinq`    STRING,
`tot_coll_amt`  STRING,
`tot_cur_bal`   STRING,
`open_acc_6m`   STRING,
`open_il_6m`    STRING,
`open_il_12m`   STRING,
`open_il_24m`   STRING,
`mths_since_rcnt_il` STRING,    
`total_bal_il`   STRING,
`il_util`    STRING,
`open_rv_12m ` STRING,  
`open_rv_24m`  STRING,  
`max_bal_bc`  STRING,   
`all_util`   STRING,    
`total_credit_rv`   STRING,
`inq_fi`  STRING,   
`total_fi_tl`    STRING,
`inq_last_12m`  STRING
)  

ROW FORMAT delimited
fields terminated by ','

STORED AS TEXTFILE;

将数据加载到表A

load data local inpath '/home/cloudera/Desktop/Project-3/1/LoanStats3a.txt' into table A;

选择数据

hive> SELECT * FROM A LIMIT 1;

输出

  

" 1077501" " 1296599" NULL NULL NULL" 36个月" "   10.65%" NULL" B" " B2" "" " 10年以上" "出租" NULL"已验证" " DEC-2011" "全   付费和#34; " N" " https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077501" "   借款人于12/22/11添加>我需要升级我的业务   技术
&#34。 " CREDIT_CARD" "计算机" " 860xx" " AZ" NULL NULL" 1985年1月" " 1" "" "" NULL NULL NULL" 83.7%" NULL" f" NULL NULL NULL NULL NULL NULL NULL NULL NULL" Jan-2015" NULL"" " DEC-2015" NULL"" " 1" "个人"

     

"" "" "" " 0" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""

2 个答案:

答案 0 :(得分:0)

您的CSV似乎包含各个字段的引号。 HIVE不支持周围的引号,因此它们成为了这些领域的一部分。在字符串字段的情况下,引号成为字符串的一部分。如果是数字字段,引号会使该字段成为无效数字,从而产生NULL。

有关支持CSV文件中引号的服务,请参阅csv-serde

答案 1 :(得分:0)

我找到了解决方案: -

create table stat2(id String,member_id INT,loan_amnt FLOAT,funding_amnt FLOAT,funding_amnt_inv FLOAT,term String,int_rate String,installment FLOAT,grade String,sub_grade String,emp_title String,emp_length String,home_ownership String,annual_inc FLOAT,verification_status String,issue_d date,loan_status String,pymnt_plan String,url String,descp String,purpose String,title String,zip_code String,addr_state String,dti FLOAT,delinq_2yrs FLOAT,earliest_cr_line String,inq_last_6mths FLOAT,mths_since_last_delinq FLOAT,mths_since_last_record FLOAT,open_acc FLOAT, pub_rec FLOAT,revol_bal FLOAT,revol_util字符串,total_acc FLOAT,initial_list_status字符串,out_prncp FLOAT,out_prncp_inv FLOAT,total_pymnt FLOAT,total_pymnt_inv FLOAT,total_rec_prncp FLOAT,total_rec_int FLOAT,total_rec_late_fee FLOAT,recoveries FLOAT,collection_recovery_fee FLOAT, last_pymnt_d字符串,last_pymnt_amnt FLOAT,next_pymnt_d字符串,last_credit_pull_d字符串,collections_12_mths_ex_med FLOAT,mths_since_last_major_derog FLOAT,policy_code FLOAT,APPLICATION_TYPE字符串,annual_inc_joint FLOAT,dti_joint FLOAT,verification_status_joint字符串,acc_now_delinq FLOAT,tot_coll_amt FLOAT,tot_cur_bal FLOAT,open_acc_6m FLOAT,open_il_6m FLOAT,open_il_12m FLOAT ,open_il_24m FLOAT,mths_since_rcnt_il FLOAT,total_bal_il FLOAT,il_util FLOAT,open_rv_12m FLOAT,open_rv_24m FLOAT,max_bal_bc FLOAT,all_util FLOAT,total_rev_hi_lim FLOAT,inq_fi FLOAT,total_cu_tl FLOAT,inq_last_12m FLOAT)

行格式SERDE' org.apache.hadoop.hive.serde2.OpenCSVSerde'与serdeproperties(

" separatorChar" =",",

" quoteChar" =" \""
  )

存储为TEXTFILE tblproperties(" skip.header.line.count" =" 2", " skip.footer.line.count" =" 4&#34);