数据框显示不同的架构

时间:2016-11-01 11:39:34

标签: apache-spark spark-dataframe pyspark-sql

我正在从impala表创建数据框,如下所示。下面给出了模式o / p。

import org.apache.spark.sql.hive._;
val hctx= new HiveContext(sc);
val dfSql = hctx.sql("select * from df2_rs.rs_trans_2 limit 1")

当我使用表格模式的下划线镶木地板文件创建数据框架时是不同的(前者中有字符串类型,后面的同一列中有二进制类型)。架构如下。为什么存在架构差异,两者最初都是从同一个镶木地板文件中创建的。

val df201606=hctx.parquetFile("/user/hive/warehouse/df2_rs.db/trans_2/month_id=201606");
df201606.printSchema;

模式o / p如下: - dfSql.printSchema

 root
     |-- supplier_id: short (nullable = true)
     |-- msa_seq_nbr: decimal(20,0) (nullable = true)
     |-- prod_surro_id: long (nullable = true)
     |-- mp_surro_id: long (nullable = true)
     |-- otlt_surro_id: integer (nullable = true)
     |-- payer_pln_surro_id: long (nullable = true)
     |-- btch_nbr: long (nullable = true)
     |-- store_nbr: string (nullable = true)
     |-- dspnsd_dt: timestamp (nullable = true)
     |-- btch_supplier_seq_nbr: long (nullable = true)
     |-- store_chnl_typ_cd: string (nullable = true)
     |-- typ_cd: string (nullable = true)
     |-- typ_cd_actn_cd: byte (nullable = true)
     |-- cmf_prod_nbr: string (nullable = true)
     |-- cmf_pack_nbr: string (nullable = true)
     |-- otlt_ctry_cd: string (nullable = true)
     |-- otlt_pstl_cd: string (nullable = true)
     |-- otlt_seq_nbr: string (nullable = true)
     |-- otlt_actn_cd: byte (nullable = true)
     |-- qty: decimal(18,3) (nullable = true)
     |-- qty_actn_cd: byte (nullable = true)
     |-- authd_rfll_nbr: byte (nullable = true)
     |-- authd_rfll_cd_actn_cd: byte (nullable = true)
     |-- pay_typ_cd: short (nullable = true)
     |-- pay_typ_cd_actn_cd: byte (nullable = true)
     |-- ddsupp_cnt: decimal(18,3) (nullable = true)
     |-- ddsupp_cnt_actn_cd: byte (nullable = true)
     |-- dspnsd_ndc: string (nullable = true)
     |-- dspnsd_ndc_actn_cd: byte (nullable = true)
     |-- mp_id: string (nullable = true)
     |-- mtch_cd: long (nullable = true)
     |-- usc_cd: string (nullable = true)
     |-- std_payer_id: string (nullable = true)
     |-- std_pln_id: string (nullable = true)
     |-- msk_payer_id: string (nullable = true)
     |-- msk_pln_id: string (nullable = true)
     |-- corrn_id: long (nullable = true)
     |-- enc_id: string (nullable = true)
     |-- pat_nbr: long (nullable = true)
     |-- dacon_qty: decimal(18,3) (nullable = true)
     |-- dacon_qty_actn_cd: byte (nullable = true)
     |-- dosg_qty: decimal(18,3) (nullable = true)
     |-- dosg_qty_actn_cd: byte (nullable = true)
     |-- cre_dt: timestamp (nullable = true)
     |-- std_pln_cd_ind: short (nullable = true)
     |-- supplier_pln_cd: string (nullable = true)
     |-- supplier_pln_id_actn_cd: short (nullable = true)
     |-- pcn: string (nullable = true)
     |-- bin_id: string (nullable = true)
     |-- grp_nbr: string (nullable = true)
     |-- pln_id_brdg_cd: short (nullable = true)
     |-- cust_prc_amt: decimal(18,3) (nullable = true)
     |-- strg: string (nullable = true)
     |-- store_cost_amt: decimal(18,3) (nullable = true)
     |-- store_cost_amt_actn_cd: string (nullable = true)
     |-- co_pay_amt: decimal(18,3) (nullable = true)
     |-- co_pay_cd: string (nullable = true)
     |-- pat_pharmy_owe_amt: decimal(18,3) (nullable = true)
     |-- pat_pay_ind: string (nullable = true)
     |-- pharmy_npi_id: string (nullable = true)
     |-- prscr_npi: string (nullable = true)
     |-- diag_icd9_cd: string (nullable = true)
     |-- orig_cd: string (nullable = true)
     |-- pat_zip3: string (nullable = true)
     |-- claim_id: long (nullable = true)
     |-- fill_nbr: integer (nullable = true)
     |-- data_use_qlfr_cd: string (nullable = true)
     |-- patient_id: long (nullable = true)
     |-- otlt_ncpdp_id: string (nullable = true)
     |-- mp_dea: string (nullable = true)
     |-- provider_id: long (nullable = true)
     |-- revsl_updt_dt: timestamp (nullable = true)
     |-- dup_updt_dt: timestamp (nullable = true)
     |-- corrn_updt_dt: timestamp (nullable = true)
     |-- supplier_pat_id: string (nullable = true)
     |-- diag_cd: string (nullable = true)
     |-- diag_vers_typ_id: byte (nullable = true)
     |-- month_id: integer (nullable = true)

模式o / p如下: -     df201606.printSchema: -

root
     |-- supplier_id: integer (nullable = true)
     |-- msa_seq_nbr: decimal(20,0) (nullable = true)
     |-- prod_surro_id: long (nullable = true)
     |-- mp_surro_id: long (nullable = true)
     |-- otlt_surro_id: integer (nullable = true)
     |-- payer_pln_surro_id: long (nullable = true)
     |-- lbtch_nbr: long (nullable = true)
     |-- store_nbr: binary (nullable = true)
     |-- dspnsd_dt: timestamp (nullable = true)
     |-- btch_supplier_seq_nbr: long (nullable = true)
     |-- store_chnl_typ_cd: binary (nullable = true)
     |-- typ_cd: binary (nullable = true)
     |-- typ_cd_actn_cd: integer (nullable = true)
     |-- cmf_prod_nbr: binary (nullable = true)
     |-- cmf_pack_nbr: binary (nullable = true)
     |-- otlt_ctry_cd: binary (nullable = true)
     |-- otlt_pstl_cd: binary (nullable = true)
     |-- otlt_seq_nbr: binary (nullable = true)
     |-- otlt_actn_cd: integer (nullable = true)
     |-- qty: decimal(18,3) (nullable = true)
     |-- qty_actn_cd: integer (nullable = true)
     |-- authd_rfll_nbr: integer (nullable = true)
     |-- authd_rfll_cd_actn_cd: integer (nullable = true)
     |-- pay_typ_cd: integer (nullable = true)
     |-- pay_typ_cd_actn_cd: integer (nullable = true)
     |-- ddsupp_cnt: decimal(18,3) (nullable = true)
     |-- ddsupp_cnt_actn_cd: integer (nullable = true)
     |-- dspnsd_ndc: binary (nullable = true)
     |-- dspnsd_ndc_actn_cd: integer (nullable = true)
     |-- mp_id: binary (nullable = true)
     |-- mtch_cd: long (nullable = true)
     |-- usc_cd: binary (nullable = true)
     |-- std_payer_id: binary (nullable = true)
     |-- std_pln_id: binary (nullable = true)
     |-- msk_payer_id: binary (nullable = true)
     |-- msk_pln_id: binary (nullable = true)
     |-- corrn_id: long (nullable = true)
     |-- enc_id: binary (nullable = true)
     |-- pat_nbr: long (nullable = true)
     |-- dacon_qty: decimal(18,3) (nullable = true)
     |-- dacon_qty_actn_cd: integer (nullable = true)
     |-- dosg_qty: decimal(18,3) (nullable = true)
     |-- dosg_qty_actn_cd: integer (nullable = true)
     |-- cre_dt: timestamp (nullable = true)
     |-- std_pln_cd_ind: integer (nullable = true)
     |-- supplier_pln_cd: binary (nullable = true)
     |-- supplier_pln_id_actn_cd: integer (nullable = true)
     |-- pcn: binary (nullable = true)
     |-- bin_id: binary (nullable = true)
     |-- grp_nbr: binary (nullable = true)
     |-- pln_id_brdg_cd: integer (nullable = true)
     |-- cust_prc_amt: decimal(18,3) (nullable = true)
     |-- strg: binary (nullable = true)
     |-- store_cost_amt: decimal(18,3) (nullable = true)
     |-- store_cost_amt_actn_cd: binary (nullable = true)
     |-- co_pay_amt: decimal(18,3) (nullable = true)
     |-- co_pay_cd: binary (nullable = true)
     |-- pat_pharmy_owe_amt: decimal(18,3) (nullable = true)
     |-- pat_pay_ind: binary (nullable = true)
     |-- lpharmy_npi_id: binary (nullable = true)
     |-- prscr_npi: binary (nullable = true)
     |-- diag_icd9_cd: binary (nullable = true)
     |-- orig_cd: binary (nullable = true)
     |-- pat_zip3: binary (nullable = true)
     |-- claim_id: long (nullable = true)
     |-- fill_nbr: integer (nullable = true)
     |-- data_use_qlfr_cd: binary (nullable = true)
     |-- patient_id: long (nullable = true)
     |-- otlt_ncpdp_id: binary (nullable = true)
     |-- mp_dea: binary (nullable = true)
     |-- provider_id: long (nullable = true)
     |-- revsl_updt_dt: timestamp (nullable = true)
     |-- dup_updt_dt: timestamp (nullable = true)
     |-- corrn_updt_dt: timestamp (nullable = true)
     |-- supplier_pat_id: binary (nullable = true)
     |-- diag_cd: binary (nullable = true)
     |-- diag_vers_typ_id: integer (nullable = true)

0 个答案:

没有答案