如何以正确的方式在spark中加载csv文件?

时间:2015-11-03 06:37:15

标签: apache-spark-sql

我在spark中加载csv文件,但是它包含空值bcz,那个arrAy索引超出绑定异常会怎么加载呢?

val schema = StructType (Array(StructField("ROW_ID",StringType,true) ,StructField("CREATED",TimestampType,true) ,StructField("CREATED_BY", StringType, true) ,StructField("LAST_UPD",TimestampType,true) ,StructField("LAST_UPD_BY", StringType, true) ,StructField("MODIFICATION_NUM", StringType, true) ,StructField("CONFLICT_ID",StringType, true) ,StructField("ACTIVE_FLG", StringType, true) ,StructField("ALW_PART_SHIP_FLG", StringType, true) ,StructField("APPROVED_FLG", StringType, true) ,StructField("AUTO_RECV_FLG", StringType, true) ,StructField("BILLABLE_FLG", StringType, true) ,StructField("BU_ID", StringType, true) ,StructField("CRDT_CHK_PASSD_FLG", StringType, true) ,StructField("DISPLAY_LINE_FLG", StringType, true) ,StructField("DOCNUM_GNRTD_FLG", StringType, true) ,StructField("EXMPT_APP_FLG", StringType, true) ,StructField("EXP_TO_ICTXN_FLG", StringType, true) ,StructField("FREEZE_FLG", StringType, true) ,StructField("FULFIL_LOCKED_FLG", StringType, true) ,StructField("HOLD_FLG", StringType, true) ,StructField("MANUAL_FLG", StringType, true) ,StructField("NEED_NETCHANGE_FLG", StringType, true) ,StructField("ORDER_CAT_CD", StringType, true) ,StructField("ORDER_NUM", StringType, true) ,StructField("ORDER_TYPE_ID", StringType, true) ,StructField("PR_REP_DNRM_FLG", StringType, true) ,StructField("PR_REP_MANL_FLG", StringType, true) ,StructField("PR_REP_SYS_FLG", StringType, true) ,StructField("REV_NUM", StringType, true) ,StructField("STATUS_CHG_FLG", StringType, true) ,StructField("TAX_EXEMPT_FLG", StringType, true) ,StructField("TEST_ORDER_FLG", StringType, true) ,StructField("TRACK_REV_FLG", StringType, true) ,StructField("ASGN_TS", StringType, true) ,StructField("CHRG_CREATED_FLG", StringType, true) ,StructField("CMPNS_STATUS_DT", StringType, true) ,StructField("CRCHK_ANN_INCOME", StringType, true) ,StructField("CRCHK_CRDT_SCORE", StringType, true) ,StructField("CRCHK_DATE", StringType, true) ,StructField("CRCHK_PSTPAID_SVC", StringType, true) ,StructField("CRDT_ASGN_TS", StringType, true) ,StructField("CRDT_CRD_EXP_DT", StringType, true) ,StructField("CRDT_CRD_TXN_AMT", StringType, true) ,StructField("CRDT_CRD_TXN_DT", StringType, true) ,StructField("DB_LAST_UPD", TimestampType, true) ,StructField("DISCNT_AMT",StringType, true) ,StructField("DISCNT_PERCENT", StringType, true) ,StructField("DISCNT_RC_AMT", StringType, true) ,StructField("DISCNT_RC_PCT", StringType, true) ,StructField("EAI_SYNC_DT", StringType, true) ,StructField("FRGHT_AMT", StringType, true) ,StructField("FRGHT_AMT_DT", StringType, true) ,StructField("INIT_APRV_AMT", StringType, true) ,StructField("LAST_CANCEL_DT", StringType, true) ,StructField("LOAD_NUM",StringType, true ) ,StructField("ORDER_DT", TimestampType, true) ,StructField("ORDER_EXCH_DT", TimestampType, true)  ,StructField("PRICING_DT", TimestampType, true) ,StructField("PURCH_ORD_EXP_TS", StringType, true) ,StructField("REQ_SHIP_DT", TimestampType, true) ,StructField("REVISION_DT", StringType, true) ,StructField("STATUS_DT",TimestampType, true) ,StructField("TAX_AMT", StringType, true) ,StructField("TAX_AMT_EXCH_DT", StringType, true) ,StructField("TAX_PERCENT", StringType, true) ,StructField("TOTAL_AMT", StringType, true) ,StructField("TOT_EXTND_PRICE", StringType, true) ,StructField("TOT_EXTND_TAX", StringType, true) ,StructField("TOT_QTY_BONUS", StringType, true) ,StructField("TOT_QTY_SHIP", StringType, true) ,StructField("ACCNT_ADDR_ID", StringType, true) ,StructField("ACCNT_ID", StringType, true) ,StructField("ACCNT_ORDER_NUM", StringType, true) ,StructField("AGREE_BNFT_ID", StringType, true) ,StructField("AGREE_ID", StringType, true) ,StructField("APPR_BY_EMP_ID", StringType, true) ,StructField("APPR_BY_POSTN_ID", StringType, true) ,StructField("BILL_ACCNT_ID", StringType, true) ,StructField("BILL_PROFILE_ID", StringType, true) ,StructField("BLOCK_BL_CD", StringType, true) ,StructField("BLOCK_DLVRY_CD", StringType, true) ,StructField("BL_ADDR_ID", StringType, true) ,StructField("BL_CON_ID", StringType, true) ,StructField("BL_OU_ID", StringType, true) ,StructField("BL_PER_ADDR_ID", StringType, true) ,StructField("CAMP_CON_ID", StringType, true) ,StructField("CARRIER_CD", StringType, true) ,StructField("CARRIER_PRIO_CD", StringType, true) ,StructField("CCNUM_ENCRPKEY_REF", StringType, true) ,StructField("CCVNUM_ENCRPKY_REF", StringType, true) ,StructField("CCV_NUMBER", StringType, true) ,StructField("CC_NUMBER", StringType, true) ,StructField("CC_TXNPROC_AC_NUM",StringType, true) ,StructField("CC_TXNPROC_VNDR_ID",StringType, true) ,StructField("CC_TXN_RTRN_MSG_CD",StringType, true) ,StructField("CMPND_PROD_NUM",StringType, true) ,StructField("CMPNS_STATUS_CD",StringType, true) ,StructField("COMMIT_TYPE_CD", StringType, true) ,StructField("COMPOUND_PROD_NUM", StringType, true) ,StructField("CONTACT_ID", StringType, true) ,StructField("CO_BUS_AREA_ID", StringType, true) ,StructField("CRCHK_CRDT_AGENCY", StringType, true) ,StructField("CRCHK_DECISION_CD", StringType, true) ,StructField("CRCHK_IDENTIFIER", StringType, true) ,StructField("CRCHK_ID_TYPE_CD", StringType, true) ,StructField("CRCHK_UPD_BY", StringType, true) ,StructField("CRDHOLDER_NAME", StringType, true) ,StructField("CRDTCD_TXN_STAT_CD", StringType, true) ,StructField("CRDT_CRD_APPR_CD", StringType, true) ,StructField("CRDT_CRD_EXP_MO_CD", StringType, true) ,StructField("CRDT_CRD_EXP_YR_CD", StringType, true) ,StructField("CRDT_CRD_NAME", StringType, true) ,StructField("CRDT_STATUS_CD", StringType, true) ,StructField("CURCY_CD", StringType, true) ,StructField("CUSTOMER_ID", StringType, true) ,StructField("DB_LAST_UPD_SRC", StringType, true) ,StructField("DCP_ID", StringType, true) ,StructField("DEST_INVLOC_ID", StringType, true) ,StructField("DLVRY_PERIOD_ID", StringType, true) ,StructField("DLVRY_STATUS_CD", StringType, true) ,StructField("EAI_EXPRT_STAT_CD", StringType, true) ,StructField("EAI_ORDER_NUM", StringType, true) ,StructField("ENTLMNT_ID", StringType, true) ,StructField("EVT_SRC_ID", StringType, true) ,StructField("FRGHT_AMT_CURCY_CD", StringType, true) ,StructField("FRGHT_TERMS_CD", StringType, true) ,StructField("FRGHT_TERMS_INFO", StringType, true) ,StructField("INTEGRATION_ID", StringType, true) ,StructField("LOY_MEMBER_ID", StringType, true) ,StructField("LOY_PROMO_ID", StringType, true) ,StructField("OPTY_ID", StringType, true) ,StructField("PAR_ORDER_ID", StringType, true) ,StructField("PAYMENT_TERM_ID", StringType, true) ,StructField("PAYMENT_TYPE_CD", StringType, true) ,StructField("PAYTO_ADDR_ID", StringType, true) ,StructField("PAYTO_CON_ID", StringType, true) ,StructField("PAYTO_OU_ID", StringType, true) ,StructField("PAY_AUTH_NUM", StringType, true) ,StructField("PAY_OU_ID", StringType, true) ,StructField("PEC_WF_PROC_NAME", StringType, true) ,StructField("PRIO_CD", StringType, true) ,StructField("PRI_LST_ID", StringType, true) ,StructField("PRI_WF_PROC_NAME", StringType, true) ,StructField("PROJ_ID", StringType, true) ,StructField("PROMO_DCP_ID", StringType, true) ,StructField("PROMO_ID", StringType, true) ,StructField("PRSP_CONTACT_ID", StringType, true) ,StructField("PR_PAYMENT_ID", StringType, true) ,StructField("PR_POSTN_ID", StringType, true) ,StructField("PR_SHIPMENT_ID", StringType, true) ,StructField("QUOTE_ID", StringType, true) ,StructField("RTRN_ADDR_ID", StringType, true) ,StructField("RTRN_CON_ID", StringType, true) ,StructField("RTRN_OU_ID", StringType, true) ,StructField("RTRN_REASON_CD", StringType, true) ,StructField("SERV_ACCNT_ID", StringType, true) ,StructField("SHIP_ADDR_ID", StringType, true) ,StructField("SHIP_CON_ID", StringType, true) ,StructField("SHIP_METH_CD", StringType, true) ,StructField("SHIP_OU_ID", StringType, true) ,StructField("SHIP_PER_ADDR_ID", StringType, true) ,StructField("SLS_HIER_VER_ID", StringType, true) ,StructField("SRC_INVLOC_ID", StringType, true) ,StructField("SRV_PROV_OU_ID", StringType, true) ,StructField("SR_ID", StringType, true) ,StructField("STATUS_CD", StringType, true) ,StructField("TAX_AMT_CURCY_CD", StringType, true) ,StructField("TAX_EXEMPT_NUM", StringType, true) ,StructField("TAX_EXEMPT_REASON", StringType, true) ,StructField("TAX_LIST_ID", StringType, true) ,StructField("VALIDATION_RULE_ID", StringType, true) ,StructField("X_IPTV_FLG", StringType, true) ,StructField("X_TRIAL_FLG", StringType, true) ,StructField("X_ACC_CAT", StringType, true) ,StructField("X_HSBA_BTU_FLG", StringType, true) ,StructField("X_NT_ORDR", StringType, true) ,StructField("X_COMMISSION_TO_ID", StringType, true) ,StructField("X_COMMISSION_TO_NAME", StringType, true) ,StructField("X_AE_ID", StringType, true) ,StructField("X_CUST_ABBR", StringType, true) ,StructField("X_URGENT_FLG", StringType, true) ,StructField("DOC_NUM", StringType, true) ,StructField("CRDT_COMMENTS", StringType, true) ,StructField("COMMENTS", StringType, true) ,StructField("DESC_TEXT", StringType, true) ,StructField("DISCNT_REASON", StringType, true) ,StructField("SHIP_INSTRUCTIONS", StringType, true) ,StructField("HOLD_REASON", StringType, true) ,StructField("EAI_ERROR_TEXT", StringType, true)))        


val rowRDD = order.map(_.split("#CO1D#", -1)).map(p => Row(p(0).trim, p(1).trim,p(2).trim,p(3).trim,p(4).trim,p(5).trim,p(6).trim,p(7).trim,p(8).trim,p(9).trim,p(10).trim,p(11).trim,p(12).trim,p(13).trim,p(14).trim,p(15).trim,p(16).trim,p(17).trim,p(18).trim,p(19).trim,p(20).trim,p(21).trim,p(22).trim,p(23).trim,p(24).trim,p(25).trim,p(26).trim,p(27).trim,p(28).trim,p(29).trim,p(30).trim,p(31).trim,p(32).trim,p(33).trim,p(34).trim,p(35).trim,p(36).trim,p(37).trim,p(38).trim,p(39).trim,p(40).trim,p(41).trim,p(42).trim,p(43).trim,p(44).trim,p(45).trim,p(46).trim,p(47).trim,p(48).trim,p(49).trim,p(50).trim,p(51).trim,p(52).trim,p(53).trim,p(54).trim,p(55).trim,p(56).trim,p(57).trim,p(58),p(59).trim,p(60).trim,p(61).trim,p(62).trim,p(63).trim,p(64).trim,p(65).trim,p(66).trim,p(67).trim,p(68).trim,p(69).trim,p(70).trim,p(71).trim,p(72).trim,p(73).trim,p(74).trim,p(75).trim,p(76).trim,p(77).trim,p(78).trim,p(79).trim,p(80).trim,p(81).trim,p(82).trim,p(83).trim,p(84).trim,p(85).trim,p(86).trim,p(87).trim,p(88).trim,p(89).trim,p(90).trim,p(91).trim,p(92).trim,p(93).trim,p(94).trim,p(95).trim,p(96).trim,p(97).trim,p(98).trim,p(99).trim,p(100).trim,p(101).trim,p(102).trim,p(103).trim,p(104).trim,p(105).trim,p(106).trim,p(107).trim,p(108).trim,p(109).trim,p(110).trim,p(111).trim,p(112).trim,p(113).trim,p(114).trim,p(115).trim,p(116).trim,p(117).trim,p(118).trim,p(119).trim,p(120).trim,p(121).trim,p(122).trim,p(123).trim,p(124).trim,p(125).trim,p(126).trim,p(127).trim,p(128).trim,p(129).trim,p(130).trim,p(131).trim,p(132).trim,p(133).trim,p(134).trim,p(135).trim,p(136).trim,p(137).trim,p(138).trim,p(139).trim,p(140).trim,p(141).trim,p(142).trim,p(143).trim,p(144).trim,p(145).trim,p(146).trim,p(147).trim,p(148).trim,p(149).trim,p(150).trim,p(151).trim,p(152).trim,p(153).trim,p(154).trim,p(155).trim,p(156).trim,p(157).trim,p(158).trim,p(159).trim,p(160).trim,p(161).trim,p(162).trim,p(163).trim,p(164).trim,p(165).trim,p(166).trim,p(167).trim,p(168).trim,p(169).trim,p(170).trim,p(171).trim,p(172).trim,p(173).trim,p(174).trim,p(175).trim,p(176).trim,p(177).trim,p(178).trim,p(179).trim,p(180).trim,p(181).trim,p(182).trim,p(183).trim,p(184).trim,p(185).trim,p(186).trim,p(187).trim,p(188).trim,p(189).trim))

val s_order = sqlContext.createDataFrame(rowRDD,schema)

s_order.registerTempTable("s_order")

错误:数组索引超出异常

1 个答案:

答案 0 :(得分:1)

我认为你可以真正简化你的“rowRDD”创作。索引超出范围的异常可能是因为您的行末尾有空值,因此您可以在拆分后将数组填充到特定大小。 除此之外,因为看起来你在拆分后占用了所有的字段,你可以像这样创建你的rowRDD(将190改为正确的长度):

val rowRDD = order
.map{line => 
    line.split("#CO1D#").map(_.trim).padTo(190, null)
}
.map(Row(_:_*))

希望这有帮助。

修改

val intColumns = Set(1, 10, 15)
val tsColumns = Set(5, 15)

val rowRDD = order
    .map{line => 
        line.split("#CO1D#").map(_.trim).padTo(190, null)
    }
    .map(fields => {
        val fieldsWithTypes = fields.zipWithIndex.map{ case (s,i) => 
            if (intColumns.contains(i)) {
                s.toInt
            } else if (tsColumns.contains(i)) {
                s.toLong
            } else {
                s
            }
        }
        Row(fieldsWithTypes:_*)
    })

zipWithIndex方法不是用索引迭代数组的最有效方法,但由于可读性,对我来说看起来更方便。