我已经创建了一个带有Hive脚本的EMR集群,作为添加的执行步骤之一。 以下是我的HIVE脚本的样子:
- 为现有数据创建Hive外部表
CREATE EXTERNAL TABLE calls_csv
(
id
int,
campaign_id
int,
campaign_name
字符串,
offer_id
int,
offer_name
字符串,
is_offer_not_found
int,
ivr_key
字符串,
call_uuid
字符串,
a_leg_uuid
字符串,
a_leg_request_uuid
字符串,
to_number
字符串,
promo_id
int,
description
字符串,
call_type
字符串,
answer_type
字符串,
agent_id
int,
from_number
字符串,
from_caller_name
字符串,
from_line_type
字符串,
from_state
字符串,
from_city
字符串,
from_country
字符串,
from_zip
字符串,
from_latitude
字符串,
from_longitude
字符串,
b_leg_uuid
字符串,
b_leg_number
字符串,
b_leg_duration
int,
b_leg_bill_rate
加倍,
b_leg_bill_duration
int,
b_leg_total_cost
加倍,
b_leg_hangup_cause
字符串,
b_leg_start_time
字符串,
b_leg_answer_time
字符串,
b_leg_end_time
字符串,
b_leg_active
tinyint,
bill_rate
加倍,
bill_duration
int,
hangup_cause
字符串,
start_time
字符串,
answer_time
字符串,
end_time
字符串,
status
字符串,
selected_ivr_keys
字符串,
processed_ivr_keys
字符串,
filter_id
int,
filter_name
字符串,
ivr_action
字符串,
selected_zip_code
字符串,
processed_zip_code
字符串,
duration
int,
payout
加倍,
min_duration
int,
connected_duration
int,
provider_cost
加倍,
caller_id_cost
加倍,
total_revenue
加倍,
total_cost
加倍,
total_profit
加倍,
publisher_id
int,
publisher_name
字符串,
publisher_revenue
加倍,
publisher_cost
加倍,
publisher_profit
加倍,
advertiser_id
int,
advertiser_name
字符串,
advertiser_cost
加倍,
is_test
tinyint,
is_sale
tinyint,
is_repeat
tinyint,
is_machine_detection
tinyint,
no_of_call_transfer
int,
offer_ivr_status
tinyint,
file_url
字符串,
algo
字符串,
callback_service_status
tinyint,
hangup_service_status
tinyint,
sms_uuid
字符串,
number_name
字符串,
keyword
字符串,
keywordmatchtype
字符串,
created_at
字符串,
updated_at
字符串,
ymdhm
bigint
)
行格式SERDE' org.apache.hadoop.hive.serde2.OpenCSVSerde'
与SERDEPROPERTIES(
' separatorChar' =',',
' quoteChar' =' \"')
LOCATION' s3:// calls-csv /'
TBLPROPERTIES(' has_encrypted_data' =' false',
' serialization.null.format' ='&#39);
msck修复表calls_csv;
- 现在让我们创建一个Parquet格式的外部表
CREATE EXTERNAL TABLE calls_parquet(
id
int,
campaign_id
int,
campaign_name
字符串,
offer_id
int,
offer_name
字符串,
is_offer_not_found
int,
ivr_key
字符串,
call_uuid
字符串,
a_leg_uuid
字符串,
a_leg_request_uuid
字符串,
to_number
字符串,
promo_id
int,
description
字符串,
call_type
字符串,
answer_type
字符串,
agent_id
int,
from_number
字符串,
from_caller_name
字符串,
from_line_type
字符串,
from_state
字符串,
from_city
字符串,
from_country
字符串,
from_zip
字符串,
from_latitude
字符串,
from_longitude
字符串,
b_leg_uuid
字符串,
b_leg_number
字符串,
b_leg_duration
int,
b_leg_bill_rate
加倍,
b_leg_bill_duration
int,
b_leg_total_cost
加倍,
b_leg_hangup_cause
字符串,
b_leg_start_time
字符串,
b_leg_answer_time
字符串,
b_leg_end_time
字符串,
b_leg_active
tinyint,
bill_rate
加倍,
bill_duration
int,
hangup_cause
字符串,
start_time
字符串,
answer_time
字符串,
end_time
字符串,
status
字符串,
selected_ivr_keys
字符串,
processed_ivr_keys
字符串,
filter_id
int,
filter_name
字符串,
ivr_action
字符串,
selected_zip_code
字符串,
processed_zip_code
字符串,
duration
int,
payout
加倍,
min_duration
int,
connected_duration
int,
provider_cost
加倍,
caller_id_cost
加倍,
total_revenue
加倍,
total_cost
加倍,
total_profit
加倍,
publisher_id
int,
publisher_name
字符串,
publisher_revenue
加倍,
publisher_cost
加倍,
publisher_profit
加倍,
advertiser_id
int,
advertiser_name
字符串,
advertiser_cost
加倍,
is_test
tinyint,
is_sale
tinyint,
is_repeat
tinyint,
is_machine_detection
tinyint,
no_of_call_transfer
int,
offer_ivr_status
tinyint,
file_url
字符串,
algo
字符串,
callback_service_status
tinyint,
hangup_service_status
tinyint,
sms_uuid
字符串,
number_name
字符串,
keyword
字符串,
keywordmatchtype
字符串,
created_at
字符串,
updated_at
字符串,
ymdhm
bigint)
存储为PARQUET
LOCATION' s3:// calls-parquet /';
- 转换和出口的时间。此步骤将运行很长时间,具体取决于您的数据大小和群集大小。 INSERT OVERWRITE TABLE calls_parquet SELECT * FROM calls_csv
以下是我在EMR群集上运行此步骤时出现的错误
状态:失败
详细信息:FAILED:执行错误,从org.apache.hadoop.hive.ql.exec.MoveTask返回代码1。移动时出错:s3://calls-parquet/.hive-staging_hive_2018-03-20_07-09-28_592_6773618098932115163-1/-ext-10000 into:s3:// calls-parquet /
JAR位置:command-runner.jar 主要类别:无
参数:hive-script --run-hive-script --args -f s3://calls-scripts/converToParquetHive.sql -d INPUT = s3:// calls-csv -d OUTPUT = s3://电话实木复合地板 失败行动:继续