DROP TABLE filtered_online_march_customers;
--creating bucketed table with customer id
CREATE TABLE IF NOT EXISTS filtered_online_march_customers(
customer_id string,
order_id string
)
CLUSTERED BY(customer_id) INTO 32 BUCKETS;
--populating the table
set hive.enforce.bucketing = true;
FROM filtered_march_online_transactions
INSERT OVERWRITE TABLE filtered_online_march_customers
SELECT
*
我创建了这个按customer_id聚类的表。但是,当我实际尝试使用桶时,它不起作用。
CREATE TABLE randomized_filtered_march_customers
AS
SELECT
*
FROM
filtered_online_march_customers
TABLESAMPLE(BUCKET 1 OUT OF 32 ON customer_id)
我收到了错误: 在pathToPartitionInfo中找不到dir = maprfs:///hive/v0k0020.db/filtered_online_march_customers/000000_0:[maprfs:/hive/v0k0020.db/filtered_online_march_customers/000000_0] at org.apache.hadoop.hive.ql.io.HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:344) at org.apache.hadoop.hive.ql.io.HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:306) 在org.apache.hadoop.hive.ql.io.CombineHiveInputFormat $ CombineHiveInputSplit。(CombineHiveInputFormat.java:108) at org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:455) 在org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:1098) 在org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:1090) 在org.apache.hadoop.mapred.JobClient.access $ 500(JobClient.java:176) 在org.apache.hadoop.mapred.JobClient $ 2.run(JobClient.java:931) 在org.apache.hadoop.mapred.JobClient $ 2.run(JobClient.java:882) at java.security.AccessController.doPrivileged(Native Method) 在javax.security.auth.Subject.doAs(Subject.java:415) 在org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1595) 在org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:882) 在org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:856) 在org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:420) 在org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:136) 在org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:153) 在org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:85) 在org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1503) 在org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1270) 在org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1088) 在org.apache.hadoop.hive.ql.Driver.run(Driver.java:911) 在org.apache.hadoop.hive.ql.Driver.run(Driver.java:901) 在org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:268) 在org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:220) 在org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:423) 在org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:359) 在org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:456) 在org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:466) 在org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:748) 在org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:686) 在org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:625) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) 在org.apache.hadoop.util.RunJar.run(RunJar.java:221) 在org.apache.hadoop.util.RunJar.main(RunJar.java:136) 作业提交失败,异常为'java.io.IOException(在pathToPartitionInfo中找不到dir = maprfs:///hive/v0k0020.db/filtered_online_march_customers/000000_0:[maprfs:/hive/v0k0020.db/filtered_online_march_customers/000000_0])'< / p>
如果我将查询更改为
CREATE TABLE randomized_filtered_march_customers
AS
SELECT
*
FROM
filtered_online_march_customers
TABLESAMPLE(BUCKET 1 OUT OF 32 ON rand())
工作正常。知道怎么解决吗?