我试图在mysql jdbc连接之上创建一个spark数据框并将其保存到hive表。但是获得了gc开销或超时异常。
我确实检查了内存设置并将其更改为最大值。该机器具有4Gb的虚拟内存,限制设置为3GB。
表的大小是22Gb,我在加载时创建分区,这样数据就会在执行程序之间分配,但似乎没有用。
Java代码:
import com.atrato.utils.*;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;
/**
* Created by dinesh on 11/24/16.
*/
public class ImportGsod {
public void importTable(String jdbc, String driver, String sourceDBName, String sourceTableName, String destTableName, String user, String password, HiveContext hiveContext, SparkConf conf) throws SQLException, ClassNotFoundException {
System.out.println("*****************************************************");
Connection connection = GetConnection.getConnection(jdbc,driver,sourceDBName,user,password);
int executorMemory = GetExecutorMemory.getExecutorMemory(conf);
System.out.println("Detected Executor Memory (In MB) :"+executorMemory);
// int rowCount = GetRowCount.getRowCount(connection,sourceTableName);
int rowCount = 114420316;
System.out.println("Detected Row Count In Table :"+rowCount);
int rowSize = GetJavaRowSize.getJavaRowSize(connection,sourceTableName);
System.out.println("Detected Row Size In Table :"+rowSize);
int memoryMaxRows = (int)(executorMemory/2);
memoryMaxRows = (memoryMaxRows*1024*1024)/rowSize;
System.out.println("Max Rows Per Executor :"+memoryMaxRows);
int numOfPartitions = rowCount/memoryMaxRows+1;
System.out.println("Number Of Partitions :"+numOfPartitions);
// String minDistinct = GetMinDistinct.getMinDistinct(connection,sourceTableName);
String minDistinct = "month";
System.out.println("Detected Min Distinct Columns :"+minDistinct);
System.out.println("*****************************************************");
Map<String, String> conn = new HashMap<>();
conn.put("url",jdbc);
conn.put("dbtable",sourceDBName+"."+sourceTableName);
conn.put("driver",driver);
conn.put("user",user);
conn.put("password",password);
conn.put("partitionColumn",minDistinct);
conn.put("lowerBound","0");
conn.put("upperBound",memoryMaxRows+"");
conn.put("numPartitions","12");
DataFrame sourceTable = hiveContext.read().format("jdbc").options(conn).load();
Map<String ,String > columnTypes = new HashMap<>();
Map<String ,Column> newColumns = new HashMap<>();
StructType columns = sourceTable.schema();
StructField[] columnMeta = columns.fields();
for(StructField column : columnMeta) {
String dataType = column.dataType().toString();
columnTypes.put(column.name(), dataType.substring(0,dataType.length()-4).toLowerCase());
}
columnTypes.entrySet().stream().filter(entry -> entry.getValue().equals("date")).forEach(entry -> {
Column column = sourceTable.col(entry.getKey()).cast("string");
newColumns.put(entry.getKey(), column);
});
DataFrame destTable = sourceTable;
for(Map.Entry<String,Column> columnEntry :newColumns.entrySet()){
destTable = destTable.withColumn(columnEntry.getKey(),columnEntry.getValue());
}
destTable.write().mode("overwrite").saveAsTable(destTableName);
}
}
HDFS配置: core-site.xml:
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://quickstart.cloudera:8020</value>
</property>
<property>
<name>fs.trash.interval</name>
<value>1</value>
</property>
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.com press.Lz4Codec</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>simple</value>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>false</value>
</property>
<property>
<name>hadoop.rpc.protection</name>
<value>authentication</value>
</property>
<property>
<name>hadoop.ssl.require.client.cert</name>
<value>false</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.keystores.factory.class</name>
<value>org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.server.conf</name>
<value>ssl-server.xml</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.client.conf</name>
<value>ssl-client.xml</value>
<final>true</final>
</property>
<property>
<name>hadoop.security.auth_to_local</name>
<value>DEFAULT</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mapred.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mapred.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.flume.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.flume.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.HTTP.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.HTTP.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.httpfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.httpfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.security.group.mapping</name>
<value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
</property>
<property>
<name>hadoop.security.instrumentation.requires.admin</name>
<value>false</value>
</property>
</configuration>
hdfs-site.xml:
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
</property>
<property>
<name>dfs.namenode.servicerpc-address</name>
<value>quickstart.cloudera:8022</value>
</property>
<property>
<name>dfs.https.address</name>
<value>quickstart.cloudera:50470</value>
</property>
<property>
<name>dfs.https.port</name>
<value>50470</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>quickstart.cloudera:50070</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>false</value>
</property>
<property>
<name>fs.permissions.umask-mode</name>
<value>022</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.client.use.legacy.blockreader</name>
<value>false</value>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>false</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/run/hdfs-sockets/dn</value>
</property>
<property>
<name>dfs.client.read.shortcircuit.skip.checksum</name>
<value>false</value>
</property>
<property>
<name>dfs.client.domain.socket.data.traffic</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
<value>true</value>
</property>
</configuration>
纱-site.xml中:
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>yarn.acl.enable</name>
<value>true</value>
</property>
<property>
<name>yarn.admin.acl</name>
<value>*</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>quickstart.cloudera:8032</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>quickstart.cloudera:8033</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>quickstart.cloudera:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>quickstart.cloudera:8031</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>quickstart.cloudera:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address</name>
<value>quickstart.cloudera:8090</value>
</property>
<property>
<name>yarn.resourcemanager.client.thread-count</name>
<value>50</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.client.thread-count</name>
<value>50</value>
</property>
<property>
<name>yarn.resourcemanager.admin.client.thread-count</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.increment-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>2816</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.increment-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.resourcemanager.amliveliness-monitor.interval-ms</name>
<value>1000</value>
</property>
<property>
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>
<value>600000</value>
</property>
<property>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>2</value>
</property>
<property>
<name>yarn.resourcemanager.container.liveness-monitor.interval-ms</name>
<value>600000</value>
</property>
<property>
<name>yarn.resourcemanager.nm.liveness-monitor.interval-ms</name>
<value>1000</value>
</property>
<property>
<name>yarn.nm.liveness-monitor.expiry-interval-ms</name>
<value>600000</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.client.thread-count</name>
<value>50</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<property>
<name>yarn.scheduler.fair.user-as-default-queue</name>
<value>true</value>
</property>
<property>
<name>yarn.scheduler.fair.preemption</name>
<value>false</value>
</property>
<property>
<name>yarn.scheduler.fair.sizebasedweight</name>
<value>false</value>
</property>
<property>
<name>yarn.scheduler.fair.assignmultiple</name>
<value>false</value>
</property>
<property>
<name>yarn.resourcemanager.max-completed-applications</name>
<value>10000</value>
</property>
<property>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/tmp/logs</value>
</property>
<property>
<name>yarn.nodemanager.remote-app-log-dir-suffix</name>
<value>logs</value>
</property>
</configuration>
mapred-site.xml中:
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>mapreduce.job.split.metainfo.maxsize</name>
<value>10000000</value>
</property>
<property>
<name>mapreduce.job.counters.max</name>
<value>120</value>
</property>
<property>
<name>mapreduce.job.counters.groups.max</name>
<value>50</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress</name>
<value>false</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.type</name>
<value>BLOCK</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.codec</name>
<value>org.apache.hadoop.io.compress.DefaultCodec</value>
</property>
<property>
<name>mapreduce.map.output.compress.codec</name>
<value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
<name>mapreduce.map.output.compress</name>
<value>true</value>
</property>
<property>
<name>zlib.compress.level</name>
<value>DEFAULT_COMPRESSION</value>
</property>
<property>
<name>mapreduce.task.io.sort.factor</name>
<value>64</value>
</property>
<property>
<name>mapreduce.map.sort.spill.percent</name>
<value>0.8</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.parallelcopies</name>
<value>10</value>
</property>
<property>
<name>mapreduce.task.timeout</name>
<value>600000</value>
</property>
<property>
<name>mapreduce.client.submit.file.replication</name>
<value>1</value>
</property>
<property>
<name>mapreduce.job.reduces</name>
<value>1</value>
</property>
<property>
<name>mapreduce.task.io.sort.mb</name>
<value>16</value>
</property>
<property>
<name>mapreduce.map.speculative</name>
<value>false</value>
</property>
<property>
<name>mapreduce.reduce.speculative</name>
<value>false</value>
</property>
<property>
<name>mapreduce.job.reduce.slowstart.completedmaps</name>
<value>0.8</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>quickstart.cloudera:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>quickstart.cloudera:19888</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.https.address</name>
<value>quickstart.cloudera:19890</value>
</property>
<property>
<name>mapreduce.jobhistory.admin.address</name>
<value>quickstart.cloudera:10033</value>
</property>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.staging-dir</name>
<value>/user</value>
</property>
<property>
<name>mapreduce.am.max-attempts</name>
<value>2</value>
</property>
<property>
<name>yarn.app.mapreduce.am.resource.mb</name>
<value>128</value>
</property>
<property>
<name>yarn.app.mapreduce.am.resource.cpu-vcores</name>
<value>1</value>
</property>
<property>
<name>mapreduce.job.ubertask.enable</name>
<value>false</value>
</property>
<property>
<name>yarn.app.mapreduce.am.command-opts</name>
<value>-Djava.net.preferIPv4Stack=true -Xmx52428800</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-Djava.net.preferIPv4Stack=true -Xmx52428800</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Djava.net.preferIPv4Stack=true -Xmx52428800</value>
</property>
<property>
<name>yarn.app.mapreduce.am.admin.user.env</name>
<value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.map.cpu.vcores</name>
<value>1</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.reduce.cpu.vcores</name>
<value>1</value>
</property>
<property>
<name>mapreduce.job.heap.memory-mb.ratio</name>
<value>0.8</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH</value>
</property>
<property>
<name>mapreduce.admin.user.env</name>
<value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value>
</property>
</configuration>
hadoop-env.sh:
# Prepend/Append plugin parcel classpaths
if [ "$HADOOP_USER_CLASSPATH_FIRST" = 'true' ]; then
# HADOOP_CLASSPATH={{HADOOP_CLASSPATH_APPEND}}
:
else
# HADOOP_CLASSPATH={{HADOOP_CLASSPATH}}
:
fi
# JAVA_LIBRARY_PATH={{JAVA_LIBRARY_PATH}}
export HADOOP_MAPRED_HOME=$( ([[ ! '{{CDH_MR2_HOME}}' =~ CDH_MR2_HOME ]] && echo {{CDH_MR2_HOME}} ) || echo ${CDH_MR2_HOME:-/usr/lib/hadoop-mapreduce/} )
export YARN_OPTS="-Xmx3221225472 -Djava.net.preferIPv4Stack=true $YARN_OPTS"
export HADOOP_CLIENT_OPTS="-Djava.net.preferIPv4Stack=true $HADOOP_CLIENT_OPTS"
我试图做分区,以便火花不会同时将所有数据加载到内存中。但这没有用。
任何帮助将不胜感激。谢谢,如果您需要更多信息,请告诉我。
链接到我尝试加载的数据: https://bigquery.cloud.google.com/table/bigquery-public-data:samples.gsod