巨大数据集上的Spark数据框架

时间:2016-11-28 21:11:57

标签: apache-spark spark-dataframe

我试图在mysql jdbc连接之上创建一个spark数据框并将其保存到hive表。但是获得了gc开销或超时异常。

我确实检查了内存设置并将其更改为最大值。该机器具有4Gb的虚拟内存,限制设置为3GB。

表的大小是22Gb,我在加载时创建分区,这样数据就会在执行程序之间分配,但似乎没有用。

Java代码:

import com.atrato.utils.*;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.sql.Connection;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;

/**
 * Created by dinesh on 11/24/16.
 */
public class ImportGsod {

    public void importTable(String jdbc, String driver, String sourceDBName, String sourceTableName, String destTableName, String user, String password, HiveContext hiveContext, SparkConf conf) throws SQLException, ClassNotFoundException {

        System.out.println("*****************************************************");
        Connection connection = GetConnection.getConnection(jdbc,driver,sourceDBName,user,password);

        int executorMemory = GetExecutorMemory.getExecutorMemory(conf);
        System.out.println("Detected Executor Memory (In MB) :"+executorMemory);

//        int rowCount = GetRowCount.getRowCount(connection,sourceTableName);
        int rowCount = 114420316;
        System.out.println("Detected Row Count In Table      :"+rowCount);

        int rowSize = GetJavaRowSize.getJavaRowSize(connection,sourceTableName);
        System.out.println("Detected Row Size In Table       :"+rowSize);

        int memoryMaxRows = (int)(executorMemory/2);
        memoryMaxRows = (memoryMaxRows*1024*1024)/rowSize;
        System.out.println("Max Rows Per Executor            :"+memoryMaxRows);

        int numOfPartitions = rowCount/memoryMaxRows+1;
        System.out.println("Number Of Partitions             :"+numOfPartitions);

//        String minDistinct = GetMinDistinct.getMinDistinct(connection,sourceTableName);
        String minDistinct = "month";
        System.out.println("Detected Min Distinct Columns    :"+minDistinct);

        System.out.println("*****************************************************");


        Map<String, String> conn = new HashMap<>();
        conn.put("url",jdbc);
        conn.put("dbtable",sourceDBName+"."+sourceTableName);
        conn.put("driver",driver);
        conn.put("user",user);
        conn.put("password",password);
        conn.put("partitionColumn",minDistinct);
        conn.put("lowerBound","0");
        conn.put("upperBound",memoryMaxRows+"");
        conn.put("numPartitions","12");

        DataFrame sourceTable = hiveContext.read().format("jdbc").options(conn).load();


        Map<String ,String > columnTypes = new HashMap<>();

        Map<String ,Column> newColumns = new HashMap<>();

        StructType columns = sourceTable.schema();

        StructField[] columnMeta = columns.fields();

        for(StructField column : columnMeta) {
            String dataType = column.dataType().toString();
            columnTypes.put(column.name(), dataType.substring(0,dataType.length()-4).toLowerCase());
        }

        columnTypes.entrySet().stream().filter(entry -> entry.getValue().equals("date")).forEach(entry -> {
            Column column = sourceTable.col(entry.getKey()).cast("string");
            newColumns.put(entry.getKey(), column);
        });

        DataFrame destTable = sourceTable;

        for(Map.Entry<String,Column> columnEntry :newColumns.entrySet()){
            destTable = destTable.withColumn(columnEntry.getKey(),columnEntry.getValue());
        }
        destTable.write().mode("overwrite").saveAsTable(destTableName);
    }
}

HDFS配置: core-site.xml:     

<!--Autogenerated by Cloudera Manager-->
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://quickstart.cloudera:8020</value>
  </property>
  <property>
    <name>fs.trash.interval</name>
    <value>1</value>
  </property>
  <property>
    <name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.com    press.Lz4Codec</value>
  </property>
  <property>
    <name>hadoop.security.authentication</name>
    <value>simple</value>
  </property>
  <property>
    <name>hadoop.security.authorization</name>
    <value>false</value>
  </property>
  <property>
    <name>hadoop.rpc.protection</name>
    <value>authentication</value>
  </property>
  <property>
    <name>hadoop.ssl.require.client.cert</name>
    <value>false</value>
    <final>true</final>
  </property>
  <property>
    <name>hadoop.ssl.keystores.factory.class</name>
    <value>org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory</value>
    <final>true</final>
  </property>
  <property>
    <name>hadoop.ssl.server.conf</name>
    <value>ssl-server.xml</value>
    <final>true</final>
  </property>
  <property>
    <name>hadoop.ssl.client.conf</name>
    <value>ssl-client.xml</value>
    <final>true</final>
  </property>
  <property>
    <name>hadoop.security.auth_to_local</name>
    <value>DEFAULT</value>
  </property>
  <property>
    <name>hadoop.proxyuser.oozie.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.oozie.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.mapred.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.mapred.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.flume.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.flume.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.HTTP.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.HTTP.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hive.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hive.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hue.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hue.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.httpfs.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.httpfs.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hdfs.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hdfs.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.yarn.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.yarn.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.security.group.mapping</name>
    <value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
  </property>
  <property>
    <name>hadoop.security.instrumentation.requires.admin</name>
    <value>false</value>
  </property>
</configuration>

hdfs-site.xml:

<?xml version="1.0" encoding="UTF-8"?>

<!--Autogenerated by Cloudera Manager-->
<configuration>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>file:///var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
  </property>
  <property>
    <name>dfs.namenode.servicerpc-address</name>
    <value>quickstart.cloudera:8022</value>
  </property>
  <property>
    <name>dfs.https.address</name>
    <value>quickstart.cloudera:50470</value>
  </property>
  <property>
    <name>dfs.https.port</name>
    <value>50470</value>
  </property>
  <property>
    <name>dfs.namenode.http-address</name>
    <value>quickstart.cloudera:50070</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
  <property>
    <name>dfs.blocksize</name>
    <value>134217728</value>
  </property>
  <property>
    <name>dfs.client.use.datanode.hostname</name>
    <value>false</value>
  </property>
  <property>
    <name>fs.permissions.umask-mode</name>
    <value>022</value>
  </property>
  <property>
    <name>dfs.namenode.acls.enabled</name>
    <value>false</value>
  </property>
  <property>
    <name>dfs.client.use.legacy.blockreader</name>
    <value>false</value>
  </property>
  <property>
    <name>dfs.client.read.shortcircuit</name>
    <value>false</value>
  </property>
  <property>
    <name>dfs.domain.socket.path</name>
    <value>/var/run/hdfs-sockets/dn</value>
  </property>
  <property>
    <name>dfs.client.read.shortcircuit.skip.checksum</name>
    <value>false</value>
  </property>
  <property>
    <name>dfs.client.domain.socket.data.traffic</name>
    <value>false</value>
  </property>
  <property>
    <name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
    <value>true</value>
  </property>
</configuration>

纱-site.xml中:     

<!--Autogenerated by Cloudera Manager-->
<configuration>
  <property>
    <name>yarn.acl.enable</name>
    <value>true</value>
  </property>
  <property>
    <name>yarn.admin.acl</name>
    <value>*</value>
  </property>
  <property>
    <name>yarn.resourcemanager.address</name>
    <value>quickstart.cloudera:8032</value>
  </property>
  <property>
    <name>yarn.resourcemanager.admin.address</name>
    <value>quickstart.cloudera:8033</value>
  </property>
  <property>
    <name>yarn.resourcemanager.scheduler.address</name>
    <value>quickstart.cloudera:8030</value>
  </property>
  <property>
    <name>yarn.resourcemanager.resource-tracker.address</name>
    <value>quickstart.cloudera:8031</value>
  </property>
  <property>
    <name>yarn.resourcemanager.webapp.address</name>
    <value>quickstart.cloudera:8088</value>
  </property>
  <property>
    <name>yarn.resourcemanager.webapp.https.address</name>
    <value>quickstart.cloudera:8090</value>
  </property>
  <property>
    <name>yarn.resourcemanager.client.thread-count</name>
    <value>50</value>
  </property>
  <property>
    <name>yarn.resourcemanager.scheduler.client.thread-count</name>
    <value>50</value>
  </property>
  <property>
    <name>yarn.resourcemanager.admin.client.thread-count</name>
    <value>1</value>
  </property>
  <property>
    <name>yarn.scheduler.minimum-allocation-mb</name>
    <value>1</value>
  </property>
  <property>
    <name>yarn.scheduler.increment-allocation-mb</name>
    <value>512</value>
  </property>
  <property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>2816</value>
  </property>
  <property>
    <name>yarn.scheduler.minimum-allocation-vcores</name>
    <value>1</value>
  </property>
  <property>
    <name>yarn.scheduler.increment-allocation-vcores</name>
    <value>1</value>
  </property>
  <property>
    <name>yarn.scheduler.maximum-allocation-vcores</name>
    <value>2</value>
  </property>
  <property>
    <name>yarn.resourcemanager.amliveliness-monitor.interval-ms</name>
    <value>1000</value>
  </property>
  <property>
    <name>yarn.am.liveness-monitor.expiry-interval-ms</name>
    <value>600000</value>
  </property>
  <property>
    <name>yarn.resourcemanager.am.max-attempts</name>
    <value>2</value>
  </property>
  <property>
    <name>yarn.resourcemanager.container.liveness-monitor.interval-ms</name>
    <value>600000</value>
  </property>
  <property>
    <name>yarn.resourcemanager.nm.liveness-monitor.interval-ms</name>
    <value>1000</value>
  </property>
  <property>
    <name>yarn.nm.liveness-monitor.expiry-interval-ms</name>
    <value>600000</value>
  </property>
  <property>
    <name>yarn.resourcemanager.resource-tracker.client.thread-count</name>
    <value>50</value>
  </property>
  <property>
    <name>yarn.application.classpath</name>
    <value>$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*</value>
  </property>
  <property>
    <name>yarn.resourcemanager.scheduler.class</name>
    <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
  </property>
  <property>
    <name>yarn.scheduler.fair.user-as-default-queue</name>
    <value>true</value>
  </property>
  <property>
    <name>yarn.scheduler.fair.preemption</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.scheduler.fair.sizebasedweight</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.scheduler.fair.assignmultiple</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.resourcemanager.max-completed-applications</name>
    <value>10000</value>
  </property>
  <property>
    <name>yarn.nodemanager.remote-app-log-dir</name>
    <value>/tmp/logs</value>
  </property>
  <property>
    <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
    <value>logs</value>
  </property>
</configuration>

mapred-site.xml中:     

<!--Autogenerated by Cloudera Manager-->
<configuration>
  <property>
    <name>mapreduce.job.split.metainfo.maxsize</name>
    <value>10000000</value>
  </property>
  <property>
    <name>mapreduce.job.counters.max</name>
    <value>120</value>
  </property>
  <property>
    <name>mapreduce.job.counters.groups.max</name>
    <value>50</value>
  </property>
  <property>
    <name>mapreduce.output.fileoutputformat.compress</name>
    <value>false</value>
  </property>
  <property>
    <name>mapreduce.output.fileoutputformat.compress.type</name>
    <value>BLOCK</value>
  </property>
  <property>
    <name>mapreduce.output.fileoutputformat.compress.codec</name>
    <value>org.apache.hadoop.io.compress.DefaultCodec</value>
  </property>
  <property>
    <name>mapreduce.map.output.compress.codec</name>
    <value>org.apache.hadoop.io.compress.SnappyCodec</value>
  </property>
  <property>
    <name>mapreduce.map.output.compress</name>
    <value>true</value>
  </property>
  <property>
    <name>zlib.compress.level</name>
    <value>DEFAULT_COMPRESSION</value>
  </property>
  <property>
    <name>mapreduce.task.io.sort.factor</name>
    <value>64</value>
  </property>
  <property>
    <name>mapreduce.map.sort.spill.percent</name>
    <value>0.8</value>
  </property>
  <property>
    <name>mapreduce.reduce.shuffle.parallelcopies</name>
    <value>10</value>
  </property>
  <property>
    <name>mapreduce.task.timeout</name>
    <value>600000</value>
  </property>
  <property>
    <name>mapreduce.client.submit.file.replication</name>
    <value>1</value>
  </property>
  <property>
    <name>mapreduce.job.reduces</name>
    <value>1</value>
  </property>
  <property>
    <name>mapreduce.task.io.sort.mb</name>
    <value>16</value>
  </property>
  <property>
    <name>mapreduce.map.speculative</name>
    <value>false</value>
  </property>
  <property>
    <name>mapreduce.reduce.speculative</name>
    <value>false</value>
  </property>
  <property>
    <name>mapreduce.job.reduce.slowstart.completedmaps</name>
    <value>0.8</value>
  </property>
  <property>
    <name>mapreduce.jobhistory.address</name>
    <value>quickstart.cloudera:10020</value>
  </property>
  <property>
    <name>mapreduce.jobhistory.webapp.address</name>
    <value>quickstart.cloudera:19888</value>
  </property>
  <property>
    <name>mapreduce.jobhistory.webapp.https.address</name>
    <value>quickstart.cloudera:19890</value>
  </property>
  <property>
    <name>mapreduce.jobhistory.admin.address</name>
    <value>quickstart.cloudera:10033</value>
  </property>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.staging-dir</name>
    <value>/user</value>
  </property>
  <property>
    <name>mapreduce.am.max-attempts</name>
    <value>2</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.resource.mb</name>
    <value>128</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.resource.cpu-vcores</name>
    <value>1</value>
  </property>
  <property>
    <name>mapreduce.job.ubertask.enable</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.command-opts</name>
    <value>-Djava.net.preferIPv4Stack=true -Xmx52428800</value>
  </property>
  <property>
    <name>mapreduce.map.java.opts</name>
    <value>-Djava.net.preferIPv4Stack=true -Xmx52428800</value>
  </property>
  <property>
    <name>mapreduce.reduce.java.opts</name>
    <value>-Djava.net.preferIPv4Stack=true -Xmx52428800</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.admin.user.env</name>
    <value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value>
  </property>
  <property>
    <name>mapreduce.map.memory.mb</name>
    <value>1024</value>
  </property>
  <property>
    <name>mapreduce.map.cpu.vcores</name>
    <value>1</value>
  </property>
  <property>
    <name>mapreduce.reduce.memory.mb</name>
    <value>1024</value>
  </property>
  <property>
    <name>mapreduce.reduce.cpu.vcores</name>
    <value>1</value>
  </property>
  <property>
    <name>mapreduce.job.heap.memory-mb.ratio</name>
    <value>0.8</value>
  </property>
  <property>
    <name>mapreduce.application.classpath</name>
    <value>$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH</value>
  </property>
  <property>
    <name>mapreduce.admin.user.env</name>
    <value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value>
  </property>
</configuration>

hadoop-env.sh:

# Prepend/Append plugin parcel classpaths

if [ "$HADOOP_USER_CLASSPATH_FIRST" = 'true' ]; then
  # HADOOP_CLASSPATH={{HADOOP_CLASSPATH_APPEND}}
  :
else
  # HADOOP_CLASSPATH={{HADOOP_CLASSPATH}}
  :
fi
# JAVA_LIBRARY_PATH={{JAVA_LIBRARY_PATH}}

export HADOOP_MAPRED_HOME=$( ([[ ! '{{CDH_MR2_HOME}}' =~ CDH_MR2_HOME ]] && echo {{CDH_MR2_HOME}} ) || echo ${CDH_MR2_HOME:-/usr/lib/hadoop-mapreduce/}  )
export YARN_OPTS="-Xmx3221225472 -Djava.net.preferIPv4Stack=true $YARN_OPTS"
export HADOOP_CLIENT_OPTS="-Djava.net.preferIPv4Stack=true $HADOOP_CLIENT_OPTS"

我试图做分区,以便火花不会同时将所有数据加载到内存中。但这没有用。

任何帮助将不胜感激。谢谢,如果您需要更多信息,请告诉我。

链接到我尝试加载的数据: https://bigquery.cloud.google.com/table/bigquery-public-data:samples.gsod

0 个答案:

没有答案