Question

我正在使用配置单元，我想将mapreduce临时工作目录从/ tmp更改为其他目录。我尝试了所有可以在互联网上找到的内容，但没有任何效果。我可以通过du -h命令看到/ tmp在mapreduce任务期间正在填充。请有人帮我更改目录。

core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
   <property>
      <name>fs.default.name</name>
      <value>hdfs://localhost:9000</value>
   </property>
  <property>
     <name>hadoop.tmp.dir</name>
     <value>/data/bd/tmp/hadoop-${user.name}</value>
  </property>
   <property>
      <name>dfs.journalnode.edits.dir</name>
      <value>/data/bd/tmp/hadoop/dfs/journalnode/</value>
   </property>
</configuration>

mapred-site.xml

    <?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
   <property>
      <name>mapreduce.framework.name</name>
      <value>yarn</value>
   </property>
   <property>
      <name>mapreduce.cluster.local.dir</name>
      <value>/data/bd/tmp/mapred/local</value>
   </property>
   <property>
      <name>mapreduce.task.tmp.dir</name>
      <value>/data/bd/tmp</value>
   </property>
   <property>
      <name>mapreduce.cluster.temp.dir</name>
      <value>/data/bd/tmp/mapred/temp</value>
   </property>
   <property>
      <name>yarn.app.mapreduce.am.env</name>
      <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
   </property>
   <property>
      <name>mapreduce.map.env</name>
      <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
   </property>
   <property>
      <name>mapreduce.reduce.env</name>
      <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
   </property>
   <property>
      <name>mapreduce.map.memory.mb</name>
      <value>2048</value>
   </property>
   <property>
      <name>mapreduce.reduce.memory.mb</name>
      <value>4096</value>
   </property>
   <property>
      <name>yarn.app.mapreduce.am.staging-dir</name>
      <value>/data/bd/tmp/hadoop-yarn/staging</value>
   </property>
   <property>
      <name>mapreduce.jobtracker.system.dir</name>
      <value>/data/bd/tmp/mapred/system</value>
   </property>
   <property>
      <name>mapreduce.jobtracker.staging.root.dir</name>
      <value>/data/bd/tmp/mapred/staging</value>
   </property>
   <property>
      <name>mapreduce.map.output.compress</name>
      <value>true</value>
   </property>
   <property>
      <name>mapreduce.map.output.compress.codec</name>
      <value>org.apache.hadoop.io.compress.GzipCodec</value>
   </property>
</configuration>

yarn-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
   <property>
      <name>yarn.nodemanager.aux-services</name>
      <value>mapreduce_shuffle</value>
   </property>
   <property>
      <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
      <value>org.apache.hadoop.mapred.ShuffleHandler</value>
   </property>
   <property>
      <name>yarn.application.classpath</name>
      <value>$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/share/hadoop/common/*,$HADOOP_COMMON_HOME/share/hadoop/common/lib/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,
    $HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*</value>
   </property>
   <property>
      <name>yarn.nodemanager.vmem-check-enabled</name>
      <value>false</value>
      <description>Whether virtual memory limits will be enforced for containers</description>
   </property>
   <property>
      <name>yarn.nodemanager.vmem-pmem-ratio</name>
      <value>4</value>
      <description>Ratio between virtual memory to physical memory when setting memory limits for containers</description>
   </property>
   <property>
      <name>yarn.nodemanager.remote-app-log-dir</name>
      <value>/data/bd/tmp/logs</value>
      <description>The staging dir used while submitting jobs</description>
   </property>
   <property>
      <name>yarn.timeline-service.entity-group-fs-store.active-dir</name>
      <value>/data/bd/tmp/entity-file-history/active</value>
      <description>HDFS path to store active application’s timeline data</description>
   </property>
   <property>
      <name>yarn.timeline-service.entity-group-fs-store.done-dir</name>
      <value>/data/bd/tmp/entity-file-history/done/</value>
      <description>HDFS path to store done application’s timeline data</description>
   </property>
   <property>
      <name>yarn.nodemanager.local-dirs</name>
      <value>/data/bd/tmp/hadoop-ubuntu/nm-local-dir</value>
      <description>List of directories to store localized files</description>
   </property>
</configuration>

hive-site.xml

<configuration>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
        <description>metadata is stored in a MySQL server</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
        <description>MySQL JDBC driver class</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>hive</value>
        <description>user name for connecting to mysql server</description>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>hive</value>
        <description>password for connecting to mysql server</description>
    </property>
    <property>
        <name>hive.exec.parallel</name>
        <value>true</value>
        <description>Whether to execute jobs in parallel</description>
    </property>
    <property>
        <name>hive.exec.parallel.thread.number</name>
        <value>8</value>
        <description>How many jobs at most can be executed in parallel</description>
    </property>
    <property>
        <name>hive.cbo.enable</name>
        <value>true</value>
        <description>Flag to control enabling Cost Based Optimizations using Calcite framework.</description>
    </property>
    <property>
        <name>hive.compute.query.using.stats</name>
        <value>true</value>
        <description>
      When set to true Hive will answer a few queries like count(1) purely using stats
      stored in metastore. For basic stats collection turn on the config hive.stats.autogather to true.
      For more advanced stats collection need to run analyze table queries.
    </description>
    </property>
    <property>
        <name>hive.stats.fetch.partition.stats</name>
        <value>true</value>
        <description>
      Annotation of operator tree with statistics information requires partition level basic
      statistics like number of rows, data size and file size. Partition statistics are fetched from
      metastore. Fetching partition statistics for each needed partition can be expensive when the
      number of partitions is high. This flag can be used to disable fetching of partition statistics
      from metastore. When this flag is disabled, Hive will make calls to filesystem to get file sizes
      and will estimate the number of rows from row schema.
    </description>
    </property>
    <property>
        <name>hive.stats.fetch.column.stats</name>
        <value>true</value>
        <description>
      Annotation of operator tree with statistics information requires column statistics.
      Column statistics are fetched from metastore. Fetching column statistics for each needed column
      can be expensive when the number of columns is high. This flag can be used to disable fetching
      of column statistics from metastore.
    </description>
    </property>
    <property>
        <name>hive.stats.autogather</name>
        <value>true</value>
        <description>A flag to gather statistics automatically during the INSERT OVERWRITE command.</description>
    </property>
    <property>
        <name>hive.stats.dbclass</name>
        <value>fs</value>
        <description>
      Expects one of the pattern in [jdbc(:.*), hbase, counter, custom, fs].
      The storage that stores temporary Hive statistics. In filesystem based statistics collection ('fs'), 
      each task writes statistics it has collected in a file on the filesystem, which will be aggregated 
      after the job has finished. Supported values are fs (filesystem), jdbc:database (where database 
      can be derby, mysql, etc.), hbase, counter, and custom as defined in StatsSetupConst.java.
    </description>
    </property>
    <property>
        <name>hive.exec.scratchdir</name>
        <value>/data/bd/tmp</value>
        <description>Scratch space for Hive jobs</description>
    </property>
    <property>
        <name>hive.service.metrics.file.location</name>
        <value>/data/bd/tmp/report.json</value>
        <description>For metric class org.apache.hadoop.hive.common.metrics.metrics2.CodahaleMetrics JSON_FILE reporter, the location of local JSON metrics file.  This file will get overwritten at every interval.</description>
    </property>
    <property>
        <name>hive.query.results.cache.directory</name>
        <value>/data/bd/tmp/hive/_resultscache_</value>
        <description>unknown</description>
    </property>
    <property>
        <name>hive.llap.io.allocator.mmap.path</name>
        <value>/data/bd/tmp</value>
        <description>unknown</description>
    </property>
    <property>
        <name>hive.hbase.snapshot.restoredir</name>
        <value>/data/bd/tmp</value>
        <description>unknown</description>
    </property>
    <property>
        <name>hive.druid.working.directory</name>
        <value>/data/bd/tmp//workingDirectory</value>
        <description>unknown</description>
    </property>
    <property>
        <name>hive.querylog.location</name>
        <value>/data/bd/tmp</value>
        <description>logs hive</description>
    </property>
</configuration>

Answer 1

对于hadoop 2.7.1

在mapreduce.cluster.local.dir中配置$HADOOP_HOME/etc/hadoop/mapred-site.xml，它还支持以逗号分隔的不同设备上的目录列表。

https://hadoop.apache.org/docs/r2.7.1/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml

如何将hadoop临时工作目录/ tmp更改为其他文件夹

1 个答案: