Apache Hbase MapReduce作业在读取数据存储时花费太多时间

时间:2019-02-27 04:19:25

标签: java hadoop mapreduce hbase nutch

我已经设置了Apache Hbase,Nutch和Hadoop集群。我已经抓取了几份文档,即大约3000万本。集群中有3名工人和1名主人。我已经编写了自己的Hbase mapreduce作业,以读取已爬网的数据并根据某些逻辑对分数进行一些更改。

为此,我合并了相同域的文档,找到了它们的有效字节并找到了一些分数。后来,在reducer中,我将该分数分配给了该域的每个URL(通过缓存)。这部分工作要花费很多时间,即16个小时。以下是代码段

 for ( int index = 0; index < Cache.size(); index++) {

        String Orig_key = Cache.get(index);
        float doc_score = log10;

        WebPage page = datastore.get(Orig_key);
        if ( page == null ) {
            continue;
        }
        page.setScore(doc_score);

        if (mark) {
            page.getMarkers().put( Queue, Q1);
        }
        context.write(Orig_key, page);
    }

如果我从数据存储中删除该文档读取语句,则作业仅在2到3个小时内完成。因此,我认为语句WebPage page = datastore.get(Orig_key);导致了此问题。是不是 如果是这种情况,那么最好的方法是什么。 Cache对象只是一个包含相同域URL的列表。

DomainAnalysisJob.java ...     ...

public class DomainAnalysisJob implements Tool {

  public static final Logger LOG = LoggerFactory
      .getLogger(DomainAnalysisJob.class);
  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

  private Configuration conf;
  protected static final Utf8 URL_ORIG_KEY = new Utf8("doc_orig_id");
  protected static final Utf8 DOC_DUMMY_MARKER = new Utf8("doc_marker");
  protected static final Utf8 DUMMY_KEY = new Utf8("doc_id");
  protected static final Utf8 DOMAIN_DUMMY_MARKER = new Utf8("domain_marker");
  protected static final Utf8 LINK_MARKER = new Utf8("link");
  protected static final Utf8 Queue = new Utf8("q");

  private static URLNormalizers urlNormalizers;
  private static URLFilters filters;
  private static int maxURL_Length;

  static {
    FIELDS.add(WebPage.Field.STATUS);
    FIELDS.add(WebPage.Field.LANG_INFO);
    FIELDS.add(WebPage.Field.URDU_SCORE);
    FIELDS.add(WebPage.Field.MARKERS);
    FIELDS.add(WebPage.Field.INLINKS);
  }

  /**
   * Maps each WebPage to a host key.
   */
  public static class Mapper extends GoraMapper<String, WebPage, Text, WebPage> {

      @Override
        protected void setup(Context context) throws IOException ,InterruptedException {
          Configuration conf = context.getConfiguration();
          urlNormalizers = new URLNormalizers(context.getConfiguration(), URLNormalizers.SCOPE_DEFAULT);
          filters = new URLFilters(context.getConfiguration());
          maxURL_Length = conf.getInt("url.characters.max.length", 2000);
        }

    @Override
    protected void map(String key, WebPage page, Context context)
        throws IOException, InterruptedException {

     String reversedHost = null;
     if (page == null) {
         return;
     }
    if ( key.length() > maxURL_Length ) {
        return;
    }
     String url = null;
     try {
         url = TableUtil.unreverseUrl(key);
         url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
         url = filters.filter(url); // filter the url
       } catch (Exception e) {
         LOG.warn("Skipping " + key + ":" + e);
         return;
       }
     if ( url == null) {
         context.getCounter("DomainAnalysis", "FilteredURL").increment(1);
         return;
     }
     try {
         reversedHost = TableUtil.getReversedHost(key.toString());
     } 
     catch (Exception e) {
        return;
    }
     page.getMarkers().put( URL_ORIG_KEY, new Utf8(key) );

     context.write( new Text(reversedHost), page );

    }
  }

  public DomainAnalysisJob() {
  }

  public DomainAnalysisJob(Configuration conf) {
    setConf(conf);
  }

  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration conf) {
    this.conf = conf;
   }

  public void updateDomains(boolean buildLinkDb, int numTasks) throws Exception {


    NutchJob job = NutchJob.getInstance(getConf(), "rankDomain-update");

    job.getConfiguration().setInt("mapreduce.task.timeout", 1800000);

    if ( numTasks < 1) {
        job.setNumReduceTasks(job.getConfiguration().getInt(
            "mapred.map.tasks", job.getNumReduceTasks()));
      } else {
        job.setNumReduceTasks(numTasks);
      }
    ScoringFilters scoringFilters = new ScoringFilters(getConf());
    HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
    fields.addAll(scoringFilters.getFields());

    StorageUtils.initMapperJob(job, fields, Text.class, WebPage.class,
            Mapper.class);
    StorageUtils.initReducerJob(job, DomainAnalysisReducer.class);


    job.waitForCompletion(true);
  }

  @Override
  public int run(String[] args) throws Exception {
    boolean linkDb = false;
    int numTasks = -1;
    for (int i = 0; i < args.length; i++) {
      if ("-rankDomain".equals(args[i])) {
        linkDb = true;
      } else if ("-crawlId".equals(args[i])) {
        getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
      } else if ("-numTasks".equals(args[i]) ) {
          numTasks = Integer.parseInt(args[++i]);
      }
      else {
        throw new IllegalArgumentException("unrecognized arg " + args[i]
            + " usage: updatedomain -crawlId <crawlId> [-numTasks N]" );
      }
    }
    LOG.info("Updating DomainRank:");
    updateDomains(linkDb, numTasks);
    return 0;
  }

  public static void main(String[] args) throws Exception {
    final int res = ToolRunner.run(NutchConfiguration.create(),
        new DomainAnalysisJob(), args);
    System.exit(res);
  }
}

DomainAnalysisReducer.java

...
...
public class DomainAnalysisReducer extends
    GoraReducer<Text, WebPage, String, WebPage> {

    public static final Logger LOG = DomainAnalysisJob.LOG;
    public DataStore<String, WebPage> datastore;

    protected static float q1_ur_threshold = 500.0f;
    protected static float q1_ur_docCount = 50;
    public static final Utf8 Queue = new Utf8("q");     // Markers for Q1 and Q2
    public static final Utf8 Q1 = new Utf8("q1");           
    public static final Utf8 Q2 = new Utf8("q2");

      @Override
      protected void setup(Context context) throws IOException,
      InterruptedException {
        Configuration conf = context.getConfiguration();
        try {
          datastore = StorageUtils.createWebStore(conf, String.class, WebPage.class);
        }
        catch (ClassNotFoundException e) {
          throw new IOException(e);
        }
        q1_ur_threshold = conf.getFloat("domain.queue.threshold.bytes", 500.0f);
        q1_ur_docCount = conf.getInt("domain.queue.doc.count", 50);
        LOG.info("Conf updated: Queue-bytes-threshold = " + q1_ur_threshold + " Queue-doc-threshold: " + q1_ur_docCount);
      }

      @Override
      protected void cleanup(Context context) throws IOException, InterruptedException {
        datastore.close();
      }

  @Override
  protected void reduce(Text key, Iterable<WebPage> values, Context context)
      throws IOException, InterruptedException {

      ArrayList<String> Cache = new ArrayList<String>();

      int doc_counter = 0;
      int total_ur_bytes = 0;

    for ( WebPage page : values ) {

        // cache
        String orig_key = page.getMarkers().get( DomainAnalysisJob.URL_ORIG_KEY ).toString();
        Cache.add(orig_key);

        // do not consider those doc's that are not fetched or link URLs
        if ( page.getStatus() == CrawlStatus.STATUS_UNFETCHED ) {
         continue;
        }

        doc_counter++;
        int ur_score_int = 0;
        int doc_ur_bytes = 0;
        int doc_total_bytes = 0;
        String ur_score_str = "0";
        String langInfo_str = null;

        // read page and find its Urdu score
        langInfo_str = TableUtil.toString(page.getLangInfo());      
        if (langInfo_str == null) {
            continue;
        }
        ur_score_str = TableUtil.toString(page.getUrduScore());
        ur_score_int = Integer.parseInt(ur_score_str);
        doc_total_bytes = Integer.parseInt( langInfo_str.split("&")[0] );
        doc_ur_bytes = ( doc_total_bytes * ur_score_int) / 100;             //Formula to find ur percentage

        total_ur_bytes += doc_ur_bytes;     

    }
    float avg_bytes = 0;
    float log10 = 0;
    if ( doc_counter > 0 && total_ur_bytes > 0) {
        avg_bytes = (float) total_ur_bytes/doc_counter;
         log10 = (float) Math.log10(avg_bytes);
         log10 = (Math.round(log10 * 100000f)/100000f);
    }

    context.getCounter("DomainAnalysis", "DomainCount").increment(1);
    // if average bytes and doc count, are more than threshold then mark as q1
    boolean mark = false;
    if ( avg_bytes >= q1_ur_threshold && doc_counter >= q1_ur_docCount ) {
        mark = true; 

    for ( int index = 0; index < Cache.size(); index++) {

        String Orig_key = Cache.get(index);
        float doc_score = log10;

        WebPage page = datastore.get(Orig_key);
        if ( page == null ) {
            continue;
        }
        page.setScore(doc_score);

        if (mark) {
            page.getMarkers().put( Queue, Q1);
        }
        context.write(Orig_key, page);
    }
  }
}

在测试和调试中,我发现语句WebPage page = datastore.get(Orig_key);是造成时间过多的主要原因。完成这项工作大约需要16个小时,但是当我用WebPage page = WebPage.newBuilder().build();替换此语句时,时间减少到了6个小时。这是由于IO吗?

0 个答案:

没有答案