如何使用java读取nutch在segment文件夹上生成的内容数据

时间:2011-09-21 21:28:36

标签: nutch

我正在尝试读取segment文件夹中的内容数据。我认为内容数据文件是用自定义format

编写的

我尝试使用nutch的Content类,但它无法识别格式。

3 个答案:

答案 0 :(得分:5)

import java.io.IOException;

import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;

public class ContentReader {
    public static void main(String[] args) throws IOException {
        // Setup the parser
        Configuration conf = NutchConfiguration.create();
        Options opts = new Options();
        GenericOptionsParser parser = new GenericOptionsParser(conf, opts, args);
        String[] remainingArgs = parser.getRemainingArgs();
        FileSystem fs = FileSystem.get(conf);
        String segment = remainingArgs[0];
        Path file = new Path(segment, Content.DIR_NAME + "/part-00000/data");
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
        Text key = new Text();
        Content content = new Content();
        // Loop through sequence files
        while (reader.next(key, content)) {
            try {
                System.out.write(content.getContent(), 0,
                        content.getContent().length);
            } catch (Exception e) {
            }
        }
    }
}

答案 1 :(得分:0)

org.apache.nutch.segment.SegmentReader 

有一个map reduce实现,用于读取segment目录中的内容数据。

答案 2 :(得分:0)

spark / scala代码从细分内容文件夹中读取数据。

我如何从项目的内容文件夹中读取内容。

我创建了一个案例类页面,其中包含从内容文件夹读取的数据

 case class Page(var url: String, var title: String = null
,var contentType: String = null, var rawHtml: String = null,var language: String = null
    ,var metadata: Map[String,String])

要从内容文件夹读取的代码

import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.io.{Text, Writable}
import org.apache.nutch.crawl.{CrawlDatum, Inlinks}
import org.apache.nutch.parse.ParseText
import org.apache.nutch.protocol.Content
    val contentDF = spark.sparkContext.sequenceFile(path.contentLocation, classOf[Text], classOf[Writable])
                    .map { case (x, y) => (x.toString, extract(y.asInstanceOf[Content])) }

/** converts Content object to Page **/
def extract(content: Content): Page = {
        try {
            val parsed = Page(content.getUrl)
            var charset: String = getCharsetFromContentType(content.getContentType)
            if (StringUtils.isBlank(charset)) {
                charset = "UTF-8"
            }
            parsed.rawHtml = Try(new String(content.getContent, charset)).getOrElse(new String(content.getContent, "UTF-8"))
            parsed.contentType = Try(content.getMetadata.get("Content-Type")).getOrElse("text/html")
            //      parsed.isHomePage = Boolean.valueOf(content.getMetadata.get("isHomePage"))
            
        parsed.metadata = content.getMetadata.names().map(name => (name,content.getMetadata.get(name))).toMap
        
            Try {
                if (StringUtils.isNotBlank(content.getMetadata.get("Content-Language")))
                    parsed.language = content.getMetadata.get("Content-Language")
                else if (StringUtils.isNotBlank(content.getMetadata.get("language")))
                    parsed.language = content.getMetadata.get("language")
                else parsed.language = content.getMetadata.get("lang")
            }
            parsed
        } catch {
            case e: Exception =>
                LOG.error("ERROR while extracting data from Content ", e)
                null
        }
    }
/**Get Html ContentType **/
def getCharsetFromContentType(contentType: String): String = {
        var result: String = "UTF-8"
        Try {
            if (StringUtils.isNotBlank(contentType)) {
                val m = charsetPattern.matcher(contentType)
                result = if (m.find) m.group(1).trim.toUpperCase else "UTF-8"
            }
        }
        result
    }