tika PackageParser不适用于目录

时间:2015-02-02 23:24:44

标签: apache-tika

我正在编写一个类来递归地从zip文件中提取文件并将它们生成到Kafka队列以进行进一步处理。我的意图是能够从多个级别的zip中提取文件。下面的代码是我执行tika ContainerExtractor来执行此操作。

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.lang.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pkg.PackageParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class UberContainerExtractor implements ContainerExtractor {

  /**
   * 
   */
  private static final long serialVersionUID = -6636138154366178135L;

  // statically populate SUPPORTED_TYPES
  static {
    Set<MediaType> supportedTypes = new HashSet<MediaType>();
    ParseContext context = new ParseContext();
    supportedTypes.addAll(new PackageParser().getSupportedTypes(context));
    SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
  }

  /**
   * A stack that maintains the parent filenames for the recursion
   */
  Stack<String> parentFileNames = new Stack<String>();
  /**
   * The default tika parser
   */
  private final Parser parser;
  /**
   * Default tika detector
   */
  private final Detector detector;
  /**
   * The supported container types into which we can recurse
   */
  public final static Set<MediaType> SUPPORTED_TYPES;
  /**
   * The number of documents recursively extracted from the container and its
   * children containers if present
   */
  int extracted;

  public UberContainerExtractor() {
    this(TikaConfig.getDefaultConfig());
  }

  public UberContainerExtractor(TikaConfig config) {
    this(new DefaultDetector(config.getMimeRepository()));
  }

  public UberContainerExtractor(Detector detector) {
    this.parser = new AutoDetectParser(new PackageParser());
    this.detector = detector;
  }

  public boolean isSupported(TikaInputStream input) throws IOException {
    MediaType type = detector.detect(input, new Metadata());
    return SUPPORTED_TYPES.contains(type);
  }

  @Override
  public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler)
      throws IOException, TikaException {

    ParseContext context = new ParseContext();
    context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
    try {
      Metadata metadata = new Metadata();
      parser.parse(stream, new DefaultHandler(), metadata, context);
    } catch (SAXException e) {
      throw new TikaException("Unexpected SAX exception", e);
    }
  }

  private class RecursiveParser extends AbstractParser {

    /**
     * 
     */
    private static final long serialVersionUID = -7260171956667273262L;

    private final ContainerExtractor extractor;

    private final EmbeddedResourceHandler handler;

    private RecursiveParser(ContainerExtractor extractor, EmbeddedResourceHandler handler) {
      this.extractor = extractor;
      this.handler = handler;
    }

    public Set<MediaType> getSupportedTypes(ParseContext context) {
      return parser.getSupportedTypes(context);
    }

    public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
      TemporaryResources tmp = new TemporaryResources();
      try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);

        // Figure out what we have to process
        String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
        MediaType type = detector.detect(tis, metadata);

        if (extractor == null) {
          // do nothing
        } else {
          // Use a temporary file to process the stream
          File file = tis.getFile();
          System.out.println("file is directory = " + file.isDirectory());

          // Recurse and extract if the filetype is supported
          if (SUPPORTED_TYPES.contains(type)) {
            System.out.println("encountered a supported file:" + filename);
            parentFileNames.push(filename);
            extractor.extract(tis, extractor, handler);
            parentFileNames.pop();
          } else { // produce the file
            List<String> parentFilenamesList = new ArrayList<String>(parentFileNames);
            parentFilenamesList.add(filename);
            String originalFilepath = StringUtils.join(parentFilenamesList, "/");
            System.out.println("producing " + filename + " with originalFilepath:" + originalFilepath
                + " to kafka queue");
            ++extracted;
          }
        }
      } finally {
        tmp.dispose();
      }
    }
  }

  public int getExtracted() {
    return extracted;
  }

  public static void main(String[] args) throws IOException, TikaException {
    String filename = "/Users/rohit/Data/cd.zip";
    File file = new File(filename);
    TikaInputStream stream = TikaInputStream.get(file);

    ContainerExtractor recursiveExtractor = new UberContainerExtractor();

    EmbeddedResourceHandler resourceHandler = new EmbeddedResourceHandler() {
      @Override
      public void handle(String filename, MediaType mediaType, InputStream stream) {
        // do nothing
      }
    };

    recursiveExtractor.extract(stream, recursiveExtractor, resourceHandler);

    stream.close();

    System.out.println("extracted " + ((UberContainerExtractor) recursiveExtractor).getExtracted() + " files");

  }
}

只要拉链内的文件是扁平结构,它就可以在多级zip上工作。对于前 cd.zip    - c.txt    - d.txt

如果zip中的文件存在于目录中,则代码不起作用。对于前 ab.zip    - ab /      - a.txt      - b.txt

在调试时,我遇到了PackageParser中的以下代码片段

try {
  ArchiveEntry entry = ais.getNextEntry();
  while (entry != null) {
    if (!entry.isDirectory()) {
        parseEntry(ais, entry, extractor, xhtml);
    }
    entry = ais.getNextEntry();
  }
} finally {
  ais.close();
}

我试图评论if条件,但它没有用。这是评论的原因吗?有没有办法解决这个问题?

我正在使用tika版本1.6

1 个答案:

答案 0 :(得分:1)

以相反的顺序处理你的问题:

  

有没有理由对此进行评论?

zip文件中的条目是目录或文件。如果是文件,则它们包含它们来自的目录的名称。因此,Tika不需要对目录做任何事情,它需要做的就是在它们出现时处理嵌入的文件

  

如果zip中的文件存在于目录中,则代码不起作用。对于前ab.zip - ab / - a.txt - b.txt

你似乎做错了。 Tika的递归和包解析器处理带有文件夹的拉链就好了!

为了证明这一点,请从这样的zip文件开始:

$ unzip -l ../tt.zip 
Archive:  ../tt.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2015-02-03 16:42   t/
        0  2015-02-03 16:42   t/t2/
        0  2015-02-03 16:42   t/t2/t3/
   164404  2015-02-03 16:42   t/t2/t3/test.jpg
---------                     -------
   164404                     4 files

现在,让我们来看看Tika App的-z提取标志,这会导致Tika提取出文件的所有嵌入内容。像这样跑,我们得到

$ java -jar tika-app-1.7.jar -z ../tt.zip 
Extracting 't/t2/t3/test.jpg' (image/jpeg) to ./t/t2/t3/test.jpg

然后列出生成的目录,我们看到

$ find . -type f
./t/t2/t3/Test.jpg

我看不出你的代码有什么问题,但遗憾的是,我们已经证明问题存在,而不是Tika ...你最好回顾一下Tika提供的各种递归示例,例如Tika App toolRecursing Parser Wrapper,然后根据这些代码将代码重新编写为简单的