TSV文件转换为RDF

时间:2019-06-29 10:38:32

标签: java jena

我想使用Java代码中来自Apache Jena的库将tsv文件转换为rdf文件。我找到了一个将csv转换为rdf的示例,但是对我没有太大帮助。链接为:http://www.essi.upc.edu/dtim/blog/post/enter-the-world-of-semantics-using-jena-to-convert-your-data-to-rdf

能给我个主意吗?非常感谢!我应该如何更改给定的代码?

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.util.Scanner;

import org.apache.commons.io.FileUtils;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.graph.Triple;
import org.apache.jena.propertytable.graph.GraphCSV;
import org.apache.jena.propertytable.lang.CSV2RDF;
import org.apache.jena.query.Query;
import org.apache.jena.query.QueryExecution;
import org.apache.jena.query.QueryExecutionFactory;
import org.apache.jena.query.QueryFactory;
import org.apache.jena.query.QuerySolution;
import org.apache.jena.query.ResultSet;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.RDFNode;
import org.apache.jena.rdf.model.ResourceFactory;
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.util.FileManager;
import org.apache.jena.vocabulary.RDF;

public static void convertCSVToRDF (String file, 
String inputFilename, String outputFilename,String outputType) {

  //Just a few lines below to convert the data from CSV to an RDF graph, 
  see how easy?!

          CSV2RDF.init();//Initialise the CSV conversion engine in Jena

          GraphCSV newGraph = new GraphCSV(inputFilename);

          Model model = ModelFactory.createModelForGraph(newGraph);



          //Manually insert class triples for each instance in the CSV file

          String sparqlQueryString = "select distinct ?s where  {?s ?p ?o}";

          Query query = QueryFactory.create(sparqlQueryString);

          QueryExecution qexec = QueryExecutionFactory.create(sparqlQueryString, model);

          ResultSet s = qexec.execSelect();

          Model m2 = ModelFactory.createDefaultModel();

          while(s.hasNext()) {

                 QuerySolution so = s.nextSolution();

                 Triple t = new Triple(so.getResource("s").asNode(),RDF.type.asNode(),

                              NodeFactory.createBlankNode(file));

                 Statement stmt = ResourceFactory.createStatement(so.getResource("s"), RDF.type, 

                                     ResourceFactory.createResource(file));

                 m2.add(stmt);

          }

          Model m3 = ModelFactory.createUnion(model, m2); //create a new RDF graph which "unions"

                                                          //the old graph with the new graph containing

                                                          //the new rows



  //Now serialize the RDF graph to an output file using the outputType input variable  

 you specify. It should be “N-Triple” in our case.

          try {

                 FileWriter out = new FileWriter(outputFilename);

                 m3.write(out,outputType);

          } catch (Exception e) {

                 System.out.println("Error in the file output process!");

                 e.printStackTrace();

          }



          //Delete specific triples of a specific predicate called ¨row¨                 

          File output = new File(outputFilename);

          File tempFile = new File("C:/Users/user1/SampleFile/temp.nt");

          BufferedReader reader = null;

          BufferedWriter writer = null;

          try {

                 reader = new BufferedReader(new FileReader(output));

                 writer = new BufferedWriter(new FileWriter(tempFile));

                 String currentLine;

                 //Delete triples from the old file by skipping it while reading the input N-Triple

                 file from the last step, otherwise write the triple to a new temp file!

                 while ((currentLine = reader.readLine()) != null) {

                       if (currentLine.contains("http://w3c/future-csv-vocab/row")) {

                              continue;

                       } else {

                              writer.write(currentLine);

                              writer.newLine();

                       }

                 }

                 writer.close();

                 reader.close();



                 PrintWriter printer = new PrintWriter(output);

                 printer.print("");

                 printer.close();



                 //copy content from temp file to final output file, overwriting it.

                 FileUtils.copyFile(tempFile, output);

          } catch (FileNotFoundException e1) {

                 // TODO Auto-generated catch block

                 e1.printStackTrace();

          } catch (IOException e) {

                 // TODO Auto-generated catch block

                 e.printStackTrace();

          } 

   }

1 个答案:

答案 0 :(得分:0)

在耶拿(Jena)中,没有将TSV转换为RDF的内置方法,因为TSV(就像CSV)不是RDF的格式, 而是以表格形式表示的任何数据。 也许由于这个原因,已经从Jena中删除了CSV支持(jena-csv的最新版本是3.9.0)。

但是TSV- t ab s 单独的 v alues-是非常简单的格式。 将TSV数据转换为RDF是一个非常简单的任务(大约10分钟的编码)。 您可以使用TSV做任何您想做的事,例如,您可以将所有TSV添加为巨大的文字。

但是以下方法展示了一种合理的方法,其中每一行都是具有数据属性声明的OWL个人。

    String tsv = "Sepal length\tSepal width\tPetal length\tPetal width\tSpecies\n" +
            "5.1\t3.5\t1.4\t0.2\tI. setosa\n" +
            "4.9\t3.0\t1.4\t0.2\tI. setosa";

    Charset ch = StandardCharsets.UTF_8;
    String separator = "\t";
    String ns = "http://ex#";
    UnaryOperator<String> nameToURI = s -> ns + s.toLowerCase().replace(" ", "_");

    Model m = ModelFactory.createDefaultModel()
            .setNsPrefixes(PrefixMapping.Standard)
            .setNsPrefix("ex", ns);
    Resource clazz = m.createResource(ns + "MyClass", OWL.Class);

    try (InputStream is = new ByteArrayInputStream(tsv.getBytes(ch));
         Reader r = new InputStreamReader(is, ch);
         BufferedReader br = new BufferedReader(r)) {
        String first = br.lines().findFirst().orElseThrow(IllegalArgumentException::new);
        List<Property> props = Arrays.stream(first.split(separator))
                .map(s -> m.createResource(nameToURI.apply(s), OWL.DatatypeProperty)
                        .addProperty(RDFS.label, s).as(Property.class))
                .collect(Collectors.toList());
        br.lines().forEach(line -> {
            String[] data = line.split(separator);
            if (data.length != props.size()) throw new IllegalArgumentException();
            Resource individual = m.createResource(clazz);
            for (int i = 0; i < data.length; i++) {
                individual.addProperty(props.get(i), data[i]);
            }
        });
    }
    m.write(System.out, "ttl");

输出:

@prefix ex:    <http://ex#> .
@prefix owl:   <http://www.w3.org/2002/07/owl#> .
@prefix rdf:   <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xsd:   <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix dc:    <http://purl.org/dc/elements/1.1/> .

ex:MyClass  a   owl:Class .

ex:sepal_width  a   owl:DatatypeProperty ;
        rdfs:label  "Sepal width" .

ex:species  a       owl:DatatypeProperty ;
        rdfs:label  "Species" .

ex:sepal_length  a  owl:DatatypeProperty ;
        rdfs:label  "Sepal length" .

ex:petal_length  a  owl:DatatypeProperty ;
        rdfs:label  "Petal length" .

ex:petal_width  a   owl:DatatypeProperty ;
        rdfs:label  "Petal width" .

[ a                ex:MyClass ;
  ex:petal_length  "1.4" ;
  ex:petal_width   "0.2" ;
  ex:sepal_length  "5.1" ;
  ex:sepal_width   "3.5" ;
  ex:species       "I. setosa"
] .

[ a                ex:MyClass ;
  ex:petal_length  "1.4" ;
  ex:petal_width   "0.2" ;
  ex:sepal_length  "4.9" ;
  ex:sepal_width   "3.0" ;
  ex:species       "I. setosa"
] .