将JSOUP转换为CSV格式

时间:2017-11-29 11:34:05

标签: java csv jsoup export-to-csv

我有一系列Jsoup结果,我想将它们转换为csv格式,以便我可以将它们与我的rdf连接起来。有什么方法可以将Jsoup转换为csv吗?

这是我的Jsoup代码:

package jena;

import java.util.regex.*; 
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.jena.query.*;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdfconnection.RDFConnection;
import org.apache.jena.system.Txn;
import org.apache.jena.query.ResultSet;
import org.apache.jena.rdfconnection.RDFConnectionFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.Scanner;  
import java.util.StringTokenizer;

import javax.xml.soap.Text;

import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.OntModelSpec;
import com.hp.hpl.jena.query.ResultSetFactory;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.sparql.util.Base64.InputStream;
import com.hp.hpl.jena.util.FileManager;

public class crawlertest {
    public static final String SOURCE_URL="http://www.semanticweb.org/shahrukh/ontologies/2017/4/university.owl";
    protected static final String SOURCE_FILE ="e:/university work/fyp/sdd folder/my sdd/university.owl";
    public static final String NS = SOURCE_URL + "#";
    public static void main(String[] args) throws IOException {

        //final String str = "<td>Jawad</tag><b>Mirza</b><tag>orange</tag><tag>pear</tag>";
        new crawlertest().getCrawlerData("");

        }

    public String getCrawlerData(String t) throws IOException
    {
        String t2 = "";
        try{
            OntModel m = ModelFactory.createOntologyModel( OntModelSpec.OWL_MEM );
            loadModel(m);
        //java.io.InputStream is = new URL("http://ww3.comsats.edu.pk/Research/LatestResearchPublications.aspx").openStream();
        //String encoding = "UTF-8";
        //Document document = Jsoup.parse(is , encoding ,"http://ww3.comsats.edu.pk/Research/LatestResearchPublications.aspx" );
        org.jsoup.nodes.Document doc = Jsoup.connect("http://ww3.comsats.edu.pk/Research/LatestResearchPublications.aspx").get();
        org.jsoup.nodes.Document doc2 = Jsoup.connect("http://ww3.comsats.edu.pk/faculty/ResearchPaperDetail.aspx?pid=13134").get();
        org.jsoup.nodes.Document doc3 = Jsoup.connect("https://scholar.google.com.pk/scholar?hl=en&as_sdt=0,5&qsp=3&q=final+year+project+computer+science").get();
        org.jsoup.nodes.Document doc4 = Jsoup.connect("https://www.um.edu.mt/ict/cs/undergraduate/?a=294793").get();
        org.jsoup.nodes.Document doc5 = Jsoup.connect("https://minerva.leeds.ac.uk/bbcswebdav/orgs/SCH_Computing/FYProj/previous-titles/bsc2012.html").get();
        //System.out.println(document.toString());
        String title = doc.title();
        String title2 = doc2.title();
        String title3 = doc3.title();
        String title4 = doc4.title();
        String title5 = doc5.title();

        System.out.println("title : "+title);
        System.out.println("title : "+title2);
        System.out.println("title : "+title3);
        System.out.println("title : "+title4);
        System.out.println("title : "+title5);



        {

        Elements text = doc.select("table");
        for (Element texts: text)
        {

            final String html=texts.text().toString();
             t2=texts.text();
             System.out.println( "The result is : "+t);
             //for (String s : html.split("Publisher :"))
              //System.out.println(s);
            //String html="My name is Shahrukh Nasir and I'm not a Publisher. You have to understand this";
            //final String REGEX="^.\\s[^Publisher]+.[^Publication]";
            final String REGEX="(?!Publisher)([a-zA-Z0-9\\s] +)(.+?)[a-zA-Z0-9\\s]+(.+?)[a-zA-Z0-9//s]+";
            //final String REGEX="<td>(.+?)</td>";

            //final String REGEX = "(:)(\\w+)\\b(?![.])";
            //final String REGEX="\b (?! \bPublisher \b)\w+ \b";
             //final String REGEX="^/[a-z0-9]+$";

            //final String REGEX2="[Publisher:].[^Publication]";
            /*final Elements elements = Jsoup.parse(html).getElementsMatchingOwnText("^//w{5,}$");
            for (final Element element : elements) {
                System.out.println("element = [" + element + "]");
                System.out.println("zip = [" + element.text() + "]");
            }*/
            //System.out.println(Pattern.matches("[A-Za-z0-9]*",texts.text()));
              Pattern p = Pattern.compile(REGEX);
              Matcher ma = p.matcher(html);
              Matcher m2=p.matcher(html);
              int count = 0;
              if(ma.find()) {
                  count++;
                  System.out.println("Match number "
                                     + count);
                  System.out.println("TEXT1: " +html);
                  //System.out.println("TEXT1: " +m.group());
                  //System.out.println("TEXT1: " +p);
                  System.out.println("Found value: " + ma.group() );
                  //System.out.println("Found value: " + m.group(0) );
                //  System.out.println("Found value: " + m.group(1) );
                  //System.out.println("Found value: " + m.group(2) );
             }
              /*while(m2.find()) {
                  count++;
                  System.out.println("Match number "
                                     + count);
                  System.out.println("TEXT1: " +texts.text().toString());
                  //System.out.println("TEXT1: " +m.group());
                  //System.out.println("TEXT1: " +p);
                  System.out.println("Found value: " + m2.group() );
                  //System.out.println("Found value: " + m.group(0) );s
                  //System.out.println("Found value: " + m.group(1) );
                  //System.out.println("Found value: " + m.group(2) );
             }*/
             //System.out.println("TEXT1: " +texts.text());
            //ResultSetFormatter.outputAsCSV(System.out,texts.text());
            //ResultSetFormatter.out(System.out,a);
        }
        }

        /*Elements links = doc.select("a[href]");
        for (Element link : links)
            //for (Element texts: text)
            {
                System.out.println("Link: " + link.attr("href"));
                //System.out.println("TEXT1: " +texts.text());
                System.out.println("TEXT1: " +link.text());
            }*/


        //Elements links2 = doc2.select("a[href]");
        {

        Elements text2 = doc2.select("td");
        Element bodyStart = text2.first();
        Element bodyEnd = text2.last(); 
        Element p = bodyStart;
        int divCount = 0; 
        while(p != bodyEnd)
        {
            p = text2.get(divCount);
            System.out.println(p.text());        
            divCount++;
        }
        for (Element texts2: text2)
        {
            System.out.println("TEXT2: " +texts2.text());



            for (Element e : text2) {
                elements.add(e);
                System.out.println("The values stored in variable are : " +e);

            }

            Element[] elementArr = elements.toArray(new Element[]{});*/
            //System.out.println("The values stored in variable are : " +elementArr);

        }
        final Pattern pattern = Pattern.compile("<td>(.+?)</td>");
        final Matcher matcher = pattern.matcher("<td>Gan </td>");
        matcher.find();
        String first_name= matcher.group(1);
        //System.out.println("First Name :" +first_name);
        System.out.println("First Name :" +matcher.group(1));

        final Matcher matcher2 = pattern.matcher("<td>Zheng </td>");
        matcher2.find();
        System.out.println("Last Name :" +matcher2.group(1));


        Elements text3 = doc3.select("div");
        for (Element texts3: text3)
        {
            System.out.println("TEXT3: " +texts3.text());
        }
        Elements text4 = doc4.select("td");

        for (Element texts4: text4)
        {
            System.out.println("TEXT4: " +texts4.text());
        }
        Elements text5 = doc5.select("td");
        for (Element texts5: text5)
        {
            System.out.println("TEXT5: " +texts5.text());

        } 
        String ti=doc.text();
          ArrayList<String> myarray = new ArrayList<>();
          StringTokenizer str1 = new StringTokenizer(ti);
          while(str1.hasMoreTokens())
          {
              String token1 = str1.nextToken();
              System.out.println(token1);
             // String out=token1;
              myarray.add(token1);
          }
          System.out.println(myarray);
          //m.write(new FileWriter("rdf80.rdf"), "RDF/XML");
          return t2;

        }

        } catch (IOException ex)
        {
            Logger.getLogger(crawlertest.class.getName()).log(Level.SEVERE,null,ex);
        }
        return t2;


}

    private void loadModel(OntModel m) {
        FileManager.get().getLocationMapper().addAltEntry( SOURCE_URL, SOURCE_FILE );
        Model baseOntology = FileManager.get().loadModel( SOURCE_URL );
        m.addSubModel( baseOntology );

        // for compactness, add a prefix declaration st: (for Sam Thomas)
        m.setNsPrefix( "st", NS );
        // TODO Auto-generated method stub

    }

    }

这段代码是rdf和Jsoup的混合,前几行都是关于rdf的。

0 个答案:

没有答案