我有一系列Jsoup结果,我想将它们转换为csv格式,以便我可以将它们与我的rdf连接起来。有什么方法可以将Jsoup转换为csv吗?
这是我的Jsoup代码:
package jena;
import java.util.regex.*;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.jena.query.*;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdfconnection.RDFConnection;
import org.apache.jena.system.Txn;
import org.apache.jena.query.ResultSet;
import org.apache.jena.rdfconnection.RDFConnectionFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.StringTokenizer;
import javax.xml.soap.Text;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.OntModelSpec;
import com.hp.hpl.jena.query.ResultSetFactory;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.sparql.util.Base64.InputStream;
import com.hp.hpl.jena.util.FileManager;
public class crawlertest {
public static final String SOURCE_URL="http://www.semanticweb.org/shahrukh/ontologies/2017/4/university.owl";
protected static final String SOURCE_FILE ="e:/university work/fyp/sdd folder/my sdd/university.owl";
public static final String NS = SOURCE_URL + "#";
public static void main(String[] args) throws IOException {
//final String str = "<td>Jawad</tag><b>Mirza</b><tag>orange</tag><tag>pear</tag>";
new crawlertest().getCrawlerData("");
}
public String getCrawlerData(String t) throws IOException
{
String t2 = "";
try{
OntModel m = ModelFactory.createOntologyModel( OntModelSpec.OWL_MEM );
loadModel(m);
//java.io.InputStream is = new URL("http://ww3.comsats.edu.pk/Research/LatestResearchPublications.aspx").openStream();
//String encoding = "UTF-8";
//Document document = Jsoup.parse(is , encoding ,"http://ww3.comsats.edu.pk/Research/LatestResearchPublications.aspx" );
org.jsoup.nodes.Document doc = Jsoup.connect("http://ww3.comsats.edu.pk/Research/LatestResearchPublications.aspx").get();
org.jsoup.nodes.Document doc2 = Jsoup.connect("http://ww3.comsats.edu.pk/faculty/ResearchPaperDetail.aspx?pid=13134").get();
org.jsoup.nodes.Document doc3 = Jsoup.connect("https://scholar.google.com.pk/scholar?hl=en&as_sdt=0,5&qsp=3&q=final+year+project+computer+science").get();
org.jsoup.nodes.Document doc4 = Jsoup.connect("https://www.um.edu.mt/ict/cs/undergraduate/?a=294793").get();
org.jsoup.nodes.Document doc5 = Jsoup.connect("https://minerva.leeds.ac.uk/bbcswebdav/orgs/SCH_Computing/FYProj/previous-titles/bsc2012.html").get();
//System.out.println(document.toString());
String title = doc.title();
String title2 = doc2.title();
String title3 = doc3.title();
String title4 = doc4.title();
String title5 = doc5.title();
System.out.println("title : "+title);
System.out.println("title : "+title2);
System.out.println("title : "+title3);
System.out.println("title : "+title4);
System.out.println("title : "+title5);
{
Elements text = doc.select("table");
for (Element texts: text)
{
final String html=texts.text().toString();
t2=texts.text();
System.out.println( "The result is : "+t);
//for (String s : html.split("Publisher :"))
//System.out.println(s);
//String html="My name is Shahrukh Nasir and I'm not a Publisher. You have to understand this";
//final String REGEX="^.\\s[^Publisher]+.[^Publication]";
final String REGEX="(?!Publisher)([a-zA-Z0-9\\s] +)(.+?)[a-zA-Z0-9\\s]+(.+?)[a-zA-Z0-9//s]+";
//final String REGEX="<td>(.+?)</td>";
//final String REGEX = "(:)(\\w+)\\b(?![.])";
//final String REGEX="\b (?! \bPublisher \b)\w+ \b";
//final String REGEX="^/[a-z0-9]+$";
//final String REGEX2="[Publisher:].[^Publication]";
/*final Elements elements = Jsoup.parse(html).getElementsMatchingOwnText("^//w{5,}$");
for (final Element element : elements) {
System.out.println("element = [" + element + "]");
System.out.println("zip = [" + element.text() + "]");
}*/
//System.out.println(Pattern.matches("[A-Za-z0-9]*",texts.text()));
Pattern p = Pattern.compile(REGEX);
Matcher ma = p.matcher(html);
Matcher m2=p.matcher(html);
int count = 0;
if(ma.find()) {
count++;
System.out.println("Match number "
+ count);
System.out.println("TEXT1: " +html);
//System.out.println("TEXT1: " +m.group());
//System.out.println("TEXT1: " +p);
System.out.println("Found value: " + ma.group() );
//System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
//System.out.println("Found value: " + m.group(2) );
}
/*while(m2.find()) {
count++;
System.out.println("Match number "
+ count);
System.out.println("TEXT1: " +texts.text().toString());
//System.out.println("TEXT1: " +m.group());
//System.out.println("TEXT1: " +p);
System.out.println("Found value: " + m2.group() );
//System.out.println("Found value: " + m.group(0) );s
//System.out.println("Found value: " + m.group(1) );
//System.out.println("Found value: " + m.group(2) );
}*/
//System.out.println("TEXT1: " +texts.text());
//ResultSetFormatter.outputAsCSV(System.out,texts.text());
//ResultSetFormatter.out(System.out,a);
}
}
/*Elements links = doc.select("a[href]");
for (Element link : links)
//for (Element texts: text)
{
System.out.println("Link: " + link.attr("href"));
//System.out.println("TEXT1: " +texts.text());
System.out.println("TEXT1: " +link.text());
}*/
//Elements links2 = doc2.select("a[href]");
{
Elements text2 = doc2.select("td");
Element bodyStart = text2.first();
Element bodyEnd = text2.last();
Element p = bodyStart;
int divCount = 0;
while(p != bodyEnd)
{
p = text2.get(divCount);
System.out.println(p.text());
divCount++;
}
for (Element texts2: text2)
{
System.out.println("TEXT2: " +texts2.text());
for (Element e : text2) {
elements.add(e);
System.out.println("The values stored in variable are : " +e);
}
Element[] elementArr = elements.toArray(new Element[]{});*/
//System.out.println("The values stored in variable are : " +elementArr);
}
final Pattern pattern = Pattern.compile("<td>(.+?)</td>");
final Matcher matcher = pattern.matcher("<td>Gan </td>");
matcher.find();
String first_name= matcher.group(1);
//System.out.println("First Name :" +first_name);
System.out.println("First Name :" +matcher.group(1));
final Matcher matcher2 = pattern.matcher("<td>Zheng </td>");
matcher2.find();
System.out.println("Last Name :" +matcher2.group(1));
Elements text3 = doc3.select("div");
for (Element texts3: text3)
{
System.out.println("TEXT3: " +texts3.text());
}
Elements text4 = doc4.select("td");
for (Element texts4: text4)
{
System.out.println("TEXT4: " +texts4.text());
}
Elements text5 = doc5.select("td");
for (Element texts5: text5)
{
System.out.println("TEXT5: " +texts5.text());
}
String ti=doc.text();
ArrayList<String> myarray = new ArrayList<>();
StringTokenizer str1 = new StringTokenizer(ti);
while(str1.hasMoreTokens())
{
String token1 = str1.nextToken();
System.out.println(token1);
// String out=token1;
myarray.add(token1);
}
System.out.println(myarray);
//m.write(new FileWriter("rdf80.rdf"), "RDF/XML");
return t2;
}
} catch (IOException ex)
{
Logger.getLogger(crawlertest.class.getName()).log(Level.SEVERE,null,ex);
}
return t2;
}
private void loadModel(OntModel m) {
FileManager.get().getLocationMapper().addAltEntry( SOURCE_URL, SOURCE_FILE );
Model baseOntology = FileManager.get().loadModel( SOURCE_URL );
m.addSubModel( baseOntology );
// for compactness, add a prefix declaration st: (for Sam Thomas)
m.setNsPrefix( "st", NS );
// TODO Auto-generated method stub
}
}
这段代码是rdf和Jsoup的混合,前几行都是关于rdf的。