是否仍然使用Pentaho从html页面解析表td? 可以说我有这个HTML内容
<html>
<body>
<table>
<tr>
<td>info1</td>
<td>info2</td>
</tr>
<tr>
<td>info3</td>
<td>info4</td>
</tr>
</table>
</body>
</html>
我在Pentaho中使用“从XML获取数据”,其中包含以下数据:Content::
Loop XPath: /html/body/table/tr
Fields::
Name: tableData
XPath: td
我想要的数据信息是
info1 info2 info3 info4
以任何方式
任何帮助都会非常感激!
答案 0 :(得分:0)
我通过将文件的每一行都读成行来解决它。然后我添加了Pentaho步骤“用户定义的Java类”,并使用XSLT将我的表内容解析为新的XML文件。使用该XML,我能够获得完成任务所需的数据。
这是我在“用户定义的Java类”中写的:
import java.util.*;
import java.io.FileOutputStream;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
private int infilenameIndex;
private int xsltfilenameIndex;
private int outfilenameIndex;
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
Object[] r=getRow();
if (r==null) {
setOutputDone();
return false;
}
if (first == false) {
infilenameIndex = getInputRowMeta().indexOfValue(getParameter("infilename"));
if (infilenameIndex < 0) {
throw new KettleException("Field not found in the input row, check parameter 'infilename'!");
}
xsltfilenameIndex = getInputRowMeta().indexOfValue(getParameter("xsltfilename"));
if (xsltfilenameIndex < 0) {
throw new KettleException("Field not found in the input row, check parameter 'xsltfilename'!");
}
outfilenameIndex = getInputRowMeta().indexOfValue(getParameter("outfilename"));
if (outfilenameIndex < 0) {
throw new KettleException("Field not found in the input row, check parameter 'outfilename'!");
}
first=false;
}
String infilename = get(Fields.In, "infilename").getString(r);
String xsltfilename = get(Fields.In, "xsltfilename").getString(r);
String outfilename = get(Fields.In, "outfilename").getString(r);
Object[] outputRowData = RowDataUtil.resizeArray(r, data.outputRowMeta.size());
int outputIndex = getInputRowMeta().size();
transform(infilename, xsltfilename, outfilename);
putRow(data.outputRowMeta, outputRowData);
return true;
}
public void transform(String infilename, String xsltfilename, String outfilename) throws KettleException {
javax.xml.transform.stream.StreamSource inss = null;
javax.xml.transform.stream.StreamSource xsltss = null;
javax.xml.transform.stream.StreamResult outss = null;
logBasic("");
logBasic("Transformerar " + infilename + " med " + xsltfilename + " till " + outfilename );
logBasic("");
try {
inss = new javax.xml.transform.stream.StreamSource(infilename);
}
catch (Exception e) {
logError("Infil saknas " + infilename);
throw new KettleException(e);
}
try {
xsltss = new javax.xml.transform.stream.StreamSource(xsltfilename);
}
catch (Exception e) {
logError("Xsltfil saknas " + xsltfilename);
throw new KettleException(e);
}
try {
outss = new javax.xml.transform.stream.StreamResult(outfilename);
}
catch (Exception e) {
logError("Outfil saknas " + outfilename);
throw new KettleException(e);
}
try {
TransformerFactory tFactory = TransformerFactory.newInstance();
// Set the TransformerFactory to the SAXON implementation.
//tFactory = new net.sf.saxon.TransformerFactoryImpl();
Transformer transformer = tFactory.newTransformer(xsltss);
// Do the transfromtation
transformer.transform(inss, outss);
}
catch (Exception e) {
throw new KettleException(e);
}
return;
}
答案 1 :(得分:-1)