I am using Flying Saucer library to convert html to pdf. It is working fine with the all the HTML files.
But for some HTML files which include some tags in pre tag, generated PDF file has tags displayed.
If I remove pre tags then the formatting of data is lost.
My code is
org.w3c.dom.Document document = null;
try {
Document doc = Jsoup.parse(new File(htmlFile), "UTF-8", "");
Whitelist wl = new RelaxedPlusDataBase64Images();
Cleaner cleaner = new Cleaner(wl);
doc = cleaner.clean(doc);
Tidy tidy = new Tidy();
tidy.setShowWarnings(false);
tidy.setXmlTags(false);
tidy.setInputEncoding("UTF-8");
tidy.setOutputEncoding("UTF-8");
tidy.setPrintBodyOnly(true);
tidy.setXHTML(true);
tidy.setMakeClean(true);
tidy.setAsciiChars(true);
if (doc.select("pre").html().contains("</")) {
doc.select("pre").unwrap();
}
Reader reader = new StringReader(doc.html());
document = (tidy.parseDOM(reader, null));
Element element = (Element) document.getElementsByTagName("head").item(0);
element.getParentNode().removeChild(element);
NodeList elements = document.getElementsByTagName("img");
for (int i = 0; i < elements.getLength(); i++) {
String value = elements.item(i).getAttributes().getNamedItem("src").getNodeValue();
if (value != null && value.startsWith("cid:") && value.contains("@")) {
value = value.substring(value.indexOf("cid:") + 4, value.indexOf("@"));
elements.item(i).getAttributes().getNamedItem("src").setNodeValue(value);
System.out.println(value);
}
}
document.normalize();
System.out.println(getNiceLyFormattedXMLDocument(document));
} catch (Exception e) {
System.out.println(e);
}
Method to create PDF is :
try {
org.w3c.dom.Document doc = CleanHtml.cleanNTidyHTML("b.html");
ITextRenderer renderer = new ITextRenderer();
renderer.setDocument(doc, null);
renderer.setPDFVersion(new Character('7'));
String outputFile = "test.pdf";
OutputStream os = new FileOutputStream(outputFile);
renderer.layout();
renderer.createPDF(os);
os.flush();
os.close();
} catch (Exception e) {
e.printStackTrace();
}
By using itext XMLWorker :
try {
org.w3c.dom.Document doc = CleanHtml.cleanNTidyHTML("a.html");
String k = CleanHtml.getNiceLyFormattedXMLDocument(doc);
OutputStream file = new FileOutputStream(new File("test.pdf"));
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, file);
document.open();
ByteArrayInputStream is = new ByteArrayInputStream(k.getBytes());
XMLWorkerHelper.getInstance().parseXHtml(writer, document, is);
document.close();
file.close();
} catch (Exception e) {
e.printStackTrace();
}
public static String getNiceLyFormattedXMLDocument(org.w3c.dom.Document doc) throws IOException, TransformerException {
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
// transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
Writer stringWriter = new StringWriter();
StreamResult streamResult = new StreamResult(stringWriter);
transformer.transform(new DOMSource(doc), streamResult);
String result = stringWriter.toString();
return result;
}