我正在尝试制作一个网络刮板,该刮板可以从网站上获取新颖的文本,并使用该网站上的html创建pdf。为此,我要获取文本的html并将其转换为DOM对象,然后将其转换为pdf的HTML代码字符串。我的问题是,一旦将其转换为pdf,则html的编码会导致引号和其他字符变为。如何制作没有出现奇怪字符的pdf文件?感谢您的提前帮助!
const request = require('request');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var htmlToPdf = require('html-to-pdf');
var BaseURL = 'https://www.wuxiaworld.com/novel/overgeared/og-chapter-';
for(let chapNum = 1; chapNum < 2; chapNum++) {//made to loop through many chapters
url = BaseURL;
url += chapNum;
request(url , (error, response, html) => {
if(!error & response.statusCode == 200) {
const dom = new JSDOM(html);
const chapterContent = dom.window.document.getElementsByClassName('fr-view')[1];//gets div I want
const filename = 'Overgeared_Chapter_' + chapNum + '.pdf';
htmlToPdf.convertHTMLString(chapterContent.outerHTML, filename,//takes the html string and makes pdf file
function (error, success) {
if (error) {
console.log('Worked...Not!');
console.log(error);
} else {
console.log('Actually Worked!');
console.log(success);
}
}
);
}
})
url = BaseURL;//resets url to be changed again
}