我正在使用节点可读性模块来清理文章。但在大多数情况下,文章的主要图像不是抓取内容的一部分。因此,我在helpers.js和readability.js文件中进行了一些更改,以获取文章的主要图像,将其从DOM对象中删除并最终将图像添加到articleContent。代码:
helpers.js
var grabImage = module.exports.grabImage = function(document) {
var images = document.getElementsByTagName('IMG');
var MINIMUM_SURFACE = 100*100;
if (images.length > 0) {
for (var i = 0; i < images.length; ++i) {
var image = images[i];
if ( image.getAttribute('data-src') ) {
image.getAttribute('src') = image.getAttribute('data-src');
}
if ( image.getAttribute('data-lazy-src') ) {
image.getAttribute('src') = image.getAttribute('data-lazy- src');
}
if ( !image.getAttribute('src') ) {
continue;
}
// //Compute surface
// var w = image.getAttribute('width') || 1;
// var h = image.getAttribute('height') || 1;
// image.surface = w * h;
var image = new Image();
image.onload = function() {
var image.surface = this.width*this.height;
}
//Filter by size
if ( image.surface > MINIMUM_SURFACE ) {
var mainImageSrc = image.getAttribute('src');
//Resolve relative url
if (!mainImageSrc.match(/^http/)) {
if (!image.ownerDocument.originalURL) {
} else{
mainImageSrc = url.resolve(image.ownerDocument.originalURL, mainImageSrc);
}
}
image.parentNode.removeChild(image);
break;
}
}
}
return mainImageSrc;
};
readability.js
var mainImgUrl = helpers.grabImage(this._document);
var img = this._document.createElement("IMG");
img.setAttribute('src', mainImgUrl);
articleContent.insertBefore(img, articleContent.childNodes[0] );
我在此函数
中添加了以上部分代码Readability.prototype.getContent = function(notDeprecated) {
但是,它没有用。正在抓取整个内容,但我收到此错误
> Cleaning Conditionally [object HTMLDivElement] (image width-494:)
Cleaning Conditionally [object HTMLDivElement] (:)
fixed link
C:\Users\SAI\reader-rest\routes\api.js:19
var content = '<html><head><meta charset="utf-8"><title>'+articl
e.title+'</title></head><body>' +article.content+'</body></html>';
TypeError: Cannot read property 'title' of undefined
at C:\Users\SAI\reader-rest\routes\api.js:19:78
at Object.jsdom.env.done (C:\Users\SAI\reader-rest\node_modules\node-readabi
lity\src\readability.js:292:18)
at C:\Users\SAI\reader-rest\node_modules\node-readability\node_modules\jsdom
\lib\jsdom.js:259:18
at nextTickCallbackWith0Args (node.js:420:9)
at process._tickCallback (node.js:349:13)
typeerror: Cannot read property 'title' of undefined at C:\Users\SAI\reader-rest\routes\api.js:19:78 at Object.jsdom.env.done (C:\Users\SAI\reader-rest\node_modules\node-readability\src\readability.js:292:18) at C:\Users\SAI\reader-rest\node_modules\node-readability\node_modules\jsdom\lib\jsdom.js:259:18 at nextTickCallbackWith0Args (node.js:420:9) at process._tickCallback (node.js:349:13)
有人可以帮我解决这个问题。
我在github上发布了相同的内容。但没有回应。 https://github.com/luin/readability/issues/52