我使用phantomjs为我经常需要获取信息的网站制作了一个抓取程序,问题是我会在整个数据中得到这些“”字符。现在,如果我将此数据写入文件,并在单独的节点脚本中,我使用iconv编码为ascii,然后使用正则表达式模式来摆脱生成的“?”,这很好。但是当我尝试在我的抓取程序中包含相同的功能时,我得到了这个错误:
ReferenceError: Can't find variable: Buffer
C:/javascript/Phantom/node_modules/iconv-lite/encodings/internal.js:4
C:/javascript/Phantom/node_modules/iconv-lite/encodings/internal.js:82
ReferenceError: Can't find variable: Buffer
C:/javascript/Phantom/node_modules/iconv-lite/encodings/sbcs-codec.js:20
C:/javascript/Phantom/node_modules/iconv-lite/lib/index.js:98 in getCodec
C:/javascript/Phantom/node_modules/iconv-lite/lib/index.js:16 in encode
siteScraper.js:72 in replaceDataAndWrite
siteScraper.js:58
这是抓取程序(替换了url和隐私凭据):
var page = require('webpage').create();
var fs = require('fs');
var iconv = require('iconv-lite');
console.log('before page.open'),
url = "xxxxxxxxxxxxxxxxxxxxxxxxx",
url2 = "xxxxxxxxxxxxxxxxxxxxxxxx";
var credentials = {username: 'xxxxxxxxx', password: 'xxxxxxxx'}
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
console.log('inside page.open callback');
page.evaluate(function (credentials) {
document.querySelector('input[id=username]').value = credentials.username;
document.querySelector('input[id=password]').value = credentials.password;
document.querySelector('input[id=button_submit]').click();
console.log('finished querying selectors');
}, credentials);
window.setTimeout(function () {
page.render('postLogin.png');
console.log('rendered post-login');
page.open(url2, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
console.log('INSIDE frame');
window.setTimeout(function () {
page.render('framePic.png');
console.log('rendered framePic.png');
var output = page.evaluate(function () {
var output;
// get all table data
tables = document.getElementsByTagName('table');
// go through table data
for(i=0; i < tables.length; i++) {
cells = tables[i].getElementsByTagName('td');
// assign data cells to output var
for (j=0; j < cells.length; j++) {
output += cells[j].innerText + '\n';
}
}
return output;
});
// get rid of /'s
while (output.indexOf("/") != -1)
{
replacedOutput = output.replace("/", "")
output = replacedOutput
}
// function call to encode output to ascii and replace a few things
replaceDataAndWrite(output);
// var path = 'scrapedData.txt';
// fs.write(path, output, 'w');
console.log('function complete');
phantom.exit();
}, 10000); // inner setTimeout
} // second else
});
}, 10000); // outer setTimeout
} // first else
}); // first page.open
function replaceDataAndWrite (data) {
encodedData = iconv.encode(data, 'ascii');
newData = encodedData.toString('ascii')
replacedData = newData.replace(/\?/gi,"")
// replacedData = replacedData.replace(/undefined/gi,"")
fs.writeFile('scrapedData.txt', replacedData, function (err) {
if (err) throw err;
});
}
我实际上只是将以下工作程序iconvTest.js中的代码复制并粘贴到我的抓取脚本中的函数“replaceDataAndWrite()”中(但明显地将文件读取出来):
var iconv = require('iconv-lite');
var fs = require('fs');
fs.readFile('scrapedData.txt', function(err, data) {
if (err) throw err;
encodedData = iconv.encode(data, 'ascii');
newData = encodedData.toString('ascii')
replacedData = newData.replace(/\?/gi,"")
// replacedData = replacedData.replace(/undefined/gi,"")
fs.writeFile('message.txt', replacedData, function (err) {
if (err) throw err;
});
});
Iconv在最后一个例子中完美运行,但在我的抓取程序中没有。这是iconv中的错误还是有其他解释?