我刚刚为T恤网站完成了这个基本的webscraper项目。
它通过一个硬编码的网址进入主页。它将搜索任何产品页面,并将它们添加到网址中。如果它找到另一个链接(remainder
),它将再次抓取它并找到更多产品页面。它将产品页面添加到urlSet
,然后再抓取它们,抓取T恤数据(价格,img,标题)然后转换,然后将它们写入CSV文件。
出于某种原因,这不会影响第二次使用'余下'
。如果我删除第二次删除网址,一切正常,文件写得正确。但是,如果我想获得其他产品页面,它似乎在某个地方失败了。
这是我的代码,我为发布这么多内容而道歉但我不知道如果没有正确的背景将如何理解它,希望它已被评论好了:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error)return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = imgURL;
tshirtObject.URL = url;
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
convertJson2Csv();
}
//convert tshirt objects and save as CSV file
function convertJson2Csv(){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) throw err;
});
}
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape) //scrape again but with remainder url
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
我控制台在arrayOfLinks
中记录了nextStep
,因此我可以看到它们被正确抓取,我无法解决为什么它们没有被传递到&{ #39; lastStep&#39;正常。
Currently scraping http://shirts4mike.com/
[ 'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirt.php?id=108',
'http://shirts4mike.com/shirt.php?id=107',
'http://shirts4mike.com/shirt.php?id=106',
'http://shirts4mike.com/shirt.php?id=105' ]
The remainder is http://shirts4mike.com/shirts.php
http://shirts4mike.com/shirt.php?id=108
http://shirts4mike.com/shirt.php?id=107
http://shirts4mike.com/shirt.php?id=106
http://shirts4mike.com/shirt.php?id=105
Currently scraping http://shirts4mike.com/shirts.php
[ 'http://shirts4mike.com/shirts.phpshirts.php',
'http://shirts4mike.com/shirts.phpshirt.php?id=101',
'http://shirts4mike.com/shirts.phpshirt.php?id=102',
'http://shirts4mike.com/shirts.phpshirt.php?id=103',
'http://shirts4mike.com/shirts.phpshirt.php?id=104',
'http://shirts4mike.com/shirts.phpshirt.php?id=105',
'http://shirts4mike.com/shirts.phpshirt.php?id=106',
'http://shirts4mike.com/shirts.phpshirt.php?id=107',
'http://shirts4mike.com/shirts.phpshirt.php?id=108' ]
但是如果我选择只拨打第一个电话而不打电话给第二个,就像这样:
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
......然后一切正常。我只是没有找到所有的网址。
这里发生了什么,我该如何解决?谢谢你们
答案 0 :(得分:1)
问题tshirtArray
未在convertJson2Csv()
中定义。在lastlastScraperPt2
通过tshirtArray
到convertJsonCsv()
convertJson2Csv(tshirtArray)
在convertJson2Csv
function convertJson2Csv(tshirtArray) {
// do stuff
}
答案 1 :(得分:1)
您的lastStep
似乎存在一个问题。看起来你的意思是remainder
是另一个网址数组。如果我在那里错了,请纠正我。但是,在if($('[type=submit]').length !== 0)
条件第一次失败时,您会自动进入下一个区块,因为remainder
开始未定义。无论当前网址是什么,您都可以将其分配给remainder
。对于for循环的其余迭代,您将永远不会再次遇到remainder == undefined
的情况。因此,如果您最终只将一个网址分配给remainder
,那么您希望获得的任何网址都将被简单地传递。
您可能希望将remainder
定义为remainder = [];
。而不是说else if (remainder == undefined)
,而只是说
} else {
remainder.push(obj.arrayOfUrls[i]);
}
然而,当scrape
只期待一个网址时,您会将一系列网址传递给scrape
。如果这是你想要的,我认为你的意思是remainder
是一个url数组,你可以定义一个新的函数,如下所示:
function scrapeRemainders(remainders) {
var promises = [];
remainder.forEach(function (url) {
promises.push(requestPromise(url));
});
return Promise.all(promises).then(function (results) {
_.flattenDeep(results);
})
}
然后,您将使用scrape
替换它,而不是您的保证链中的第二个scrapeRemainders
。另外,对于前一个函数中的_
,您需要npm install lodash
然后var _ = require('lodash')
。另外,lodash与promises无关,但它是一个很好的数据操作工具。如果有机会,你应该调查一下。
此外,在lastScraperPt1
中,您可以更改
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
到
return Promise.all(promiseArray);
它做同样的事情。
希望这会有所帮助。如果这不能回答您的问题,请向我发表评论,我可以相应地更改我的答案。
答案 2 :(得分:0)
全部修复了,它抓住了scrape()
中错误的网址。虽然我在将statusCodes记录到控制台后才知道这一点:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var urlHome = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = [];
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) {
errorHandler(error);
return reject(error);
}
if(!error && response.statusCode == 200){
return resolve(html);
}
if(response.statusCode !== 200){
console.log("response code is " + response.statusCode);
}
return resolve("");
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
var URL = 'http://shirts4mike.com/';
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(URL + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.push(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
var remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = urlHome + imgURL;
tshirtObject.URL = urlSet[i];
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
return tshirtArray;
}
//conver tshirt objects and save as CSV file
function convertJson2Csv(tshirtArray){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) errorHandler(err);
});
}
scrape(urlHome) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape)
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.then(convertJson2Csv)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
//If the site is down, an error message describing the issue should appear in the console.
//This is to be tested by disabling wifi on your device.
//When an error occurs log it to a file scraper-error.log . It should append to the bottom of the file with a time stamp and error
var errorHandler = function (error) {
console.log(error.message);
console.log('The scraper could not not scrape data from ' + url + ' there is either a problem with your internet connection or the site may be down');
/**
* create new date for log file
*/
var loggerDate = new Date();
/**
* create message as a variable
*/
var errLog = '[' + loggerDate + '] ' + error.message + '\n';
/**
*when the error occurs, log that to the error logger file
*/
fs.appendFile('scraper-error.log', errLog, function (err) {
if (err) throw err;
console.log('There was an error. The error was logged to scraper-error.log');
});
};