Question

该脚本从finance.yahoo.com下载历史股票价格。一系列代码用于循环遍历脚本，基于股票代码阵列创建li'nks并下载与每个股票代码相关联的数据。然而，一些股票代码符号不再是最新的，因此雅虎提供404页面而不是包含价格信息的csv。然后将错误存储在csv中并保存到我的计算机中。为了不下载这些文件，我正在寻找字符串'抱歉，找不到您请求的页面。'，它包含在每个yahoos错误站点中作为404页面的指示符。

代码的行为（输出，见下面的代码）：

代码贯穿所有代码并下载所有股票价格.csv's。这适用于所有的股票代码，但雅虎不再使用一些股票代码符号。在不再使用的股票代码的情况下，程序下载包含yahoos 404页面的.csv。所有文件（也包含实际数据的文件）都会下载到目录c：\ Users \ W7ADM \ stock-price-leecher \ data2。

问题：

我希望代码不要将404页面下载到csv文件中，但在这种情况下什么也不做，然后转到循环中的下一个自动收报机符号。我试图用if-condition查找字符串“抱歉，找不到你请求的页面”。这是在404页的yahoos上显示的。最后，我要下载所有csv用于实际存在的代码并将它们保存到我的硬盘。

var url_begin = 'http://real-chart.finance.yahoo.com/table.csv?s=';
var url_end = '&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv';
var tickers = [];
var link_created = '';

var casper = require('casper').create({
    pageSettings: {
        webSecurityEnabled: false
    }
});                   

casper.start('http://www.google.de', function() {              
        tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is retrievable, 0AM.DE is not
        //loop through all ticker symbols
        for (var i in tickers){
                //create a link with the current ticker
                link_created=url_begin + tickers[i] + url_end;
                //check to see, if the created link returns a 404 page
                this.open(link_created);
                var content = this.getHTML();
                //If is is a 404 page, jump to the next iteration of the for loop
                if (content.indexOf('Sorry, the page you requested was not found.')>-1){
                        console.log('No Page found.');
                        continue; //At this point I want to jump to the next iteration of the loop.
                }
                //Otherwise download file to local hdd
                else {
                        console.log(link_created);
                        this.download(link_created, 'stock-price-leecher\\data2\\'+tickers[i]+'.csv');
                }
        }
});

casper.run(function() {
        this.echo('Ende...').exit();
});

输出：

C:\Users\Win7ADM>casperjs spl_old.js
ADS.DE,0AM.DE
http://real-chart.finance.yahoo.com/table.csv?s=ADS.DE&a=00&b=1&c=1950&d=11&e=31
&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0AM.DE&a=00&b=1&c=1950&d=11&e=31
&f=2050&g=d&ignore=.csv
Ende...

C:\Users\Win7ADM>

Answer 1

casper.open是异步（非阻塞），但您以阻塞方式使用它。你应该使用casper.thenOpen，它有一个在页面加载时调用的回调，你可以用它做任何事情。

casper.start("http://example.com");

tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is still retrievable, 0AM.DE is not
tickers.forEach(function(ticker){
    var link_created = url_begin + ticker + url_end;
    casper.thenOpen(link_created, function(){
        console.log("open", link_created);
        var content = this.getHTML();
        if (content.indexOf('Sorry, the page you requested was not found.') > -1) {
            console.log('No Page found.');
        } else {
            console.log("downloading...");
            this.download(link_created, 'test14_'+ticker+'.csv');
        }
    });
});

casper.run();

您还可以注册thenOpen事件，并通过检查状态来专门下载，而不是使用page.resource.received回调。但现在您无法访问ticker，因此您必须将其存储在全局变量中，或者从resource.url解析它。

var i = 0;
casper.on("page.resource.received", function(resource){
    if (resource.stage === "end" && resource.status === 200) {
        this.download(resource.url, 'test14_'+(i++)+'.csv');
    }
});

casper.start("http://example.com");

tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is still retrievable, 0AM.DE is not
tickers.forEach(function(ticker){
    var link_created = url_begin + ticker + url_end;
    casper.thenOpen(link_created);
});

casper.run();

我认为您不应该使用open或thenOpen执行此操作。它可能适用于PhantomJS，但可能不适用于SlimerJS。

我实际上尝试过，而且您的页面很奇怪，因为下载不成功。您可以加载一些虚拟页面，例如example.com，使用__utils__.sendAJAX自己下载csv文件（只能从页面上下文访问），使用fs模块write下载它们。您应该只根据您确定的特定404错误页面文本编写它：

casper.start("http://example.com");

casper.then(function(){
    tickers = ['ADS.DE', '0AM.DE']; //ADS.DE is still retrievable, 0AM.DE is not
    tickers.forEach(function(ticker){
        var link_created = url_begin + ticker + url_end;
        var content = casper.evaluate(function(url){
            return __utils__.sendAJAX(url, "GET");
        }, link_created);
        console.log("len: ", content.length);
        if (content.indexOf('Sorry, the page you requested was not found.') > -1) {
            console.log('No Page found.');
        } else {
            console.log("writing...");
            fs.write('test14_'+ticker+'.csv', content);
        }
    });
});

casper.run();

如何在没有错误的情况下下载股票价格数据（404）？

1 个答案: