Question

我试图返回网站站点地图中提供的所有网址，例如Argos。一旦我拥有这些URL，我就需要重复此过程以返回结果URL可能包含的任何URL。例如：

http://www.argos.co.uk/sitemap.xml返回

http://www.argos. co.uk/product.xml
http://www.argos. co.uk/product2.xml
http://www.argos. co.uk/catalogue.xml
http://www.argos. co.uk/buyers_guides.xml
http://www.argos. co.uk/features_and_articles.xml
http://www.argos. co.uk/static_pages.xml
http://www.argos. co.uk/store_pages.xml

http://www.argos.co.uk/product.xml然后包含我自己需要的链接（然后重复此过程，直到到达包含不再有xml URL的页面）

到目前为止：

var urls = require('sitemap-urls'); //package to return xml links from sitemap
var cheerio = require('cheerio');
var request = require('request')

// Returns all xml urls located within page source
request('http://www.argos.co.uk/sitemap.xml', function (error, response, html) {
  var sitemap = html;
  var results = urls.extractUrls(sitemap);

// If results returned, loop to make sitemap equal each url until array end
    if(results) {
    for(i = 0; i < results.length; i++) {
        sitemap = results[i]
        console.log(sitemap)

    // Need to repeat url return process for each url returned


    }
  }                                                                                         
});

我可能会忽略一个简单的解决方案，非常感谢任何帮助，谢谢。

Answer 1

我认为你要找的是蜘蛛

<?php
function crawl_page($url, $depth = 5)
{
    static $seen = array();
    if (isset($seen[$url]) || $depth === 0) {
        return;
    }

    $seen[$url] = true;

    $dom = new DOMDocument('1.0');
    @$dom->loadHTMLFile($url);

    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $href = $element->getAttribute('href');
    if (0 !== strpos($href, 'http')) {
            $path = '/' . ltrim($href, '/');
            if (extension_loaded('http')) {
                $href = http_build_url($url, array('path' => $path));
            } else {
                $parts = parse_url($url);
                $href = $parts['scheme'] . '://';
                if (isset($parts['user']) && isset($parts['pass'])) {
                    $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                }
                $href .= $parts['host'];
                if (isset($parts['port'])) {
                    $href .= ':' . $parts['port'];
                }
                $href .= $path;
            }
        }
        crawl_page($href, $depth - 1);
    }
    echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL;
}
crawl_page("http://hobodave.com", 2);

返回站点地图网址

1 个答案: