PHP更快的替代品?

时间:2018-01-06 08:55:45

标签: php

我一直在研究基本上是一种网络爬虫的代码,它会进入某个网站并抓取不同的所需部分,给我链接,然后我会转到这些链接并抓住它们所需的部分,将其解析为数组。问题是有54个部分,我想要的不同部分总计约8,500个项目。

最大的问题是使用所有这些,并且鉴于我无法亲自更改网站的代码,当我使用循环时,我的页面上的加载时间非常慢。我想知道这里是否有人可以看到任何可能加速我的代码的地方,任何一点点都会有所帮助。 从for循环切换到xpath查询,我已经获得了大约21秒到大约16秒的平均加载时间,但对我来说,它看起来仍然很慢。

无论如何,这是代码。我概括了它。

(在进行基准测试时,我发现newPart部分的平均值大约为(.00004秒)。我不相信这是它减速的地方,但这是有8500次的部分它运行,所以如果你能找到一种方法来加快速度,那也很棒。)

$sections = array();
// Here is some bs website, just as an example.
$homepage = "google.com";
function getSections() {
    global $homePage;

    $options = array('http'=>array('method'=>"GET", "headers'=>'User-Agent: ACrawler/1.0\n"));
    $contex = stream_context_create($options);

    $doc = new DOMDocument();
    @$doc->loadHTML(@file_get_contents($homePage, false, $contex));

    $query = "//div[contains(@class, 'col-md-8 uw-content')]/ul/li/a";
    $xpath = new DOMXPath($doc);
    $result = $xpath->query($query);

    foreach($result as $a){
        $link = $a->getAttribute("href");
        $name = "";
        $text = $a->textContent;
        $etext = explode(" ", $text);
        for($i = 1; $i < count($etext); $i++){
            if(substr($etext[$i], 0, 1) == "(") {
                if(substr($etext[$i], -1) == ")")
                    $name .= substr($etext[$i], 1, -1)." ";
                else
                    $name .= substr($etext[$i], 1)." ";
            } else if(substr($etext[$i], -1) == ")") {
                $name .= substr($etext[$i], 0, -1);
            }
        }
        if(substr($name, -1) == " ")
            $name = substr($name, 0, -1);
        getParts($link, $name);
    }
}

function getParts($newlink, $name) {
    global $homePage, $sections;

    $options = array('http'=>array('method'=>"GET", "headers'=>'User-Agent: ACrawler/1.0\n"));
    $contex = stream_context_create($options);

    $doc = new DOMDocument();
    $link = $homePage.$newlink;
    @$doc->loadHTML(@file_get_contents($link, false, $contex));

    $newSection = array();
    $parts = array();

    $newSection["sectionName"] = $name;

    $query = "//a[@name]";
    $xpath = new DOMXPath($doc);
    $result = $xpath->query($query);

    foreach($result as $a){
        $newPart = newPart($a, $name);
        $parts[] = $newPart;
    }

    $newSection["parts"] = $parts;
    $sections[] = $newSection;
}

function newPart($a, $name){
    $newPart = array();
    $p = $a->getElementsByTagName("p")[0];

    $title = $p->getElementsByTagName("b")[0]->textContent;
    $splitTitle = explode(" ", $title);
    $number = null;
    $cleared = null;
    if(is_numeric($splitTitle[1])){
        $number = $splitTitle[1];
        $cleared = 2;
    }else{
        $number = $splitTitle[2];
        $cleared = 3;
    }

    $pname = "";
    $ints = null;
    $categories = array();

    for($i = $cleared; $i < count($splitTitle); $i++)
        if($splitTitle[$i][0] != "(")
            if($ints != null)
                if(substr($splitTitle[$i], -1) == ",")
                    $categories[] = substr($splitTitle[$i], 0, -1);
                else
                    $categories[] = $splitTitle[$i];
            else
                $pname .= $splitTitle[$i]." ";
        else
            $ints = $splitTitle[$i][1];

    $description = "";
    $splitdesc = "";
    $text = $p->textContent;
    $splitText = explode(" ", $text);
    // -8 for View course details in YYYYYY: X XXXX XXX
    $sentences = array();
    $sentence = "";
    for($i = count($splitTitle) - 1; $i < count($splitText) - (5 + $cleared - 1); $i++)
        if(substr($splitText[$i], -1) == "."){
            $sentence .= $splitText[$i];
            $sentences[] = $sentence;
            $sentence = "";
        }else if(substr($splitText[$i], -5) == ".View"){
            $sentence .= $splitText[$i];
            $sentence = substr($sentence, 0, -4);
            $sentences[] = $sentence;
            $sentence = "";
        }else
            if($i == count($splitTitle) - 1)
                $sentence .= substr($splitText[$i], 3)." ";
            else
                $sentence .= $splitText[$i]." ";
    $splitdesc = array();
    if(count($sentences) != 0){
        while(substr($sentences[count($sentences) - 1], 0, 11) == "Prerequisite" ||
            substr($sentences[count($sentences) - 1], 0, 11) == "Co-requisite"){
            $splitdesc[] = $sentences[count($sentences) - 1];
            unset($sentences[count($sentences) - 1]);
        }
        for($i = count($splitdesc); $i > 0; $i--)
            $splitdesc.= $splitdesc[$i]." ";
        foreach($sentences as $sentence)
            $description .= $sentence." ";
    }

    $link = $p->getElementsByTagName("a")[0]->getAttribute("href");

    $newClass["number"] = $number;
    $newClass["title"] = $pname;
    $newClass["ints"] = $ints;
    $newClass["categories"] = $categories;
    $newClass["description"] = $description;
    $newClass["splitdesc"] = $splitdesc;
    $newClass["link"] = $link;

    return $newClass;
}

getSections();

提前感谢您的帮助。

0 个答案:

没有答案