Question

我正在解析网页并关注链接，以便将链接从一个页面映射到另一个页面。我只是提取链接所在页面的标题，用于链接页面的URL以及URL导致的页面标题。

我的代码可以顺利发现我感兴趣的链接，并下载子页面以查找其他产品链接。在至少一百页中有几百个，所以它是几个解析的HTML文件。我正在构建一个$products[index]格式包含一个数组array(['url'] => URLToPage, ['title'] => TitleOfPage, ['link_title'] => TitleOfLinkedPage)，我希望这能够证明这一点。

该脚本正常工作，直到我添加此片段，之后脚本将停止执行，没有错误，警告，通知或任何内容;它根本不会到达脚本的末尾。我已经包含set_time_limit(0)以防止执行时间到期，因为此脚本需要一些时间才能完成。此代码在填充$ products数组后执行，如果找到任何链接，$ products始终是数组，并且我在测试用例中输出$ link_html_strings以验证是否按预期检索了页面。这是违法的代码：

// Populate the destination link titles
if ( isset( $products ) && count( $products ) > 0 )
    {
        foreach( $products as $id => $product )
        {
             $from_this_page = $product['url'];
             if ( $DEBUG ) echo 'Parsing ' . $from_this_page . '.<br />';
             $link_html_string = file_get_contents( $from_this_page, NULL, NULL, NULL, 500 );
             $string_parts = explode( '<title>', $link_html_string );
             $string_parts = explode( '</title>', $string_parts[1] );
             $products[$id]['link_title'] = $string_parts[0];
             if ( $DEBUG ) echo 'Found title: ' . $products[$id]['link_title'] . '<br />';
             ob_flush();
             flush();
        }
    }

不应该真的需要500个字符，但是，在阅读整个文件时我对内存使用有一些顾虑，所以我通过限制读取来减少负载（我认为）。我想也许这个脚本耗尽了PHP的所有已分配内存。当包含它时，它将遍历此循环多次，但在某些时候停止执行，这一点也不完全相同。对于正在解析的文件，我会得到几个echo。

这是脚本的完整代码，用于回答有关评论中$ product产品内容的问题。

<?php
// PHP HTML DOM Parser from http://simplehtmldom.sourceforge.net/
require_once( 'includes/simple_html_dom.php' );

//error_reporting( E_ALL );
set_time_limit( 0 );

// Debugging flag
$DEBUG = false;

function reportProducts( $category, $products )
{
    echo '<table width="90%" align="center"><tr><th colspan="3">';
    echo $category . ' has ' . count( $products ) . ' products listed, or in subpages.';
    echo '</th></tr>';
    echo '<tr><td bgcolor="#777777" width="30%">This page</td>
        <td bgcolor="#bbbbbb" width="30%">links with</td>
        <td bgcolor="#777777" width="30%">to this page</td></tr>';
    foreach( $products as $product )
    {
        echo '<tr><td bgcolor="#777777">' . $product['title'] . '</td>
            <td bgcolor="#bbbbbb"><a href="' . $product['url'] . '">' . $product['url'] . 
            '</a></td><td bgcolor="#777777">' . $product['link_title'] . '</td></tr>';
    }
    echo '</table><br />';
    ob_flush(); // Server may buffer again, preventing incremental display
    flush();
}

function parseProductsForPage( $page_to_parse )
{
    global $DEBUG;
    $failed = false;
    $product_id = 0;
    $page_dom = new simple_html_dom();
    $page_html_string = @file_get_contents( $page_to_parse->href );
    $load_state = @$page_dom->load( $page_html_string );
    if ( $load_state === NULL )
    {
        // Find any direct product pages for this page
        if ( $DEBUG ) echo $page_to_parse->href . ' being checked for products... ';
        $possible = $page_dom->find( 'a[onclick]' );
            foreach( $possible as $link )
            {
                if ( $link->innertext == "[ Add to cart ]" )
                {
                    $products[$product_id]['url'] = $link->href;
                    $titles = $page_dom->find( 'title' );
                    $products[$product_id]['title'] = $titles[0]->innertext;
                    $product_id++;
                }
            }
        if ( $DEBUG ) 
        {
            if ( isset( $products ) )
            {
                echo count( $products ) . ' found on page.<br />';
            } else
            {
                echo '0 found on page.<br />';
            }
        }

        // Find subpages...
        if ( $DEBUG ) echo $page_to_parse->href . ' being checked for links... ';
        $subpages = $page_dom->find( 'a[class=buy]' );
        if ( $DEBUG ) echo count( $subpages ) . ' found.<br />';
        // ... and parse
        foreach( $subpages as $subpage )
        {
            $subpage_dom = new simple_html_dom();
            $subpage_html_string = @file_get_contents( $subpage->href );
            $load_state = @$subpage_dom->load( $subpage_html_string );
            if ( $load_state === NULL )
            {
                // Find any direct product pages for this page
                if ( $DEBUG ) echo $subpage->href . ' being checked for products... ';
                $possible = $subpage_dom->find( 'a[onclick]' );
                    foreach( $possible as $link )
                    {
                        if ( $link->innertext == "[ Add to cart ]" )
                        {
                            $products[$product_id]['url'] = $link->href;
                            $titles = $page_dom->find( 'title' );
                            $products[$product_id]['title'] = $titles[0]->innertext;
                            $product_id++;
                        }
                    }
                if ( $DEBUG ) 
                {
                    if ( isset( $products ) )
                    {
                        echo count( $products ) . ' found on page.<br />';
                    } else
                    {
                        echo '0 found on page.<br />';
                    }
                }
                $subpage_dom->clear();
            } else
            {
                $failed[] = $subpage->href;
            }
            $subpage_dom->clear();
            unset( $subpage_dom );
        }
        // Populate the destination link titles
        if ( isset( $products ) && count( $products ) > 0 )
        {
            foreach( $products as $id => $product )
            {
                // $from_this_page = $product['url'];
                // if ( $DEBUG ) echo 'Parsing ' . $from_this_page . '.<br />';
                // $link_html_string = file_get_contents( $from_this_page, NULL, NULL, NULL, 500 );
                // $string_parts = explode( '<title>', $link_html_string );
                // $string_parts = explode( '</title>', $string_parts[1] );
                // $products[$id]['link_title'] = $string_parts[0];
                // if ( $DEBUG ) echo 'Found title: ' . $products[$id]['link_title'] . '<br />';
                // ob_flush();
                // flush();
            }
        }
    } else
    {
        $failed[] = $page_to_parse->href;
    }
    $titles = $page_dom->find( 'title' );
    if ( isset( $products ) ) reportProducts( $titles[0]->innertext, $products );
    $page_dom->clear();
    unset( $page_dom );
    return $failed;
}

// Initialize the object
$html = new simple_html_dom();
$html->load_file( 'index.html' );

// Start output buffer
ob_start();

// Find all product categories listed on the website
if ( $DEBUG ) echo '<h1>Collecting links from LHN...</h1>';
$sidelinks = $html->find( 'a[class=sidelink_main]' );
$html->clear();
unset( $html );

echo '<h1>Found ' . count( $sidelinks ) . ' categories.</h1><br />';
ob_flush(); // Server may buffer output, preventing incremental display
flush();

// Find links and products for each category
foreach( $sidelinks as $sidelink )
{
    if ( $DEBUG ) echo 'Sending ' . $sidelink->href . ' to parser.<br />';
    $parse_failed = parseProductsForPage( $sidelink );
    if ( $parse_failed )
    {
        foreach( $parse_failed as $failure )
        {
            $failures[] = $failure;
        }
    }
}

echo count( $failures ) . ' pages failed to parse.<br />';

echo '<br />FIN!<br />'; // Easily searched to verfiy end of script was reached, also
                         // celebratory.
ob_end_flush(); // Clear output buffer
flush();
?>

Answer 1

你确定set_time_limit有效吗（当运行带有safe_mode的php时会产生任何影响）？

还要确保$string_parts = explode( '<title>', $link_html_string );给出结果（可能没有title-element或tagName可能使用大写）

此代码块中是否存在可以停止执行的内容？

1 个答案: