格式包含一个数组array(['url'] => URLToPage, ['title'] => TitleOfPage, ['link_title'] => TitleOfLinkedPage)
以防止执行时间到期,因为此脚本需要一些时间才能完成。此代码在填充$ products数组后执行,如果找到任何链接,$ products始终是数组,并且我在测试用例中输出$ link_html_strings以验证是否按预期检索了页面。这是违法的代码:
// Populate the destination link titles
if ( isset( $products ) && count( $products ) > 0 )
foreach( $products as $id => $product )
$from_this_page = $product['url'];
if ( $DEBUG ) echo 'Parsing ' . $from_this_page . '.<br />';
$link_html_string = file_get_contents( $from_this_page, NULL, NULL, NULL, 500 );
$string_parts = explode( '<title>', $link_html_string );
$string_parts = explode( '</title>', $string_parts[1] );
$products[$id]['link_title'] = $string_parts[0];
if ( $DEBUG ) echo 'Found title: ' . $products[$id]['link_title'] . '<br />';
这是脚本的完整代码,用于回答有关评论中$ product产品内容的问题。
// PHP HTML DOM Parser from http://simplehtmldom.sourceforge.net/
require_once( 'includes/simple_html_dom.php' );
//error_reporting( E_ALL );
set_time_limit( 0 );
// Debugging flag
$DEBUG = false;
function reportProducts( $category, $products )
echo '<table width="90%" align="center"><tr><th colspan="3">';
echo $category . ' has ' . count( $products ) . ' products listed, or in subpages.';
echo '</th></tr>';
echo '<tr><td bgcolor="#777777" width="30%">This page</td>
<td bgcolor="#bbbbbb" width="30%">links with</td>
<td bgcolor="#777777" width="30%">to this page</td></tr>';
foreach( $products as $product )
echo '<tr><td bgcolor="#777777">' . $product['title'] . '</td>
<td bgcolor="#bbbbbb"><a href="' . $product['url'] . '">' . $product['url'] .
'</a></td><td bgcolor="#777777">' . $product['link_title'] . '</td></tr>';
echo '</table><br />';
ob_flush(); // Server may buffer again, preventing incremental display
function parseProductsForPage( $page_to_parse )
global $DEBUG;
$failed = false;
$product_id = 0;
$page_dom = new simple_html_dom();
$page_html_string = @file_get_contents( $page_to_parse->href );
$load_state = @$page_dom->load( $page_html_string );
if ( $load_state === NULL )
// Find any direct product pages for this page
if ( $DEBUG ) echo $page_to_parse->href . ' being checked for products... ';
$possible = $page_dom->find( 'a[onclick]' );
foreach( $possible as $link )
if ( $link->innertext == "[ Add to cart ]" )
$products[$product_id]['url'] = $link->href;
$titles = $page_dom->find( 'title' );
$products[$product_id]['title'] = $titles[0]->innertext;
if ( $DEBUG )
if ( isset( $products ) )
echo count( $products ) . ' found on page.<br />';
} else
echo '0 found on page.<br />';
// Find subpages...
if ( $DEBUG ) echo $page_to_parse->href . ' being checked for links... ';
$subpages = $page_dom->find( 'a[class=buy]' );
if ( $DEBUG ) echo count( $subpages ) . ' found.<br />';
// ... and parse
foreach( $subpages as $subpage )
$subpage_dom = new simple_html_dom();
$subpage_html_string = @file_get_contents( $subpage->href );
$load_state = @$subpage_dom->load( $subpage_html_string );
if ( $load_state === NULL )
// Find any direct product pages for this page
if ( $DEBUG ) echo $subpage->href . ' being checked for products... ';
$possible = $subpage_dom->find( 'a[onclick]' );
foreach( $possible as $link )
if ( $link->innertext == "[ Add to cart ]" )
$products[$product_id]['url'] = $link->href;
$titles = $page_dom->find( 'title' );
$products[$product_id]['title'] = $titles[0]->innertext;
if ( $DEBUG )
if ( isset( $products ) )
echo count( $products ) . ' found on page.<br />';
} else
echo '0 found on page.<br />';
} else
$failed[] = $subpage->href;
unset( $subpage_dom );
// Populate the destination link titles
if ( isset( $products ) && count( $products ) > 0 )
foreach( $products as $id => $product )
// $from_this_page = $product['url'];
// if ( $DEBUG ) echo 'Parsing ' . $from_this_page . '.<br />';
// $link_html_string = file_get_contents( $from_this_page, NULL, NULL, NULL, 500 );
// $string_parts = explode( '<title>', $link_html_string );
// $string_parts = explode( '</title>', $string_parts[1] );
// $products[$id]['link_title'] = $string_parts[0];
// if ( $DEBUG ) echo 'Found title: ' . $products[$id]['link_title'] . '<br />';
// ob_flush();
// flush();
} else
$failed[] = $page_to_parse->href;
$titles = $page_dom->find( 'title' );
if ( isset( $products ) ) reportProducts( $titles[0]->innertext, $products );
unset( $page_dom );
return $failed;
// Initialize the object
$html = new simple_html_dom();
$html->load_file( 'index.html' );
// Start output buffer
// Find all product categories listed on the website
if ( $DEBUG ) echo '<h1>Collecting links from LHN...</h1>';
$sidelinks = $html->find( 'a[class=sidelink_main]' );
unset( $html );
echo '<h1>Found ' . count( $sidelinks ) . ' categories.</h1><br />';
ob_flush(); // Server may buffer output, preventing incremental display
// Find links and products for each category
foreach( $sidelinks as $sidelink )
if ( $DEBUG ) echo 'Sending ' . $sidelink->href . ' to parser.<br />';
$parse_failed = parseProductsForPage( $sidelink );
if ( $parse_failed )
foreach( $parse_failed as $failure )
$failures[] = $failure;
echo count( $failures ) . ' pages failed to parse.<br />';
echo '<br />FIN!<br />'; // Easily searched to verfiy end of script was reached, also
// celebratory.
ob_end_flush(); // Clear output buffer
答案 0 :(得分:0)
还要确保$string_parts = explode( '<title>', $link_html_string );