我正在制作一个图片搜索项目,我想获得一页最大的图像。 我添加了一些代码来修复图像的真实地址,删除可能的广告图像。比较宽度*高度回声最大的一个。但我的代码有一些问题。这是我的整个代码。任何人都可以帮我修复错误的地方以及如何优化代码,我觉得这个过程很慢。谢谢大家。
<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
$v = 'http://www.yomiuri.co.jp/stream/';
$html = file_get_html($v);
$maxsize = -1;
$the_biggest_image = false;
$arr = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
foreach($html->find('img') as $element) {
preg_match_all('#https?://(.*?)($|/)#m', urldecode(stripcslashes($v)), $r); //get site base url
$pic = $element->src;
$comm = url_to_absolute( $r[0][0], $pic);//get image absolute url
$check_flag = true;
foreach($arr as $item) {
if (substr_count(strtolower($comm),$item) > 0) $check_flag = false;
}// remove ads images
if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {
$maxsize = $arr[0] * $arr[1]; //compare images' sise
$the_biggest_image = $comm;
echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
}
}
?>
url_to_absolute.php
<?php
/**
* Edited by Nitin Kr. Gupta, publicmind.in
*/
/**
* Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
* the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
* OF SUCH DAMAGE.
*/
/*
* This is a BSD License approved by the Open Source Initiative (OSI).
* See: http://www.opensource.org/licenses/bsd-license.php
*/
/**
* Combine a base URL and a relative URL to produce a new
* absolute URL. The base URL is often the URL of a page,
* and the relative URL is a URL embedded on that page.
*
* This function implements the "absolutize" algorithm from
* the RFC3986 specification for URLs.
*
* This function supports multi-byte characters with the UTF-8 encoding,
* per the URL specification.
*
* Parameters:
* baseUrl the absolute base URL.
*
* url the relative URL to convert.
*
* Return values:
* An absolute URL that combines parts of the base and relative
* URLs, or FALSE if the base URL is not absolute or if either
* URL cannot be parsed.
*/
function url_to_absolute( $baseUrl, $relativeUrl )
{
// If relative URL has a scheme, clean path and return.
$r = split_url( $relativeUrl );
if ( $r === FALSE )
return FALSE;
if ( !empty( $r['scheme'] ) )
{
if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
$r['path'] = url_remove_dot_segments( $r['path'] );
return join_url( $r );
}
// Make sure the base URL is absolute.
$b = split_url( $baseUrl );
if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
return FALSE;
$r['scheme'] = $b['scheme'];
// If relative URL has an authority, clean path and return.
if ( isset( $r['host'] ) )
{
if ( !empty( $r['path'] ) )
$r['path'] = url_remove_dot_segments( $r['path'] );
return join_url( $r );
}
unset( $r['port'] );
unset( $r['user'] );
unset( $r['pass'] );
// Copy base authority.
$r['host'] = $b['host'];
if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
// If relative URL has no path, use base path
if ( empty( $r['path'] ) )
{
if ( !empty( $b['path'] ) )
$r['path'] = $b['path'];
if ( !isset( $r['query'] ) && isset( $b['query'] ) )
$r['query'] = $b['query'];
return join_url( $r );
}
// If relative URL path doesn't start with /, merge with base path
if ( $r['path'][0] != '/' )
{
$base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
if ( $base === FALSE ) $base = '';
$r['path'] = $base . '/' . $r['path'];
}
$r['path'] = url_remove_dot_segments( $r['path'] );
return join_url( $r );
}
/**
* Filter out "." and ".." segments from a URL's path and return
* the result.
*
* This function implements the "remove_dot_segments" algorithm from
* the RFC3986 specification for URLs.
*
* This function supports multi-byte characters with the UTF-8 encoding,
* per the URL specification.
*
* Parameters:
* path the path to filter
*
* Return values:
* The filtered path with "." and ".." removed.
*/
function url_remove_dot_segments( $path )
{
// multi-byte character explode
$inSegs = preg_split( '!/!u', $path );
$outSegs = array( );
foreach ( $inSegs as $seg )
{
if ( $seg == '' || $seg == '.')
continue;
if ( $seg == '..' )
array_pop( $outSegs );
else
array_push( $outSegs, $seg );
}
$outPath = implode( '/', $outSegs );
if ( $path[0] == '/' )
$outPath = '/' . $outPath;
// compare last multi-byte character against '/'
if ( $outPath != '/' &&
(mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
$outPath .= '/';
return $outPath;
}
/**
* This function parses an absolute or relative URL and splits it
* into individual components.
*
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
* A portion of the ABNFs are repeated here:
*
* URI-reference = URI
* / relative-ref
*
* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
*
* relative-ref = relative-part [ "?" query ] [ "#" fragment ]
*
* hier-part = "//" authority path-abempty
* / path-absolute
* / path-rootless
* / path-empty
*
* relative-part = "//" authority path-abempty
* / path-absolute
* / path-noscheme
* / path-empty
*
* authority = [ userinfo "@" ] host [ ":" port ]
*
* So, a URL has the following major components:
*
* scheme
* The name of a method used to interpret the rest of
* the URL. Examples: "http", "https", "mailto", "file'.
*
* authority
* The name of the authority governing the URL's name
* space. Examples: "example.com", "user@example.com",
* "example.com:80", "user:password@example.com:80".
*
* The authority may include a host name, port number,
* user name, and password.
*
* The host may be a name, an IPv4 numeric address, or
* an IPv6 numeric address.
*
* path
* The hierarchical path to the URL's resource.
* Examples: "/index.htm", "/scripts/page.php".
*
* query
* The data for a query. Examples: "?search=google.com".
*
* fragment
* The name of a secondary resource relative to that named
* by the path. Examples: "#section1", "#header".
*
* An "absolute" URL must include a scheme and path. The authority, query,
* and fragment components are optional.
*
* A "relative" URL does not include a scheme and must include a path. The
* authority, query, and fragment components are optional.
*
* This function splits the $url argument into the following components
* and returns them in an associative array. Keys to that array include:
*
* "scheme" The scheme, such as "http".
* "host" The host name, IPv4, or IPv6 address.
* "port" The port number.
* "user" The user name.
* "pass" The user password.
* "path" The path, such as a file path for "http".
* "query" The query.
* "fragment" The fragment.
*
* One or more of these may not be present, depending upon the URL.
*
* Optionally, the "user", "pass", "host" (if a name, not an IP address),
* "path", "query", and "fragment" may have percent-encoded characters
* decoded. The "scheme" and "port" cannot include percent-encoded
* characters and are never decoded. Decoding occurs after the URL has
* been parsed.
*
* Parameters:
* url the URL to parse.
*
* decode an optional boolean flag selecting whether
* to decode percent encoding or not. Default = TRUE.
*
* Return values:
* the associative array of URL parts, or FALSE if the URL is
* too malformed to recognize any parts.
*/
function split_url( $url, $decode=FALSE)
{
// Character sets from RFC3986.
$xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
$xpchar = $xunressub . ':@% ';
// Scheme from RFC3986.
$xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)';
// User info (user + password) from RFC3986.
$xuserinfo = '(([' . $xunressub . '%]*)' .
'(:([' . $xunressub . ':%]*))?)';
// IPv4 from RFC3986 (without digit constraints).
$xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
// IPv6 from RFC2732 (without digit and grouping constraints).
$xipv6 = '(\[([a-fA-F\d.:]+)\])';
// Host name from RFC1035. Technically, must start with a letter.
// Relax that restriction to better parse URL structure, then
// leave host name validation to application.
$xhost_name = '([a-zA-Z\d-.%]+)';
// Authority from RFC3986. Skip IP future.
$xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
$xport = '(\d*)';
$xauthority = '((' . $xuserinfo . '@)?' . $xhost .
'?(:' . $xport . ')?)';
// Path from RFC3986. Blend absolute & relative for efficiency.
$xslash_seg = '(/[' . $xpchar . ']*)';
$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
$xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)';
$xpath_abs = '(/(' . $xpath_rel . ')?)';
$xapath = '(' . $xpath_authabs . '|' . $xpath_abs .
'|' . $xpath_rel . ')';
// Query and fragment from RFC3986.
$xqueryfrag = '([' . $xpchar . '/?' . ']*)';
// URL.
$xurl = '^(' . $xscheme . ':)?' . $xapath . '?' .
'(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
// Split the URL into components.
if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
return FALSE;
if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]);
if ( !empty($m[7]) ) {
if ( isset( $m[9] ) ) $parts['user'] = $m[9];
else $parts['user'] = '';
}
if ( !empty($m[10]) ) $parts['pass'] = $m[11];
if ( !empty($m[13]) ) $h=$parts['host'] = $m[13];
else if ( !empty($m[14]) ) $parts['host'] = $m[14];
else if ( !empty($m[16]) ) $parts['host'] = $m[16];
else if ( !empty( $m[5] ) ) $parts['host'] = '';
if ( !empty($m[17]) ) $parts['port'] = $m[18];
if ( !empty($m[19]) ) $parts['path'] = $m[19];
else if ( !empty($m[21]) ) $parts['path'] = $m[21];
else if ( !empty($m[25]) ) $parts['path'] = $m[25];
if ( !empty($m[27]) ) $parts['query'] = $m[28];
if ( !empty($m[29]) ) $parts['fragment']= $m[30];
if ( !$decode )
return $parts;
if ( !empty($parts['user']) )
$parts['user'] = rawurldecode( $parts['user'] );
if ( !empty($parts['pass']) )
$parts['pass'] = rawurldecode( $parts['pass'] );
if ( !empty($parts['path']) )
$parts['path'] = rawurldecode( $parts['path'] );
if ( isset($h) )
$parts['host'] = rawurldecode( $parts['host'] );
if ( !empty($parts['query']) )
$parts['query'] = rawurldecode( $parts['query'] );
if ( !empty($parts['fragment']) )
$parts['fragment'] = rawurldecode( $parts['fragment'] );
return $parts;
}
/**
* This function joins together URL components to form a complete URL.
*
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
* This function implements the specification's "component recomposition"
* algorithm for combining URI components into a full URI string.
*
* The $parts argument is an associative array containing zero or
* more of the following:
*
* "scheme" The scheme, such as "http".
* "host" The host name, IPv4, or IPv6 address.
* "port" The port number.
* "user" The user name.
* "pass" The user password.
* "path" The path, such as a file path for "http".
* "query" The query.
* "fragment" The fragment.
*
* The "port", "user", and "pass" values are only used when a "host"
* is present.
*
* The optional $encode argument indicates if appropriate URL components
* should be percent-encoded as they are assembled into the URL. Encoding
* is only applied to the "user", "pass", "host" (if a host name, not an
* IP address), "path", "query", and "fragment" components. The "scheme"
* and "port" are never encoded. When a "scheme" and "host" are both
* present, the "path" is presumed to be hierarchical and encoding
* processes each segment of the hierarchy separately (i.e., the slashes
* are left alone).
*
* The assembled URL string is returned.
*
* Parameters:
* parts an associative array of strings containing the
* individual parts of a URL.
*
* encode an optional boolean flag selecting whether
* to do percent encoding or not. Default = true.
*
* Return values:
* Returns the assembled URL string. The string is an absolute
* URL if a scheme is supplied, and a relative URL if not. An
* empty string is returned if the $parts array does not contain
* any of the needed values.
*/
function join_url( $parts, $encode=FALSE)
{
if ( $encode )
{
if ( isset( $parts['user'] ) )
$parts['user'] = rawurlencode( $parts['user'] );
if ( isset( $parts['pass'] ) )
$parts['pass'] = rawurlencode( $parts['pass'] );
if ( isset( $parts['host'] ) &&
!preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
$parts['host'] = rawurlencode( $parts['host'] );
if ( !empty( $parts['path'] ) )
$parts['path'] = preg_replace( '!%2F!ui', '/',
rawurlencode( $parts['path'] ) );
if ( isset( $parts['query'] ) )
$parts['query'] = rawurlencode( $parts['query'] );
if ( isset( $parts['fragment'] ) )
$parts['fragment'] = rawurlencode( $parts['fragment'] );
}
$url = '';
if ( !empty( $parts['scheme'] ) )
$url .= $parts['scheme'] . ':';
if ( isset( $parts['host'] ) )
{
$url .= '//';
if ( isset( $parts['user'] ) )
{
$url .= $parts['user'];
if ( isset( $parts['pass'] ) )
$url .= ':' . $parts['pass'];
$url .= '@';
}
if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
$url .= '[' . $parts['host'] . ']'; // IPv6
else
$url .= $parts['host']; // IPv4 or name
if ( isset( $parts['port'] ) )
$url .= ':' . $parts['port'];
if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
$url .= '/';
}
if ( !empty( $parts['path'] ) )
$url .= $parts['path'];
if ( isset( $parts['query'] ) )
$url .= '?' . $parts['query'];
if ( isset( $parts['fragment'] ) )
$url .= '#' . $parts['fragment'];
return $url;
}
/**
* This function encodes URL to form a URL which is properly
* percent encoded to replace disallowed characters.
*
* RFC3986 specifies the allowed characters in the URL as well as
* reserved characters in the URL. This function replaces all the
* disallowed characters in the URL with their repective percent
* encodings. Already encoded characters are not encoded again,
* such as '%20' is not encoded to '%2520'.
*
* Parameters:
* url the url to encode.
*
* Return values:
* Returns the encoded URL string.
*/
function encode_url($url) {
$reserved = array(
":" => '!%3A!ui',
"/" => '!%2F!ui',
"?" => '!%3F!ui',
"#" => '!%23!ui',
"[" => '!%5B!ui',
"]" => '!%5D!ui',
"@" => '!%40!ui',
"!" => '!%21!ui',
"$" => '!%24!ui',
"&" => '!%26!ui',
"'" => '!%27!ui',
"(" => '!%28!ui',
")" => '!%29!ui',
"*" => '!%2A!ui',
"+" => '!%2B!ui',
"," => '!%2C!ui',
";" => '!%3B!ui',
"=" => '!%3D!ui',
"%" => '!%25!ui',
);
$url = rawurlencode($url);
$url = preg_replace(array_values($reserved), array_keys($reserved), $url);
return $url;
}
?>
答案 0 :(得分:1)
你还没有真正说过你有什么错误但幸运的是你的代码中有一些错误。可能会给你错误的是这个块:
if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {
$maxsize = $arr[0] * $arr[1]; //compare images' sise
$the_biggest_image = $comm;
echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
}
$arr = @getimagesize($comm);
这是您的“广告”过滤变量。$check_flag
为false,您仍然会执行以下计算语句reset()
不适用于字符串。 $the_biggest_image
。这是打算?<强>更新强>
尝试使代码正常工作并希望稍微好一些:
<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
// options
$url = 'http://www.yomiuri.co.jp/stream/';
$ignore = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
$biggestImage = 'path to "no image found" image';
// process
$maxSize = -1;
$visited = array();
$html = file_get_html($url);
// base url
$parts=parse_url($url);
$host=$parts['scheme'].'://'.$parts['host'];
// loop images
foreach($html->find('img') as $element) {
$pic = $element->src;
if($pic=='')continue;// it happens on your test url
$absUrl = url_to_absolute($host, $pic);//get image absolute url
// ignore already seen images, add new images
if(in_array($absUrl, $visited))continue;
$visited[]=$absUrl;
// remove ads images
$ignoring=false;
foreach($ignore as $item)
if (stripos($absUrl,$item)!==false){
$ignoring=true;
break;
}
if($ignoring)continue;
// get image
$image=@getimagesize($absUrl);// get the rest images width and height
if (($image[0] * $image[1]) > $maxSize) {
$maxSize = $image[0] * $image[1]; //compare images' sise
$biggestImage = $absUrl;
}
}
echo '<img src="'.$biggestImage.'" />'; //echo the biggest one
?>
答案 1 :(得分:0)
根据您的代码我创建了以下解决方案 - 它使用相同的逻辑,它允许您设置图像的最小宽度和高度,以确保它返回正确的图像
private function getMainImageFromUrl($pageUrl) {
$biggestImage = '';
$minImgWidth = 300;
$minImgHeight = 300;
$images = $this->getImagesFromDom($pageUrl);
$visited = array();
$maxSize = -1;
$ignore = array('ad', 'ads','gif'); // get rid of ads (check if these contains following)
foreach ($images as $image) {
$pic = $image->getAttribute('src');
# if source is empty, skip to another image
if ( empty( $pic ) )
continue;
# get image absolute url
$absUrl = url_to_absolute($pic);
# ignore already seen images (skip to another), add new images
if ( in_array( $absUrl, $visited ) )
continue;
$visited[] = $absUrl;
# remove ads
$ignoring = false;
foreach($ignore as $item)
if ( stripos( $absUrl,$item ) !== false ){
$ignoring=true;
break;
}
if ( $ignoring )
continue;
$imageSize = @getimagesize($absUrl);
if ( ( $imageSize[0] * $imageSize[1] ) > $maxSize) {
$maxSize = $imageSize[0] * $imageSize[1];
if ($minImgWidth < $imageSize[0] && $minImgHeight < $imageSize[1])
$biggestImage = $absUrl;
}
}
return $biggestImage;
}
private function getImagesFromDom( $url ) {
ini_set('default_socket_timeout', 4);
$dom = new DOMDocument();
@$dom->loadHTMLFile( $url );
$dom->preserveWhiteSpace = false;
# Get images from DOM
return $dom->getElementsByTagName('img');
}