我正在尝试使用此类通过URL获取IMDB,该类打算从URL EX获取IMDB信息:http://www.imdb.com/title/tt0371746/
相反,它会返回错误:{ $param['error'] = "No Title found in Search Results!"; return $param; }
我在网址上没有电影时会产生错误,为什么每次都会返回错误?即使我在每个网址上都有一部电影我添加了吗?!
这是班级:
<?php
class IMDBGrabber {
function __construct() {
$this->time = "NOW()";
}
function getMovieInfo($input) {
$param = array();
$imdbUrl = $this->scruburl($input);
if ($imdbUrl === null) {
$param['error'] = "No Title found in Search Results!";
return $param;
} $content = $this->geturl($imdbUrl);
if (stripos($content, "<meta name=\"application-name\" content=\"IMDb\" />") !== false) {
$param = $this->GrabInfo($content);
$param['imdb_url'] = $imdbUrl;
} else {
$param['error'] = "No Media found on IMDb!";
} return $param;
}
function scruburl($input) {
$url = "http://www.google.com/search?q=imdb+" . stripslashes(rawurlencode($input));
$content = $this->geturl($url);
$urls = $this->match_all('/<a href="(http:\/\/www.imdb.com\/title\/tt.*?)".*?>.*?<\/a>/ms', $content, 1);
if (!isset($urls[0]))
return null;
else
return $urls[0];
}
function geturl($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
$content = curl_exec($ch);
curl_close($ch);
return $content;
}
function getimage($image) {
header("Content-type: image/jpeg");
$imdb_poster = rawurldecode($image);
$image = curl_init();
curl_setopt($image, CURLOPT_URL, $imdb_poster);
curl_setopt($image, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($image, CURLOPT_CONNECTTIMEOUT, 5);
$data = curl_exec($image);
curl_close($image);
return $data;
}
function match_all($regex, $str, $i = 0) {
if (preg_match_all($regex, $str, $matches) === false)
return false;
else
return $matches[$i];
}
function match($regex, $str, $i = 0) {
if (preg_match($regex, $str, $match) == 1)
return $match[$i];
else
return false;
}
function GrabInfo($content) {
$param = array();
$param['title_id'] = $this->match('/<link rel="canonical" href="http:\/\/www.imdb.com\/title\/(tt[0-9]+)\/" \/>/ms', $content, 1);
$param['title'] = trim($this->match('/<title>(.*?) \(.*?<\/title>/ms', $content, 1));
$param['type'] = $this->match('/<meta.*?property=.og:type.*?content=.(.*?)(\'|")/ms', $content, 1);
$param['year'] = trim($this->match('/<title>.*?\(.*?([0-9][0-9][0-9][0-9]).*?\).*?<\/title>/ms', $content, 1));
$param['rating'] = $this->match('/<span itemprop="ratingValue">([0-9].[0-9])<\/span>/m', $content, 1);
$param['ratingcount'] = $this->match('/<span itemprop="ratingCount">(.*?)<\/span>/m', $content, 1);
$param['reviewcount'] = $this->match('/<span itemprop="reviewCount">(.*?)<\/span>/m', $content, 1);
$param['trailer'] = $this->match('|<a href="{0,1}(/video/imdb/vi\d*/)|ims', $content, 1);
$param['genres'] = array();
foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/Genre.?:(.*?)(<\/div>|See more)/ms', $content, 1), 1) as $m) {
array_push($param['genres'], $m);
} $param[genres] = is_array(($param[genres])) ? implode(", ", ($param[genres])) : ($param[genres]);
$param['directors'] = array();
foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/Director.?:(.*?)(<\/div>|>.?and )/ms', $content, 1), 1) as $m) {
array_push($param['directors'], $m);
} $param[directors] = is_array(($param[directors])) ? implode(", ", ($param[directors])) : ($param[directors]);
$param['writers'] = array();
foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/Writer.?:(.*?)(<\/div>|>.?and )/ms', $content, 1), 1) as $m) {
array_push($param['writers'], $m);
} $param[writers] = is_array(($param[writers])) ? implode(", ", ($param[writers])) : ($param[writers]);
$param['stars'] = array();
foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/Stars:(.*?)<\/div>/ms', $content, 1), 1) as $m) {
array_push($param['stars'], $m);
} $param[stars] = is_array(($param[stars])) ? implode(", ", ($param[stars])) : ($param[stars]);
$param['cast'] = array();
foreach ($this->match_all('/<td class="name">(.*?)<\/td>/ms', $content, 1) as $m) {
array_push($param['cast'], trim(strip_tags($m)));
} $param[cast] = is_array(($param[cast])) ? implode(", ", ($param[cast])) : ($param[cast]);
$param['mpaa_rating'] = $this->match('/infobar">.<img.*?alt="(.*?)".*?>/ms', $content, 1);
if ($param['title_id'] != "") {
$releaseinfoHtml = $this->geturl("http://www.imdb.com/title/" . $param['title_id'] . "/releaseinfo");
$param['also_known_as'] = $this->getAkaTitles($releaseinfoHtml, $usa_title);
$param[also_known_as] = is_array(($param[also_known_as])) ? implode("<br />", ($param[also_known_as])) : ($param[also_known_as]);
$param['usa_title'] = $usa_title;
$param['release_date'] = $this->match('/Release Date:<\/h4>.*?([0-9][0-9]? (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)[0-9][0-9]).*?(\(|<span)/ms', $content, 1);
$param['release_dates'] = $this->getReleaseDates($releaseinfoHtml);
$param[release_dates] = is_array(($param[release_dates])) ? implode("<br />", ($param[release_dates])) : ($param[release_dates]);
} $param['plot'] = trim(strip_tags($this->match('/Users:.*?<p>(.*?)(<\/p>|<a)/ms', $content, 1)));
$param['poster'] = $this->match('/img_primary">.*?<img src="(.*?)".*?<\/td>/ms', $content, 1);
$param['poster_large'] = "";
$param['poster_small'] = "";
if ($param['poster'] != '' && strrpos($param['poster'], "nopicture") === false && strrpos($param['poster'], "ad.doubleclick") === false) {
$param['poster_large'] = substr($param['poster'], 0, strrpos($param['poster'], "_V1.")) . "_V1._SY500.jpg";
$param['poster_small'] = substr($param['poster'], 0, strrpos($param['poster'], "_V1.")) . "_V1._SY150.jpg";
} else {
$param['poster'] = "";
} $param['runtime'] = trim($this->match('/Runtime:<\/h4>.*?([0-9]+) min.*?<\/div>/ms', $content, 1));
if ($param['runtime'] == '')
$param['runtime'] = trim($this->match('/infobar.*?([0-9]+) min.*?<\/div>/ms', $content, 1));
$param['oscars'] = trim($this->match('/Won ([0-9]+) Oscars./ms', $content, 1));
$param['awards'] = trim($this->match('/([0-9]+) wins/ms', $content, 1));
$param['nominations'] = trim($this->match('/([0-9]+) nominations/ms', $content, 1));
$param['storyline'] = trim(strip_tags($this->match('/Storyline<\/h2>(.*?)(<em|<\/p>|<span)/ms', $content, 1)));
$param['release_date'] = trim(strip_tags($this->match('/Release Date.?:(.*?)(<\/div>|See more)/ms', $content, 1)));
$param['keywords'] = array();
foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/Plot Keywords.?:(.*?)(<\/div>|See more)/ms', $content, 1), 1) as $m) {
array_push($param['keywords'], $m);
} $param[keywords] = is_array(($param[keywords])) ? implode(", ", ($param[keywords])) : ($param[keywords]);
$param['tagline'] = trim(strip_tags($this->match('/Tagline.?:<\/h4>(.*?)(<span|<\/div)/ms', $content, 1)));
$param['votes'] = $this->match('/href="ratings".*?>([0-9]+,?[0-9]*) votes<\/a>\)/ms', $content, 1);
$param[votes] = is_array(($param[votes])) ? implode(", ", ($param[votes])) : ($param[votes]);
$param['languages'] = $this->match_all('/a href="\/language\/.*?">(.*?)<\/a>/ms', $content, 1);
$param['languages'] = array_unique($param['languages']);
$param[languages] = is_array(($param[languages])) ? implode(", ", ($param[languages])) : ($param[languages]);
$param['countries'] = array();
foreach ($this->match_all('/<a.*?>(.*?)<\/a>/ms', $this->match('/Country.?:(.*?)(<\/div>|See more)/ms', $content, 1), 1) as $m) {
array_push($param['countries'], $m);
} $param[countries] = is_array(($param[countries])) ? implode(", ", ($param[countries])) : ($param[countries]);
$param['companies'] = $this->match_all('/a.*?href="\/company\/.*?">(.*?)<\/a>/ms', $content, 1);
$param['companies'] = array_unique($param['companies']);
$param[companies] = is_array(($param[companies])) ? implode(", ", ($param[companies])) : ($param[companies]);
return $param;
}
function getReleaseDates($content) {
$releaseDates = array();
foreach ($this->match_all('/<tr>(.*?)<\/tr>/ms', $this->match('/Date<\/th><\/tr>(.*?)<\/table>/ms', $content, 1), 1) as $r) {
$country = trim(strip_tags($this->match('/<td><b>(.*?)<\/b><\/td>/ms', $r, 1)));
$date = trim(strip_tags($this->match('/<td align="right">(.*?)<\/td>/ms', $r, 1)));
array_push($releaseDates, $country . " = " . $date);
} return $releaseDates;
}
function getAkaTitles($content, &$usa_title) {
$akaTitles = array();
foreach ($this->match_all('/<tr>(.*?)<\/tr>/msi', $this->match('/Also Known As(.*?)<\/table>/ms', $content, 1), 1) as $m) {
$akaTitleMatch = $this->match_all('/<td>(.*?)<\/td>/ms', $m, 1);
$akaTitle = trim($akaTitleMatch[0]);
$akaCountry = trim($akaTitleMatch[1]);
array_push($akaTitles, $akaTitle . " = " . $akaCountry);
if ($akaCountry != '' && strrpos(strtolower($akaCountry), "usa") !== false)
$usa_title = $akaTitle;
} return $akaTitles;
}
}
答案 0 :(得分:1)
这背后有一些错误:
IMDBGrabber::scruburl($input)
方法中有正确的正则表达式,双引号之后和http之前可能有字符。如果我
你,我宁愿用Google custom search engine API来搜索
它。使用当前的方法,您将在几次之后被禁止
数十万次尝试。所以固定的正则表达式是: $urls = $this->match_all('/<a[\s\S]*?href="[\s\S]*?(http[s]{0,1}:\/\/www.imdb.com\/title\/[\s\S]*?)\//', $content, 1);
stripos($content, "<meta
name=\"application-name\"...
似乎是
错误。我下载了title/tt0371746/
的html,但没有
这样的字串。我使用if (stripos($content, "Your rating:") !== false) {
在这两次更改后,您的脚本会输出如下内容:
array(34) {
["title_id"]=>
string(9) "tt0371746"
["title"]=>
string(8) "Iron Man"
["type"]=>
string(11) "video.movie"
["year"]=>
string(4) "2008"
["rating"]=>
string(3) "7.9"
["ratingcount"]=>
string(7) "578,477"
["reviewcount"]=>
string(10) "1,017 user"
["trailer"]=>
string(24) "/video/imdb/vi447873305/"
["genres"]=>
string(28) " Action, Adventure, Sci-Fi"
["directors"]=>
string(57) "<span class="itemprop" itemprop="name">Jon Favreau</span>"
["writers"]=>
string(131) "<span class="itemprop" itemprop="name">Mark Fergus</span>, <span class="itemprop" itemprop="name">Hawk Ostby</span>, 6 more credits"
["stars"]=>
string(214) "<span class="itemprop" itemprop="name">Robert Downey Jr.</span>, <span class="itemprop" itemprop="name">Gwyneth Paltrow</span>, <span class="itemprop" itemprop="name">Terrence Howard</span>, See full cast and crew"
["cast"]=>
string(0) ""
["mpaa_rating"]=>
bool(false)
["also_known_as"]=>
string(0) ""
["usa_title"]=>
NULL
["release_date"]=>
string(24) "1 May 2008 (Netherlands)"
["release_dates"]=>
string(0) ""
["plot"]=>
string(0) ""
["poster"]=>
string(0) ""
["poster_large"]=>
string(0) ""
["poster_small"]=>
string(0) ""
["runtime"]=>
string(3) "126"
["oscars"]=>
string(0) ""
["awards"]=>
string(2) "18"
["nominations"]=>
string(2) "51"
["storyline"]=>
string(856) "Tony Stark. Genius, billionaire, playboy, philanthropist. Son of legendary inventor and weapons contractor Howard Stark. When Tony Stark is assigned to give a weapons presentation to an Iraqi unit led by Lt. Col. James Rhodes, he's given a ride on enemy lines. That ride ends badly when Stark's Humvee that he's riding in is attacked by enemy combatants. He survives - barely - with a chest full of shrapnel and a car battery attached to his heart. In order to survive he comes up with a way to miniaturize the battery and figures out that the battery can power something else. Thus Iron Man is born. He uses the primitive device to escape from the cave in Iraq. Once back home, he then begins work on perfecting the Iron Man suit. But the man who was put in charge of Stark Industries has plans of his own to take over Tony's technology for other matters."
["keywords"]=>
string(304) " <span class="itemprop" itemprop="keywords">armor</span>, <span class="itemprop" itemprop="keywords">cave</span>, <span class="itemprop" itemprop="keywords">iron</span>, <span class="itemprop" itemprop="keywords">genius</span>, <span class="itemprop" itemprop="keywords">missile</span>, See All (198)"
["tagline"]=>
string(52) "Get ready for a different breed of heavy metal hero."
["votes"]=>
bool(false)
["languages"]=>
string(153) "|</span>
<a href="/language/fa?ref_=tt_dt_dt"
itemprop='url'>Persian, |</span>
<a href="/language/ar?ref_=tt_dt_dt"
itemprop='url'>Arabic"
["countries"]=>
string(3) "USA"
["companies"]=>
string(75) "Paramount Pictures</span>, Marvel Enterprises</span>, Marvel Studios</span>"
["imdb_url"]=>
string(36) "http://www.imdb.com/title/tt0371746/"
}
带有一堆PHP通知。
但这一切都是维持的噩梦,我不得不说,代码并不是最好的。考虑使用this方法或基于XML / xpath寻址的解析器。另见this。