我有一个PHP代码。此代码输出HTML。我需要修改此代码以输出XML。 关于如何做到这一点的任何想法。是否有任何XML库可以直接完成工作,或者我是否必须手动创建每个节点。?
我的PHP代码是:
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style>
a {text-decoration:none; color:black;}
</style>
</head>
<body>
<?php
$a=$_POST["title"];
$b=$_POST["name"];
$c="http://www.imdb.com/search/title?title=".urlencode($a)."&title_type=".urlencode($b);
$d=file_get_contents($c);
preg_match_all('/<div id="main">\n(No results.)/', $d,$nore);
preg_match_all('#<img src="(.*)"#Us', $d, $img);//image
preg_match_all('/<a\s*href="\/title\/tt[0-9]*\/">((?:[a-z]*(?:&*[.]*)?\s*-*[a-z]*[0-9]*[^<])+)/i',$d,$tit); //title
preg_match_all('/<span\sclass="year_type">\s*\(([\d]*)/',$d,$ye); //movie year working fine
preg_match_all('#<span class="credit">\n Dir: (.*)\n(?: With:)?#Us',$d,$dir); //director
preg_match_all('/<span class="rating-rating"><span class="value">([\w]*.[\w]*)/i',$d,$rat); //rating
preg_match_all('/<a\shref="(\/title\/tt[0-9]*\/)"\s*[title]+/i',$d,$lin); //link
for($i=0;$i<5;$i++)
{
if (@$rat[1][$i]=="-")
$rat[1][$i]="N/A";
}
for($i=0;$i<5;$i++)
{
if(@$dir[1][$i]=="")
$dir[1][$i]="N/A";
}
if(count($tit[1])>5)
$cnt=5;
else
$cnt=count($tit[1]);
echo"<center><b>Search Result</b></center>";
echo "<br/>";
echo "<center><b>\"$a\"of type\"$b\":</b></center>";
echo"<br/>";
if(@$nore[1][0]=="No results.")
echo "<center><b>No movies found!</b></center>";
else
{
echo "<center><table border=1><tr><td><center>Image</center></td><td><center>Title</center></td><td><center>Year</center></td><td><center>Director</center></td><td><center>Rating(10)</center></td><td><center>Link to Movie</center></td></tr>";
for($j=0;$j<$cnt;$j++)
{
echo "<tr>";
echo "<td>".@$img[0][$j+2]."</td>";
echo "<td><center>".@$tit[1][$j]."</center></td>";
echo "<td><center>".@$ye[1][$j]."</center></td>";
echo "<td><center>".@$dir[1][$j]."</center></td>";
echo "<td><center>".@$rat[1][$j]."</center></td>";
echo '<td><center><a style="text-decoration:underline; color:blue;" href="http://www.imdb.com'.@$lin[1][$j].'">Details</a></center></td>';
echo "</tr>";
}
echo "</table></center>";
}
?>
</body>
</html>
预期的XML输出:
<result cover="http://ia.mediaimdb.com/images
/M/MV5BMjMyOTM4MDMxNV5BMl5BanBnXkFtZTcwNjIyNzExOA@@._V1._SX54_
CR0,0,54,74_.jpg" title="The Amazing Spider-Man(2012)"year="2012"
director="Marc Webb" rating="7.5"
details="http://www.imdb.com/title/tt0948470"/>
<result cover="http://ia.mediaimdb.
com/images/M/MV5BMzk3MTE5MDU5NV5BMl5BanBnXkFtZTYwMjY3NTY3._V1._SX54_CR0,
0,54,74_.jpg" title="Spider-Man(2002)" year="2002"director="Sam Raimi"
rating="7.3" details="http://www.imdb.com/title/tt0145487"/>
<result cover="http://ia.mediaimdb.
com/images/M/MV5BODUwMDc5Mzc5M15BMl5BanBnXkFtZTcwNDgzOTY0MQ@@._V1._SX54_
CR0,0,54,74_.jpg" title="Spider-Man 3 (2007)" year="2007" director="Sam
Raimi" rating="6.3" details="http://www.imdb.com/title/tt0413300"/>
<result cover="http://i.mediaimdb.
com/images/SF1f0a42ee1aa08d477a576fbbf7562eed/realm/feature.gif" title="
The Amazing Spider-Man 2 (2014)" year="2014" director="Sam Raimi"
rating="6.3" details="http://www.imdb.com/title/tt1872181"/>
<result cover="http://ia.mediaimdb.
com/images/M/MV5BMjE1ODcyODYxMl5BMl5BanBnXkFtZTcwNjA1NDE3MQ@@._V1._SX54_
CR0,0,54,74_.jpg" title="Spider-Man 2 (2004)" year="2004" director="Sam
Raimi" rating="7.5" details="http://www.imdb.com/title/tt0316654"/>
</results>
答案 0 :(得分:2)
首先,你正在用正则表达式解析你的html结果,效率很低,不必要,而且......好吧,you're answering to the cthulhu call!
其次,解析IMDB HTML以检索结果虽然有效,但可能是不必要的。有一些巧妙的第三方API可以帮助您完成工作,例如http://imdbapi.org
如果您不想使用任何第三方API,恕我直言,您应该使用DOM解析器/操纵器解析HTML,例如DOMDocument,这样更安全,更好,同时时间,可以解决你的HTML到XML问题。
这是您提出的问题(从结果中构建XML和HTML):
function resultsToHTML($results)
{
$doc = new DOMDocumet();
$table = $doc->createElement('table');
foreach ($results as $r) {
$row = $doc->createElement('tr');
$doc->appendChild($row);
$title = $doc->createElement('td', $r['title']);
$row->appendChild($title);
$year = $doc->createElement('td', $r['year']);
$row->appendChild($year);
$rating = $doc->createElement('td', $r['rating']);
$row->appendChild($rating);
$imgTD = $doc->createElement('td');
//Creating a img tag (use only on)
$img = $doc->createElement('img');
$img->setAttribute('src', $r['img_src']);
$imgTD->appendChild($img);
$row->appendChild($imgTD);
$imgTD = $doc->createElement('td');
//Importing directly from the old document
$fauxDoc = new DOMDocument();
$fauxDoc->loadXML($r['img']);
$img = $fauxDoc->getElementsByTagName('img')->index(0);
$importedImg = $doc->importNode('$img', true);
$imgTD->appendChild($importedImg);
$row->appendChild($imgTD);
}
return $doc;
}
function resultsToXML($results)
{
$doc = new DOMDocumet();
$root = $doc->createElement('results');
foreach ($results as $r) {
$element = $root->createElement('result');
$element->setAttribute('cover', $r['img_src']);
$element->setAttribute('title', $r['title']);
$element->setAttribute('year', $r['year']);
$element->setAttribute('rating', $r['rating']);
$root->appendChild($element);
}
$doc->appendChild($root);
return $doc;
}
打印它们只需要
$xml = resultsToXML($results);
print $xml->saveXML();
与html
相同以下是基于帖子的DOMDocument代码的重构:
<?php
//Mock IMDB Link
$a = 'The Amazing Spider-Man';
$b = 'title';
$c = "http://www.imdb.com/search/title?title=".urlencode($a)."&title_type=".urlencode($b);
// HTML might be malformed so we want DOMDocument to be quiet
libxml_use_internal_errors(true);
//Initialize DOMDocument parser
$doc = new DOMDocument();
//Load previously downloaded document
$doc->loadHTMLFile($c);
//initialize array to store results
$results = array();
// get table of results and extract a list of rows
$listOfTables = $doc->getElementsByTagName('table');
$rows = getResultRows($listOfTables);
$i = 0;
//loop through all rows to retrieve information
foreach ($rows as $row) {
if ($title = getTitle($row)) {
$results[$i]['title'] = $title;
}
if (!is_null($year = getYear($row)) && $year) {
$results[$i]['year'] = $year;
}
if (!is_null($rating = getRating($row)) && $rating) {
$results[$i]['rating'] = $rating;
}
if ($img = getImage($row)) {
$results[$i]['img'] = $img;
}
if ($src = getImageSrc($row)) {
$results[$i]['img_src'] = $src;
}
++$i;
}
//the first result can be a false positive due to the
// results' table header, so we remove it
if (isset($results[0])) {
array_shift($results);
}
<强>功能强>
function getResultRows($listOfTables)
{
foreach ($listOfTables as $table) {
if ($table->getAttribute('class') === 'results') {
return $table->getElementsByTagName('tr');
}
}
}
function getImageSrc($row)
{
$img = $row->getElementsByTagName('img')->item(0);
if (!is_null($img)) {
return $img->getAttribute('src');
} else {
return false;
}
}
function getImage($row, $doc)
{
$img = $row->getElementsByTagName('img')->item(0);
if (!is_null($img)) {
return $doc->saveHTML($img);
} else {
return false;
}
}
function getTitle($row)
{
$tdInfo = getTDInfo($row->getElementsByTagName('td'));
if (!is_null($tdInfo) && !is_null($as = $tdInfo->getElementsByTagName('a'))) {
return $as->item(0)->nodeValue;
} else {
return false;
}
}
function getYear($row)
{
$tdInfo = getTDInfo($row->getElementsByTagName('td'));
if (!is_null($tdInfo) && !is_null($spans = $tdInfo->getElementsByTagName('span'))) {
foreach ($spans as $span) {
if ($span->getAttribute('class') === 'year_type') {
return str_replace(')', '', str_replace('(', '', $span->nodeValue));
}
}
}
}
function getRating($row)
{
$tdInfo = getTDInfo($row->getElementsByTagName('td'));
if (!is_null($tdInfo) && !is_null($spans = $tdInfo->getElementsByTagName('span'))) {
foreach ($spans as $span) {
if ($span->getAttribute('class') === 'rating-rating') {
return $span->nodeValue;
}
}
}
}
function getTDInfo($tds)
{
foreach ($tds as $td) {
if ($td->getAttribute('class') == 'title') {
return $td;
}
}
}