为了训练我自己的PHP和HTML表单,我决定制作一个小的网络应用程序,从另一个网站收集数据,但显示它为移动设备。
在本次练习中,我选择了我所在地区的公交公司网站:http://delijn.be/en/index.htm。我分析了网站并找到了名为“form1”的表单,它通过POST方法将数据发送到网站:http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=en。
我开始编写php代码并在互联网上找到你可以用cURL发送POST字段。所以我做了。不幸的是它不起作用。我得到了网站的错误页面。所以我想一些字段必须丢失,但我检查了一切,我找不到另一个字段。通过这种方式,我再次来到这里,寻求帮助。
网络应用程序托管在my home server上,也可以downloaded在那里。
如果有人可以帮助我解决这个问题,我将非常感激 ief2
<小时/> PS:代码的某些部分是用荷兰语写的,所以这里有一些翻译:
<小时/> PPS:下载链接显然不起作用,但下载它没有问题,所以这里有一些代码片段:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>De Lijn Mobile</title>
<meta name="viewport" content="width = device-width">
</head>
<body>
<form name="main" action="calculateRoute.php" method="post">
<b>Vertrek:</b><br>
Gemeente: <input type="text" name="vertrekGemeente"><br>
Straat: <input type="text" name="vertrekStraat"><br>
Nummer: <input type="text" name="vertrekNummer"><br>
<hr>
<b>Aankomst:</b><br>
Gemeente: <input type="text" name="aankomstGemeente"><br>
Straat: <input type="text" name="aankomstStraat"><br>
Nummer: <input type="text" name="aankomstNummer"><br>
<hr>
<b>Datum:</b><br>
<?php
require("./Date.php");
$now = new Date();
?>
<input type="radio" name="datumType" value="aankomst" checked> Aankomst<br>
<input type="radio" name="datumType" value="vertrek"> Vertrek<br>
Dag: <input type="text" size="2" name="datumDag" value="<?php echo $now->day; ?>"><br>
Maand: <input type="text" size="2" name="datumMaand" value="<?php echo $now->month; ?>"><br>
Jaar: <input type="text" size="4" name="datumJaar" value="<?php echo $now->year; ?>"><br>
Tijdstip: <input type="text" size="2" name="datumUur" value="<?php echo $now->hour; ?>"> :
<input type="text" size="2" name="datumMinuten" value="<?php echo $now->minutes; ?>"><br>
<hr>
<input type="submit" value="Bereken"><br>
</form>
</body>
</html>
<DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>De Lijn Mobile - Berekeningen</title>
</head>
<body>
<?php
require_once("./Plaats.php");
require_once("./Date.php");
require_once("./DeLijn.php");
echo "Gathering data...<br>";
$gemeente = $_POST["vertrekGemeente"];
$straat = $_POST["vertrekStraat"];
$nummer = $_POST["vertrekNummer"];
$vertrekPlaats = new Plaats($gemeente, $straat, $nummer);
$gemeente = $_POST["aankomstGemeente"];
$straat = $_POST["aankomstStraat"];
$nummer = $_POST["aankomstNummer"];
$aankomstPlaats = new Plaats($gemeente, $straat, $nummer);
$datumType = $_POST["datumType"];
$dag = $_POST["datumDag"];
$maand = $_POST["datumMaand"];
$jaar = $_POST["datumJaar"];
$uur = $_POST["datumUur"];
$min = $_POST["datumMinuten"];
$datum = Date::withDate($jaar, $maand, $dag, $uur, $min);
$datum->month = $maand;
echo "Searching...<br>";
searchDeLijn($vertrekPlaats,
$aankomstPlaats,
$datumType,
$datum);
?>
</body>
</html>
<?php
require_once("Route.php");
require_once("Date.php");
require_once("Plaats.php");
// ==== Returns of Route objects or null
define('DATE_ARRIVAL', "aankomst");
define('DATE_DEPARTURE', "vertrek");
function searchDeLijn($dep, $ar, $dateType, $date) {
$vertrekkenOfAankomen = "aankomen";
if(DATE_DEPARTURE === $dateType) {
$vertrekkenOfAankomen = "vertrekken";
}
$myMins = (int)$date->minutes;
$myMins -= ($myMins % 5);
$postFields = array(
"form1:vertrekGemeenteInput" => $dep->gemeente,
"form1:vertrekStraatInput" => $dep->straat,
"form1:vertrekNrInput" => $dep->nummer,
"form1:aankomstGemeenteInput" => $ar->gemeente,
"form1:aankomstStraatInput" => $ar->straat,
"form1:aankomstNrInput" => $ar->nummer,
"form1:vertrekkenOfAankomenRadio" => $vertrekkenOfAankomen,
"form1:dagCombo" => (string)(int)$date->day,
"form1:maandCombo" => (string)(int)$date->month,
"form1:jaarCombo" => $date->year,
"form1:uurCombo" => (string)(int)$date->hour,
"form1:minutenCombo" => (string)$myMins);
print_r($postFields);
// do the curl
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,
'http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=nl');
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$contents = curl_exec($ch);
curl_close($ch);
if($contents == false) {
return null;
}
echo $contents;
$myRouteObjects = extractRoutesFromXMLData($contents);
return $myRouteObjects;
}
// ==== Returns array of Route objects or null
function extractRoutesFromXMLData($dataString) {
$tableBody = getResultsTableBody($dataString);
if($tableBody != null) { return null; }
$tableRows = getTableRowsOfTableBody($tableBody);
if($tableRows != null) { return null; }
// put them in an array
$myArray = array();
$count = $tableRows->length;
for($i = 0; $i < $count; $i++) {
$aNode = $tableRows->item(0);
$myArray[] = $aNode;
}
return $myArray;
}
// ==== Returns XMLDocument or null
function getResultsTableBody($dataString) {
// Get table element
$status = preg_match('/<TABLE id="routeplanner_overzicht".*?>.*?<\/TABLE>/is',
$docString, $matches);
if($status == 0) {
return null;
}
$tableElement = $matches[0];
// Extract body
$status = preg_match('/<TBODY>.*?<\/TBODY>/is',
$tableElement, $matches);
if($status == 0) {
return null;
}
$doc = new DOMDocument();
$doc->loadXML($matches[0]);
return $doc;
}
// ==== Retunrs XMLNodeList or null
function getTableRowsOfTableBody($xmlDoc) {
$xpath = new DOMXPath($domDoc);
$xpathres = $xpath->evaluate("//tbody[0]/tr");
if($xpathres == false) {
return null;
}
return $xpathres;
}
?>
Date.php
,Plaats.php
和Route.php
都包含分别封装日期,位置和可能路线的类。
答案 0 :(得分:0)
字段丢失,服务器真的响应发布数据很奇怪。我只能自动化一页。要点击其他链接,cookie显然是不够的。
我编写了一些代码,这些代码可能对其他需要了解表单布局的人有用:
<强> HTMLFormExtractor.py 强>
#!/usr/bin/python
import sys
import getopt
import urllib
import re
# ############################
# This code may be used by anyone. It may be used in both free
# and commercial software. It may be copied, modified and even
# be sold. The creator of this code takes no responsibility for
# any damage this script could do.
# ############################
# ############################
# ############################
# Usage: ./exec [-x] [URL]
#
# This application logs all forms of an HTML document and it's
# objects which have the HTML 'name'-attribute set. The program
# currently only works when the attributes of the objects are
# styled like the XML format (eg: name="myname").
#
# Options:
# -x: Create an XML document of the following form:
# ==== BEGIN XML ====
# formlist
# form (variable)
# attribute (variable)
# name
# value
#
# object (variable)
# type (eg: input)
# name (eg: username)
# ==== END XML ====
#
# URL: a URL pointing to an available, HTML file. If it's not
# specified specified the program will read the HTML document
# from the standard input.
#
# ############################
# ===== DATA =====
global FORM_OBJECTS_TAG_NAME
FORM_OBJECTS_TAG_NAME = ("input",
"textarea",
"label",
"fieldset",
"legend",
"select",
"optgroup",
"option",
"button")
# ===== CLASSES =====
class HTMLAttribute:
def __init__(self, name, value, orString = None):
self.name = name
self.value = value
self.originalString = None
@classmethod
def withAttributeString(cls, string):
"""Takes a string of the form attrNam="value" """
attrNameRegex = "\w+="
attrName = re.findall(attrNameRegex, string)[0]
attrName = attrName[0:len(attrName)-1]
valueRegex = "[\"'].*?[\"']"
value = re.findall(valueRegex, string)[0]
value = value[1:len(value)-1]
return cls(attrName, value, string)
class HTMLObject:
def __init__(self, aName):
self.name = aName
self.attributes = [] # contains HTMLAttribute
def addAttribute(self, anAttribute):
self.attributes.append(anAttribute)
def getAttributeWithName(self, aName):
"""Returns none or an HTLMAttribute"""
aName = aName.lower()
for anAttribute in self.attributes:
if anAttribute.name.lower() == aName: return anAttribute
return None
@classmethod
def withTagString(cls, string):
"""Takes a string of the form <aTagName attrName="value" ... >"""
tagOnyRegex = "<.*?>"
regObj = re.compile(tagOnyRegex, re.S)
string = re.findall(regObj, string)[0]
tagNameRegex = "(?<=<)\w+[\s>]"
tagName = re.findall(tagNameRegex, string)[0]
tagName = tagName[0:len(tagName)-1]
attrRegex = "\w+=[\"'].*?[\"']"
allAttributes = re.findall(attrRegex, string)
myObj = cls(tagName)
for anAttrString in allAttributes:
attrObj = HTMLAttribute.withAttributeString(anAttrString)
myObj.addAttribute(attrObj)
return myObj
class HTMLForm:
def __init__(self, name, htmlObjects):
self.name = name
self.HTMLObjects = htmlObjects # list of HTMLObject
# ===== FUNCTIONS =====
def getFormsFromHTML(htmlData):
regex = re.compile("<form.*?>.*?</form>", re.IGNORECASE | re.S)
result = re.findall(regex, htmlData)
return result
def getFormObjects(aForm):
"""Returns a list of HTMLObjects"""
global FORM_OBJECTS_TAG_NAME
myRegex = "<(?:"
myOrRegexLen = len(myRegex)
for aTagName in FORM_OBJECTS_TAG_NAME:
myRegex += aTagName + "|"
if len(myRegex) == myOrRegexLen: return []
myRegex = myRegex[0:len(myRegex)-1]
myRegex += ").*?>"
regObj = re.compile(myRegex, re.S | re.I)
allObjects = re.findall(regObj, aForm)
foundObjects = []
for anObject in allObjects:
anObj = HTMLObject.withTagString(anObject)
foundObjects.append(anObj)
return foundObjects
def printForms(foundForms, foundObjects):
"""Pass on a list of HTMLObject and a list of lists of HTMLObjects
The first list are the forms the second are the objects contained by
the forms at the corresponding index of the first list."""
counter = 0
for aForm in foundForms:
print "===== FORM " + str(counter+1) + " ====="
print "\tATTRIBUTES:"
for anAttribute in aForm.attributes:
print "\t\t" + anAttribute.name + ": '" + anAttribute.value + "'"
print "\n\t" + str(len(foundObjects)) + " OBJECTS:"
for anObject in foundObjects[counter]:
nameAttribute = anObject.getAttributeWithName("name")
if nameAttribute != None:
print "\t\t" + anObject.name + " (name=\"" + nameAttribute.value + "\")"
print "\n"
counter += 1
def createXMLString(foundForms, foundObjects):
"""Pass on a list of HTMLObject and a list of lists of HTMLObjects
The first list are the forms the second are the objects contained by
the forms at the corresponding index of the first list.
XML:
formlist
form (mult)
attribute (mult)
name
value
object (mult)
type (eg: input)
name (eg: username)
"""
counter = 0
xmlString = "<formlist>\n"
for aForm in foundForms:
# make form child
formXMLChild = "\t<form>\n"
# add all attributes
for anAttr in aForm.attributes:
formXMLChild += "\t\t<attribute>\n"
formXMLChild += "\t\t\t<name>" + anAttr.name + "</name>\n"
formXMLChild += "\t\t\t<value>" + anAttr.value + "</value>\n"
formXMLChild += "\t\t</attribute>\n"
# add all input objects if they have a name
for anObject in foundObjects[counter]:
nameAttr = anObject.getAttributeWithName("name")
if nameAttr != None:
formXMLChild += "\t\t<object>\n"
formXMLChild += "\t\t\t<type>" + anObject.name + "</type>\n"
formXMLChild += "\t\t\t<name>" + nameAttr.value + "</name>\n"
formXMLChild += "\t\t</object>\n"
# end child and append
formXMLChild += "\t<form>\n\n"
xmlString += formXMLChild
counter += 1
# end xml and return the string
xmlString = xmlString[0:len(xmlString)-1] + "</formlist>\n"
return xmlString
# ===== MAIN =====
# Parse the command line options
userArgv = sys.argv[1:]
flags, arguments = getopt.getopt(userArgv, "x")
wantsXMLFormat = flags.count(('-x', '')) > 0
hasURL = len(arguments) > 0;
# Get the HTML data
myHTML = None;
if hasURL:
myURL = arguments[0];
urlHandle = urllib.urlopen(myURL)
if urlHandle == None:
print "Failed to open the URL"
sys.exit(1)
myHTML = urlHandle.read()
urlHandle.close()
else:
myHTML = sys.stdin.read()
# Get all forms
htmlForms = getFormsFromHTML(myHTML)
# Loop with all forms
foundForms = []
foundObjects = [] # list of list
for aFormTag in htmlForms:
# append the form
formChilds = getFormObjects(aFormTag)
formHTMLObject = HTMLObject.withTagString(aFormTag)
foundForms.append(formHTMLObject)
# append a form input object
allObjects = getFormObjects(aFormTag)
foundObjects.append(allObjects)
# Print or create xml
if not wantsXMLFormat:
printForms(foundForms, foundObjects)
else:
myXMLString = createXMLString(foundForms, foundObjects)
print myXMLString