需要帮助分析HTTP表单

时间:2011-03-27 14:37:24

标签: php html post curl


为了训练我自己的PHP和HTML表单,我决定制作一个小的网络应用程序,从另一个网站收集数据,但显示它为移动设备。

在本次练习中,我选择了我所在地区的公交公司网站:http://delijn.be/en/index.htm。我分析了网站并找到了名为“form1”的表单,它通过POST方法将数据发送到网站:http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=en

我开始编写php代码并在互联网上找到你可以用cURL发送POST字段。所以我做了。不幸的是它不起作用。我得到了网站的错误页面。所以我想一些字段必须丢失,但我检查了一切,我找不到另一个字段。通过这种方式,我再次来到这里,寻求帮助。

网络应用程序托管在my home server上,也可以downloaded在那里。

如果有人可以帮助我解决这个问题,我将非常感激 ief2

<小时/> PS:代码的某些部分是用荷兰语写的,所以这里有一些翻译:

  • Gemeente =城镇/城市
  • Plaats =位置
  • Nummer = Number
  • 基准=日期
  • Dag = Day
  • Maand =月
  • Jaar =年
  • Uur =小时
  • Aankomst =抵达
  • Vertrek =离境
  • Berekenen =计算

<小时/> PPS:下载链接显然不起作用,但下载它没有问题,所以这里有一些代码片段:

的index.php

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
    <head>
        <title>De Lijn Mobile</title>
        <meta name="viewport" content="width = device-width">
    </head>

    <body>
        <form name="main" action="calculateRoute.php" method="post">
            <b>Vertrek:</b><br>
            Gemeente: <input type="text" name="vertrekGemeente"><br>
            Straat: <input type="text" name="vertrekStraat"><br>
            Nummer: <input type="text" name="vertrekNummer"><br>
            <hr>
            <b>Aankomst:</b><br>
            Gemeente: <input type="text" name="aankomstGemeente"><br>
                Straat: <input type="text" name="aankomstStraat"><br>
            Nummer: <input type="text" name="aankomstNummer"><br>
            <hr>
            <b>Datum:</b><br>
            <?php
                require("./Date.php");
                $now = new Date();
            ?>
            <input type="radio" name="datumType" value="aankomst" checked> Aankomst<br>
            <input type="radio" name="datumType" value="vertrek"> Vertrek<br>
            Dag: <input type="text" size="2" name="datumDag" value="<?php echo $now->day; ?>"><br>
            Maand: <input type="text" size="2" name="datumMaand" value="<?php echo $now->month; ?>"><br>
            Jaar: <input type="text" size="4" name="datumJaar" value="<?php echo $now->year; ?>"><br>
            Tijdstip: <input type="text" size="2" name="datumUur" value="<?php echo $now->hour; ?>"> : 
            <input type="text" size="2" name="datumMinuten" value="<?php echo $now->minutes; ?>"><br>
            <hr>
            <input type="submit" value="Bereken"><br>
        </form>
    </body>
</html>

calculateRoute.php

<DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
    <head>
        <title>De Lijn Mobile - Berekeningen</title>
    </head>

    <body>
        <?php
            require_once("./Plaats.php");
            require_once("./Date.php");
            require_once("./DeLijn.php");

            echo "Gathering data...<br>";
            $gemeente = $_POST["vertrekGemeente"];
            $straat = $_POST["vertrekStraat"];
            $nummer = $_POST["vertrekNummer"];
            $vertrekPlaats = new Plaats($gemeente, $straat, $nummer);

            $gemeente = $_POST["aankomstGemeente"];
            $straat = $_POST["aankomstStraat"];
            $nummer = $_POST["aankomstNummer"];
            $aankomstPlaats = new Plaats($gemeente, $straat, $nummer);

            $datumType = $_POST["datumType"];
            $dag = $_POST["datumDag"];
            $maand = $_POST["datumMaand"];
            $jaar = $_POST["datumJaar"];
            $uur = $_POST["datumUur"];
            $min = $_POST["datumMinuten"];
            $datum = Date::withDate($jaar, $maand, $dag, $uur, $min);
            $datum->month = $maand;

            echo "Searching...<br>";
            searchDeLijn($vertrekPlaats,
                $aankomstPlaats,
                $datumType,
                $datum);

        ?>
    </body>
</html>

DeLijn.php

<?php

require_once("Route.php");
require_once("Date.php");
require_once("Plaats.php");

// ==== Returns of Route objects or null
define('DATE_ARRIVAL', "aankomst");
define('DATE_DEPARTURE', "vertrek");
function searchDeLijn($dep, $ar, $dateType, $date) {
    $vertrekkenOfAankomen = "aankomen";
    if(DATE_DEPARTURE === $dateType) {
        $vertrekkenOfAankomen = "vertrekken";
    }
    $myMins = (int)$date->minutes;
    $myMins -= ($myMins % 5);
    $postFields = array(
        "form1:vertrekGemeenteInput" => $dep->gemeente,
        "form1:vertrekStraatInput" => $dep->straat,
        "form1:vertrekNrInput" => $dep->nummer,

        "form1:aankomstGemeenteInput" => $ar->gemeente,
        "form1:aankomstStraatInput" => $ar->straat,
        "form1:aankomstNrInput" => $ar->nummer,

        "form1:vertrekkenOfAankomenRadio" => $vertrekkenOfAankomen,
        "form1:dagCombo" => (string)(int)$date->day,
        "form1:maandCombo" => (string)(int)$date->month,
        "form1:jaarCombo" => $date->year,
        "form1:uurCombo" => (string)(int)$date->hour,
        "form1:minutenCombo" => (string)$myMins);

    print_r($postFields);

    // do the curl
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL,
        'http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=nl');
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

    $contents = curl_exec($ch);
    curl_close($ch);
    if($contents == false) {
        return null;
    }

    echo $contents;

    $myRouteObjects = extractRoutesFromXMLData($contents);
    return $myRouteObjects;
}

// ==== Returns array of Route objects or null
function extractRoutesFromXMLData($dataString) {
    $tableBody = getResultsTableBody($dataString);
    if($tableBody != null) { return null; }

    $tableRows = getTableRowsOfTableBody($tableBody);
    if($tableRows != null) { return null; }

    // put them in an array
    $myArray = array();
    $count = $tableRows->length;
    for($i = 0; $i < $count; $i++) {
        $aNode = $tableRows->item(0);
        $myArray[] = $aNode;
    }

    return $myArray;
}

// ==== Returns XMLDocument or null
function getResultsTableBody($dataString) {
    // Get table element
    $status = preg_match('/<TABLE id="routeplanner_overzicht".*?>.*?<\/TABLE>/is',
            $docString, $matches);
    if($status == 0) {
        return null;
    }

    $tableElement = $matches[0];

    // Extract body
    $status = preg_match('/<TBODY>.*?<\/TBODY>/is',
            $tableElement, $matches);
    if($status == 0) {
        return null;
    }

    $doc = new DOMDocument();
    $doc->loadXML($matches[0]);

    return $doc;
}


// ==== Retunrs XMLNodeList or null
function getTableRowsOfTableBody($xmlDoc) {
    $xpath = new DOMXPath($domDoc);
    $xpathres = $xpath->evaluate("//tbody[0]/tr");
    if($xpathres == false) {
        return null;
    }

    return $xpathres;
}
?>

Date.phpPlaats.phpRoute.php都包含分别封装日期,位置和可能路线的类。

1 个答案:

答案 0 :(得分:0)

字段丢失,服务器真的响应发布数据很奇怪。我只能自动化一页。要点击其他链接,cookie显然是不够的。

我编写了一些代码,这些代码可能对其他需要了解表单布局的人有用:

<强> HTMLFormExtractor.py

#!/usr/bin/python
import sys
import getopt
import urllib
import re

# ############################
# This code may be used by anyone. It may be used in both free
# and commercial software. It may be copied, modified and even
# be sold. The creator of this code takes no responsibility for
# any damage this script could do.
# ############################

# ############################
# ############################
# Usage: ./exec [-x] [URL]
# 
# This application logs all forms of an HTML document and it's
# objects which have the HTML 'name'-attribute set. The program
# currently only works when the attributes of the objects are
# styled like the XML format (eg: name="myname").
# 
# Options:
#   -x: Create an XML document of the following form:
#           ==== BEGIN XML ====
#           formlist
#               form (variable)
#                   attribute (variable)
#                       name
#                       value
#
#                   object (variable)
#                       type (eg: input)
#                       name (eg: username)
#           ==== END XML ====
#
#   URL: a URL pointing to an available, HTML file. If it's not
#       specified specified the program will read the HTML document 
#       from the standard input.
#
# ############################

# ===== DATA =====
global FORM_OBJECTS_TAG_NAME
FORM_OBJECTS_TAG_NAME = ("input", 
    "textarea", 
    "label", 
    "fieldset", 
    "legend", 
    "select", 
    "optgroup", 
    "option", 
    "button")



# ===== CLASSES =====
class HTMLAttribute:
    def __init__(self, name, value, orString = None):
        self.name = name
        self.value = value
        self.originalString = None

    @classmethod
    def withAttributeString(cls, string):
        """Takes a string of the form attrNam="value" """
        attrNameRegex = "\w+="
        attrName = re.findall(attrNameRegex, string)[0]
        attrName = attrName[0:len(attrName)-1]

        valueRegex = "[\"'].*?[\"']"
        value = re.findall(valueRegex, string)[0]
        value = value[1:len(value)-1]

        return cls(attrName, value, string)

class HTMLObject:
    def __init__(self, aName):
        self.name = aName
        self.attributes = [] # contains HTMLAttribute

    def addAttribute(self, anAttribute):
        self.attributes.append(anAttribute)

    def getAttributeWithName(self, aName):
        """Returns none or an HTLMAttribute"""
        aName = aName.lower()
        for anAttribute in self.attributes:
            if anAttribute.name.lower() == aName: return anAttribute
        return None

    @classmethod
    def withTagString(cls, string):
        """Takes a string of the form <aTagName attrName="value" ... >"""
        tagOnyRegex = "<.*?>"
        regObj = re.compile(tagOnyRegex, re.S)
        string = re.findall(regObj, string)[0]

        tagNameRegex = "(?<=<)\w+[\s>]"
        tagName = re.findall(tagNameRegex, string)[0]
        tagName = tagName[0:len(tagName)-1]

        attrRegex = "\w+=[\"'].*?[\"']"
        allAttributes = re.findall(attrRegex, string)

        myObj = cls(tagName)
        for anAttrString in allAttributes:
            attrObj = HTMLAttribute.withAttributeString(anAttrString)
            myObj.addAttribute(attrObj)

        return myObj

class HTMLForm:
    def __init__(self, name, htmlObjects):
        self.name = name
        self.HTMLObjects = htmlObjects # list of HTMLObject

# ===== FUNCTIONS =====
def getFormsFromHTML(htmlData):
    regex = re.compile("<form.*?>.*?</form>", re.IGNORECASE | re.S)
    result = re.findall(regex, htmlData)
    return result

def getFormObjects(aForm):
    """Returns a list of HTMLObjects"""
    global FORM_OBJECTS_TAG_NAME
    myRegex = "<(?:"
    myOrRegexLen = len(myRegex)
    for aTagName in FORM_OBJECTS_TAG_NAME:
        myRegex += aTagName + "|"
    if len(myRegex) == myOrRegexLen: return []

    myRegex = myRegex[0:len(myRegex)-1]
    myRegex += ").*?>"

    regObj = re.compile(myRegex, re.S | re.I)
    allObjects = re.findall(regObj, aForm)

    foundObjects = []
    for anObject in allObjects:
        anObj = HTMLObject.withTagString(anObject)
        foundObjects.append(anObj)

    return foundObjects

def printForms(foundForms, foundObjects):
    """Pass on a list of HTMLObject and a list of lists of HTMLObjects
    The first list are the forms the second are the objects contained by
    the forms at the corresponding index of the first list."""
    counter = 0
    for aForm in foundForms:
        print "===== FORM " + str(counter+1) + " ====="

        print "\tATTRIBUTES:"
        for anAttribute in aForm.attributes:
            print "\t\t" + anAttribute.name + ": '" + anAttribute.value + "'"

        print "\n\t" + str(len(foundObjects)) + " OBJECTS:"
        for anObject in foundObjects[counter]:
            nameAttribute = anObject.getAttributeWithName("name")
            if nameAttribute != None:
                print "\t\t" + anObject.name + " (name=\"" + nameAttribute.value + "\")"

        print "\n"
        counter += 1


def createXMLString(foundForms, foundObjects):
    """Pass on a list of HTMLObject and a list of lists of HTMLObjects
    The first list are the forms the second are the objects contained by
    the forms at the corresponding index of the first list.

    XML:
        formlist
            form (mult)
                attribute (mult)
                    name
                    value

                object (mult)
                    type (eg: input)
                    name (eg: username)
    """
    counter = 0
    xmlString = "<formlist>\n"
    for aForm in foundForms:
        # make form child
        formXMLChild = "\t<form>\n"

        # add all attributes
        for anAttr in aForm.attributes:
            formXMLChild += "\t\t<attribute>\n"
            formXMLChild += "\t\t\t<name>" + anAttr.name + "</name>\n"
            formXMLChild += "\t\t\t<value>" + anAttr.value + "</value>\n"
            formXMLChild += "\t\t</attribute>\n"

        # add all input objects if they have a name
        for anObject in foundObjects[counter]:
            nameAttr = anObject.getAttributeWithName("name")
            if nameAttr != None:
                formXMLChild += "\t\t<object>\n"
                formXMLChild += "\t\t\t<type>" + anObject.name + "</type>\n"
                formXMLChild += "\t\t\t<name>" + nameAttr.value + "</name>\n"
                formXMLChild += "\t\t</object>\n"

        # end child and append
        formXMLChild += "\t<form>\n\n"
        xmlString += formXMLChild
        counter += 1

    # end xml and return the string
    xmlString = xmlString[0:len(xmlString)-1] + "</formlist>\n"
    return xmlString


# ===== MAIN =====
# Parse the command line options
userArgv = sys.argv[1:]
flags, arguments = getopt.getopt(userArgv, "x")
wantsXMLFormat = flags.count(('-x', '')) > 0
hasURL = len(arguments) > 0;

# Get the HTML data
myHTML = None;
if hasURL:
    myURL = arguments[0];
    urlHandle = urllib.urlopen(myURL)
    if urlHandle == None:
        print "Failed to open the URL"
        sys.exit(1)
    myHTML = urlHandle.read()
    urlHandle.close()

else:
    myHTML = sys.stdin.read()

# Get all forms
htmlForms = getFormsFromHTML(myHTML)

# Loop with all forms
foundForms = []
foundObjects = [] # list of list
for aFormTag in htmlForms:
    # append the form
    formChilds = getFormObjects(aFormTag)
    formHTMLObject = HTMLObject.withTagString(aFormTag)
    foundForms.append(formHTMLObject)

    # append a form input object
    allObjects = getFormObjects(aFormTag)
    foundObjects.append(allObjects)


# Print or create xml
if not wantsXMLFormat:
    printForms(foundForms, foundObjects)
else:
    myXMLString = createXMLString(foundForms, foundObjects)
    print myXMLString