HTML使用htmlagilitypack& Scraping AJAX Paging Table HttpWebRequest的

时间:2016-05-17 17:36:50

标签: c# httpwebrequest html-agility-pack

我需要创建一个HTML抓取工具,用于从HTML页面上的AJAX分页表中抓取数据。我能够在PHP中使用curl获取正确的Web请求,但我需要能够使用C#和.NET执行相同的操作。我需要抓取的页面位于here。截至目前,只有一页数据,但我能够将数据返回到PHP中有5条记录的页面中。我一直在尝试转换代码,但似乎无法从我的请求中得到正确的答案。

这是我到目前为止所做的:

public static string GetUltiProPage(int page)
{
    var url = "https://rew31.ultipro.com/PAC1016/JobBoard/ListJobs.aspx?Page=Browse";
    var request = (HttpWebRequest)WebRequest.Create(url);
    var postData = "__VIEWSTATE=%2FwEPDwUENTM4MWRkp7zhXpqU%2FH4WkDJ4KHitD4Wf%2Bu0%3D&__VIEWSTATEGENERATOR=161526B4&__EVENTVALIDATION=%2FwEWDQKRpdUcAtmYs8gHAtGa0qYHAsm9hqkDAp244ZYGAoL2oZIFAv31w5QIArbGqZ8NAqP3quANAsu8sZMPAsCstNoNAoP5qokBAtSmv8sPu3VhvhoU5as%2BywvA67Bi%2F1lQdjA%3D&__PXPOSTBACK=1&__PXLBN=&__PXLBV=&__PXDO=*5B193482696D86A46BE27A09BF9D44EF1FEF5044CF8196FBF1B81AABDB2580DA35D2E0E945D769E69249FCDCF130A4B1D3104011C807F147712D6EA5E4CC1ABC99A8DA4AA272D02FEAC1C3B02E5FF2966724009B422B7DC9C150EE4EB8BBDEE66D889DF3F2ED31300B672A6BD6778DCBA44779F751744AF48012C9EAFA64A09ABD6FBF990399A95BF122EB7806A3917135BE313A12FBCE63&__Keywords=&__RecordsPerPage=5&__PreparedPXDO=*501A15E038216814AFF20DE0ED6A92EF6202B33F038B51D97EA415797B8CA4E5F003978EDCEBB459EC1C6E0D8270762723E59844D1984D81300F50C6AB73C565843CDEE40789E31922A4317BE7E2FFFE60F6E468A882428F14510542C9B45CFEA0590D7B1DDE8BB16DEDFA49C6E13B8383EE1F010F7077267F2252B4C292D62AC532DBCE9A2FB51589343F83E842A9EB30DD800E4C2349156C7B206FB7CBE62B04EAC8196AAC51B873BBEABC1B6B866E12A309EEE428C94673859D0B369920F351F42E891D6770AFF60BF9EF0FEC9923BB59D7E020D48883FD9408640F8B66AD770C68EFD1110832496D391A556BDFE3AAC1B237CB869C1280726F76C682600C1DA1DF67DE1B48F1EFF865CC776CE83F8F9F7A0E512B5EE57636BEE01B78297C236EDA93BC028096FFDD18D728D7F46057B78653C997C10797BD5AC895065120FAE3E928110394CD88E805A1835AB3BCD402041499201659D272F84C43E242590D2CBAA3E0FDC3BD22479E9EFDFA425D9CF6D8286C8275A6C9C910E5B3EA808CF2C47F928007837E90FF5FDC169D98D03DF4CF0F04D2D4B6D870078B65CF5DF1A66EDDD14575EEB9AADDA7D7B93BB528D3CE4489F6D6787105D81054A37C7FDD217250A592FD9995&__PageNumber=" + page + "&__Previous=+%3C+";
    var data = Encoding.ASCII.GetBytes(postData);

    request.Method = "POST";
    request.ContentType = "application/x-www-form-urlencoded";
    request.ContentLength = data.Length;
    request.Host = "rew31.ultipro.com";
    request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
    request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    request.Referer = "https://rew31.ultipro.com/PAC1016/JobBoard/listjobs.aspx?Page=Browse";
    request.Headers.Add("Accept-Language: en-US,en;q=0.5");
    request.Headers.Add("Accept-Encoding: gzip, deflate, br");
    request.Headers.Add("Cookie: __jbsrcid=*86EE06D72F666815; ASP.NET_SessionId=5qa4np55qx52io555tgxpgmi; BIGipServerrew31.ultipro.com_http_pool=422280714.20480.0000");

    using (var stream = request.GetRequestStream())
    {
        stream.Write(data, 0, data.Length);
    }

    var response = (HttpWebResponse)request.GetResponse();

    var responseString = new StreamReader(response.GetResponseStream()).ReadToEnd();

    return responseString;
}

这就是我的PHP代码(这是有用的):

function GetJobPostings($numberOfPages, $filter){
    $JobPostings = array();
    $num = 0;
    while($num < $numberOfPages){
        if($GLOBALS['isComplete'] == true)
        {
            break;
        }
        $postFields = "__VIEWSTATE=%2FwEPDwUENTM4MWRkp7zhXpqU%2FH4WkDJ4KHitD4Wf%2Bu0%3D&__VIEWSTATEGENERATOR=161526B4&__EVENTVALIDATION=%2FwEWDQKRpdUcAtmYs8gHAtGa0qYHAsm9hqkDAp244ZYGAoL2oZIFAv31w5QIArbGqZ8NAqP3quANAsu8sZMPAsCstNoNAoP5qokBAtSmv8sPu3VhvhoU5as%2BywvA67Bi%2F1lQdjA%3D&__PXPOSTBACK=1&__PXLBN=&__PXLBV=&__PXDO=*5B193482696D86A46BE27A09BF9D44EF1FEF5044CF8196FBF1B81AABDB2580DA35D2E0E945D769E69249FCDCF130A4B1D3104011C807F147712D6EA5E4CC1ABC99A8DA4AA272D02FEAC1C3B02E5FF2966724009B422B7DC9C150EE4EB8BBDEE66D889DF3F2ED31300B672A6BD6778DCBA44779F751744AF48012C9EAFA64A09ABD6FBF990399A95BF122EB7806A3917135BE313A12FBCE63&__Keywords=&__RecordsPerPage=5&__PreparedPXDO=*501A15E038216814AFF20DE0ED6A92EF6202B33F038B51D97EA415797B8CA4E5F003978EDCEBB459EC1C6E0D8270762723E59844D1984D81300F50C6AB73C565843CDEE40789E31922A4317BE7E2FFFE60F6E468A882428F14510542C9B45CFEA0590D7B1DDE8BB16DEDFA49C6E13B8383EE1F010F7077267F2252B4C292D62AC532DBCE9A2FB51589343F83E842A9EB30DD800E4C2349156C7B206FB7CBE62B04EAC8196AAC51B873BBEABC1B6B866E12A309EEE428C94673859D0B369920F351F42E891D6770AFF60BF9EF0FEC9923BB59D7E020D48883FD9408640F8B66AD770C68EFD1110832496D391A556BDFE3AAC1B237CB869C1280726F76C682600C1DA1DF67DE1B48F1EFF865CC776CE83F8F9F7A0E512B5EE57636BEE01B78297C236EDA93BC028096FFDD18D728D7F46057B78653C997C10797BD5AC895065120FAE3E928110394CD88E805A1835AB3BCD402041499201659D272F84C43E242590D2CBAA3E0FDC3BD22479E9EFDFA425D9CF6D8286C8275A6C9C910E5B3EA808CF2C47F928007837E90FF5FDC169D98D03DF4CF0F04D2D4B6D870078B65CF5DF1A66EDDD14575EEB9AADDA7D7B93BB528D3CE4489F6D6787105D81054A37C7FDD217250A592FD9995&__PageNumber=$num&__Previous=+%3C+";

        $curlHeaders = array(
        "Host: rew31.ultipro.com",
        "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language: en-US,en;q=0.5",
        "Accept-Encoding: gzip, deflate, br",
        "Referer: https://rew31.ultipro.com/PAC1016/JobBoard/listjobs.aspx?Page=Browse",
        "Cookie: __jbsrcid=*86EE06D72F666815; ASP.NET_SessionId=5qa4np55qx52io555tgxpgmi; BIGipServerrew31.ultipro.com_http_pool=422280714.20480.0000",
        "Connection: keep-alive",
        "Content-Type: application/x-www-form-urlencoded",
        "Content-Length: " . strlen($postFields)
        );

        $url = "https://rew31.ultipro.com/PAC1016/JobBoard/ListJobs.aspx?Page=Browse";

        $ch = curl_init();
        curl_setopt($ch,CURLOPT_URL, $url);
        curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1);
        curl_setopt($ch,CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch,CURLOPT_HTTPHEADER, $curlHeaders);
        curl_setopt($ch,CURLOPT_POSTFIELDS, $postFields);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        $result = curl_exec($ch);

        if(curl_exec($ch) === false)
        {
            echo 'Curl error: ' . curl_error($ch);
        }
        else
        {
            //echo 'Operation completed without any errors';
        }


        $pagePostings = GetPageJobPostings($filter,  str_get_html($result));

        $JobPostings = array_merge($JobPostings, (array)$pagePostings);

        curl_close($ch);

        $num++;
    }

    return $JobPostings;
}  

如何让我的C#代码正常工作以获得正确的响应?一旦得到正确的响应,我计划使用htmlagilitypack将html解析为自定义对象。

0 个答案:

没有答案