Web使用python抓取aspx网站

时间:2015-02-18 18:42:53

标签: python scrapy response python-requests

我能够获得HTTP标头和参数但无法生成响应对象。该网站是 - https://www.sacmembership.ca/Search/Search.aspx&我希望为每个从业者提供详细信息。这是我到目前为止的代码: -

import cookielib
import socket
import urllib
import urllib2

url = 'https://www.sacmembership.ca/Search/Search.aspx'
http_header = {
                #"POST" : "https://www.sacmembership.ca/Search/Results.aspx HTTP/1.1",
                "Host" : "www.sacmembership.ca",
                "Connection" : "keep-alive",
                "Content-Length" : "16581",
                "Cache-Control" :"max-age=0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Origin": "https://www.sacmembership.ca",
                "User-Agent" : "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36",
                "Content-Type" : "application/x-www-form-urlencoded",
                "Referer" : "https://www.sacmembership.ca/Search/Search.aspx",
                "Accept-Encoding" : "gzip, deflate",
                "Accept-Language" : "en-US,en;q=0.8"
                }

params = {
    'ctl00$ContentPlaceHolder1$ddlProfession' : "",
    'ctl00$ContentPlaceHolder1$ddlFacility' : "",
    'ctl00$ContentPlaceHolder1$txtCity' : "",
    'ctl00$ContentPlaceHolder1$ddlProvince' : "",
    'ctl00$ContentPlaceHolder1$ddlSortBy' : "LastName",
    'ctl00$ContentPlaceHolder1$ddlLanguageOfPractice' : "",
    'ctl00$ContentPlaceHolder1$txtEmployerCompanyName' : "",
    'ctl00$ContentPlaceHolder1$txtFirstName' : "",
    'ctl00$ContentPlaceHolder1$txtLastName' : "",
    'ctl00$ContentPlaceHolder1$btnSearch' : "Search"
    }

cookie_jar = cookielib.LWPCookieJar()
cookie = urllib2.HTTPCookieProcessor(cookie_jar)

opener = urllib2.build_opener(cookie)


req = urllib2.Request(url, urllib.urlencode(params), http_header)


res = opener.open(req)
html = res.read()
print html
"""
open("tmp.html", "w").write(html)
body = html
"""

请帮我解决这个问题

1 个答案:

答案 0 :(得分:1)

我能够实现我正在寻找的使用Selenium。

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from scrapy import Selector
from selenium.webdriver.support.ui import Select, WebDriverWait
import csv
import time
import requests
from scrapy import Selector as s

driver = webdriver.Firefox()
Links = ['','','','','']
for each in links:
    driver.get(each)
    time.sleep(02)
    driver.find_element_by_id("showAll").click()
    time.sleep(04)
    source = driver.page_source
    sel = s(text=source,type="html")
    apartment_listing = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
    with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\\apartment_listing.csv","ab")as export:
        for each1 in apartment_listing:
            export.write('{}\n'.format(each1))
    #New_link = driver.current_url
    i = 0
    while (i)<21:
        driver.find_element_by_class_name('next').click()
        time.sleep(02)
        source1 = driver.page_source
        sel1 = s(text=source1,type="html")
        apartment_listing1 = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
        with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\\apartment_listing.csv","ab")as export:
            for each2 in apartment_listing1:
                export.write('{}\n'.format(each2))
        i = i+1