从网站传递参数读取数据

时间:2018-02-28 17:12:28

标签: python python-3.x beautifulsoup python-requests

import requests
from lxml import html
from bs4 import BeautifulSoup

session_requests = requests.session()

sw_url = "https://www.southwest.com"
sw_url2 = "https://www.southwest.com/flight/select-flight.html?displayOnly=&int=HOMEQBOMAIR"
#result = session_requests.get(sw_url)

#tree = html.fromstring(result.text)

payload = {"name":"AirFormModel","origin":"MCI","destination":"DAL","departDate":"2018-02-28T06:00:00.000Z","returnDate":"2018-03-03T06:00:00.000Z","tripType":"true","priceType":"DOLLARS","adult":1,"senior":0,"promoCode":""}

#{
#    'origin': 'MCI',
#   'destination': 'DAL',
#   'departDate':'2018-02-28T06:00:00.000Z',
#   'returnDate':'2018-03-01T06:00:00.000Z',
#   'adult':'1'
#}

p = requests.post(sw_url,params=payload)
#print(p.text)
print(p.content)
p1 = requests.get(sw_url2)
soup = BeautifulSoup(p.text,'html.parser')

print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})

for tag in pr:
    print(tag)
    print('++++')
    print(tag.next_sibling)

print(soup.find("div",{"class":"twoSegments"}))

soup = BeautifulSoup(p1.text,'html.parser')

print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})

for tag in pr:
    print(tag)
    print('++++')
    print(tag.next_sibling)

print(soup.find("div",{"class":"twoSegments"}))

我需要在特定日期检索两个地点之间的航班价格。我通过查看浏览器检查员的会话信息确定了参数,并将其包含在发布请求中。

我不确定我在这里做错了什么,但我无法正确读取标签中的数据。它没有打印。

编辑:4/25/2018

我现在正在使用以下代码,但它似乎没有帮助。请指教。

import threading
from lxml import html
from bs4 import BeautifulSoup
import time
import datetime 
import requests 


def worker(oa,da,ods):
    """thread worker function"""
    print (oa + ' ' + da + ' ' + ods + ' ' +  str(datetime.datetime.now()))
    url = "https://www.southwest.com/api/air-booking/v1/air-booking/page/air/booking/shopping"
    rh = {
    'accept': 'application/json,text/javascript,*/*;q=0.01',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.5',
    'cache-control': 'max-age=0',
    'content-length': '454',
    'content-type': 'application/json',
    'referer': 'https://www.southwest.com/air/booking/select.html?originationAirportCode=MCI&destinationAirportCode=LAS&returnAirportCode=&departureDate=2018-05-29&departureTimeOfDay=ALL_DAY&returnDate=&returnTimeOfDay=ALL_DAY&adultPassengersCount=1&seniorPassengersCount=0&fareType=USD&passengerType=ADULT&tripType=oneway&promoCode=&reset=true&redirectToVision=true&int=HOMEQBOMAIR&leapfrogRequest=true',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }

    fd = {
    'returnAirport':'',
    'twoWayTrip':'false',
    'fareType':'DOLLARS',
    'originAirport':oa,
    'destinationAirport':da,
    'outboundDateString':ods,
    'returnDateString':'',
    'adultPassengerCount':'1',
    'seniorPassengerCount':'0',
    'promoCode':'',
    'submitButton':'true'
    }
    with requests.Session() as s:
        r = s.post(url,headers = rh )
#        soup = BeautifulSoup(r.content,'html.parser')
#        soup = BeautifulSoup(r.content,'lxml')
        print(r)
        print(r.content)
    print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
    return


#db = MySQLdb.connect(host="localhost",user="root",passwd="vikram",db="garmin")
rcount = 0

tdelta = 55
#print(strt_date)
threads = []
count = 1 
thr_max = 2
r = ["MCI","DEN","MCI","MDW","MCI","DAL"]

strt_date = (datetime.date.today() + datetime.timedelta(days=tdelta)).strftime("%m/%d/%Y")

while count < 2:
    t = threading.Thread(name=r[count-1]+r[count],target=worker,args=(r[count-1],r[count],strt_date))
    threads.append(t)
    t.start()
    count = count + 2

1 个答案:

答案 0 :(得分:2)

当您说从浏览器的检查员查看会话信息时,我假设您的意思是网络选项卡。如果是这种情况,您确定是否注意到正确发送的数据?

以下是浏览器发送的URL,然后提取您需要的页面:

url = 'https://www.southwest.com/flight/search-flight.html'

您在请求中没有使用标题,在我看来,在某些情况下应该强制通过。以下是浏览器传递的标题:

:authority:www.southwest.com
:method:POST
:path:/flight/search-flight.html
:scheme:https
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding:gzip, deflate, br
accept-language:en-US,en;q=0.9
cache-control:max-age=0
content-length:564
content-type:application/x-www-form-urlencoded
origin:https://www.southwest.com
referer:https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR
upgrade-insecure-requests:1
user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36

注意:

  1. 我删除了cookie标头,因为如果你正在使用会话,那将由请求处理。

  2. 前四个标题(以冒号(':')开头的标题)无法在Python的requests中传递;所以,我跳过了它们。

  3. 以下是我用来传递标题的dict

    rh = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'max-age=0',
        'content-length': '564',
        'content-type': 'application/x-www-form-urlencoded',
        'origin': 'https://www.southwest.com',
        'referer': 'https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
    }
    

    以下是浏览器发送的表单数据:

    fd = {
        'toggle_selfltnew': '',
        'toggle_AggressiveDrawers': '',
        'transitionalAwardSelected': 'false',
        'twoWayTrip': 'true',
        'originAirport': 'MCI',
        # 'originAirport_displayed': 'Kansas City, MO - MCI',
        'destinationAirport': 'DAL',
        # 'destinationAirport_displayed': 'Dallas (Love Field), TX - DAL',
        'airTranRedirect': '',
        'returnAirport': 'RoundTrip',
        'returnAirport_displayed': '',
        'outboundDateString': '02/28/2018',
        'outboundTimeOfDay': 'ANYTIME',
        'returnDateString': '03/01/2018',
        'returnTimeOfDay': 'ANYTIME',
        'adultPassengerCount': '1',
        'seniorPassengerCount': '0',
        'promoCode': '',
        'fareType': 'DOLLARS',
        'awardCertificateToggleSelected': 'false',
        'awardCertificateProductId': ''
    }
    

    请注意,我注释掉了上面的两个项目,但它没有任何区别。我假设你只有位置代码,而不是全名。如果你有它们或者你可以从页面中提取它们,你也可以将它们与其他数据一起发送。

    我不知道这是否有所不同,但我使用的是data而不是params

    with requests.Session() as s:
        r = s.post(url, headers = rh, data = fd)
        soup = BeautifulSoup(r.content, 'lxml')
    

    最后,结果如下:

    >>> soup.find('span', {'class': 'currency_symbol'}).text
    '$'