用请求刮取网站(帖子,js?)

时间:2017-11-13 19:46:34

标签: javascript python-3.x cookies web-scraping python-requests

我试图抓住这个网站:

http://www.finanzen.net/historische-kurse/Daimler

并使用以下源代码:

from cookies import cookies
import datetime
import requests
import time
import webbrowser


def download(number,
             isin,
             start=datetime.date(1998,1,1),
             end=datetime.date.today(),
             dst="raw"):
    """
    """
    # ToDo -- I want to give a tag and not an url!
    url = "http://www.finanzen.net/historische-kurse/daimler"

    # ToDo -- should be dynamic
    today = datetime.date.today()
    inTag1    = str(1)
    inMonat1  = str(1)
    inJahr1   = str(1998)
    inTag2    = str(today.day)
    inMonat2  = str(today.month)
    inJahr2   = str(today.year)
    strBoerse = "XETRA"
    pkBHTs    = int(time.time())

    payload = {"inTag1"    : inTag1,
               "inMonat1"  : inMonat1,
               "inJahr1"   : inJahr1,
               "inTag2"    : inTag2,
               "inMonat2"  : inMonat2,
               "inJahr2"   : inJahr2,
               "strBoerse" : strBoerse,
               "pkBHTs"    : pkBHTs}

    r = requests.get(url, params=payload, cookies=cookies)

    if "08.11.2017" in r.text:
        print("OK")
    else:
        print("FAIL!")

    with open("output.html", "w") as f:
        f.write(r.text)


if __name__ == "__main__":

    print("Test: download()")

    download(1510210323,
             "DE0007100000",
             start=datetime.date(1998,1,1),
             end=datetime.date.today(),
             dst="raw")
    print("Done.")

    webbrowser.open("output.html")

我有第二个脚本,它从我的浏览器提供cookie数据:

cookies = {'CAP' : 'data=44a1e1f46fef0411bf06d9bfc501913f',
           'CUID' : 'N,1510569302850:ALHGLuQAAAAPTiwxNTEwNTY5MzAyODUwVdj35/i8kfuLw5RmnsCECh6uWduJEJHPHe44+gmS5k1OeVohiY2UE0s8Toc6Z1KsPkSIOyvb0rHFvfBB5GtZD0BeUVeUq8xKkDIkDqq2RsE7AvdO9c+GoqElRytvxjPuoExKFUZ7sMl3+ugTDvQsjM0q6iEkcfYTCjZcqRhGJ2JicnT0yZI8NIINqvt1OUufo4jtHTgznYHCgSG8lxydqzv+Cax90XRsvKoUEzTfJCxzqryt3rkXiy4IMEOrTMxZOZCoT0HO3hgghkd3XyzOhhr70tLnPbY4GxPkWrcXy4y+7xHwwoX+jmJGiNvEJod8mQF3QkkDSN+uwmTlAgy7Yg==',
           'MI' : '1',
           'OPTOUTMULTI' : '0:0%7Cc2:0',
           '__utma' : '99761801.1635127051.1447939999.1510562258.1510569245.10',
           '__utmb' : '99761801.1.10.1510569245',
           '__utmc' : '99761801',
           '__utmt_UA-1858090-1' : '1',
           '__utmz' : '99761801.1503872185.2.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
           '_ga' : 'GA1.2.1635127051.1447939999',
           '_gid' : 'GA1.2.216894857.1510518215',
           'finpopup2' : 'n=3&d=08%2E09%2E2017',
           'fintargeting' : 'v=1&h=0x000000&hd=131117091117091117091117091117091117',
           'finvisit' : 'v=5&p=16&d=13%2E11%2E2017',
           'mjdkyj' : 'AKsRol8bjClrOgYBI9F53uwo0572ZvzG_ifsQhL0W6CGIPDli067aLH682nhCzAvZJLwgmI_hfnp1G_cO6_R7La4pHyjXqGz7w',
           'utag_main' : 'v_id:015e25c45b49001bad6f158e01b004066002a05e00838$_sn:10$_ss:0$_st:1510571043449$dc_visit:10$dc_visit_dip-main:10$dip_times_empty_enrichment:26$ses_id:1510569240676%3Bexp-session$_pn:1%3Bexp-session$collectCookieMode:3rdParty%3Bexp-session$dc_event:1%3Bexp-session$dip_events_this_session:1%3Bexp-session$dc_event_dip-main:1%3Bexp-session$dc_region_dip-main:eu-central-1%3Bexp-session$dc_region:eu-central-1%3Bexp-session',
           'xdefcc' : 'G18e8ffb3a46fc000807955c49556bf4cc',
           '1P_JAR' : '2017-11-13-10',
           'AID' : 'AJHaeXJrF9XjJOxZm8l4doMQZS4yMOPes3h5NKKcWJZxIyGSOAjtzw',
           'APISID' : 'VILFXotMggFxtb9h/AcPvkxMR5pWR0AzUo',
           'ASPSESSIONIDQSRTTSCC' : 'GLLJJMHAIDCBGEEODOLMADEN',
           'ASPSESSIONIDSSRSRSBD' : 'PGHHNMHANNCJJKIPNALFCCMO',
           'CAP' : 'data=44a1e1f46fef0411bf06d9bfc501913f',
           'CONSENT' : 'YES+DE.de+20150726-13-0',
           'CUID' : 'N,1510569302850:ALHGLuQAAAAPTiwxNTEwNTY5MzAyODUwVdj35/i8kfuLw5RmnsCECh6uWduJEJHPHe44+gmS5k1OeVohiY2UE0s8Toc6Z1KsPkSIOyvb0rHFvfBB5GtZD0BeUVeUq8xKkDIkDqq2RsE7AvdO9c+GoqElRytvxjPuoExKFUZ7sMl3+ugTDvQsjM0q6iEkcfYTCjZcqRhGJ2JicnT0yZI8NIINqvt1OUufo4jtHTgznYHCgSG8lxydqzv+Cax90XRsvKoUEzTfJCxzqryt3rkXiy4IMEOrTMxZOZCoT0HO3hgghkd3XyzOhhr70tLnPbY4GxPkWrcXy4y+7xHwwoX+jmJGiNvEJod8mQF3QkkDSN+uwmTlAgy7Yg==',
           'HSID' : 'AkD2AyVb5Z9wR9QT-',
           'MI' : '1',
           'NID' : '117=IbpFdbi-srSVK7HAD-b_ENXheFKQqlI0MtwsBKkQ5kMzKE_YMKHN7THu3mgYrYZZ45mSgYb1r67FfyX30QDu3pT22YEzK4Ylj_DbQ0BwbLx36NJV68KmDoivU9zD38hSPe9oPS89Z8spzbp9mPSEuYOhhznowFxCS0ZnZDhYIKsnwdMEYWqDVhQ0fUqnpnTQFV1VWb1-7dZJ5LvnKKWf5xHi5wNhL5YQtaFdYEb2rY0L3HTR0IeZMu3ZIR-2pOYQYspwyg',
           'NID' : '110=Chz_C4sXWBfkLwySlpc-od0DauOGbWPjjZ_UYJgVYSeOXLWaQayrm_PHSMfnI4bkHwFQjWq0atuDmXEGq39-uJHLpanLu7kQDs9WTm004KBx7nWacN1-_x_p05gbw8wx',
           'OPTOUTMULTI' : '0:0%7Cc2:0',
           'POPUPCHECK' : '1510604610799',
           'SAPISID' : 'e-07IvwMgsqnc1KL/AWrA0YaffjZEXetqg',
           'SID' : 'FwUbfPob8j7z4QAbMUsPD7HI4FwPoOM7wPo9cyTwyAveBe0fy86idhBKLMz8mn93l-pyPA.',
           'SIDCC' : 'AE4kn7_ynLaaOiVILBfgv5-_j3I-18GWRw0_rgIiJGSRP0YLV8zwv3Me80u9dqJI_a58y8xXxMCyLlu7qBw2rQ',
           'SSID' : 'ADQhejhWBta7RIl3q',
           '__utma' : '99761801.1635127051.1447939999.1510562258.1510569245.10',
           '__utmb' : '99761801.1.10.1510569245',
           '__utmc' : '99761801',
           '__utmt_UA-1858090-1' : '1',
           '__utmz' : '99761801.1503872185.2.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
           '_ga' : 'GA1.2.1635127051.1447939999',
           '_gid' : 'GA1.2.216894857.1510518215',
           'anj' : """dTM7k!M4/C@-S3@:5]qk`_#I2PQ7s]@!7PT-Q!$Rkf%Tam0H]QFpc6s(!H!k>jP5fr<Pn!Sse@DnFc!m#J[!_?'td[$Y+EO30JW48u`al*p]+pQ%.IaIz(eLt1a=RpN+IIuSI6>2pVG*(l/YR]55R_8*)f_>Bgu2u'nh]2pe>#n88><bUjXaW]vuR5bchvb:ei<F/=Ow02i-18D:GO_nsX`cEj282$Vr6Zz2%R'kHi(@n0veK7e2'oM:Nd`*Ju!l/-nAbGM6F3YP-^KnNXwhV1.yggNVGk!<5]]1?:av7hOQ/L#d9Q``r=MjWhED9SGjg%*!X1%w5kp?T<s?7j=KIM8Y$p1^aigtynxfq/5MBw3X2Bc8E5EILM(^k=p=B+%ra1D^CUB]=ExTXvZx7go4f/B(8iGBZRtY(q7_H=>?rY$IpGd).%<kME<mKK:vp7SPIctG0J8vv3y6MW*)3'5N9/Ju_Oiw98wU4016EdBD9et%n*bF_uZv<jF>'E9@VDsp3i^mu'h2R4Y8:2]ko2YdA.(6d3^+<+4Dyo=Z-Mi./o1lsVmD61*JVd'oKHFeNRD*7d:wbm(7!4vs*Ahw+.XI2d?c4#.oV1%5pnBbLG^!1N1qfcZMzhBp?xs_#Ch3o$L$hlWShx4HF@u5e9Agc*S[s.gz6MTIX6:[Rm`v-pH_<(#8YlG`P_(PHOI*v*N^mA!>5Nmh8U?<WbeUp48=ux1N@quBj7U2(Kz1W7/>7`Q2A-`C7X6N_.bl/V/43$YC8N6zsH:M$KCuLYMOsnBP<3v3h)rg2aA2V?PAMJ71Lc*oW?!I^KQ<CPZ#RByeri1*v-M93nj?6#tLa?5o-[E*.ys9IO/2KA)?C^vETDT%wlq#n=7kK9_U`Q=HTXf]UwV(h8H2]_x^UElV<-Jd.hF$pC#O#+2f1I6i44C<NiDE$C2Lhv=z#pWxmO?o4I*aMM2t])V=p>>i[n-caqAWXA`#7(h2Ka8_dV7bgcPn_h./y#T[!AyHDHC-4!t3+wnhevgtid(Y=^JnIDS6@J$jP$wO`bUt5^neF+TZMJNpbwFfE=<dL.627>80_)>!ViHO9vIz7]ebKf-WPz3-%7p]w_1nPBQvyJ_r*^cg['SV3T/t<?UK_ZqgOio<1W])V^'PW^Ev7^_`McptJ3`gig?Hh$DFMgh[yImN`wByK71`iKGSxl_8Fq!?p7WW)]R:8XVy+fJZ6zYJc@ro-!w@'_WN`A7'O]Slqu3gh/RPX!Ps[gJp9IA439Ki3Ak!II7Lgy997x07G][')<2wjcG9H3f'SwP/wKC>o=F7#3mMF9R#1U[5^SQCb:VD*7$G9pj9i7hJq:DMe)rg`lO-kf#2>mv3gzo_Jf?`*]*afAw3#21o/#YC<A9@/!41/zOjJ'<W2Z@?6EN2's]ZiejVf$y!mpqR7b2[ulW:gR.5Bcv[6>C9'Gl7uJHY1%@j!sP>>!bCYTvKIL>wV*`Hq.2q/FBSY?F%_iT7I%AZ<.`7@*)oo+DWqHvHvnK?4OQEzdX+4g.Mrhj(H<WjfG<4gIa([228)?u)*nhKni6oaXD!1YJEEVg07mn1_Ad:6cO37jyJ3?2L?+)$PaLfTKocjN55L@E+cMm4)k3jv.$z_rTgYiga4/""",
           'finpopup2' : 'n=3&d=08%2E09%2E2017',
           'fintargeting' : 'v=1&h=0x000000&hd=131117091117091117091117091117091117',
           'finvisit' : 'v=5&p=16&d=13%2E11%2E2017',
           'i00' : '002963d5e4108c32058a5679f0001%3B5a097588%3B5a3b7d3c',
           'icu' : 'ChgI95w7EAoYAiACKAIwuPaX0AU4AkACSAIQuPaX0AUYAQ..',
           'mjdkyj' : 'AKsRol8bjClrOgYBI9F53uwo0572ZvzG_ifsQhL0W6CGIPDli067aLH682nhCzAvZJLwgmI_hfnp1G_cO6_R7La4pHyjXqGz7w',
           'sess' : '1',
           'utag_main' : 'v_id:015e25c45b49001bad6f158e01b004066002a05e00838$_sn:10$_ss:0$_st:1510571043449$dc_visit:10$dc_visit_dip-main:10$dip_times_empty_enrichment:26$ses_id:1510569240676%3Bexp-session$_pn:1%3Bexp-session$collectCookieMode:3rdParty%3Bexp-session$dc_event:1%3Bexp-session$dip_events_this_session:1%3Bexp-session$dc_event_dip-main:1%3Bexp-session$dc_region_dip-main:eu-central-1%3Bexp-session$dc_region:eu-central-1%3Bexp-session',
           'uuid2' : '869611879133359501',
           'xdefcc' : 'G18e8ffb3a46fc000807955c49556bf4cc'}

如果我在cookie中提供任何个人信息,请通知我。感谢。

我运行我的脚本并期望获得相同的数据,例如在我的浏览器中手动处理表单,但这不起作用。

在我的控制台中,我得到:

Test: download()
FAIL!
Done.

由我的脚本生成的output.html文件包含“BittewählenSiedas Start- und Enddatum des Zeitraumes,fürdenSie historische Kursdatenanzeigenmöchten。” (=“请选择您要显示历史价格的开始和结束日期”)而不是数据。这个消息是错误消息,并告诉我将数据发送到表单 - 但我已经将数据发送到网站,你可以看到!

目前我不明白我可以更改什么来获取数据而不是此消息。拜托,请你帮个忙吗?感谢您的努力,请原谅我的英语!

1 个答案:

答案 0 :(得分:2)

您可以选择selenium来获得相同的结果。所以我修改了你的代码以实现你想要获得的值。

# -*- coding: utf-8 -*-
import datetime
import requests
import time
import webbrowser
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import io

def getBoersenIndex(boerse):
    """Get index of given boerse.
       To set the value of the boersen dropdown, you have to know the index of the chosen boerse.

        Keyword arguments:
        boerse -- name of boerse as you see in dropdown at the website
    """

    boersen = [
        'Budapest',
        'Berlin',
        'Baader Bank',
        'BX Swiss',
        'Bats',
        'Chi-X',
        'Düsseldorf',
        'Frankfurt',
        'Hamburg',
        'Hannover',
        'Lang und Schwarz',
        'München',
        'Mexiko',
        'Nasdaq OTC',
        'XETRA',
        'Prag',
        'Stuttgart',
        'Swiss Exchange',
        'Tradegate',
        'Wien',
        'Quotrix'
    ]

    for index, boerseName in enumerate(boersen):
        if boerseName == boerse:
            return index+1


def download(startDay, startMonth, startYear, boerse, url):

    today = datetime.date.today()
    inTag1    = str(startDay)
    inMonat1  = str(startMonth)
    inJahr1   = str(startYear + 1 - 1998) # because the dropdown uses the index as a value starting with 1998 = 1
    inTag2    = str(today.day)
    inMonat2  = str(today.month)
    inJahr2   = str(today.year + 1 - 1998) # same as starting year
    strBoerse = str(getBoersenIndex(boerse))

    # this is my webdriver implementation, you may use another one
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    browser = webdriver.Chrome(<path_to_your_chrome_driver>, chrome_options=options)

    browser.get(url) 
    time.sleep(1)

    browser.find_element_by_xpath("//select[@name='inTag1']/option[" +inTag1+ "]").click()
    browser.find_element_by_xpath("//select[@name='inMonat1']/option[" +inMonat1+ "]").click()
    browser.find_element_by_xpath("//select[@name='inJahr1']/option[" +inJahr1+ "]").click()
    browser.find_element_by_xpath("//select[@name='inTag2']/option[" +inTag2+ "]").click()
    browser.find_element_by_xpath("//select[@name='inMonat2']/option[" +inMonat2+ "]").click()
    browser.find_element_by_xpath("//select[@name='inJahr2']/option[" +inJahr2+ "]").click()    
    browser.find_element_by_xpath("//select[@name='strBoerse']/option[" +strBoerse+ "]").click()
    browser.find_element_by_css_selector("span.button").click()

    if "08.11.2017" in browser.page_source:
        print("OK")
    else:
        print("FAIL!")

    with io.open("output.html", "w", encoding='utf8') as f:
        f.write(browser.page_source)


if __name__ == "__main__":
    print("Test: download()")
    download(1,1,1998,'XETRA', 'http://www.finanzen.net/historische-kurse/daimler')
    print("Done.")

    webbrowser.open("output.html")