我正尝试使用以下脚本从a webpage获取一些表格内容。要手动填充内容,必须在点击Submit
按钮之前从this image所示的下拉菜单中选择选项。我试图相应地模仿帖子http请求。但是,我可能在某些地方出错了,这就是脚本无法正常工作的原因。具体来说,this is是我要获取的内容。
这是我尝试过的方法:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.lgindiasocial.com/microsites/brand-store-web-five/locate.aspx'
headers = {
'x-microsoftajax': 'Delta=true',
'origin': 'https://www.lgindiasocial.com',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'referer': 'https://www.lgindiasocial.com/microsites/brand-store-web-five/locate.aspx',
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
r = s.get(URL)
soup = BeautifulSoup(r.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['ScriptManager1'] = 'UpdatePanel1|btnsubmit'
payload['ddlState:'] = 'Assam'
payload['ddlCity'] = 'Golaghat'
payload['ddllocation'] = 'Golaghat'
s.headers.update(headers)
r = s.post(URL,data=payload)
soup = BeautifulSoup(r.text,"lxml")
item = soup.select_one("table")
print(item)
运行脚本时,输出为None。
如何使用发帖请求从搜索结果中获取表格内容?
编辑:如果我直接从开发工具中复制有效负载的the content,并在有效负载中使用相同的负载,则会得到预期的结果。
import requests
from bs4 import BeautifulSoup
URL = 'https://www.lgindiasocial.com/microsites/brand-store-web-five/locate.aspx'
payload = "ScriptManager1=UpdatePanel1%7Cbtnsubmit&hidcity=&ddlState=Assam&ddlCity=Golaghat&ddllocation=Golaghat&__EVENTTARGET=&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE=M%2BqldpZhV90EX2sawXMrHD7jYtOMXnrPuP8XfVtS21GKmxK0YYuBnqm3I7tU%2BKMtFGZgzWpsYK%2FYJtfTBUK%2F0WobR21tjbWjdrZiXS5FlLcS6qgYMNKfqyZRcK13dbz667H7T6QZqpITTRSqsM%2BrM91VW989KXoknFdx0H6EkRFCJRu4WsBsUxeJnd5Lf5IAUN%2BTNKDYE5GuclDNKnmU1pMmHhrjKQysvYtw8cjD5DdDkNb7NDkLiVxm7DISyXZtVJyOBV6dFa%2Blm1%2FR9M7F2nyepARAl0XIiNP9dhFvomLNdlP%2BU%2FNyllJ5IXW4D%2Fl5Kfx5yaRP8XSKURtAc915i%2F2T48a0dyAR42tJ40eit1IWs7MCwgesNtF35zkuKN1SRhyhHqcnKjcMYW%2BkLqKsLvKpLQcDuXrIAzYyqlgJZ%2FlBQJo%2BiM4tTOH4mEqDkSZW%2Fk94KX1OM70s9%2FS%2Fd5trrHIgNoKw1bCRI8IQ41ZEicMsJPTp67KnqoMZz0F0cCmo%2F49zYkuHw0kqaZmKCrRUNW8Xcr%2F5A3AfNg%2FB8WURD0g2x%2BwzcLXDcVCJ6ngf0LdOc%2BTppM6EOZpTGJGjjDqK116tzWAOPfiJHgBuIPkiZJTaEHnwwjcYXuuLN%2FTgPFUJkXVjBSyRdCnPXsebInNd4Wsu2lnNdwZUO3rnNuu5eY%2FHf7YemcmCEzji%2FxLG%2FynnG0sG61TC1bJCyFw2E3V6ZGshbuqDfh7QQyxqPDEt2uaCN7s%2FOZ%2FwiXeVY2henUVBZSVrxUvF6QT0eO4SIY0OlNYBLK7cO4YG4zC0tURSBr7lZwR%2B%2FowLieNGSO7sOeLQVwL71GKnzBAOZVQH1hw%2B8FIRPoc0pn3v7RjK5CMgTtrZlar67Cv1lTi2nUyAIpX%2BhGkaQeOsg%2ByaIqDIo%2FWwcrg9VV9QP%2FdmwP8hTtq3KTVs0Ncja4Yvizm12BkEwWtMJ9fqzLBXt%2F2J2EjsG7GudgXypwSU7U8oY%2Fq%2BCk93y%2FeTr1ftEFbpGRTRm4hNVXeoCYRyuJceU%2BvO4U5E29ZPqBIolidYtKKH7lnRxKNk2BHtY93VNHPZEjTEDnHcGbgtHmxlBjHRQZlzJKWTjY5ccdFABihGx%2FzY0VCwaehpx2BWxy5qXqW1fX7e5uxxxHteYVt7YyrzYPsX%2B%2FlKiYwt23fsJzmmVkHwmu5%2FTSk1Ms9yJmBE%2B8pEF%2Bum01L8jRH4zxyTaD4s779uLZwLAUUzpi5cfseKTrjGv7uNjCpNci9BXbSdCdqrKa8aPiJX0lWUH9zid%2B8Jc7Jhx%2Bb6nzJpbZ8E9sPpUlcHVGUSzqixsiK91W%2FDDk2LCOvTqJJ9JXmy5cwRhL9r95okWq%2BDImTetFhdYk9%2F9VH3JsACpv4dqqdviEjjFpvmEp7SBMLSWw7toPUIRortPtriz3u9velTqNpHgmbmig8Znb%2F4Q8JrYfjPZzfRxN%2FuQXQyxUNUY2IsYbC5Bm7JWTMZe869muBdE%2FlMLujUkOFCXaOwZXuZHbr7neq0nro3RvYUggBLqxGFlG1Bp52iDNklcx8nfjVMOhOybfCMcxz6mq4Ew2hdLv4IslLRawI5u%2FPQe0vu0TG9LeBeR6Ok1sf72rWpvhD6yl4GTy8oJC1UglabWo8i5aMprxxAWuz%2BzLzizI3aRTQsl1MFKsD9gIGZsaFNAIb7gEgFgw%2B%2BSjTGR51mGES3sOUYXscIJVBciBs3F9vnr8u5gfKD3hLwqvc4djKMBxVQfjLEs%2FQwb7mlOx8XodaV6uOrkiZpw2WZNja5RPBIp4VXeXKXIxqBNsNA4eGT%2Bx2b2JadVB8%3D&__VIEWSTATEGENERATOR=06ED1D24&__VIEWSTATEENCRYPTED=&__ASYNCPOST=true&btnsubmit=Submit"
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
s.headers.update({'content-type':'application/x-www-form-urlencoded; charset=UTF-8'})
r = s.post(URL,data=payload)
soup = BeautifulSoup(r.text,"lxml")
item = soup.select_one("table")
print(item)
答案 0 :(得分:2)
首先在您的代码中输入一个小错字(在其中输入冒号)
payload['ddlState:'] = 'Assam'
更大的问题与页面的构造方式有关。该页面具有三个下拉菜单,这些下拉菜单发送POST请求。每个POST请求都会返回一个修改后的__VIEWSTATE,该__VIEWSTATE必须包含在后续请求的标头中。
在您的代码中,仅从原始GET请求的input [form]中获取__VIEWSTATE,您需要从上一个POST请求获取__VIEWSTATE。因此,以下方法应该起作用:
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
r = s.get(URL)
soup = BeautifulSoup(r.text, "lxml")
# first POST = Select State
payload = {i['name']: i.get('value', '') for i in soup.select('input[name]')}
payload['ScriptManager1'] = 'UpdatePanel1|btnsubmit'
payload['ddlState'] = 'Assam'
payload['ddlCity'] = 'Select City'
payload['ddllocation'] = 'Select Location'
payload['__EVENTTARGET'] = 'ddlState'
r = s.post(URL, data=payload)
soup = BeautifulSoup(r.text, "lxml")
# second POST = Select City
payload = {i['name']: i.get('value', '') for i in soup.select('input[name]')}
payload['ScriptManager1'] = 'UpdatePanel1|btnsubmit'
payload['ddlCity'] = 'Golaghat'
payload['ddllocation'] = 'Select Location'
payload['__EVENTTARGET'] = 'ddlCity'
r = s.post(URL, data=payload)
soup = BeautifulSoup(r.text, "lxml")
# third POST = Select Location
payload = {i['name']: i.get('value', '') for i in soup.select('input[name]')}
payload['ScriptManager1'] = 'UpdatePanel1|btnsubmit'
payload['ddlCity'] = 'Golaghat'
payload['ddllocation'] = 'Golaghat'
payload['__EVENTTARGET'] = ''
s.headers.update(headers)
r = s.post(URL, data=payload)
soup = BeautifulSoup(r.text, "lxml")
item = soup.select_one("table")
print(item)
此代码有一些优化空间。我试图使问题变得透明。
答案 1 :(得分:1)
您的最终帖子返回的结果如下所示。它不包含您的预期表,不是完整的HTML文档。我怀疑这通常是使用AJAX加载的,因此仅更新了DOM的一部分,然后在处理之后,执行了更多的JavaScript来加载您要查找的表。我建议您使用selenium
之类的工具来驱动浏览器。例如,使用Chrome:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
from selenium.common.exceptions import StaleElementReferenceException
import time
class WaitUntilElementIsStale:
def __init__(self, *, driver=None, element=None, timeout=10):
assert driver or element
self.element = driver.find_element_by_tag_name('html') if element is None else element
self.timeout = timeout
def __enter__(self):
return None
def __exit__(self, exc_type, exc_value, exc_traceback):
if exc_type is not None:
return
start_time = time.time()
while time.time() < start_time + self.timeout:
try:
# poll the link with an arbitrary call
self.element.find_elements_by_id("doesn't-matter")
except StaleElementReferenceException:
return
time.sleep(0.1)
options = webdriver.ChromeOptions()
options.add_argument("headless") # comment out to see the browser
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
try:
driver.implicitly_wait(10) # 10 seconds is a bit of "overkill" but it doesn't hurt
driver.get('https://www.lgindiasocial.com/microsites/brand-store-web-five/locate.aspx')
with WaitUntilElementIsStale(element=driver.find_element_by_name("ddlCity")):
Select(driver.find_element_by_name("ddlState")).select_by_value('Assam')
with WaitUntilElementIsStale(element=driver.find_element_by_name("ddllocation")):
Select(driver.find_element_by_name("ddlCity")).select_by_value('Golaghat')
Select(driver.find_element_by_name("ddllocation")).select_by_value('Golaghat')
driver.find_element_by_name('btnsubmit').click()
driver.find_element_by_tag_name('table') # implicitly wait for up to 10 seconds for this element to appear
soup = BeautifulSoup(driver.page_source, "lxml")
item = soup.select_one("table")
print(item)
finally:
driver.quit()
打印:
<table border="0" cellspacing="0" id="dladdress" rules="all" style="background-color:White;border-width:0px;width:500px;border-collapse:collapse;border:0px solid red;">
<tbody><tr>
<td style="border-width:0px;">
<div style="padding:5px;text-align: left;width: 85%;">
<div style="padding-top:10px; padding-bottom:5px; color:#d80546; ">LG BRAND STORE - BRAHMAPUTRA ELECTRONICS</div>
<span id="dladdress_ctl02_divaddress" style="font-weight:bold; padding-bottom:10px;">Complete Address : </span>
<span style="text-transform:capitalize">g.f. road, near das & co., golaghat, assam</span><br/>
<span id="dladdress_ctl02_divcontact" style="font-weight:bold; padding-bottom:10px;">Contact Person :</span>
<span style="text-transform:capitalize;">pravin kankani<br/>
<span id="dladdress_ctl02_divphone" style="font-weight:bold; padding-bottom:10px;">Contact No. :</span> <span id="dladdress_ctl02_lblcontact">9954305770 , </span><br/>
<span id="dladdress_ctl02_divemail" style="font-weight:bold; padding-bottom:10px;"> Email ID:</span> brahmaputralgshoppe@yahoo.com<br/></span>
</div>
</td>
</tr>
</tbody></table>
更新:
我看到您已经更新了问题。我假设您已经进行了网络跟踪,以查看随后的AJAX POST请求被用来获得最终结果的原因。您非常依赖网站应用程序内部的不变,但是我认为只要它们保持不变,您的代码就可以正常工作。
我已经更新了解决方案,不必每次在下拉列表中进行选择时都必须等待1秒,而在下一个下拉列表中进行选择之前,不必每次等待1秒。例如,在做出选择之后,在州下拉列表中,将重新创建当前的城市下拉列表(原始的城市下拉列表将变为“陈旧”)。新代码只是反复等待当前城市下拉列表过时,然后再寻找新代码。因此,应该很少浪费时间。
第二个更改只是像您一样将完整的HTML加载到Beautiful Soup中。
当然,此代码取决于用户界面是否不变。但是当用户界面确实发生更改并且此代码很容易适应这种更改时,这是显而易见的。
原始帖子请求的结果
3256|updatePanel|UpdatePanel1|
<div class="container-fluid">
<div class="row">
<div class="col-md-12 text-center">Locate nearest LG Brand Store</div>
<div class="col-md-12 text-center">
<div class="row">
<div class="col-md-3"><select name="ddlState" onchange="javascript:setTimeout('__doPostBack(\'ddlState\',\'\')', 0)" id="ddlState" class="select">
<option selected="selected" value="Select State">Select State</option>
<option value="Andhra Pradesh">Andhra Pradesh</option>
<option value="Arunachal Pradesh">Arunachal Pradesh</option>
<option value="Assam">Assam</option>
<option value="Bangalore">Bangalore</option>
<option value="Bihar">Bihar</option>
<option value="Chandigarh">Chandigarh</option>
<option value="Chhattisgarh">Chhattisgarh</option>
<option value="Delhi">Delhi</option>
<option value="Goa">Goa</option>
<option value="Gujarat">Gujarat</option>
<option value="Haryana">Haryana</option>
<option value="Himachal Pradesh">Himachal Pradesh</option>
<option value="Jammu and Kashmir">Jammu and Kashmir</option>
<option value="Jharkhand">Jharkhand</option>
<option value="Karnataka">Karnataka</option>
<option value="Kerala">Kerala</option>
<option value="Madhya Pradesh">Madhya Pradesh</option>
<option value="Maharashtra">Maharashtra</option>
<option value="Manipur">Manipur</option>
<option value="Mizoram">Mizoram</option>
<option value="Nagaland">Nagaland</option>
<option value="Orissa">Orissa</option>
<option value="Punjab">Punjab</option>
<option value="Rajasthan">Rajasthan</option>
<option value="Tamilnadu">Tamilnadu</option>
<option value="Telangana">Telangana</option>
<option value="Tripura">Tripura</option>
<option value="Uttar Pradesh">Uttar Pradesh</option>
<option value="Uttarakhand">Uttarakhand</option>
<option value="West Bengal">West Bengal</option>
</select></div>
<div class="col-md-3"><select name="ddlCity" onchange="javascript:setTimeout('__doPostBack(\'ddlCity\',\'\')', 0)" id="ddlCity" class="select">
<option selected="selected" value="Select City">Select City</option>
</select></div>
<div class="col-md-3"><select name="ddllocation" id="ddllocation" class="select">
<option selected="selected" value="Select Location">Select Location</option>
</select></div>
<div class="col-md-3"><input type="submit" name="btnsubmit" value="Submit" onclick="return validate();_gaq.push(['_trackEvent', 'LOCATE US', 'Click', 'SUBMIT']);" id="btnsubmit" class="submit" /></div>
</div>
</div>
<div class="col-md-12 text-center mt-5">
<div class="col-md-12 text-center">
<div class="row">
<div class="col-md-6">
<div id="map" style="display:none;height: 360px; width:100%;margin-left:10px;border:1px solid #cccccc;"></div>
</div>
<div class="col-md-6"><!----------------------------MAP START---------------------------------->
<div id="divgrid" style="display:block;background-color:White;width:100%;height:360px;border:1px solid #cccccc;overflow-x:hidden;overflow-y:auto;">
<div>
</div>
</div>
</div>
</div>
<!----------------------------MAP END---------------------------------->
</div>
</div>
</div>
</div>
|0|hiddenField|__EVENTTARGET||0|hiddenField|__EVENTARGUMENT||0|hiddenField|__LASTFOCUS||1648|hiddenField|__VIEWSTATE|9aji4H+sHZgnrG7tl/cn055Jcu74N6ZhHGoSzOvQqlBrKXUD9/cBQ156dQZjy+TjfXw0Fkg3939tdE4izxjfd/hvGfV4nC0wmZIv3wKCnQdxKzH0RFeDbEqZ6m1kn1GeiQCcDTOQ0UySTp+gTFcA7Jih/UbFnTjOsDJ7tk1JEMlsw7O8Ss/g1Fvc/Vx1Oh4xVEvtS0hlmp6WbU2dpm8dMvVdcy1Or8IgSDoVoNQbqVDXcRkZEPHk9uV4p/VuoMU9LgSDVlrd9FCxV5rP4xkW1+THBX1FqCUYQfk8VfayTQTg1FyQfs0lWHMpXmBNpUhSGxXkLL6GQ9s4dNzKcJa8NWcwjTu78b168nL3TB2aw0iVARxkIUXOn+E51MFFfzdDJYgBhar7XJiF9AUQh2k4wwgTB4FmNGxlO9bGtGZMdB7NR19OmLoP3Tz1xZ9wJaJI2rBdbGGvpjUJHlVnoAx9Pn8jmzVtuqA/KFmgJZd2XfQAlez8qXZz/Nakl22uhzgCZDdl77H25PU4aZhHemR8Ut4sYNposzdkUuD1FxVlTfkqRcFMVpSdLWpQZeowAJx12wbv/ny8ohsEMuTs7dfhAqXw01HlEh22SXoWDzrTeeFPNn2ryqyloczBleGoXDHqeAAygeIHmnjM1eHujrcIER5dijk1cU7OH7FpRyBp3ACvvZSprRAQBy4ORiu5F8LBbPiTPVxWwFjNijL2hmW9yCia54tLdaSmx3zNlQXjel9Mo2D9QhrMs45p4srk2iFuItuUCdiA+oCsb+BHGP5WhypWYR/z6WDnY8fjBnRkIxrLNehve9qN2DLtAZGoO+UuVmqPooMBTv0RPw9cHdF52Yu52ZtHtFat7BTH2vrneUsIsEBzMc19/NBswNuugvJQDJ/OHUEx1f7uB6cdqFTUyTc+LOK8kW5lZMKyUzS4Bzc5g5XIi4e9c9/B1Bgvj16aT8oKyryK92uhjOrMNP5uSh/izGBB3WH1semoIih72Jy8ydLjOsk3qSylCqoYzQSSCTXRXeCZcengG4dgFTVwW32wA7WjbpUGZw1/YKENhiesda8ZP/nrFNmXDFqMHNm5Oe059jCxk+GpsKRRK3eN5nnXTez3uXx+XdJNrgVBR+gufhsAW/Z7N4b1SKpBnKa3t7O7KBUhrn60PnluMlmrAq6yppcJ6SSyBLVbJPTZLORUKmp2ha3FUSRBl/Ei8bx1FdJXFFcWP0k/c+vWmsBpiyvkD6azYWadVRcL6OdTb9iBl2BP4qktMGxQj7xtFbP8Jk3iAbf/CULABw6wUrl+JdCITSNPfypfsM+MWFKEb1M4a+PGtRcYBIcWe7hmzyjau9j1XCxZJuUNMgiwQYv/Zdl8pMzKEFOM9Rt9oUQvkOnOVLggqqK+rLhiSbwZfI7HRvoC3eLdta9NY7D+6PNrwGkfOnnjCeyWTRMHD61rA2tt/bHOJNr+HEgIMimYgsfMW6q244E/OLDDLPCYcBvHpDsJilHJCY9olAm6+j/8jAyYzHvDzUJSDpf6PImydtv7er2gvryRRBAeImlSLasgP+3ujnrfgG3DA9L6PoV2P9Ft6INQGhrHIwmxtCUnl7THLHCLcqFGW6mG4Sn5CJTp2BnqtM0xFecL/ytDHA+xYHnEs4ky|8|hiddenField|__VIEWSTATEGENERATOR|06ED1D24|0|hiddenField|__VIEWSTATEENCRYPTED||0|asyncPostBackControlIDs|||0|postBackControlIDs|||13|updatePanelIDs||tUpdatePanel1|0|childUpdatePanelIDs|||12|panelsToRefreshIDs||UpdatePanel1|2|asyncPostBackTimeout||90|11|formAction||locate.aspx|14|scriptBlock|ScriptContentNoTags|var player=[];|13|scriptStartupBlock|ScriptContentNoTags|initialize();|