以下是我用于抓取BSE网站的代码。一切都很好,除了一个小故障。内部(第二个)for循环不会迭代,执行结束。 任何帮助都会有用。
browser=webdriver.Chrome()
browser.get('http://www.bseindia.com/markets/keystatics/Keystat_index.aspx')
for i in range(1,48):
browser.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_ddltype']/option["+str(i)+"]").click()
browser.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_btnSubmit"]').click()
data = []
for j in range(2,21):
browser.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_gvReport_ctl"+str(j).zfill(2)+"_Linkbtn']").click()
for tr in browser.find_elements_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_gvYearwise"]'):
ths = tr.find_elements_by_tag_name('th')
tds = tr.find_elements_by_tag_name('td')
if ths:
data.append([th.text for th in ths])
if tds:
data.append([td.text for td in tds])
f.write(str(data) + "\n")
答案 0 :(得分:3)
很多时候点击导致500,所以我运行了递归尝试捕获块。
以下是整个代码:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
base_url="http://www.bseindia.com/markets/keystatics/Keystat_index.aspx"
#browser = webdriver.Chrome('/Users/qriyoinfolabs/ahlat/chromedriver')
browser=webdriver.Chrome()
browser.get(base_url)
data = []
def fetch_this_erroful_page_for_me(id):
try:
print "Trying "+str(id)+"..."
browser.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_ddltype']/option["+str(id)+"]").click()
browser.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_btnSubmit"]').click()
except:
print "Retrying "+str(id)+"..."
time.sleep(2)
browser.get(base_url)
fetch_this_erroful_page_for_me(id)
def click_on_this_link_for_me(year_id,option_id):
try:
print "Trying year"+str(year_id)+"..."
zfilled_id=str(year_id).zfill(2)
browser.find_element_by_xpath("//*[@id='ctl00_ContentPlaceHolder1_gvReport_ctl"+zfilled_id+"_Linkbtn']").click()
return 1
except NoSuchElementException:
return 0
else:
time.sleep(2)
fetch_this_erroful_page_for_me(option_id)
click_on_this_link_for_me(year_id,option_id)
for i in range(1,48):
fetch_this_erroful_page_for_me(i)
for j in range(2,21):
valid=click_on_this_link_for_me(j,i)
if(valid==0):
print "valid0"
break
for tr in browser.find_elements_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_gvYearwise"]'):
ths = tr.find_elements_by_tag_name('th')
tds = tr.find_elements_by_tag_name('td')
if ths:
data.append([th.text for th in ths])
if tds:
data.append([td.text for td in tds])
with open('str.txt','w') as file:
file.write(str(data))
答案 1 :(得分:0)
'//[@id="ctl00_ContentPlaceHolder1_gvYearwise"]'
不是tr
标记,而是table
。所以browser.find_elements_by_xpath(..)
只返回一个元素。
试试'//[@id="ctl00_ContentPlaceHolder1_gvYearwise"]//tr'
顺便说一句,这就像for i in range(1,48)
一样糟糕。尝试使用元素或元素生成器制作一些可迭代的对象。
例如(我不是很好,因为没有正确测试 - 有HTTP ERROR 500
问题):
def get_next_row(driver, xpath):
i = 0
while True:
try:
yield driver.find_elements_by_xpath(xpath)[i]
except IndexError:
break
i += 1
browser=webdriver.Chrome()
browser.implicitly_wait(0.5)
browser.get('http://www.bseindia.com/markets/keystatics/Keystat_index.aspx')
for list_item in get_next_row(browser, "//*[@id='ctl00_ContentPlaceHolder1_ddltype']/option"):
list_item.click()
browser.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_btnSubmit"]').click()
data = []
for next_button in get_next_row(browser, '//a[contains(@id, "ctl00_ContentPlaceHolder1_gvReport_ct")]'):
next_button.click()
for tr in get_next_row(browser, '//*[@id="ctl00_ContentPlaceHolder1_gvYearwise"]//tr'):
ths = tr.find_elements_by_tag_name('th')
tds = tr.find_elements_by_tag_name('td')
if ths:
data.append([th.text for th in ths])
if tds:
data.append([td.text for td in tds])
f.write(str(data) + "\n")