我正在尝试抓取此网站的内容: http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=A&FirstName=&City=&FilerID= LastName是字母A-Z,以获取说客信息。这是一个佐治亚州的开放网站。
我使用了机械化和Selenium的组合(实际上是其中之一)来获取所需的基本信息,在基本for循环中对每个字母进行排序。 (以下代码) 我遇到问题的地方 - Selenium和mechanize都试图点击与每个说客相关联的“查看游说者”链接。
使用Selenium,它会点击第一个链接,然后失败并出现“selenium.common.exceptions.NoSuchElementException:消息:没有这样的元素:无法找到元素:{”method“:”id“,”selector“: “ctl00_ContentPlaceHolder1_Results_ctl03_lnkView”}“消息。
使用mechanize,因为每个“View Lobbyist”链接都是一个href而不是一个表单,所以任何br.submit()都会失败。
以下是Selenium代码的简要版本:
def __init__(self):
self.url = "http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_ByName.aspx"
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
def scrape_lobbyists(self, letter):
urlstr = "http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?Year=2016&LastName="+letter+"&FirstName=&City=&FilerID="
driver.get(urlstr)
soup = BS(driver.page_source)
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
return
records = table.find_all('tr') # List of all results for this letter
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
stext = sname.get_text()
if ',' in stext:
continue
rec_print = rec_print + stext + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnks = row.find_all('a', 'lblentrylink')
for lnk in lnks:
if lnk is None: # For some reason, first record is blank.
continue
newlnk = lnk['id'] # OK, this is the new URL
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
print("Lnk: ", lnk)
print("NewLnk: ", newlnk) # Just look at various elements
print("LnkStr: ", newstr)
print("LnkCtl: ", newctl)
driver.find_element_by_id(newlnk).click() # newlnk seems to be the right one...
在此处使用机械化代码:
br.open( “http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=” +字母+ “&安培;姓=安培;市=安培; FilerID =”) 汤= BS(br.response()。read())
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
continue
records = table.find_all('tr') # List of all results for this letter
for form in br.forms():
print "Form name:", form.name
print form
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
if ',' in sname.get_text(): # They actually have a field named 'comma'!!
continue
rec_print = rec_print + sname.get_text() + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnk = row.find('a', 'lblentrylink')
if lnk is None: # For some reason, first record is blank.
continue
print("Lnk: ", lnk)
newlnk = lnk['id']
print("NEWLNK: ", newlnk)
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
br.select_form('aspnetForm') # Tried (nr=0) also...
print("NEWCTL: ", newctl)
br[__EVENTTARGET] = newctl
response = br.submit(name=newlnk).read()
无论如何,我有点难过,所以感谢任何指导!
答案 0 :(得分:0)
问题是 - 一旦您点击“查看游说者”链接,您就会被重定向到同一浏览器窗口中的其他网址。如果您想点击第二个“查看游说者”链接,则需要返回游说者列表。
以下是实施 - 收集说客名称,按照“个人资料”链接获取文件管理员ID,然后返回重复此过程:
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=A&FirstName=&City=&FilerID=")
wait = WebDriverWait(driver, 15)
results = []
# iterate over the results skipping the header row
for index in range(1, len(driver.find_elements_by_css_selector("table#ctl00_ContentPlaceHolder1_Results tr"))):
# get the current row
rows = driver.find_elements_by_css_selector("table#ctl00_ContentPlaceHolder1_Results tr")
lobbyist = rows[index]
# extract some data and follow the link
name = lobbyist.find_element_by_css_selector("[id$=lblFName]").text
profile_link = lobbyist.find_element_by_css_selector("[id$=lnkView]")
profile_link.click()
# wait for the page to load
filer_id = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[id$=lblFilerID]"))).text
# go back
driver.back()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#ctl00_ContentPlaceHolder1_Results tr")))
results.append({"name": name,
"filer_id": filer_id})
driver.close()
pprint(results)
打印:
[{'filer_id': 'L20160009', 'name': 'ASHLEY'},
{'filer_id': 'L20120018', 'name': 'ANNA'},
{'filer_id': 'L20050601', 'name': 'BILLY'},
{'filer_id': 'L20090142', 'name': 'CHANDON'},
{'filer_id': 'L20130009', 'name': 'CHARLES'},
{'filer_id': 'L20140179', 'name': 'MARY PAIGE'},
{'filer_id': 'L20050237', 'name': 'NORMER'},
{'filer_id': 'L20060195', 'name': 'PAMELA'},
{'filer_id': 'L20090281', 'name': 'SHAUN'},
{'filer_id': 'L20150090', 'name': 'TYLER'},
{'filer_id': 'L20160162', 'name': 'SARKIS'},
{'filer_id': 'L20150045', 'name': 'SAMUEL'},
{'filer_id': 'L20160098', 'name': 'JOSHUA'},
{'filer_id': 'L20130110', 'name': 'TIMOTHY'},
{'filer_id': 'L20060300', 'name': 'JENNIFER'},
{'filer_id': 'L20080329', 'name': 'BRAD'},
{'filer_id': 'L20130177', 'name': 'ELIZABETH'},
{'filer_id': 'L20120102', 'name': 'C.'},
{'filer_id': 'L20050996', 'name': 'STEVE'},
{'filer_id': 'L20110128', 'name': 'TRACY'},
{'filer_id': 'L20100284', 'name': 'JASON'},
{'filer_id': 'L20150052', 'name': 'MOLLY'},
{'filer_id': 'L20050253', 'name': 'ELIZABETH'},
{'filer_id': 'L20150016', 'name': 'BLAKE'}]