我试图从下面代码中显示的链接的所有页面中获取数据,但是不起作用。
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
driver.get('http://buyersguide.recyclingtoday.com/search')
results = list()
#scrpae data here
rows = driver.find_elements_by_xpath("//td[@style='font-weight:bold;']//parent::tr")
#get more pages
while True:
try:
driver.find_element_by_partial_link_text('Next').click()
sleep(15)
#scrpae data here
rows = driver.find_elements_by_xpath("//td[@style='font-weight:bold;']//parent::tr")
for i in range(0, len(rows)):
print(rows[i])
results.append(rows[i])
print('---')
except NoSuchElementException:
break
#get all the wanted data
records = []
for result in results:
company=result.find_element_by_xpath('./td[1]').text
address = result.find_element_by_xpath('./td[2]').text
contact= result.find_element_by_xpath('./td[3]//a').text
number= result.find_element_by_xpath('./td[5]').text
records.append((company,address,contact,number))
#output to be table
df = pd.DataFrame(records, columns=['company','number','address', 'contact'])
答案 0 :(得分:1)
这不能回答您的问题,因为没有解释您面临的问题。但是,我对此表示了怀疑。这未经单元测试,如果您打算利用此实现,请使用webdriver wait代替time.sleep并进行单元测试
我得到了数据,但是没有测试所有数据是否都进入了iframe
import time
from selenium.webdriver import Chrome
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
driver = Chrome(executable_path=<path>)
driver.get('http://buyersguide.recyclingtoday.com/search')
results = []
while True:
time.sleep(2)
results_table = driver.find_element_by_css_selector('#Body_tbl>tbody')
rows = results_table.find_elements_by_tag_name('tr')
del rows[:2] # delete header
del rows[-2:] # delete footer
for row in rows:
data = row.find_elements_by_tag_name('td')
results.append([data[0].text, data[1].text, data[2].text, data[4].text])
time.sleep(2)
try:
next_ = driver.find_element_by_link_text('Next >')
next_.click()
except NoSuchElementException:
break
df = pd.DataFrame(results, columns=['Address', 'Company', 'Contact', 'Phone Number'])
答案 1 :(得分:0)
尝试一下(缩进可能会损坏):
driver.find_element_by_partial_link_text('Next').click()
#sleep(15)# you can use explicit wait time as below
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//td[@style='font-weight:bold;']//parent::tr"))
rows = driver.find_elements_by_xpath("//td[@style='font-weight:bold;']//parent::tr"))
for i in rows:
text= i.text
print text