我尝试从此网站上抓取公司的联系数据:
我可以使用以下代码执行此操作:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies
driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink
contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data:
company_list.append(cn.text) # this stores the text in the list
driver.back() #navigate to previous site
time.sleep(5) #wait for the pop-up window to appear
driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup
time.sleep(5) #wait for the popup to vanish
driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink
contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data2:
company_list.append(cn.text) # this stores the text in the list
print(company_list) #show the list
我的输出是这样
['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']
问题:
我想要我的代码在第1页的整个列表中执行此操作,然后在下一页继续并再次执行。直到我在列表中有100个地址为止。我会使用“ while循环”来执行此操作,但是我的用于查找地址的xpath太指定了,因此它将始终循环相同的公司。
非常感谢
答案 0 :(得分:1)
尝试下面的代码提取一页数据。更新代码以遍历下一页记录。
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome() #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
'//*[@id="contactInformation"]//div[@class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(company_name + " : " + contact_data)
driver.back() # navigate to previous site
print(company_list)
答案 1 :(得分:0)
感谢上面的Dilip Meghwals评论,我可以完成我的代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
company_list= [] #create empty list
count = 25
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
while len(company_list) < 1000:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="contactInformation"]//div[@class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(contact_data)
driver.back() # navigate to previous site
time.sleep(5)
driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();
company_list = [w.replace('\n', ', ') for w in company_list]
print(company_list)
df_company_name = pd.DataFrame(company_list, columns =['Name'])
df_company_name.to_excel("company_name.xlsx")