硒重复执行python网络抓取

时间:2020-09-25 11:09:08

标签: python selenium google-chrome web-scraping

我尝试从此网站上抓取公司的联系数据:

https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=4

我可以使用以下代码执行此操作:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink

contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data:
    company_list.append(cn.text) # this stores the text in the list

driver.back() #navigate to previous site

time.sleep(5) #wait for the pop-up window to appear

driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup

time.sleep(5) #wait for the popup to vanish

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink

contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data2:
    company_list.append(cn.text) # this stores the text in the list

print(company_list) #show the list

我的输出是这样

['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']

问题:

我想要我的代码在第1页的整个列表中执行此操作,然后在下一页继续并再次执行。直到我在列表中有100个地址为止。我会使用“ while循环”来执行此操作,但是我的用于查找地址的xpath太指定了,因此它将始终循环相同的公司。

非常感谢

2 个答案:

答案 0 :(得分:1)

尝试下面的代码提取一页数据。更新代码以遍历下一页记录。

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome() #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies

WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
    WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
    elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
    company_name = elements[i].text
    elements[i].click()  # click on the first company namelink
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
                                                                '//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
    contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
    # print(contact_data)
    company_list.append(company_name + " : " + contact_data)
    driver.back()  # navigate to previous site

print(company_list)

答案 1 :(得分:0)

感谢上面的Dilip Meghwals评论,我可以完成我的代码:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

company_list= [] #create empty list

count = 25

chrome_options = webdriver.ChromeOptions()

prefs = {"profile.default_content_setting_values.notifications" : 2}

chrome_options.add_experimental_option("prefs",prefs)

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies


    while len(company_list) < 1000:
            
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

            elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
            # To iterate over the company list and click on the company name then capture the address on navigated page
            # come back to previous page and repeat the same.
            
            for i in range(elementsSize):
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
                elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
                company_name = elements[i].text
                elements[i].click()  # click on the first company namelink
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
                contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
                # print(contact_data)
                company_list.append(contact_data)
                driver.back()  # navigate to previous site
                            
            time.sleep(5)
    
            driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();

company_list = [w.replace('\n', ', ') for w in company_list]

print(company_list)

df_company_name = pd.DataFrame(company_list, columns =['Name'])

df_company_name.to_excel("company_name.xlsx")