我正试图从一个网站上获取一些文本,实际上是分支机构的名称。一切正常,但我有1000多家公司名称。我可以获取近100个分支机构名称,但是之后,该网站www.firmenwissen.de屏蔽了我的IP地址。在这里,我只列出了一些公司,但并非全部。这是我的代码:
from bs4 import BeautifulSoup as BS
from selenium import webdriver
import requests
import pandas as pd
from fake_useragent import UserAgent
import numpy as np
import time
companies = ['olly UG (haftungsbeschränkt)','BLUECHILLED Verwaltungs GmbH', 'Albert Barufe GmbH','ITERGO Informationstechnologie GmbH', 'Rheinbahn AG', 'AWS Personalmarketing GmbH', 'POX Media Verwaltungs GmbH','HB-DIGITAL GmbH']
n = 0
## creating environment
pd.set_option('display.max_column', None)
ua = UserAgent()
headers = {'User-Agent': str(ua.chrome)}
url = 'https://www.firmenwissen.de/index.html'
branchname = list()
driver = webdriver.Chrome('chromedriver_win32/chromedriver')
for company in df['name']:
driver.get(url)
inputElement = driver.find_element_by_id("searchPhrase0")
inputElement.clear()
inputElement.send_keys(company)
inputElement.submit()
soup = BS(driver.page_source, 'html.parser')
branchName = driver.find_element_by_class_name('margin-bottom-05').text
branchname.append(branchName)
# Delay
delays = [7, 8, 6, 5, 10, 12,15,9]
delay = np.random.choice(delays)
time.sleep(delay)
n = n + 1
print(n)
driver.quit()
df['Branch'] = branchname