我正在处理我的第一个Web爬虫,并且由于stackoverflow的帮助,我设法将以下代码放在一起。这段代码可以很好地单击每个页面,然后进入每个链接以提取我需要的信息。但是,由于没有要提取的网络信息,它卡在了其中一个链接上。
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
binary = FirefoxBinary('geckodriver.exe')
driver = webdriver.Firefox()
driver.get('http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php')
url = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
text = requests.get(url).text
page1 = BeautifulSoup(text, "html.parser")
def get_data(url, tries=0, max_tries=3):
text_test2 = requests.get(url).text
page2 = BeautifulSoup(text_test2, "html.parser")
try:
title = page2.find('h1', attrs={'class':'hl_2'}).text
content = page2.find('div', attrs={'class':'cont'}).text
phone = page2.find('div', attrs={'class':'sico ico_phone'}).text
email_div = page2.find('div', attrs={'class':'sico ico_email'})
email = email_div.find('a', attrs={'class': 'xsecondarylink'})['href']
web_div = page2.find('div', attrs={'class':'sico ico_link'})
web = web_div.find('a', attrs={'class':'xsecondarylink'})
if web != None:
web = web['href']
except:
if tries < max_tries:
tries += 1
print("try {}".format(tries))
return get_data(url, tries)
data = {'Name': [title],
'Street address': [content],
'Phone number': [phone],
'Email': [email],
'Web': [web]
}
return pd.DataFrame(data=data)
df = pd.DataFrame()
for i in range(0,80):
print(i)
page1 = BeautifulSoup(driver.page_source, 'html.parser')
for div in page1.findAll('div', attrs={'class':'item'}):
for a in div.findAll('a', attrs={'class':'initial_noline'}):
if 'kid=' not in a['href'] : continue
print('http://www.interzum.com' + a['href'])
data = get_data('http://www.interzum.com' + a['href'])
df = pd.concat([df, data])
next_button = driver.find_element_by_class_name('slick-next')
next_button.click()
time.sleep(20)
df.to_csv('results.csv')
我尝试了许多不同的方法来尝试说,如果网络不存在,则继续循环播放,如果确实存在,则将我拉到href上。但是我不断收到一条错误消息 UnboundLocalError:赋值之前引用了本地变量'web'
我似乎无法将其与网络上的信息正确地整合在一起。对我做错了什么的任何见解将不胜感激!
谢谢大家。
答案 0 :(得分:0)
我认为您需要切换到硒来从特定页面检索信息,因为并非总是为请求加载内容。您可以将以下内容用作框架。
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseLink = 'http://www.interzum.com'
varLink = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php?fw_goto=aussteller/blaettern&fw_ajax=1&paginatevalues=%7B%22stichwort%22%3A%22%22%7D&start={}&dat=231518http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
startUrl = 'http://www.interzum.com/exhibitors-and-products/exhibitor-index/exhibitor-index-15.php'
resultsPerPage = 20
i = 0
headers = {'User-Agent' : 'Mozilla/5.0'}
results = []
final = []
with requests.Session() as s:
r = s.get(startUrl, headers = headers)
soup = bs(r.content, 'lxml')
numPages = int(soup.select('a[rel=next]')[-2].text)
links = list((baseLink + link['href'] for link in soup.select('[href*="fw_goto=aussteller/details&&kid="]')))
results.append(links)
for j in range(1, numPages):
i+=20
url = varLink.format(i)
r = s.get(url, headers = headers)
soup = bs(r.content, 'lxml')
links = list((baseLink + link['href'] for link in soup.select('[href*="fw_goto=aussteller/details&&kid="]')))
results.append(links)
totalList = [item for sublist in results for item in sublist]
for link in totalList:
driver.get(link)
try:
title = driver.find_element_by_css_selector('h1.hl_2').text
content = driver.find_element_by_css_selector('div.cont').text
phone = driver.find_element_by_css_selector('div.sico.ico_phone').text
email = driver.find_element_by_css_selector('div.sico.ico_email a.xsecondarylink').get_attribute('href')
web = driver.find_element_by_css_selector('div.sico.ico_link a.xsecondarylink').get_attribute('href')
final.append([title, content, phone, email, web])
except Exception as e:
print(link)
print(e)
continue