我是网络爬网的新手。我试图从《福布斯》“最佳跨国公司”名单中提取表数据。我能够成功提取一些数据。但是,我只能从列表中获得前10名。该表之间包含广告。如何获取所有数据?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 30)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH,
'.//*[@id="the_list"]/tbody[@id="list-table-body"]')))
data = []
for row in rows:
for i in row.find_elements_by_class_name("data"):
try:
if i.is_displayed():
row_dict = {}
row_dict['Rank'] = i.find_element_by_xpath('.//td[2]').text
row_dict['Link'] = i.find_element_by_xpath('.//td[3]/a[@href]').get_attribute("href")
row_dict['Company'] = i.find_element_by_xpath('.//td[3]').text
row_dict['Industry'] = i.find_element_by_xpath('.//td[4]').text
row_dict['Country'] = i.find_element_by_xpath('.//td[5]').text
data.append(row_dict)
except:
continue
driver.close()
df = pd.DataFrame(data)
df.to_csv("Forbes_TEST.csv", sep=",", index=False)
答案 0 :(得分:0)
要获取全部250条记录,您只需添加代码即可滚动到页面底部到现有代码。因此添加:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
之前:
data = []
并添加import time
但是说您的代码确实很慢。即使将wait_row
设置为3,我的机器上仍需要1m5.933s的时间运行。以下代码耗时0m12.978s。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import csv
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 3)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH, './/*[@id="the_list"]/tbody[@id="list-table-body"]')))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
ranks = []
links = []
companies = []
industries = []
countries = []
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find("table", {"id": "the_list"})
for tr in table.find_all("tr", {"class": "data"}):
tds = tr.find_all("td")
ranks.append(tds[1].text)
links.append(tds[2].find('a')['href'])
companies.append(tds[2].text)
industries.append(tds[3].text)
countries.append(tds[4].text)
data = zip(ranks, links, companies, industries, countries)
with open('Forbes_TEST_02.csv', 'w') as csvfile:
csv_out = csv.writer(csvfile)
csv_out.writerow(['Rank', 'Link', 'Company','Industry', 'Country'])
csv_out.writerows(data)
driver.close()