我是Python的新手,但是我正在寻找一个Web抓取工具,该工具将在线提取HTML表格中的数据并将其以相同格式打印为CSV。
我正在尝试从网页中抓取数据。但这给了我一个空数组,尽管其中存在数据。如何提取这些数据?
enter code here
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://www.forbes.com/global2000/list/#tab:overall"
page = urlopen(url)
soup = BeautifulSoup(page)
text = soup.get_text()
print(soup.prettify())
all_tables=soup.find_all('tbody')
print(all_tables)
right_table=soup.find('tbody',{"class":'list-item-template'})
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]
G=[]
H=[]
for row in right_table.findAll("tr"):
cells = row.findAll('td')
print(len(cells))
states=row.findAll('th') #To store second column data
if len(cells)==8: #Only extract table body not heading
A.append(cells[0].find(text=True))
# B.append(states[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
D.append(cells[3].find(text=True))
E.append(cells[4].find(text=True))
F.append(cells[5].find(text=True))
G.append(cells[6].find(text=True))
H.append(cells[7].find(text=True))
df=pd.DataFrame(A,columns=['IMAGE'])
df['RANK']=B
df['NAME']=C
df['COUNTRY']=D
df['REVENUE']=E
df['PROFITS']=F
df['ASSETS']=G
print(df)
我得到以下输出:
答案 0 :(得分:1)
必须运行Javascript才能填充页面,并且可能会向您显示Cookie同意。以下内容进行导航,并留出时间来执行各种操作:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url = 'https://www.forbes.com/global2000/list/#tab:overall'
d = webdriver.Chrome()
d.get(url)
d.switch_to.frame(d.find_element_by_css_selector('[id^="pop-frame"]'))
WebDriverWait(d, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".call"))).click()
element = WebDriverWait(d, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , '.legalese')))
actions = ActionChains(d)
actions.move_to_element(element).perform()
table = d.find_element_by_css_selector('#the_list')
print(table.text)
答案 1 :(得分:0)
您遇到了麻烦,因为在尝试抓取HTML时未加载HTML。 您必须等待js部分的执行。 最简单的方法之一是使用硒并等待页面加载
#!/usr/bin/python3
# coding: utf8
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
url = "https://www.forbes.com/global2000/list/#tab:overall"
# Load the javascript part
driver = webdriver.Firefox()
driver.get(url)
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html)
text = soup.get_text()
print(soup.prettify())
all_tables=soup.find_all('tbody')
print(all_tables)
right_table=soup.find('tbody',{"class":'list-item-template'})
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]
G=[]
H=[]
print(right_table)
for row in right_table.findAll("tr"):
cells = row.findAll('td')
print(len(cells))
states=row.findAll('th') #To store second column data
if len(cells)==8: #Only extract table body not heading
A.append(cells[0].find(text=True))
# B.append(states[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
D.append(cells[3].find(text=True))
E.append(cells[4].find(text=True))
F.append(cells[5].find(text=True))
G.append(cells[6].find(text=True))
df=pd.DataFrame(A,columns=['IMAGE'])
df['RANK']=B
df['NAME']=C
df['COUNTRY']=D
df['REVENUE']=E
df['PROFITS']=F
df['ASSETS']=G
print(df)