我有一个注册桌子的网站,我需要刮擦桌子。在该表中,只有当我们在新选项卡中打开它们时,才能看到存在的电子邮件地址,但是它们显示在页面的html脚本中。我无法抓取电子邮件。
class HTMLTableParser:
def parse_url(self,url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
return[(table['id'], self.parse_html_table(table))\
for table in soup.find_all('table')]
def parse_html_table(self,table):
n_columns = 0
n_rows = 0
column_names = []
for row in table.find_all('tr'):
td_tags = row.find_all('td')
if len(td_tags)>0:
n_rows+=1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0, n_columns)
df = pd.DataFrame(columns = columns,
index = range(0, n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker, column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
return df
答案 0 :(得分:0)
编辑:最初,我只回答了如何获取电子邮件。已调整答案,以获取包含所有其他数据的电子邮件。 编辑2:与BS4 4.6系列兼容。
由于在锚点href
中找到了该电子邮件,因此无法获取。如果找到该电子邮件,我们将从锚中提取该电子邮件;否则,如果未找到该电子邮件,我们将从单元格中提取文本。
由于我100%不清楚代码的最终目标是什么,所以这只是在提取所有单元格,重点是获取原始代码中未捕获的电子邮件。
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.adelaide.edu.au/directory/atoz?dsn=directory.phonebook;orderby=last%2Cfirst%2Cposition_n;m=atoz;page=;perpage=50'
def parse_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
return [(table['id'], parse_html_table(table)) for table in soup.find_all('table')]
def parse_html_table(table):
n_columns = 0
n_rows = 0
column_names = []
column_names = [th.get_text() for th in table.select('th')]
n_columns = len(column_names)
rows = table.select('tr')[1:]
n_rows = len(rows)
df = pd.DataFrame(columns=column_names, index=range(n_rows))
r_index = 0
for row in rows:
c_index = 0
for cell in row.select('td'):
if cell.get('data-th') == 'Email':
anchor = cell.select_one('a')
df.iat[r_index, c_index] = anchor.get('href').replace('mailto:', '') if anchor else cell.get_text()
else:
df.iat[r_index, c_index] = cell.get_text()
c_index += 1
r_index += 1
return df
print(parse_url(url))