Question

我有一个注册桌子的网站，我需要刮擦桌子。在该表中，只有当我们在新选项卡中打开它们时，才能看到存在的电子邮件地址，但是它们显示在页面的html脚本中。我无法抓取电子邮件。

class HTMLTableParser:    
  def parse_url(self,url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    return[(table['id'], self.parse_html_table(table))\
          for table in soup.find_all('table')]

  def parse_html_table(self,table):
     n_columns = 0
     n_rows = 0
     column_names = []

     for row in table.find_all('tr'):
         td_tags = row.find_all('td')
         if len(td_tags)>0:
             n_rows+=1
             if n_columns == 0:
                 n_columns = len(td_tags)

         th_tags = row.find_all('th')
         if len(th_tags) > 0 and len(column_names) == 0:
             for th in th_tags:
                 column_names.append(th.get_text())




    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0, n_columns)

    df = pd.DataFrame(columns = columns,
                     index = range(0, n_rows))

    row_marker = 0
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker, column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

Answer 1

编辑：最初，我只回答了如何获取电子邮件。已调整答案，以获取包含所有其他数据的电子邮件。编辑2：与BS4 4.6系列兼容。

由于在锚点href中找到了该电子邮件，因此无法获取。如果找到该电子邮件，我们将从锚中提取该电子邮件；否则，如果未找到该电子邮件，我们将从单元格中提取文本。

由于我100％不清楚代码的最终目标是什么，所以这只是在提取所有单元格，重点是获取原始代码中未捕获的电子邮件。

from bs4 import BeautifulSoup
import requests
import pandas as pd

url = 'https://www.adelaide.edu.au/directory/atoz?dsn=directory.phonebook;orderby=last%2Cfirst%2Cposition_n;m=atoz;page=;perpage=50'

def parse_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    return [(table['id'], parse_html_table(table)) for table in soup.find_all('table')]

def parse_html_table(table):
    n_columns = 0
    n_rows = 0
    column_names = []

    column_names = [th.get_text() for th in table.select('th')]
    n_columns = len(column_names)

    rows = table.select('tr')[1:]
    n_rows = len(rows)

    df = pd.DataFrame(columns=column_names, index=range(n_rows))

    r_index = 0
    for row in rows:
        c_index = 0
        for cell in row.select('td'):
            if cell.get('data-th') == 'Email':
                anchor = cell.select_one('a')
                df.iat[r_index, c_index] = anchor.get('href').replace('mailto:', '') if anchor else cell.get_text()
            else:
                df.iat[r_index, c_index] = cell.get_text()
            c_index += 1
        r_index += 1

    return df

print(parse_url(url))

当电子邮件被列为链接时，如何从给定的链接中获取表格中的电子邮件？

1 个答案: