使用 Beautiful Soup 和 Pandas 从网站上抓取数据

时间:2021-03-03 23:08:14

标签: python pandas beautifulsoup

我有一个 python 脚本,它使用 BeautifulSoup 和 Pandas 包,以便从 url 列表中抓取数据并将数据转换为数据框,然后将其保存为 excel 文件并通过电子邮件发送作为附件。

问题在于,当脚本运行并完成第一项的抓取时,它会崩溃并返回以下错误:

ValueError: 15 columns passed, passed data had 14 columns

我认为这意味着缺少 html 标签 对吗??

该列表包括 3 个网址。

代码:

import time
from datetime import date
import smtplib

import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate

def scrape_website():
    url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
                "https://www.bayt.com/en/international/jobs/head-chef-jobs/",
                "https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]
    for url in url_list:
        soup = BeautifulSoup(requests.get(url).content, "lxml")

        links = []
        for a in soup.select("h2.m0.t-regular a"):
            if a['href'] not in links:
                links.append("https://www.bayt.com" + a['href'])
        joineddd = []

        for link in links:
            s = BeautifulSoup(requests.get(link).content, "lxml")
            alldd = [dd.text for dd in s.select(
                "div[class='card-content is-spaced'] dd")]
            alldd.insert(0, link)
            joineddd.append(alldd)
        print("Web Crawling is Done for  {}".format(url))
        convert_to_dataFrame(joineddd)
    send_email()

def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

def convert_to_dataFrame(joineddd):
    df = pd.DataFrame(joineddd, columns=[
        "link", "location", "Company_Industry", "Company_Type",
        "Job_Role", "Employment_Type", "Monthly_Salary_Range",
        "Number_of_Vacancies", "Career_Level",
        "Years_of_Experience", "Residence_Location",
        "Gender","Nationality","Degree","Age"])
    df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])
    df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
    send_email()

def send_email():
    '''send email '''
    today = date.today()
    file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
    username='XXXXXXXXXXX'
    password='XXXXXXXXXXXXX'
    send_from = 'XXXXXXXXXXXXX'
    send_to = 'XXXXXXXXXXXXXX'
    Cc = 'recipient'
    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = send_to
    msg['Cc'] = Cc
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = 'Hello, This is a test mail {}'.format(today)
    server = smtplib.SMTP('smtp.gmail.com')
    port = '587'
    fp = open(file, 'rb')
    part = MIMEBase('application', 'vnd.ms-excel')
    part.set_payload(fp.read())
    fp.close()
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
    msg.attach(part)
    smtp = smtplib.SMTP('smtp.gmail.com')
    smtp.ehlo()
    smtp.starttls()
    smtp.login(username, password)
    smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
    smtp.quit()
    print('Mail Sent')


if __name__ == "__main__":
    scrape_website()

3 个答案:

答案 0 :(得分:1)

update func scrape_website(),将 alldd 保存为字典。

for link in links:
    s = BeautifulSoup(requests.get(link).content, "lxml") 
    ### update Start ###
    alldd = dict()
    alldd['link'] = link
    dd_div = [i for i in s.select("div[class='card-content is-spaced'] div") 
              if ('<dd>' in str(i) ) and ( "<dt>" in str(i))]
    for div in dd_div:
        k = div.select_one('dt').get_text(';', True)
        v = div.select_one('dd').get_text(';', True)
        alldd[k] = v
    ### update End  ###    
    joineddd.append(alldd)


# result
df = pd.DataFrame(joineddd)

alladd 样本:


{
         'link': 'https://www.bayt.com/en/qatar/jobs/executive-chef-4298309/',       
         'Job Location': 'Doha, Qatar',
         'Company Industry': 'Real Estate; Hospitality & Accomodation; Catering, Food Service, & Restaurant',
         'Company Type': 'Employer (Private Sector)',
         'Job Role': 'Hospitality and Tourism',
         'Employment Type': 'Unspecified',
         'Monthly Salary Range': 'Unspecified',
         'Number of Vacancies': 'Unspecified',
         'Career Level': 'Mid Career',
         'Years of Experience': 'Min: 7',
         'Residence Location': 'Qatar',
         'Degree': "Bachelor's degree / higher diploma"
}

答案 1 :(得分:0)

ValueError: 15 columns passed, passed data had 14 columns

我在这里读到的意思是你指定数据框有 15 列,但你提供给它的数据只有 14 个特征。您需要检查原始文件以确保它确实包含您期望的数据,或者调整您期望的列及其名称以匹配文件。

答案 2 :(得分:0)

让我们清理一些代码。

  1. 您不需要编写函数来删除列,已经有一种方法可以使用 .drop() 来执行此操作。所以删除函数 remove_unwanted_cols(dataset, cols) 并简单地更改行:

df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])

df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)

  1. 你的意思是让它发送两次电子邮件吗?您可以在 scrape_website() 函数和 convert_to_dataFrame() 函数中执行此操作。

  2. 如果提取数据来构建数据框,我通常会尽量避免使用列表,因为您会得到某些站点有 x 列的错误的确切原因,但下一次抓取有额外的一个(或不匹配)连)。字典是一种更好的处理方法,键是列名,并对数据进行赋值。所以你会有一个字典列表。列表中的每一项都是一行,每个字典对应一列的值。然后你可以去掉 convert_to_dataFrame() 函数,因为 Pandas 可以为你做这件事,但我们会把它留在那里,你可以保留它,如果你愿意,也可以删除它。

  3. 如果您对字符串使用 r'',则不需要对 \ 进行字符转义。要么做:r"F:\AIenv\web_scrapping\jobDesc.xlsx",要么"F:\\AIenv\web_scrapping\\jobDesc.xlsx"

代码:

import time
from datetime import date
import smtplib

import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate

def scrape_website(url):
    soup = BeautifulSoup(requests.get(url).content, "lxml")
    subject = url.split('/')
    subject = [x for x in subject if x != ''][-1]
    links = []
    for a in soup.select("h2.m0.t-regular a"):
        if a['href'] not in links:
            links.append("https://www.bayt.com" + a['href'])
    
    joineddd = []
    for link in links:
        row = {}
        s = BeautifulSoup(requests.get(link).content, "lxml")
        job_description = s.find('h2', text='Job Description').find_next('dl')
        data_titles = job_description.find_all('dt')
        for data_title in data_titles:
            dt = '_'.join(data_title.text.split())
            dd = data_title.find_next('dd').text.strip()
            row.update({dt: dd})
                    
        if s.find('h2', text='Preferred Candidate'):
            preferred_candidate = s.find('h2', text='Preferred Candidate').find_next('dl')
            data_titles = preferred_candidate.find_all('dt')
            for data_title in data_titles:
                dt = '_'.join(data_title.text.split())
                dd = data_title.find_next('dd').text.strip()
                row.update({dt: dd})
            
        joineddd.append(row)
            
    print("Web Crawling is Done for  {}".format(url))
    convert_to_dataFrame(joineddd, subject)
    #send_email(subject) #<-- did you want to send here?

def convert_to_dataFrame(joineddd, subject):
    df = pd.DataFrame(joineddd)
    df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)
    df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
    send_email(subject)  #<--or do you want to send here??

def send_email(subject):
    '''send email '''
    today = date.today()
    file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
    username='XXXXXXXXXXX'
    password='XXXXXXXXXXXXX'
    send_from = 'XXXXXXXXXXXXX'
    send_to = 'XXXXXXXXXXXXXX'
    Cc = 'recipient'
    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = send_to
    msg['Cc'] = Cc
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = 'Hello, This is a test mail {} - {}'.format(today,subject)
    server = smtplib.SMTP('smtp.gmail.com')
    port = '587'
    fp = open(file, 'rb')
    part = MIMEBase('application', 'vnd.ms-excel')
    part.set_payload(fp.read())
    fp.close()
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
    msg.attach(part)
    smtp = smtplib.SMTP('smtp.gmail.com')
    smtp.ehlo()
    smtp.starttls()
    smtp.login(username, password)
    smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
    smtp.quit()
    print('Mail Sent')


url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
            "https://www.bayt.com/en/international/jobs/head-chef-jobs/",
            "https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]

if __name__ == "__main__":
    for url in url_list:
        scrape_website(url)