我需要在提到的页面上刮掉每个职位的职位描述文本

时间:2019-09-09 06:30:02

标签: python-3.x web-scraping beautifulsoup

我需要使用python美丽的汤匙模块在CSV文件的不同列中的标题内的每个职位名称(例如科目(会计)职位名称(staff accountant)职位描述文字)的页面()中刮取职位描述。 / p>

我是美丽汤的新手,我尝试了一些方法来做,但是它不起作用,请您提供代码帮助

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

start = time.time()

url = ""
data = []
while True:
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
jobdesc = soup.find("li",{'class':'col-xs-12 col-sm-4'})
section=soup.find("h4")
jd = {"jobdescription":jobdesc.text,"topic":section.text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")

3 个答案:

答案 0 :(得分:1)

这是利用:bs4 4.7.1+中的一种方法来隔离各节以进行循环的一种方法。使用zip_longest,因此我们可以将部分标题加入每个作业。

import requests, csv
from bs4 import BeautifulSoup as bs
from itertools import zip_longest

r = requests.get('https://resources.workable.com/job-descriptions/#', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')

with open("data.csv", "w", encoding="utf-8-sig", newline='') as csv_file:

    w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
    w.writerow(['Section','Job Title'])

    for section in soup.select('section:has(.job)'):
        title = section.select_one('a').text.strip()
        jobs = [job.text for job in section.select('li a')]
        rows = list(zip_longest([title], jobs, fillvalue = title))
        for row in rows:
            w.writerow(row)

答案 1 :(得分:0)

我使用requests软件包禁止使用403,所以我决定使用selenium

您可以尝试以下方法:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver

url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
    title = s.find('h4').text
    lis = soup.find_all("li",{'class':'col-xs-12 col-sm-4'})
    for li in lis:
        jd = {"jobdescription":li.text,"topic":title}
        data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")

编辑:获取所有作业的说明

from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver

url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
    title = s.find('h4').text
    lis = s.find_all("li",{'class':'col-xs-12 col-sm-4'})
    for li in lis:
        job = li.text
        driver.get(li.find('a').get('href'))
        soup2 = BeautifulSoup(driver.page_source, 'html.parser')
        jd = {"job":job,"topic":title, "description": soup2.find('div',{'class':'entry-content article-content'}).text}
        data.append(jd)

df = pd.DataFrame(data)
df.to_csv("JD.csv")

答案 2 :(得分:-1)

从怪物作业中抓取数据并上传到 Mongo DB。

from time import *
from selenium import webdriver
import pymongo
from pymongo.results import InsertManyResult
import os


client = pymongo.MongoClient()
mydb =  client['jobs']
collection  = mydb['med_title']

driver = webdriver.Chrome("C:/Users/91798/Desktop/pythn_files/chromedriver.exe")
driver.get("https://www.monsterindia.com/")

driver.implicitly_wait(9)
driver.find_element_by_id("SE_home_autocomplete").send_keys("nursing , Therapist , docter , medical ,nurse , hospital")

#for normal search use this 
driver.find_element_by_xpath("//body/div[@id='themeDefault']/section[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/form[1]/div[1]/div[2]/input[1]").click()
driver.implicitly_wait(20)
temp = 1
while(True):
    if temp == 5:
        break
    all_jobs =  driver.find_elements_by_class_name("card-apply-content")
    link_list = []
    for job in all_jobs:
        try:
            company = ""
            com_name = job.find_elements_by_class_name("job-tittle")
            driver.implicitly_wait(1)
            for ele in com_name:
                company = ele.find_element_by_class_name('company-name').text
            job_title = ""
            for ele in com_name:
                job_title = ele.find_element_by_class_name('medium').text
       
            location = job.find_element_by_class_name("loc").text
            driver.implicitly_wait(1)
            lnks= job.find_elements_by_tag_name("a")
            for lnk in lnks:
                link_list.append(lnk.get_attribute('href'))
                break
            driver.implicitly_wait(1)
            desc = job.find_element_by_class_name("job-descrip").text
            driver.implicitly_wait(1)
            skills = job.find_element_by_class_name("descrip-skills").text

        except:
            desc =  'desc Not Specified'
            skills =  'skills Not Specified'  
            location = ' location Not Specified'
            company = 'company  Not Specified'
            job_title = 'job_title not specified'
        
        s = skills.split(' ')
        for i in s:
            if i == ',':
                s.remove(',')
        data = {"job_title" : job_title ,"comapany_name": company,"job_location": 
        location,"job_desc":desc,"skills":s[2::],"card_link":link_list[0]}
        link_list.clear()
        y =  collection.insert_one(data)
        print(y.inserted_id)
    driver.find_element_by_xpath("//button[contains(text(),'Next')]").click()
    sleep(25)
    temp = temp +1