我需要使用python美丽的汤匙模块在CSV文件的不同列中的标题内的每个职位名称(例如科目(会计)职位名称(staff accountant)职位描述文字)的页面()中刮取职位描述。 / p>
我是美丽汤的新手,我尝试了一些方法来做,但是它不起作用,请您提供代码帮助
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
start = time.time()
url = ""
data = []
while True:
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
jobdesc = soup.find("li",{'class':'col-xs-12 col-sm-4'})
section=soup.find("h4")
jd = {"jobdescription":jobdesc.text,"topic":section.text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
答案 0 :(得分:1)
这是利用:bs4 4.7.1+中的一种方法来隔离各节以进行循环的一种方法。使用zip_longest,因此我们可以将部分标题加入每个作业。
import requests, csv
from bs4 import BeautifulSoup as bs
from itertools import zip_longest
r = requests.get('https://resources.workable.com/job-descriptions/#', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
with open("data.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(['Section','Job Title'])
for section in soup.select('section:has(.job)'):
title = section.select_one('a').text.strip()
jobs = [job.text for job in section.select('li a')]
rows = list(zip_longest([title], jobs, fillvalue = title))
for row in rows:
w.writerow(row)
答案 1 :(得分:0)
我使用requests
软件包禁止使用403,所以我决定使用selenium
您可以尝试以下方法:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = soup.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
jd = {"jobdescription":li.text,"topic":title}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
编辑:获取所有作业的说明
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = s.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
job = li.text
driver.get(li.find('a').get('href'))
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
jd = {"job":job,"topic":title, "description": soup2.find('div',{'class':'entry-content article-content'}).text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
答案 2 :(得分:-1)
从怪物作业中抓取数据并上传到 Mongo DB。
from time import *
from selenium import webdriver
import pymongo
from pymongo.results import InsertManyResult
import os
client = pymongo.MongoClient()
mydb = client['jobs']
collection = mydb['med_title']
driver = webdriver.Chrome("C:/Users/91798/Desktop/pythn_files/chromedriver.exe")
driver.get("https://www.monsterindia.com/")
driver.implicitly_wait(9)
driver.find_element_by_id("SE_home_autocomplete").send_keys("nursing , Therapist , docter , medical ,nurse , hospital")
#for normal search use this
driver.find_element_by_xpath("//body/div[@id='themeDefault']/section[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/form[1]/div[1]/div[2]/input[1]").click()
driver.implicitly_wait(20)
temp = 1
while(True):
if temp == 5:
break
all_jobs = driver.find_elements_by_class_name("card-apply-content")
link_list = []
for job in all_jobs:
try:
company = ""
com_name = job.find_elements_by_class_name("job-tittle")
driver.implicitly_wait(1)
for ele in com_name:
company = ele.find_element_by_class_name('company-name').text
job_title = ""
for ele in com_name:
job_title = ele.find_element_by_class_name('medium').text
location = job.find_element_by_class_name("loc").text
driver.implicitly_wait(1)
lnks= job.find_elements_by_tag_name("a")
for lnk in lnks:
link_list.append(lnk.get_attribute('href'))
break
driver.implicitly_wait(1)
desc = job.find_element_by_class_name("job-descrip").text
driver.implicitly_wait(1)
skills = job.find_element_by_class_name("descrip-skills").text
except:
desc = 'desc Not Specified'
skills = 'skills Not Specified'
location = ' location Not Specified'
company = 'company Not Specified'
job_title = 'job_title not specified'
s = skills.split(' ')
for i in s:
if i == ',':
s.remove(',')
data = {"job_title" : job_title ,"comapany_name": company,"job_location":
location,"job_desc":desc,"skills":s[2::],"card_link":link_list[0]}
link_list.clear()
y = collection.insert_one(data)
print(y.inserted_id)
driver.find_element_by_xpath("//button[contains(text(),'Next')]").click()
sleep(25)
temp = temp +1