我具有从确实提取工作信息的代码,但是现在我想从工作标题中提取链接,以便我可以打开一个新页面并提取工作描述信息。
我可以在html页面上的href标记内看到指向职位发布的链接,但没有提起如何提取它的链接?
import requests
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH)
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')
driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")
soup=BeautifulSoup(driver.page_source, "lxml")
title = [tag.text.strip() for tag in soup.select('.jobtitle')]
company = [tag.text.strip() for tag in soup.select('.company')]
location = [tag.text.strip() for tag in soup.select('.location')]
for y in range (len(title)):
tmpstring = (title[y] + ',' + company[y] + ',' + location[y] + ",0")
tmpstring = tmpstring.encode("utf-8")
f = open('FileDump','a')
f.write(tmpstring)
f.close
答案 0 :(得分:0)
您可以使用以下代码提取链接
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://arstechnica.com")
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print link.get('href')
参考 https://pythonspot.com/extract-links-from-webpage-beautifulsoup/
答案 1 :(得分:0)
您可以使用此代码获取子元素。
title_href = [tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
我尝试了您的代码并修改了一些地方。因为我发现它可以从<a>
获得全名
import requests
import time
from random import randint
from bs4 import BeautifulSoup
import urllib, requests, re, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
webdriver.DesiredCapabilities.CHROME["unexpectedAlertBehaviour"] = "accept"
options = Options()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options,executable_path='chromedriver')
driver.get("https://www.indeed.co.uk/automotive-engineer-jobs-in-uk")
domain = "https://www.indeed.co.uk"
soup=BeautifulSoup(driver.page_source, "lxml")
title = [tag.find("a")["title"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
title_href = [domain + tag.find("a")["href"] for tag in soup.findAll("h2",{"class":"jobtitle"})]
company = [tag.text.strip() for tag in soup.findAll("span",{"class":"company"})]
location = [tag.text.strip() for tag in soup.findAll("span",{"class":"location"})]
print(title_href)
driver.close()