如何仅打印该页面描述中包含特定文本的链接?

时间:2017-07-06 03:29:24

标签: python python-3.x beautifulsoup

我试图打开包含该页面上某些字词的链接。如果这个词出现在那个页面上,例如" engineering"如果没有通过,则返回链接。

这是我到目前为止所做的:我输入的是工程,位置是北约克

import requests
from bs4 import BeautifulSoup
import webbrowser
import time

jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'

r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")

filter_words = ['chemical engineering', 'instrumentation', 'QA']
all_job_url = []
filtered_job_links = []
http_flinks = []
flinks = []

def get_all_joblinks():  # obtains all the links on the search page
    for tag in prettify.find_all('a', {'data-tn-element':"jobTitle"}):
        link = tag['href']
        all_job_url.append(link)

def filter_links():

    for eachurl in all_job_url: # iterates through each link
        rurl = requests.get(base_url + eachurl)
        content = rurl.content
        soup = BeautifulSoup(content, "html.parser")
        summary = soup.get_text()

        #supposed to filter links based on certain words within text on link page
        if any(word in summary for word in filter_words):
            for filtered_link in soup.find_all('link', {'rel':'canonical'}):
                flink = filtered_link['href'] # obtains only filtered links
                if "http:" in flink:
                    http_flinks.append(flink)
                    print(http_flinks)
                else:
                    flinks.append(flink)
                    #website = webbrowser.open_new(base_url + flink)
                    time.sleep(3)
                    print(flinks)
        else:
            print("nothing")
            pass


def search_job():

    while True:

        if prettify.select('div.no_results'):
            print("no job matches found")
            break
        else:
            # opens the web page of job search if entries are found
            website = webbrowser.open_new(url)
            break

get_all_joblinks()
filter_links()

0 个答案:

没有答案