如何在Python字符串中插入冒号?

时间:2016-04-05 17:28:12

标签: python string colon

我需要将一个字符串传递给一个变量,其中包含命令“inurl:something here”,因此我可以在网址中根据该术语进行高级Google搜索。

例如,我的程序将此变量传递给自动Google搜索程序:

search_terms = 'python developer inurl:jobs'

但这不起作用,我已将问题缩小到结肠。 如何将表达式“inurl:whatever”插入到我的字符串中作为一段文本,就像字符串中的其他所有内容一样,没有python解释作为编程符号?我已经尝试了所有我能想到的东西,包括方括号,括号和堆栈问题等。没有任何效果。谢谢!

根据要求提供更多代码: 也许是在写文件时我收到了错误?

import requests,re,bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time,random    

search_terms = 'python developer inurl:jobs'
added_terms = 'contact email'

然后通过一个函数程序查找电子邮件...(我知道这工作正常)然后它应该写入记事本文件:

number_of_sites = 1   #NUMBER OF SITES (SEARCH RESULTS) TO PARSE FOR EMAILS
number_of_search_pages = 1

def google_this_for_emails():                #Googles and gets the first few links

    global scrapedEmails
    scrapedEmails = []
    global emails_not_found
    emails_not_found = []

    #This searches for certain keywords in Google and parses results with BS
    for el in search_terms:
        webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
        print('\n Searching for the terms...', el,added_terms)
        headers = {'User-agent':'Mozilla/5.0'}
        res = requests.get(webpage, headers=headers)
        #res.raise_for_status()

        statusCode = res.status_code
        if statusCode == 200:
            soup = bs4.BeautifulSoup(res.text,'lxml')
            serp_res_rawlink = soup.select('.r a')

            dicti = []                  #This gets the href links
            for link in serp_res_rawlink:
                url = link.get('href')
                if 'pdf' not in url:
                    dicti.append(url)

            dicti_url = []              #This cleans the "url?q=" from link
            for el in dicti:
                if '/url?q=' in el:
                    result = (el.strip('/url?q='))
                    dicti_url.append(result)
            #print(dicti_url)

            global dicti_pretty_links
            dicti_pretty_links = []     #This cleans the gibberish at end of url
            for el in dicti_url[0:(number_of_sites)]:
                pretty_url = el.partition('&')[0]
                dicti_pretty_links.append(pretty_url)
            print(dicti_pretty_links)


            for el in dicti_pretty_links:
            #######START OF THE BS CHECK FOR EMAILS BY REGEX #################
                #This opens page in BS for parsing emails
                webpage = (el)
                headers = {'User-agent':'Mozilla/5.0'}
                res = requests.get(webpage, headers=headers)

                statusCode = res.status_code
                if statusCode == 200:
                    soup = bs4.BeautifulSoup(res.text,'lxml')
                    #if "</form>" in soup:

                    #This is the first way to search for an email in soup, "MO"
                    emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
                    mo = emailRegex.findall(res.text)
                    print('THIS BELOW IS MO')
                    print(mo,'EMAILS COMING FROM: ',el)
                    for el in mo:
                        if el not in scrapedEmails:
                            scrapedEmails.append(el)

                    #This is the second way to search for an email in soup, "MAILTOS":
                    # mailtos = soup.select('a[href^=mailto]')
                    # print('THIS BELOW IS MAILTOS')
                    # print(mailtos, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
                    #
                    # dicti_cleaner = []
                    # target = re.compile(r'mailto')
                    # for el in mailtos:
                    #     mo = target.search(str(el))
                    #     dicti_cleaner.append(el)
                    #
                    # temp = []
                    # for el in dicti_cleaner:
                    #     pretty_url = str(el).partition(':')[2]
                    #     second_url = str(pretty_url).partition('"')[0]
                    #     temp.append(second_url)
                    #
                    # for el in temp:
                    #     if el not in scrapedEmails:
                    #         scrapedEmails.append(el)
            #     #######END OF THE BS CHECK FOR EMAILS BY REGEX #################

            for el in dicti_pretty_links:
            #######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
                browser = webdriver.Firefox()  #This converts page into Selenium object
                page = browser.get(el)
                time.sleep(random.uniform(0.5,1.5))
                try:                                #Tries to open "contact" link
                    contact_link = browser.find_element_by_partial_link_text('ontact')
                    if contact_link:
                        contact_link.click()
                except:
                    pass    #Silently ignores exception
                html = browser.page_source          #Loads up the page for Regex search
                soup = BeautifulSoup(html,'lxml')
                time.sleep(random.uniform(0.5,1.5))
                emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
                mo = emailRegex.findall(html)
                print('THIS BELOW IS SEL_emails_MO for',el)
                print(mo,'EMAILS COMING FROM: ',el)
                if not mo:
                    print('no emails found in ',el)
                    emails_not_found.append(el)
                for el in mo:
                    if el not in scrapedEmails:     #Checks if emails is/adds to ddbb
                        scrapedEmails.append(el)
                browser.close()
                #######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################

            print('EMAILS SCRAPED SO FAR: \n', scrapedEmails)

    time.sleep(random.uniform(0.5,1.5))    #INSERTS HUMAN-LIKE RANDOM DELAY


def google_nextpage_for_emails():                #Googles and gets the first few links
    print(60*'-')
    print('STARTING FUNCTION NEXTPAGE FOR EMAILS')
    counter = 10
    for i in range(0,(number_of_search_pages)):
        #This searches for certain keywords in Google and parses results with BS
        for el in search_terms:
            webpage = 'https://www.google.com/search?q='+str(el)+str(added_terms)+'&start='+str(counter)
            print('\n Searching for the terms...', el,added_terms, 'on', webpage)
            headers = {'User-agent':'Mozilla/5.0'}
            res = requests.get(webpage, headers=headers)
            #res.raise_for_status()

            statusCode = res.status_code
            if statusCode == 200:
                soup = bs4.BeautifulSoup(res.text,'lxml')
                serp_res_rawlink = soup.select('.r a')

                dicti = []                  #This gets the href links
                for link in serp_res_rawlink:
                    url = link.get('href')
                    if 'pdf' not in url:
                        dicti.append(url)

                dicti_url = []              #This cleans the "url?q=" from link
                for el in dicti:
                    if '/url?q=' in el:
                        result = (el.strip('/url?q='))
                        dicti_url.append(result)
                #print(dicti_url)

                global dicti_pretty_links
                dicti_pretty_links = []     #This cleans the gibberish at end of url
                for el in dicti_url[0:(number_of_sites)]:
                    pretty_url = el.partition('&')[0]
                    dicti_pretty_links.append(pretty_url)
                print(dicti_pretty_links)


                for el in dicti_pretty_links:
                #######START OF THE BS CHECK FOR EMAILS BY REGEX #################
                    #This opens page in BS for parsing emails
                    webpage = (el)
                    headers = {'User-agent':'Mozilla/5.0'}
                    res = requests.get(webpage, headers=headers)

                    statusCode = res.status_code
                    if statusCode == 200:
                        soup = bs4.BeautifulSoup(res.text,'lxml')
                        #if "</form>" in soup:

                        #This is the first way to search for an email in soup, "MO"
                        emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
                        mo = emailRegex.findall(res.text)
                        print('THIS BELOW IS MO')
                        print(mo, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
                        for el in mo:
                            if el not in scrapedEmails:
                                scrapedEmails.append(el)

                        #This is the second way to search for an email in soup, "MAILTOS":
                        # mailtos = soup.select('a[href^=mailto]')
                        # print('THIS BELOW IS MAILTOS')
                        # print(mailtos, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
                        #
                        # dicti_cleaner = []
                        # target = re.compile(r'mailto')
                        # for el in mailtos:
                        #     mo = target.search(str(el))
                        #     dicti_cleaner.append(el)
                        #
                        # temp = []
                        # for el in dicti_cleaner:
                        #     pretty_url = str(el).partition(':')[2]
                        #     second_url = str(pretty_url).partition('"')[0]
                        #     temp.append(second_url)
                        #
                        # for el in temp:
                        #     if el not in scrapedEmails:
                        #         scrapedEmails.append(el)
                #     #######END OF THE BS CHECK FOR EMAILS BY REGEX #################

                try:
                    for el in dicti_pretty_links:
                    #######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
                        browser = webdriver.Firefox()  #This converts page into Selenium object
                        page = browser.get(el)
                        time.sleep(random.uniform(1,2))
                        try:                                #Tries to open "contact" link
                            contact_link = browser.find_element_by_partial_link_text('ontact')
                            if contact_link:
                                contact_link.click()
                        except Exception as e:
                            print (e)
                            continue
                            #pass    #Silently ignores exception
                        html = browser.page_source          #Loads up the page for Regex search
                        soup = BeautifulSoup(html,'lxml')
                        time.sleep(random.uniform(1,2))
                        emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
                        mo = emailRegex.findall(html)
                        print('THIS BELOW IS SEL_emails_MO for',el)
                        print(mo, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
                        if not mo:
                            print('no emails found in ',el)
                            emails_not_found.append(el)
                        for el in mo:
                            if el not in scrapedEmails:     #Checks if emails is/adds to ddbb
                                scrapedEmails.append(el)
                        browser.close()
                        #######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
                except Exception as e:
                    print(e)
                    continue

        counter += 10
        time.sleep(random.uniform(1,2.5))    #INSERTS HUMAN-LIKE RANDOM DELAY
        print('EMAILS SCRAPED SO FAR \n', scrapedEmails)
        report()

def open_emails_lost():
   for el in emails_not_found:
        print(el)
        browser = webdriver.Firefox()  #This converts page into Selenium object
        try:
            browser.get(el)
            time.sleep(random.uniform(1,2))
        except:
            pass

def report():






filename = (str(search_terms)+str('_')+str(added_terms))
        testFile = open(filename + '.txt', 'w')
        #testFile = open('test_google_tabs.txt', 'w')
        testFile.write('SEARCH: ')
        testFile.write(str(search_terms).upper())
        testFile.write(str(added_terms).upper())
        testFile.write('\n')
        testFile.write(str(len(search_terms)))
        testFile.write(' Google result parsed')
        testFile.write('\n')
        testFile.write(str(len(scrapedEmails)))
        testFile.write(' emails found')
        testFile.write('\n')
        testFile.write(60*'*')
        testFile.write('\n')
        testFile.write(str(scrapedEmails)[1:-1])  #last part deletes the square brakets
        testFile.write('\n')
        testFile.write('\n')
        testFile.write(str('And these below are the pages were emails were not found_____________'))
        testFile.write('\n')
        testFile.write(str(emails_not_found)[1:-1])
        testFile.close()
        #print('The information has been successfully written to "test_google_tabs.txt"')
        print('The information has been successfully written to', filename)
        print(60*'-')



google_this_for_emails()

google_nextpage_for_emails()

report()

open = input('Press any key to open the webpages that did not contain email addresses, or type "quit" to end program')

if open == 'quit':
    pass
else:
    open_emails_lost()

1 个答案:

答案 0 :(得分:0)

python不会将冒号:解释为编程符号

但是如果您想要显示实际用作编程符号的内容,例如尝试显示"\n"

你可以用这个:

exampleString= r"\n"