我需要将一个字符串传递给一个变量,其中包含命令“inurl:something here”,因此我可以在网址中根据该术语进行高级Google搜索。
例如,我的程序将此变量传递给自动Google搜索程序:
search_terms = 'python developer inurl:jobs'
但这不起作用,我已将问题缩小到结肠。 如何将表达式“inurl:whatever”插入到我的字符串中作为一段文本,就像字符串中的其他所有内容一样,没有python解释作为编程符号?我已经尝试了所有我能想到的东西,包括方括号,括号和堆栈问题等。没有任何效果。谢谢!
根据要求提供更多代码: 也许是在写文件时我收到了错误?
import requests,re,bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time,random
search_terms = 'python developer inurl:jobs'
added_terms = 'contact email'
然后通过一个函数程序查找电子邮件...(我知道这工作正常)然后它应该写入记事本文件:
number_of_sites = 1 #NUMBER OF SITES (SEARCH RESULTS) TO PARSE FOR EMAILS
number_of_search_pages = 1
def google_this_for_emails(): #Googles and gets the first few links
global scrapedEmails
scrapedEmails = []
global emails_not_found
emails_not_found = []
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('\n Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
global dicti_pretty_links
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:(number_of_sites)]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links:
#######START OF THE BS CHECK FOR EMAILS BY REGEX #################
#This opens page in BS for parsing emails
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#if "</form>" in soup:
#This is the first way to search for an email in soup, "MO"
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(res.text)
print('THIS BELOW IS MO')
print(mo,'EMAILS COMING FROM: ',el)
for el in mo:
if el not in scrapedEmails:
scrapedEmails.append(el)
#This is the second way to search for an email in soup, "MAILTOS":
# mailtos = soup.select('a[href^=mailto]')
# print('THIS BELOW IS MAILTOS')
# print(mailtos, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
#
# dicti_cleaner = []
# target = re.compile(r'mailto')
# for el in mailtos:
# mo = target.search(str(el))
# dicti_cleaner.append(el)
#
# temp = []
# for el in dicti_cleaner:
# pretty_url = str(el).partition(':')[2]
# second_url = str(pretty_url).partition('"')[0]
# temp.append(second_url)
#
# for el in temp:
# if el not in scrapedEmails:
# scrapedEmails.append(el)
# #######END OF THE BS CHECK FOR EMAILS BY REGEX #################
for el in dicti_pretty_links:
#######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
browser = webdriver.Firefox() #This converts page into Selenium object
page = browser.get(el)
time.sleep(random.uniform(0.5,1.5))
try: #Tries to open "contact" link
contact_link = browser.find_element_by_partial_link_text('ontact')
if contact_link:
contact_link.click()
except:
pass #Silently ignores exception
html = browser.page_source #Loads up the page for Regex search
soup = BeautifulSoup(html,'lxml')
time.sleep(random.uniform(0.5,1.5))
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(html)
print('THIS BELOW IS SEL_emails_MO for',el)
print(mo,'EMAILS COMING FROM: ',el)
if not mo:
print('no emails found in ',el)
emails_not_found.append(el)
for el in mo:
if el not in scrapedEmails: #Checks if emails is/adds to ddbb
scrapedEmails.append(el)
browser.close()
#######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
print('EMAILS SCRAPED SO FAR: \n', scrapedEmails)
time.sleep(random.uniform(0.5,1.5)) #INSERTS HUMAN-LIKE RANDOM DELAY
def google_nextpage_for_emails(): #Googles and gets the first few links
print(60*'-')
print('STARTING FUNCTION NEXTPAGE FOR EMAILS')
counter = 10
for i in range(0,(number_of_search_pages)):
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'https://www.google.com/search?q='+str(el)+str(added_terms)+'&start='+str(counter)
print('\n Searching for the terms...', el,added_terms, 'on', webpage)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
global dicti_pretty_links
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:(number_of_sites)]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links:
#######START OF THE BS CHECK FOR EMAILS BY REGEX #################
#This opens page in BS for parsing emails
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#if "</form>" in soup:
#This is the first way to search for an email in soup, "MO"
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(res.text)
print('THIS BELOW IS MO')
print(mo, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
for el in mo:
if el not in scrapedEmails:
scrapedEmails.append(el)
#This is the second way to search for an email in soup, "MAILTOS":
# mailtos = soup.select('a[href^=mailto]')
# print('THIS BELOW IS MAILTOS')
# print(mailtos, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
#
# dicti_cleaner = []
# target = re.compile(r'mailto')
# for el in mailtos:
# mo = target.search(str(el))
# dicti_cleaner.append(el)
#
# temp = []
# for el in dicti_cleaner:
# pretty_url = str(el).partition(':')[2]
# second_url = str(pretty_url).partition('"')[0]
# temp.append(second_url)
#
# for el in temp:
# if el not in scrapedEmails:
# scrapedEmails.append(el)
# #######END OF THE BS CHECK FOR EMAILS BY REGEX #################
try:
for el in dicti_pretty_links:
#######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
browser = webdriver.Firefox() #This converts page into Selenium object
page = browser.get(el)
time.sleep(random.uniform(1,2))
try: #Tries to open "contact" link
contact_link = browser.find_element_by_partial_link_text('ontact')
if contact_link:
contact_link.click()
except Exception as e:
print (e)
continue
#pass #Silently ignores exception
html = browser.page_source #Loads up the page for Regex search
soup = BeautifulSoup(html,'lxml')
time.sleep(random.uniform(1,2))
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(html)
print('THIS BELOW IS SEL_emails_MO for',el)
print(mo, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
if not mo:
print('no emails found in ',el)
emails_not_found.append(el)
for el in mo:
if el not in scrapedEmails: #Checks if emails is/adds to ddbb
scrapedEmails.append(el)
browser.close()
#######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
except Exception as e:
print(e)
continue
counter += 10
time.sleep(random.uniform(1,2.5)) #INSERTS HUMAN-LIKE RANDOM DELAY
print('EMAILS SCRAPED SO FAR \n', scrapedEmails)
report()
def open_emails_lost():
for el in emails_not_found:
print(el)
browser = webdriver.Firefox() #This converts page into Selenium object
try:
browser.get(el)
time.sleep(random.uniform(1,2))
except:
pass
def report():
filename = (str(search_terms)+str('_')+str(added_terms))
testFile = open(filename + '.txt', 'w')
#testFile = open('test_google_tabs.txt', 'w')
testFile.write('SEARCH: ')
testFile.write(str(search_terms).upper())
testFile.write(str(added_terms).upper())
testFile.write('\n')
testFile.write(str(len(search_terms)))
testFile.write(' Google result parsed')
testFile.write('\n')
testFile.write(str(len(scrapedEmails)))
testFile.write(' emails found')
testFile.write('\n')
testFile.write(60*'*')
testFile.write('\n')
testFile.write(str(scrapedEmails)[1:-1]) #last part deletes the square brakets
testFile.write('\n')
testFile.write('\n')
testFile.write(str('And these below are the pages were emails were not found_____________'))
testFile.write('\n')
testFile.write(str(emails_not_found)[1:-1])
testFile.close()
#print('The information has been successfully written to "test_google_tabs.txt"')
print('The information has been successfully written to', filename)
print(60*'-')
google_this_for_emails()
google_nextpage_for_emails()
report()
open = input('Press any key to open the webpages that did not contain email addresses, or type "quit" to end program')
if open == 'quit':
pass
else:
open_emails_lost()
答案 0 :(得分:0)
python不会将冒号:
解释为编程符号
但是如果您想要显示实际用作编程符号的内容,例如尝试显示"\n"
你可以用这个:
exampleString= r"\n"