我正在构建一个网络爬虫,我面临一个问题,即搜索页面仅提供指向项目的链接,所以我想
功能1从.txt文件中读取邮政编码并搜索到项目的链接
功能2获取项目链接并刮擦以获取详细信息
我两个刮板都作为单独的.py文件
我将它们组合在一起,并使每个脚本成为一个函数
我已经实现了双端队列来添加数据和检索数据,并且可以正常工作。但是我如何让它们一起运行?
# -*- coding: UTF-8 -*-
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
from time import sleep
import csv
from collections import deque
dq = deque([])
#Search The links Via Post Code
def linkScrape():
recordnum = 0
pagnum = 0
with open("catlist.txt") as catlist:
postkeys = []
for line in catlist:
postkeys.append(line.strip())
with open("pcodnum.txt") as pagesnum:
postpages = []
for line in pagesnum:
postpages.append(line.strip())
with open("pcodes.txt") as pcodes:
postcodes = []
for line in pcodes:
postcodes.append(line.strip())
for y in postcodes:
for z in postkeys:
for x in postpages:
surl = 'https://www.checkatrade.com/Search/?location={}&cat={}&page={}'.format(y, z, x)
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
#driver = webdriver.Firefox()
driver.implicitly_wait(10) # seconds
driver.get (surl)
print ("Link Scraper: Headless Firefox Scraping: " + surl)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
questions = soup.select('.ch-listing__result')
for question in questions:
comlink = question.find('a', attrs={"class": "catnow-search-click"})
if comlink is None:
comlink = 'None'
else:
comlink = comlink.attrs['href']
comlink = 'https://www.checkatrade.com' + comlink
recordnum += 1
dq.appendleft(str(comlink))
pagnum += 1
print("Link Scraper: " + str(pagnum) + ' pages finished with ' + str(recordnum) + ' records')
print(list(dq))
driver.close()
# Scrape Company Details From Url
def datScrape( xurl ):
f = csv.writer(open('companydetails.csv', 'w'))
f.writerow(['Business Name', 'Business Owner', 'Business Telephone', 'Business Mobile', 'Business Email', 'Business Managed Email'])
surl = xurl
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
#driver = webdriver.Firefox()
driver.implicitly_wait(5) # seconds
driver.get (surl)
print ("Company Details Scraper: Headless Firefox Scraping: " + surl)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
details = soup.select('.contact-card__details')
#print(questions)
for detail in details:
busname = detail.select('h1')[0].get_text()
#print(busname)
#busowner = question.find(class_='contact-card__contact-name').get_text()
busowner = detail.find('div', attrs={"class": "contact-card__contact-name"})
if busowner is None:
busowner = 'None'
else:
busowner = busowner.text
#print(busowner)
comtelephone = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlTel"})
if comtelephone is None:
comtelephone = 'None'
else:
comtelephone = comtelephone.attrs['href'].rsplit(":", 1)[-1]
#print(comtelephone)
comtelemob = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlMobile"})
if comtelemob is None:
comtelemob = 'None'
else:
comtelemob = comtelemob.attrs['href'].rsplit(":", 1)[-1]
#print(comtelemob)
comemail = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlEmail"})
if comemail is None:
comemail = 'None'
else:
comemail = comemail.attrs['href'].rsplit(":", 1)[-1]
comemanmail = detail.find('a', attrs={"id": "ctl00_ctl00_content_managedEmail"})
if comemanmail is None:
comemanmail = 'None'
else:
comemanmail = comemanmail.attrs['href'].rsplit(":", 1)[-1]
#print(comemail)
print("Company Details Scraper: " + busname + "\n" + busowner + "\n" + comtelephone + "\n" + comtelemob + "\n" + comemail + "\n" + comemanmail)
f.writerow([busname, busowner, comtelephone, comtelemob, comemail, comemanmail])
data_list = []
driver.close()
driver.quit()
from multiprocessing import Process
p = Process(target=linkScrape)
p.start()
p2 = Process(target=datScrape)
sleep(20)
p2.start(dq.pop())
p.join()
p2.join()
具有多处理功能的更新代码 新错误
Traceback (most recent call last):
File "script.py", line 120, in <module>
p2.start(dq.pop())
IndexError: pop from an empty deque
甚至认为队列definitley此时已经有数据
答案 0 :(得分:1)
为完成此任务,您可能需要包括一些multiprocessing
。有关更多详细信息,请参见this page。
您可以考虑查看this Stack Overflow post来解决类似问题。您需要做的是为每个过程创建流程。
我会考虑将您的脚本更改为以下内容:
# -*- coding: UTF-8 -*-
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
from time import sleep
import csv
from collections import deque
# Added this library
from multiprocessing import Process
dq = deque([])
#Search The links Via Post Code
def linkScrape( scrapeInput ):
recordnum = 0
pagnum = 0
spost = scrapeInput
with open("catlist.txt") as catlist:
postkeys = []
for line in catlist:
postkeys.append(line.strip())
with open("pcodnum.txt") as pagesnum:
postpages = []
for line in pagesnum:
postpages.append(line.strip())
for z in postkeys:
for x in postpages:
surl = 'https://www.checkatrade.com/Search/?location=' + spost + '&cat=' + str(z) + '&page=' + str(x)
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
#driver = webdriver.Firefox()
driver.implicitly_wait(10) # seconds
driver.get (surl)
print ("Headless Firefox Scraping: " + surl)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
questions = soup.select('.ch-listing__result')
for question in questions:
comlink = question.find('a', attrs={"class": "catnow-search-click"})
if comlink is None:
comlink = 'None'
else:
comlink = comlink.attrs['href']
comlink = 'https://www.checkatrade.com' + comlink
recordnum += 1
dq.appendleft(comlink)
pagnum += 1
print("Link Scraper: " + str(pagnum) + ' pages finished with ' + str(recordnum) + ' records')
driver.close()
# Scrape Company Details From Url
def datScrape( xurl ):
f = csv.writer(open('companydetails.csv', 'w'))
f.writerow(['Business Name', 'Business Owner', 'Business Telephone', 'Business Mobile', 'Business Email', 'Business Managed Email'])
surl = xurl
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
#driver = webdriver.Firefox()
driver.implicitly_wait(5) # seconds
driver.get (surl)
print ("Headless Firefox Scraping: " + surl)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
details = soup.select('.contact-card__details')
#print(questions)
for detail in details:
busname = detail.select('h1')[0].get_text()
#print(busname)
#busowner = question.find(class_='contact-card__contact-name').get_text()
busowner = detail.find('div', attrs={"class": "contact-card__contact-name"})
if busowner is None:
busowner = 'None'
else:
busowner = busowner.text
#print(busowner)
comtelephone = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlTel"})
if comtelephone is None:
comtelephone = 'None'
else:
comtelephone = comtelephone.attrs['href'].rsplit(":", 1)[-1]
#print(comtelephone)
comtelemob = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlMobile"})
if comtelemob is None:
comtelemob = 'None'
else:
comtelemob = comtelemob.attrs['href'].rsplit(":", 1)[-1]
#print(comtelemob)
comemail = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlEmail"})
if comemail is None:
comemail = 'None'
else:
comemail = comemail.attrs['href'].rsplit(":", 1)[-1]
comemanmail = detail.find('a', attrs={"id": "ctl00_ctl00_content_managedEmail"})
if comemanmail is None:
comemanmail = 'None'
else:
comemanmail = comemanmail.attrs['href'].rsplit(":", 1)[-1]
#print(comemail)
print("Company Details Scraper: " + busname + "\n" + busowner + "\n" + comtelephone + "\n" + comtelemob + "\n" + comemail + "\n" + comemanmail)
f.writerow([busname, busowner, comtelephone, comtelemob, comemail, comemanmail])
data_list = []
driver.close()
driver.quit()
# Added in this function to run two functions in parallel
# Taken from: https://stackoverflow.com/questions/7207309/python-how-can-i-run-python-functions-in-parallel
# Credit to NPE
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
with open("pcodes.txt") as pcodes:
postcodes = []
for line in pcodes:
postcodes.append(line.strip())
# You will probably need to edit the below...
for postcode in postcodes:
# You will need to call the runInParallel function to call your two other functions
runInParallel(linkScrape(postcode), datScrape(postcode))
由于尚不清楚您是否真的想同时发生两件事(因为有一个IF
语句,因此可能需要进行一些编辑,但这将同时完成两个功能的运行)(尽可能接近)