我创建了一个程序来计算一个网站中某个单词的出现次数,但是为了对此进行优化,我决定使用多线程来同时计算多个网站中一个单词的出现次数!
`
import threading
from multiprocessing.pool import ThreadPool as Pool
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import Request, urlopen
from requests_html import HTMLSession
import urllib.parse
import re
import time
import unidecode
remove_prefix = ["google.", "webcache.", ".youtube.", "blogger.com", "pinterest.", "dailymotion", "linkedin.com", ".pdf", "deezer.com", "facebook.", "twitter.", "paroles", ".aliexpress", "dictionnaire", "video"]
keyword = "capital france"
a1 = "paris"
a1 = a1.lower()
values = {'q': keyword}
data = urllib.parse.urlencode(values)
url = "http://www.google.fr/search?" + data
print(url)
session = HTMLSession()
r = session.get(url)
listlink = list(set(r.html.absolute_links))
for c in remove_prefix:
listlink = [i for i in [item for item in listlink] if not (c in i)]
scorea1 = 0
def compter(number):
global scorea1
req = Request(listlink[number], headers={'User-Agent': 'Mozilla/5.0'})
resp = BeautifulSoup(urlopen(req).read(), "lxml", parse_only=SoupStrainer('body')).get_text().lower()
scorea1 += sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(unidecode.unidecode(a1)), unidecode.unidecode(resp)))
print(scorea1)
def process():
threads = []
for i in range(5):
threads.append(threading.Thread(target=compter, args=(i,)))
threads[-1].start()
for t in threads:
t.join()
process()
print(scorea1)
` 导入时间 从current.futures导入ThreadPoolExecutor 从bs4导入BeautifulSoup,SoupStrainer 从urllib.request导入请求,urlopen 从request_html导入HTMLSession 导入urllib.parse 汇入 导入unidecode
start = time.time()
remove_prefix = ["google.", "webcache.", ".youtube.", "blogger.com", "pinterest.", "dailymotion", "linkedin.com", ".pdf", "deezer.com", "facebook.", "twitter.", "paroles", ".aliexpress", "dictionnaire", "video"]
keyword = "capital france"
a1 = "paris"
a1 = a1.lower()
values = {'q': keyword}
data = urllib.parse.urlencode(values)
url = "http://www.google.fr/search?" + data
session = HTMLSession()
r = session.get(url)
listlink = list(set(r.html.absolute_links))
for c in remove_prefix:
listlink = [i for i in [item for item in listlink] if not (c in i)]
scorea1 = 0
def compte(url):
global scorea1
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
resp = BeautifulSoup(urlopen(req).read(), "lxml", parse_only=SoupStrainer('body')).get_text().lower()
scorea1 += sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(unidecode.unidecode(a1)), unidecode.unidecode(resp)))
print(scorea1)
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(5):
future = executor.submit(compte, listlink[i])
print(scorea1)
print(time.time() - start)
两个方法都需要大约相同的时间来运行,例如2.5秒,但是我想知道是否可以更快地执行此操作,以及在scorea1 > x
时如何中断线程?
(PS:如果有什么需要改进的地方,请告诉我)