我正在研究可旋转ips的刮刀,我已经在笔记本电脑上创建了一个可以按预期工作的小mvp:
import logging
import time
import random
import threading
from datetime import datetime
from datetime import timedelta
logging.basicConfig(
level=logging.DEBUG,
format='(%(threadName)-10s) %(message)s',
)
class Controller(object):
def __init__(self, event):
self.start_time = datetime.now()
self.event = event
def worker(self):
while True:
if self.event.is_set():
rand_sleep_time = random.randint(1, 10) / 5
logging.debug("Sleeping for %.2f secs" % rand_sleep_time)
time.sleep(rand_sleep_time)
logging.debug("Werking")
else:
time.sleep(1)
def blocker(self):
while True:
rand_sleep_time = random.randint(3, 6)
logging.debug("Sleeping for %.2f secs" % rand_sleep_time)
time.sleep(rand_sleep_time)
if datetime.now() > self.start_time + timedelta(seconds=10):
self.event.clear() # only stop the execution for when the ip is updated
logging.debug("ALL THREADS SLEEP NOW!")
time.sleep(10)
self.event.set() # you can now proceed with the computations
self.start_time = datetime.now()
start_time = datetime.now()
e = threading.Event()
e.set()
c = Controller(e)
for thread in range(NUM_THREADS):
t = threading.Thread(target=c.worker, name='Thread-Worker-{}'.format(thread+1))
t.start()
threading.Thread(target=c.blocker, name='Thread-Blocker-1').start()
因此,上方的工作人员进行了一些工作,然后阻止程序在短时间内暂停所有工作,同时更新“ ip”,然后工作人员再次开始工作。采取这种逻辑并将其实施到生产中,这将失败(因为我认为工人不会停下来)。不幸的是,我无法包含所有代码,但这是主要部分。希望这已经足够,因为其他部分与Ip-Updater不会停止其他线程的事实无关。这种实现方式的唯一区别是,我使用了类,也许应该更改类(因为方法具有self
arg并且正在对其进行更改。但是,如果Ip-Updater成功地停止了其他线程,那么应该没有问题,不是吗?):
class ThreadedNewsParser(object):
"""
This little guy parses the news with multiple threads and dynamically changes the ip of the sessions
"""
def __init__(self, update_ip_in, num_threads, date_start, date_end):
assert isinstance(num_threads, int)
assert num_threads > 0
assert any(isinstance(date_start, type_) for type_ in [datetime, date])
assert any(isinstance(date_end, type_) for type_ in [datetime, date])
self.start_time = datetime.now()
self.event = threading.Event()
self.event.set()
self.update_ip_in = update_ip_in
self.check_ip_url = 'https://httpbin.org/ip'
autolog("STARTING WORK ON IP: {}".format(session.get(self.check_ip_url).text), logging.debug)
self.num_threads = num_threads
self.date_start = date_start
self.date_end = date_end
self.dates = [date for date in date_range(date_start, date_end)]
self.p = DailyCompanyNewsParser(2008, 1, 1) # the date here does not matter
def worker(self):
while len(self.dates) > 0:
if self.event.is_set():
print("THREAD WERKING!")
pause = random.randint(1, 5) / 5
autolog('THREAD SLEEPING %.2f' % pause, logging.debug)
time.sleep(pause)
if len(self.dates) > 0:
date = self.dates.pop(0)
self.p.get_news_for_all_stocks(verbose=True, date_=date)
else:
print("THREAD SLEEPING")
time.sleep(10) # so that the threads do not check if the event is set instantaneously
def ip_updater(self): # this is the blocker
while len(self.dates) > 0:
autolog("IP_UPDATER SLEEPING FOR: {}".format(self.update_ip_in / 4), logging.debug)
time.sleep(self.update_ip_in / 4) # do not check the condition every instance
if datetime.now() > self.start_time + timedelta(seconds=self.update_ip_in):
print("ALL THREADS SLEEP NOW!")
autolog("ALL THREADS SLEEP NOW!", logging.info)
self.event.clear() # Make all other threads sleep so that we can update the IP
time.sleep(10)
get_new_ip()
self.start_time = datetime().now()
# autolog("Obtained new IP address: {}".format(session.get(self.check_ip_url).text), logging.debug)
autolog("ALL THREADS WAKE UP NOW!", logging.info)
print("ALL THREADS WAKE UP NOW!")
self.event.set()
def run(self):
for thread in range(self.num_threads):
t = threading.Thread(target=self.worker, name='Thread-Worker-{}'.format(thread+1))
t.start()
threading.Thread(target=self.ip_updater, name='Thread-IPUpdater-1').start()
重写所有内容以使event
和start_time
成为全局变量不能解决该问题。例如:
class ThreadedNewsParser(object):
"""
This little guy parses the news with multiple threads and dynamically changes the ip of the sessions
"""
def __init__(self, update_ip_in, num_threads, date_start, date_end):
assert isinstance(num_threads, int)
assert num_threads > 0
assert any(isinstance(date_start, type_) for type_ in [datetime, date])
assert any(isinstance(date_end, type_) for type_ in [datetime, date])
self.update_ip_in = update_ip_in
self.check_ip_url = 'https://httpbin.org/ip'
autolog("STARTING WORK ON IP: {}".format(session.get(self.check_ip_url).text), logging.debug)
self.num_threads = num_threads
self.date_start = date_start
self.date_end = date_end
self.dates = [date for date in date_range(date_start, date_end)]
self.p = DailyCompanyNewsParser(2008, 1, 1) # the date here does not matter
def worker(self):
global event
while len(self.dates) > 0:
if event.is_set():
print("THREAD WERKING!")
pause = random.randint(1, 5) / 5
autolog('THREAD SLEEPING %.2f' % pause, logging.debug)
time.sleep(pause)
if len(self.dates) > 0:
date = self.dates.pop(0)
self.p.get_news_for_all_stocks(verbose=True, date_=date)
else:
print("THREAD SLEEPING")
time.sleep(10) # so that the threads do not check if the event is set instantaneously
def ip_updater(self): # this is the blocker
global start_time
global event
while len(self.dates) > 0:
autolog("IP_UPDATER SLEEPING FOR: {}".format(self.update_ip_in / 4), logging.debug)
time.sleep(self.update_ip_in / 4) # do not check the condition every instance
if datetime.now() > start_time + timedelta(seconds=self.update_ip_in):
print("ALL THREADS SLEEP NOW!")
autolog("ALL THREADS SLEEP NOW!", logging.info)
event.clear() # Make all other threads sleep so that we can update the IP
time.sleep(10)
get_new_ip()
start_time = datetime().now()
# autolog("Obtained new IP address: {}".format(session.get(self.check_ip_url).text), logging.debug)
autolog("ALL THREADS WAKE UP NOW!", logging.info)
print("ALL THREADS WAKE UP NOW!")
event.set()
def run(self):
for thread in range(self.num_threads):
t = threading.Thread(target=self.worker, name='Thread-Worker-{}'.format(thread+1))
t.start()
threading.Thread(target=self.ip_updater, name='Thread-IPUpdater-1').start()