我正在构建一个类,该类使我可以从站点上挖掘出一组链接,然后使用并发.futures来检查链接的有效性。
当我运行函数multithreaded_link_checking
时,它运行正常。
However, when its inside a class, it returns this error:
Traceback (most recent call last):
File "datahandler.py", line 236, in <module>
data.multithreaded_link_checking(links)
File "datahandler.py", line 209, in multithreaded_link_checking
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
File "/anaconda3/lib/python3.7/concurrent/futures/thread.py", line 128, in __init__
if max_workers <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'
我的代码如下:
class DataHandler:
def __init__(self, url, file=None,):
self.file = file
self.url = url
def get_links_a_on_page(self):
"""
Gets a links from the igm website
returns list -> links
"""
# TODO: Check if URL is correct
print('checking site')
site = requests.get(self.url)
soup = BeautifulSoup(site.text, 'html.parser')
print("URL received, cleaning links.")
# Find all the href on self.url
links = [a_link['href'] for a_link in soup.find_all("a", href=True)]
for n, i in enumerate(links):
clean_link = re.search("http:(.*)", i)
links[n] = clean_link.group(0)
print("Cleaning URL")
return links
def get_links_from_csv(self):
"""
:return list -> links:
"""
# TODO: Check if File is CSV
# TODO: Check if File has links
links = []
try:
with open(self.file, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
links.append(', '.join(row))
except FileNotFoundError:
print("File not found")
return links
def check_links_urllib3_helper(link, return_links=True):
"""
Checks the response code of the url
:param site_url:
:return response code:
"""
if return_links is True:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
if r.status == 200:
return link
if return_links is False:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
return r.status
def multithreaded_link_checking(links, max_workers=99):
"""
multithreaded operation to review a set of links and identifies working links vs. 404 codes
:param links:
:return None:
"""
# codes list in order to log the response
downloadable_links = []
# Use the ThreadPoolExecutor to run concurrent processes
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
# set the size to know how many links are left -- do this to make sure the threads dont bug out
size = len(links)
# map(function,iterable)
for i in executor.map(check_links_urllib3_helper, links):
print("Links Left: ", size)
downloadable_links.append(i)
size -= 1
return downloadable_links
答案 0 :(得分:0)
固定。
需要将self
添加为multithreaded_link_checking()
的位置参数