在类内运行并发期货时发生TypeError

时间:2018-11-23 02:09:23

标签: python python-3.x multithreading concurrency

我正在构建一个类,该类使我可以从站点上挖掘出一组链接,然后使用并发.futures来检查链接的有效性。

当我运行函数multithreaded_link_checking时,它运行正常。

However, when its inside a class, it returns this error:

Traceback (most recent call last):
  File "datahandler.py", line 236, in <module>
    data.multithreaded_link_checking(links)
  File "datahandler.py", line 209, in multithreaded_link_checking
    with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
  File "/anaconda3/lib/python3.7/concurrent/futures/thread.py", line 128, in __init__
    if max_workers <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'

我的代码如下:

class DataHandler:

    def __init__(self, url, file=None,):
        self.file = file
        self.url = url

    def get_links_a_on_page(self):
        """
        Gets a links from the igm website
        returns list -> links
        """
        # TODO: Check if URL is correct
        print('checking site')
        site = requests.get(self.url)
        soup = BeautifulSoup(site.text, 'html.parser')
        print("URL received, cleaning links.")

        # Find all the href on self.url
        links = [a_link['href'] for a_link in soup.find_all("a", href=True)]

        for n, i in enumerate(links):
            clean_link = re.search("http:(.*)", i)
            links[n] = clean_link.group(0)
        print("Cleaning URL")
        return links

    def get_links_from_csv(self):
        """

        :return list -> links:
        """

        # TODO: Check if File is CSV
        # TODO: Check if File has links
        links = []
        try:
            with open(self.file, newline='') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
                for row in spamreader:
                    links.append(', '.join(row))
        except FileNotFoundError:
            print("File not found")
        return links

    def check_links_urllib3_helper(link, return_links=True):
        """
        Checks the response code of the url
        :param site_url:
        :return response code:
        """
        if return_links is True:
            # Initialize urllib manager -- highly efficient
            http = urllib3.PoolManager()
            # HEAD to get header values -- much faster
            r = http.request("HEAD", link)
            if r.status == 200:
                return link
        if return_links is False:
            # Initialize urllib manager -- highly efficient
            http = urllib3.PoolManager()
            # HEAD to get header values -- much faster
            r = http.request("HEAD", link)
            return r.status


    def multithreaded_link_checking(links, max_workers=99):
        """
        multithreaded operation to review a set of links and identifies working links vs. 404 codes
        :param links:
        :return None:
        """
        # codes list in order to log the response
        downloadable_links = []
        # Use the ThreadPoolExecutor to run concurrent processes
        with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
            # set the size to know how many links are left -- do this to make sure the threads dont bug out
            size = len(links)
            # map(function,iterable)
            for i in executor.map(check_links_urllib3_helper, links):
                print("Links Left: ", size)
                downloadable_links.append(i)
                size -= 1
        return downloadable_links

1 个答案:

答案 0 :(得分:0)

固定。

需要将self添加为multithreaded_link_checking()的位置参数