Question

这是我进行网页抓取的代码。我使用current.futures来加快速度。但是有一个我不知道如何解决的问题。

import logging
import json
import requests
import re
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import time

def chem960(cas_no = str(64173962),time_out=(10,20)):
    #logging.info("start chem960")
    try:
        r = requests.get("http://www.chem960.com/structure/sc{cas_no}".format(cas_no=cas_no),timeout=time_out)
        root = etree.HTML(r.text)
        selector = "/html/body/div[6]/div[2]/div[1]/div[2]/p[1]/text()"
        smiles_key = root.xpath(selector)
        smiles = smiles_key[1].strip()
        logging.info("finish chem960")
        return {"smiles":smiles,"source":"chem960"}
    except:
        logging.info("finish chem960")
        return {"source":"chem960"}

def chemexper(cas_no = "2040906",time_out=(10,20)):
    #logging.info("start chemexper")
    try:
        url = "http://www.chemexper.com/searchResult.shtml?format=ccd2013%2Cccd&target=structure&options=brandqtyoffercrm&searchValue="+cas_no+"&searchTemplate=rn.value%3D%22%3F%22&Search=Search"
        start_html = requests.get(url,timeout=time_out).text
        #logging.info(start_html)
        index_link = re.findall('<iframe id="searchResultsFrame"  src="(.*?)">',start_html)
        index_link = index_link[0]
        index_html = requests.get(index_link,timeout=time_out).text
        #logging.info(index_html)
        final_url = re.findall('<a href="(.*?)">here</a>',index_html)
        final_url = "http://newsearch.chemexper.com/"+final_url[0]
        final_html = requests.get(final_url,timeout=time_out).text
        #logging.info(final_html)
        root = etree.HTML(final_html)
        #InChI = root.xpath('/html/body/table[1]//tr[2]/td[1]/table//tr[1]/td/text()')[1].strip()
        InChIKey =root.xpath('/html/body/table[1]//tr[2]/td[1]/table//tr[2]/td/text()')[0]

        #logging.info("finish chemexper")
        return {"InChIKey":InChIKey,"source":"chemexper"}
    except:
        logging.info("finish chemexper")
        return {"source":"chemexper"}

def ncbi(cas_no = "2040906",time_out=(10,20)):
    #logging.info("start ncbi")
    try:
        url = "https://www.ncbi.nlm.nih.gov/pccompound?term="+cas_no
        start_html = requests.get(url,timeout=time_out).text
        findword = "(https://pubchem.ncbi.nlm.nih.gov/compound/\d{5,10})"
        pattern = re.compile(findword)
        results = [i.split('/')[-1] for i in pattern.findall(start_html)]
        pid = results[0]
        pull_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/"+str(pid)+"/JSON/?"
        index_html = requests.get(pull_url,timeout=time_out).text
        root = json.loads(index_html)
        smiles = root['Record']['Section'][2]['Section'][1]['Section'][3]['Information'][0]['StringValue']
        logging.info("finish ncbi")
        return {"smiles":smiles,"source":"ncbi"}
    except:
        logging.info("finish ncbi")
        return {"source":"ncbi"}
if __name__ == "__main__":
    cas_no = "2040906"
    with ThreadPoolExecutor(max_workers=3) as pools:
        data = [pools.submit(chem960,cas_no),pools.submit(chemexper,cas_no),pools.submit(ncbi,cas_no)]
    data = [i.result() for i in data]

我想退出并破坏线程池（如果有任何pools.submit结果）。我应该如何修改我的代码.ThreadPoolExecutor有什么方法可以实现吗？

Answer 1

使用以下内容：

concurrent.futures.wait(data,return_when=FIRST_COMPLETED)
pools.shutdown(wait=False)

当我从python中的线程池获得任何结果时如何退出？

1 个答案: