这是我进行网页抓取的代码。我使用current.futures来加快速度。但是有一个我不知道如何解决的问题。
import logging
import json
import requests
import re
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import time
def chem960(cas_no = str(64173962),time_out=(10,20)):
#logging.info("start chem960")
try:
r = requests.get("http://www.chem960.com/structure/sc{cas_no}".format(cas_no=cas_no),timeout=time_out)
root = etree.HTML(r.text)
selector = "/html/body/div[6]/div[2]/div[1]/div[2]/p[1]/text()"
smiles_key = root.xpath(selector)
smiles = smiles_key[1].strip()
logging.info("finish chem960")
return {"smiles":smiles,"source":"chem960"}
except:
logging.info("finish chem960")
return {"source":"chem960"}
def chemexper(cas_no = "2040906",time_out=(10,20)):
#logging.info("start chemexper")
try:
url = "http://www.chemexper.com/searchResult.shtml?format=ccd2013%2Cccd&target=structure&options=brandqtyoffercrm&searchValue="+cas_no+"&searchTemplate=rn.value%3D%22%3F%22&Search=Search"
start_html = requests.get(url,timeout=time_out).text
#logging.info(start_html)
index_link = re.findall('<iframe id="searchResultsFrame" src="(.*?)">',start_html)
index_link = index_link[0]
index_html = requests.get(index_link,timeout=time_out).text
#logging.info(index_html)
final_url = re.findall('<a href="(.*?)">here</a>',index_html)
final_url = "http://newsearch.chemexper.com/"+final_url[0]
final_html = requests.get(final_url,timeout=time_out).text
#logging.info(final_html)
root = etree.HTML(final_html)
#InChI = root.xpath('/html/body/table[1]//tr[2]/td[1]/table//tr[1]/td/text()')[1].strip()
InChIKey =root.xpath('/html/body/table[1]//tr[2]/td[1]/table//tr[2]/td/text()')[0]
#logging.info("finish chemexper")
return {"InChIKey":InChIKey,"source":"chemexper"}
except:
logging.info("finish chemexper")
return {"source":"chemexper"}
def ncbi(cas_no = "2040906",time_out=(10,20)):
#logging.info("start ncbi")
try:
url = "https://www.ncbi.nlm.nih.gov/pccompound?term="+cas_no
start_html = requests.get(url,timeout=time_out).text
findword = "(https://pubchem.ncbi.nlm.nih.gov/compound/\d{5,10})"
pattern = re.compile(findword)
results = [i.split('/')[-1] for i in pattern.findall(start_html)]
pid = results[0]
pull_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/"+str(pid)+"/JSON/?"
index_html = requests.get(pull_url,timeout=time_out).text
root = json.loads(index_html)
smiles = root['Record']['Section'][2]['Section'][1]['Section'][3]['Information'][0]['StringValue']
logging.info("finish ncbi")
return {"smiles":smiles,"source":"ncbi"}
except:
logging.info("finish ncbi")
return {"source":"ncbi"}
if __name__ == "__main__":
cas_no = "2040906"
with ThreadPoolExecutor(max_workers=3) as pools:
data = [pools.submit(chem960,cas_no),pools.submit(chemexper,cas_no),pools.submit(ncbi,cas_no)]
data = [i.result() for i in data]
我想退出并破坏线程池(如果有任何pools.submit结果)。我应该如何修改我的代码.ThreadPoolExecutor有什么方法可以实现吗?
答案 0 :(得分:0)
使用以下内容:
concurrent.futures.wait(data,return_when=FIRST_COMPLETED)
pools.shutdown(wait=False)