Question

我需要从一个网站收集研究，而一次只能以xml显示1000个研究。我有一个案例，其中有1000多个研究，因此我需要进行迭代，直到掌握所有内容为止。就我而言，我尝试使用递归函数来实现。我知道这不是唯一的方法，但是我是如此亲密，以至于我想知道这里出了什么问题。

from xml.etree import ElementTree as ET

def collect_ct_ids(nct_ids=[], start=1):    
    url = f'https://www.clinicaltrials.gov/ct2/results?&type=Intr&intr=%22NICOTINE+BITARTRATE%22%20OR%20%22PROSTEP%22%20OR%20%22NICOTROL%22%20OR%20%22NICOTINE+POLACRILEX%22%20OR%20%22NICORETTE%22%20OR%20%22NICODERM+CQ%22%20OR%20%22HABITROL%22%20OR%20%22NICOTINE%22&down_fmt=csv&down_flds=all?displayxml=true&start={start}&count=10000'
    print(url)

    # read xml
    response = requests.get(url)
    root = ET.fromstring(response.content)

    for child in root.iter('search_results'):
        n_studies = int(child.attrib['count'])

    # store studies
    nct_ids = nct_ids + [x.find('nct_id').text for x in root.findall('clinical_study')]

    print(n_studies, len(nct_ids))

    ## check that the default url allows to collect all the studies
    if n_studies == len(nct_ids):
        # return the studies
        print('if')
        print(nct_ids)
        return(nct_ids)   

    else:
        # reiterate until every study has been collected
        print('else')
        start += 1000
        return(nct_ids.append(collect_ct_ids(nct_ids, start)))

基本上，当nct_ids对象包含1342个研究时，该函数应返回一个包含1342个研究的列表。

欢迎任何帮助。

编辑2：

解决方案是使用.__ add__在列表中添加元素，并且无需在第二次返回中添加到列表中，也无需复制元素。

from xml.etree import ElementTree as ET

def collect_ct_ids(nct_ids=[], start=1):    
    url = f'https://www.clinicaltrials.gov/ct2/results?&type=Intr&intr=%22NICOTINE+BITARTRATE%22%20OR%20%22PROSTEP%22%20OR%20%22NICOTROL%22%20OR%20%22NICOTINE+POLACRILEX%22%20OR%20%22NICORETTE%22%20OR%20%22NICODERM+CQ%22%20OR%20%22HABITROL%22%20OR%20%22NICOTINE%22&down_fmt=csv&down_flds=all?displayxml=true&start={start}&count=1000'
    print(url)

    # read xml
    response = requests.get(url)
    root = ET.fromstring(response.content)

    for child in root.iter('search_results'):
        n_studies = int(child.attrib['count'])

    # store studies
    nct_ids = nct_ids.__add__([x.find('nct_id').text for x in root.findall('clinical_study')])

    print(n_studies, len(nct_ids))

    ## check that the default url allows to collect all the studies
    if n_studies == len(nct_ids):
        # return the studies
        print('if')
        #print(nct_ids)
        return(nct_ids)   

    else:
        # reiterate until every study has been collected
        print('else')
        start += 1000
        return(nct_ids.__add__(collect_ct_ids(nct_ids, start)))

尽管列表不为空，Python递归函数也不会返回列表对象

0 个答案: