Question

我需要以字符串数组的形式获取所选文件夹的内容，以便为for循环中的每个文件运行脚本。我目前的处理方式是将内容作为char二维数组接收。

有没有办法直接将内容作为字符串数组获取，而不是循环运行每一行并将其转换为String？

 directory = uigetdir;
    list = ls(strcat(directory, '\*.extension'))

    for i = 1: ??

Answer 1

您要使用the dir function：

script_path = os.path.dirname(os.path.realpath(__file__))

driver = webdriver.PhantomJS(executable_path="/usr/local/bin/bin/phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])

case_list = []

#this function launches the headless browser and gets us to the first page of results, which we'll scrape using main
def search():

    driver.get('https://www.courts.mo.gov/casenet/cases/nameSearch.do')

    if 'Service Unavailable' in driver.page_source:
        log('Casenet website seems to be down. Receiving "service unavailable"')
        driver.quit()
        gc.collect()
        return False

    time.sleep(2)

    court = Select(driver.find_element_by_id('courtId'))

    court.select_by_visible_text('All Participating Courts')

    case_enter = driver.find_element_by_id('inputVO.lastName')

    case_enter.send_keys('Wakefield & Associates')

    year_enter = driver.find_element_by_id('inputVO.yearFiled')

    year_enter.send_keys('2018')

    driver.find_element_by_id('findButton').click()

    time.sleep(3)

#scrapes table and stores what we need in a list of lists
def main():

    parties = []

    dates = []

    case_nums = []

    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')

    table = soup.findAll('table', {'class':'outerTable'})

    for row in table:

        col = row.find_all('td', attrs={'class':'td1'})

        col2 = row.find_all('td', attrs={'class':'td2'})

        all_links = soup.findAll('a')

        for cols in col:
            if 'V' in cols.text:
                cols = cols.string
                cols.encode('utf-8').strip()
                cols = re.sub("\xa0'\'", '', cols).strip()
                parties.append(cols)

        for cols in col2:
            if 'V' in cols.text:
                cols = cols.string
                cols.encode('utf-8').strip()
                cols = re.sub("\xa0'\'", '', cols).strip()
                parties.append(cols)

        for link in all_links:
            raw_html = str(link)

            if 'goToThisCase' in raw_html:

                start = raw_html.find("('") + 2
                end = raw_html.find("',")
                case = raw_html[start:end].strip()
                case_nums.append(case)

        for i in col:
            if '/2018' in i.text:
                i = i.string
                i.encode('utf-8').strip()
                i = re.sub("\xa0", '', i).strip()
                dates.append(i)

        for j in col2:
            if '/2018' in j.text:
                j = j.string
                j.encode('utf-8').strip()
                j = re.sub("\xa0", '', j).strip()
                dates.append(j)

        case_list.append(parties)
        case_list.append(case_nums)
        case_list.append(dates)

    return case_list

def page_looper():

    main()

    count = '1'

    print "page %s fully scraped" % count

    count = str(int(count) +1)

    print len(case_list), " cases so far"

    print case_list

    for count in range(2,9):

        link = driver.find_element_by_link_text(str(count))

        link.click()

        time.sleep(2)

        main()

        print "page %s fully scraped" % count

        count = str(int(count) +1)

        print len(case_list), " cases so far"

        print case_list

    next_page_link = driver.find_element_by_partial_link_text('Next')
    print "Next 10 pages found"
    next_page_link.click()
    time.sleep(2)

    try:

        page_looper()

    except Exception:

        print "no more cases"

    #pprint.pprint(case_list)

    #data = zip(case_list[0],case_list[1],case_list[2])

    #pprint.pprint(data)

    # with open(script_path + "/cases.csv", "w") as f:
    #   writer = csv.writer(f)
    #   for d in data:
    #       writer.writerow(d)
search()

page_looper()

MATLAB-获取文件夹内容为字符串数组

1 个答案: