从“创建的”网址中提取数据

时间:2014-02-07 23:27:08

标签: python html json csv time

我有以下定义的函数,并试图弄清楚如何从csv中提取值并将它们放入URL的“theid”部分。

def get_csv_column(csv_fname, col, **kwargs):
  with open(csv_fname, 'rb') as inf:
      incsv = csv.reader(inf, **kwargs)
      column = [row[col] for row in incsv]
  return column

def withid (theid):
    """"""

    global cache

    dupe = False

    theurl = "{0}{1}{2}".format(OMDBURL, "?i=", theid)

    response = urllib2.urlopen(theurl)

    movdata = json.load(response)

    for mov in cache:
        if movdata[MKEY[1]] == mov[MKEY[1]]:
            dupe = True
    if not dupe:
        cache.append(movdata)
    return movdata
###I thought this loop below could reach in, pull the data and add a 5 second delay after each request

 with open('step3_desired_output.txt','w') as step3:
    for column in withid:
       step3.write(movdata)
       time.sleep(5)

这似乎不起作用,我只是迷失了如何继续。

1 个答案:

答案 0 :(得分:0)

继续Trying to run a defined function with a delay

import csv
import json
import time
import urllib2

PAGE_DELAY = 5.    # time between loading pages
PAGE_LOAD  = 0.3   # how long it takes to load a page
INFILE     = 'outputrows2.csv'
OUTFILE    = 'step3_desired_output.txt'

make_url = 'http://www.imdb.com/title/tt{}/'.format

def get_csv_column(csv_fname, col, **kwargs):
    with open(csv_fname, 'rb') as inf:
        incsv = csv.reader(inf, **kwargs)
        column = [row[col] for row in incsv]
    return column

def get_data_by_id(id):
    url = make_url(id)
    return urllib2.urlopen(url).read()

def delayed(delay, fn, *args):
    time.sleep(delay)
    return fn(*args)

def human_time(seconds):
    if seconds >= 86400:
        return '{:0.1f} days'.format(seconds / 86400.)
    elif seconds >= 3600:
        return '{:0.1f} hours'.format(seconds / 3600.)
    elif seconds >= 60:
        return '{:0.1f} minutes'.format(minutes / 60.)
    else:
        return '{:0.1f} seconds'.format(seconds)

def main():
    ids = get_csv_column(INFILE, 0)

    expected = (PAGE_DELAY + PAGE_LOAD) * len(ids)
    print('This will take about {}.'.format(human_time(expected)))

    results = (delayed(PAGE_DELAY, get_data_by_id, id) for id in ids)
    with open(OUTFILE, 'w') as outf:
        for res in results:
            outf.write(res)

if __name__=="__main__":
    main()