处理数据块 - 请求/ Python

时间:2016-01-07 15:44:48

标签: python dictionary python-requests bigdata

我有一些巨大的文件需要处理和搜索,我发现requests将是一个很好的库。 要处理的文件(小一个)的示例是:

https://storage.googleapis.com/tlc-trip-data/2015/green_tripdata_2015-06.csv

这是我处理此类数据的代码:

import requests
import csv

def consumeTaxiData(url):
    """
    Given a url, reads its content and process its data.
    :param url: the url to be readen.
    :return: a list of tuples in the form (long, lat, hour).
    """
    print "Processing", url
    points = []

    r = requests.get(url, stream=True)
    keys = None
    i = 0
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            if i == 0:
                reader = csv.DictReader(chunk.splitlines(), delimiter=',')
            else:
                reader = csv.DictReader(chunk.splitlines(), fieldnames=keys, delimiter=',')
            for line in reader:
                if i == 0:
                    keys = line.keys()
                    print "Keys", keys

                latitude = line.get('dropoff_latitude', None)
                if latitude is None:
                    latitude = line.get('Dropoff_latitude', None)

                longitude = line.get('dropoff_longitude', None)
                if longitude is None:
                    longitude = line.get('Dropoff_longitude', None)

                time = line.get('tpep_dropoff_datetime', None)
                if time is None:
                    time = line.get('Lpep_dropoff_datetime', None)
                if time is not None and latitude is not None and longitude is not None and \
                   datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.strptime(date, '%Y-%m-%d'):
                    time = roundTime(datetime.strptime(time, '%Y-%m-%d %H:%M:%S'), roundTo=60 * 60).hour
                    points.append((longitude, latitude, time))
                i+=1

    return points

数据的第一行包含其字段名称,因为我只对其中三个感兴趣,但我使用的是DictReader。 但是,print语句给了我这个:

 Keys ['VendorID', 'Total_amount', 'lpep_pickup_datetime', 'Passenger_count', 'Payment_type', 'Store_and_fwd_flag', 'Pickup_latitude', 'Trip_type ', 'Lpep_dropoff_datetime', 'RateCodeID', 'Trip_distance', 'Fare_amount', 'Pickup_longitude', 'Dropoff_latitude', 'Tolls_amount', 'improvement_surcharge', 'Tip_amount', None, 'Extra', 'MTA_tax', 'Ehail_fee', 'Dropoff_longitude']
{'VendorID': '2', 'Total_amount': '11.8', 'lpep_pickup_datetime': '2015-06-01 00:00:00', 'Passenger_count': '1', 'Payment_type': '2', 'Store_and_fwd_flag': 'N', 'Pickup_latitude': '40.881328582763672', 'Trip_type ': '1', 'Lpep_dropoff_datetime': '2015-06-01 00:09:32', 'RateCodeID': '1', 'Trip_distance': '2.64', 'Fare_amount': '10.5', 'Pickup_longitude': '-73.878700256347656', 'Dropoff_latitude': '40.884838104248047', 'Tolls_amount': '0', 'improvement_surcharge': '0.3', 'Tip_amount': '0', None: ['', ''], 'Extra': '0.5', 'MTA_tax': '0.5', 'Ehail_fee': '', 'Dropoff_longitude': '-73.838386535644531'}
{'VendorID': '2', 'Total_amount': '17.3', 'lpep_pickup_datetime': '2015-06-01 00:00:05', 'Passenger_count': '1', 'Payment_type': '2', 'Store_and_fwd_flag': 'N', 'Pickup_latitude': '40.876182556152344', 'Trip_type ': '1', 'Lpep_dropoff_datetime': '2015-06-01 00:12:41', 'RateCodeID': '1', 'Trip_distance': '4.79', 'Fare_amount': '16', 'Pickup_longitude': '-73.906356811523438', 'Dropoff_latitude': '40.830490112304688', 'Tolls_amount': '0', 'improvement_surcharge': '0.3', 'Tip_amount': '0', None: ['', ''], 'Extra': '0.5', 'MTA_tax': '0.5', 'Ehail_fee': '', 'Dropoff_longitude': '-73.944488525390625'}
{'VendorID': '2', 'Total_amount': '10.3', 'lpep_pickup_datetime': '2015-06-01 00:00:09', 'Passenger_count': '1', 'Payment_type': '2', 'Store_and_fwd_flag': 'N', 'Pickup_latitude': '40.747196197509766', 'Trip_type ': '1', 'Lpep_dropoff_datetime': '2015-06-01 00:11:29', 'RateCodeID': '1', 'Trip_distance': '1.45', 'Fare_amount': '9', 'Pickup_longitude': '-73.887863159179688', 'Dropoff_latitude': '40.738815307617188', 'Tolls_amount': '0', 'improvement_surcharge': '0.3', 'Tip_amount': '0', None: ['', ''], 'Extra': '0.5', 'MTA_tax': '0.5', 'Ehail_fee': '', 'Dropoff_longitude': '-73.888786315917969'}
{'VendorID': '2', 'Total_amount': '5.8', 'lpep_pickup_datetime': '2015-06-01 00:00:26', 'Passenger_count': '1', 'Payment_type': '2', 'Store_and_fwd_flag': 'N', 'Pickup_latitude': '40.770065307617187', 'Trip_type ': '1', 'Lpep_dropoff_datetime': '2015-06-01 00:03:51', 'RateCodeID': '1', 'Trip_distance': '.74', 'Fare_amount': '4.5', 'Pickup_longitude': '-73.917800903320312', 'Dropoff_latitude': '40.766143798828125', 'Tolls_amount': '0', 'improvement_surcharge': '0.3', 'Tip_amount': '0', None: ['', ''], 'Extra': '0.5', 'MTA_tax': '0.5', 'Ehail_fee': '', 'Dropoff_longitude': '-73.907890319824219'}
{'Trip_distance': None, 'VendorID': '1', 'improvement_surcharge': None, 'Tip_amount': None, 'Total_amount': None, 'lpep_pickup_datetime': '2015-06-01 00:00:18', 'Extra': None, 'Pickup_latitude': '40.717', 'Ehail_fee': None, 'Fare_amount': None, 'Pickup_longitude': '-73.956329345703125', 'Tolls_amount': None, 'Dropoff_longitude': None, 'Passenger_count': None, 'Payment_type': None, 'MTA_tax': None, 'Lpep_dropoff_datetime': '2015-06-01 00:04:31', 'Store_and_fwd_flag': 'N', 'RateCodeID': '1', 'Dropoff_latitude': None, 'Trip_type ': None}
['121124267578,-73.950599670410156,40.723434448242187,1,.80,5,0.5,0.5,1.25,0,,0.3,7.55,1,1,,', '2,2015-06-01 00:00:16,2015-06-01 00:10:29,N,1,-73.939163208007812,40.816555023193359,-73.938468933105469,40.796218872070313,1,1.94,9.5,0.5,0.5,0,0,,0.3,10.8,2,1,,', '2,2015-06-01 00:00:29,2015-06-01 00:26:47,N,1,-73.941329956054687,40.813583374023438,-73.918571472167969,40.811511993408203,1,6.26,22.5,0.5,0.5,0,0,,0.3,23.8,2,1,,', '2,2015-06-01 00:01:15,2015-06-01 00:04:11,N,1,-73.997383117675781,40.674507141113281,-73.98590087890625,40.67755126953125,1,.90,5,0.5,0.5,1.26,0,,0.3,7.56,1,1,,', '2,2015-06-01 00:00:39,2015-06-01 00:06:35,N,1,-73.891006469726563,40.746994018554687,-73.880416870117187,40.749176025390625,1,.71,5.5,0.5,0.5,0,0,,0.3,6.8,2,1,,', '2,2015-06-01 00:00:34,2015-06-01 00:10:13,N,1,-73.969017028808594,40.693115234375,-73.950355529785156,40.706508636474609,2,1.96,9,0.5,0.5,0,0,,0.3,10.3,2,1,,', '2,2015-06-01 00:01:06,2015-06-01 00:32:00,N,1,-73.928153991699219,40.695011138916016,-73.954338073730469,40.773025512695']
{'VendorID': '121124267578', 'Total_amount': '-73.950599670410156', 'lpep_pickup_datetime': '40.723434448242187', 'Passenger_count': '1', 'Payment_type': '.80', 'Store_and_fwd_flag': '5', 'Pickup_latitude': '0.5', 'Trip_type ': '0.5', 'Lpep_dropoff_datetime': '1.25', 'RateCodeID': '0', 'Trip_distance': '', 'Fare_amount': '0.3', 'Pickup_longitude': '7.55', 'Dropoff_latitude': '1', 'Tolls_amount': '1', 'improvement_surcharge': '', 'Tip_amount': '', None: None, 'Ehail_fee': None, 'MTA_tax': None, 'Extra': None, 'Dropoff_longitude': None}
Traceback (most recent call last):
  File "/Users/paulaceccon/Documents/Projetos/NYCNoise/Scripts/noiseInference.py", line 490, in <module>
    taxi_dropoffs = getTaxiTrips(date)
  File "/Users/paulaceccon/Documents/Projetos/NYCNoise/Scripts/noiseInference.py", line 300, in getTaxiTrips
{'VendorID': '2', 'Total_amount': '2015-06-01 00:00:16', 'lpep_pickup_datetime': '2015-06-01 00:10:29', 'Passenger_count': 'N', 'Payment_type': '1', 'Store_and_fwd_flag': '-73.939163208007812', 'Pickup_latitude': '40.816555023193359', 'Trip_type ': '-73.938468933105469', 'Lpep_dropoff_datetime': '40.796218872070313', 'RateCodeID': '1', 'Trip_distance': '1.94', 'Fare_amount': '9.5', 'Pickup_longitude': '0.5', 'Dropoff_latitude': '0.5', 'Tolls_amount': '0', 'improvement_surcharge': '0', 'Tip_amount': '', None: [''], 'Ehail_fee': '1', 'MTA_tax': '2', 'Extra': '10.8', 'Dropoff_longitude': ''}
    result = pool.map(consumeTaxiData, data)
  File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/pool.py", line 251, in map
    return self.map_async(func, iterable, chunksize).get()
  File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/pool.py", line 567, in get
    raise self._value
ValueError: time data '40.796218872070313' does not match format '%Y-%m-%d %H:%M:%S'

我想知道造成这种情况的原因,为什么它在5号dict打印线上弄乱了,我怎么能解决它。

1 个答案:

答案 0 :(得分:0)

您还应该检查

from dateutil.parser import parse

def is_date(string):
    try: 
        parse(string)
        return true
    except ValueError:
        return false

然后

if is_date(time) and latitude is not None and longitude is not None:
   if datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.strptime(date, '%Y-%m-%d'):
        time = roundTime(datetime.strptime(time, '%Y-%m-%d %H:%M:%S'), roundTo=60 * 60).hour
        points.append((longitude, latitude, time))   

它将解决错误

ValueError: time data '40.796218872070313' does not match format '%Y-%m-%d %H:%M:%S'