我正在将自行车租赁数据整合到存储在HDF文件中的DataFrame中,并希望了解添加日期会增加文件大小。令我惊讶的是,它没有,而是当我为日期添加一列时,生成的文件会更小。
以下是我的代码和一些实验结果。对我来说,最后两个文件大小似乎是合乎逻辑的,但第一个是令人惊讶的。是什么原因导致文件大小缩小,即使添加了新数据?
In [1]:
import pandas as pd
import re
import os
from requests import get
from bs4 import BeautifulSoup, SoupStrainer
def get_data_links():
'''
Returns list of filenames parsed from http://dev.hsl.fi/tmp/citybikes/.
Only names of form "stations_yyyymmddThhmmssZ" are returned.
'''
res = get('http://dev.hsl.fi/tmp/citybikes/')
soup = BeautifulSoup(res.content, 'lxml')
return [link['href'] for link in soup.find_all('a') if re.match('stations_\d{8}T\d{6}Z', link['href'])]
def save_new_data(new_data, filename):
store = pd.HDFStore(filename)
if not '/data' in store.keys():
store['data'] = pd.DataFrame()
store['data'] = store['data'].append(new_data)
store.close()
def get_data(link_list, with_date):
data = pd.DataFrame()
bad_data = 0
for link in link_list:
filename = 'http://dev.hsl.fi/tmp/citybikes/' + link
try:
new_data = pd.read_json(filename)
new_data = new_data.result.apply(pd.Series)
if with_date:
new_data['date'] = pd.Timestamp(link.split("_")[1])
data = data.append(new_data)
except:
bad_data += 1
continue
print("Result shape: " + str(data.shape))
print("Number of good files: " + str(len(link_list) - bad_data))
print("Number of bad files: " + str(bad_data))
return data
def print_shape_size(filename):
store = pd.HDFStore(filename)
print(filename + " data shape: " + str(store['data'].shape) + ", size: " + str(os.path.getsize(filename)))
store.close()
links = get_data_links()
data_without_dates = get_data(links[0:1000], False)
data_with_dates = get_data(links[0:1000], True)
save_new_data(data_without_dates, 'data_without_dates.h5')
save_new_data(data_with_dates, 'data_with_dates.h5')
save_new_data(data_with_dates.drop('date', axis=1), 'data_with_dates_dropped.h5')
print_shape_size('data_without_dates.h5')
print_shape_size('data_with_dates.h5')
print_shape_size('data_with_dates_dropped.h5')
结果:
Out[1]:
Result shape: (136177, 7)
Number of good files: 958
Number of bad files: 42
Result shape: (136177, 8)
Number of good files: 958
Number of bad files: 42
data_without_dates.h5 data shape: (136177, 7), size: 12964632
data_with_dates.h5 data shape: (136177, 8), size: 10253592
data_with_dates_dropped.h5 data shape: (136177, 7), size: 9559352
(Python 3.6.1,Pandas 0.20.1)