当我尝试使用大量数据运行我的代码时,我在all_rows = [[x[0], x[1]] for x in cur]
中得到了一个内存错误。我有200M行。我怎么能避免它?
binwidth = 1
latitudes = []
userids = []
info = []
densities = []
with lite.connect(databasepath) as con:
cur = con.execute('SELECT latitude, userid FROM dynamicMessage WHERE latitude>45')
print "executed"
all_rows = [[x[0], x[1]] for x in cur]
all_rows = sorted(all_rows, key=itemgetter(0))
print "sorted"
for x in all_rows:
latitudes.append(x[0])
userids.append(x[1])
min_lat = -100
max_lat = 100
binwidth = 1
bin_range = np.arange(min_lat,max_lat,binwidth)
binned_latitudes = np.digitize(latitudes,bin_range)
all_in_bins = zip(binned_latitudes,userids)
unique_in_bins = list(set(all_in_bins))
all_in_bins.sort()
unique_in_bins.sort()
bin_count_all = []
for bin, group in groupby(all_in_bins, lambda x: x[0]):
bin_count_all += [(bin, len([k for k in group]))]
bin_count_unique = []
for bin, group in groupby(unique_in_bins, lambda x: x[0]):
bin_count_unique += [(bin, len([ k for k in group]))]
bin_density = [(bin_range[b-1],a*1.0/u) for ((b,a),(_,u)) in zip(bin_count_all, bin_count_unique)]
bin_density = np.array(bin_density).transpose()
# all_in_bins and unique_in_bins now contain the data
# corresponding to the SQL / pseudocode in your question
# plot as standard bar - note you can put uneven widths in as an array-like here if necessary
plt.bar(*bin_density, width=binwidth)
plt.savefig('latlongstats'+'t'+str(time.strftime("%H:%M:%S")), format='png')