import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import gc
import numpy as np
from mpl_toolkits.basemap import Basemap
from math import radians, cos, sin, asin, sqrt
import pandas as pd
from sklearn import preprocessing
#some modules are unused
df_raw = pd.read_csv("AllUserTraj.csv", sep='\t')
df_raw = df_raw.drop('Unnamed: 0', 1) # You may start from df_raw
df = df_raw[['Latitude','Longitude','UserID','Domestic','Weekday','WaitingTime','DayHour']]
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees) in km
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r
def ODM_cal(df_user, ID):
aggregate_features = pd.DataFrame()
p = df_user.groupby(['Latitude','Longitude'])['UserID'].count()
n = len(p)
aggregate_features = pd.DataFrame(p)
aggregate_features['temp'] = aggregate_features.index
lat = list()
long = list()
for index, row in aggregate_features.iterrows():
lat.append(row['temp'][0])
long.append(row['temp'][1])
aggregate_features['Latitude'] = lat
aggregate_features['Longitude'] = long
aggregate_features = aggregate_features.drop('temp', 1)
aggregate_features = aggregate_features.sort_values('UserID', ascending=False)
aggregate_features = aggregate_features.reset_index(drop=True)
ODM = np.zeros((n,n))
frequency = dict()
count = 0
for index, row in df_user.iterrows():
Latitude = row['Latitude']
Longitude = row['Longitude']
location_index = aggregate_features[(aggregate_features['Latitude'] == Latitude) & (aggregate_features['Longitude'] == Longitude)].index.tolist()[0]
if count == 0:
destination_index = location_index
else:
origin_index = destination_index
destination_index = location_index
if destination_index != origin_index:
ODM[origin_index, destination_index] += 1 #Count trip number
if location_index not in frequency:
frequency[location_index] = 0
frequency[location_index] += 1
count+= 1
## Calculate distance matrix
DistanceM = np.zeros((n,n))
for i in range(0,n):
for j in range(i,n):
DistanceM[i, j] = haversine(aggregate_features['Longitude'][i], aggregate_features['Latitude'][i], aggregate_features['Longitude'][j], aggregate_features['Latitude'][j])
return (aggregate_features, frequency, ODM, DistanceM)
我用np.zeros创建了一个矩阵。但是在运行memoryerror时发生的事情就像:
" ODM = np.zeros((n,n))
的MemoryError"
我想问: (1)实际上有什么问题? (2)如何避免?
我用python3.6(64位)numpy(64位)运行它 我相信环境不是问题。
可能迭代需要太多内存? 如何优化?与gc。?
df是从csv文件收集的数据帧,包含一些时空信息。
告诉我细节,谢谢!