得到memoryerror用迭代创建ODM = np.zeros((n,n))

时间:2017-12-10 07:00:30

标签: python numpy out-of-memory

  import networkx as nx
  import matplotlib
  import matplotlib.pyplot as plt
  import matplotlib.cm as cm
  import gc
  import numpy as np
  from mpl_toolkits.basemap import Basemap
  from math import radians, cos, sin, asin, sqrt
  import pandas as pd
  from sklearn import preprocessing
  #some modules are unused

df_raw = pd.read_csv("AllUserTraj.csv", sep='\t')
df_raw = df_raw.drop('Unnamed: 0', 1) # You may start from df_raw
df = df_raw[['Latitude','Longitude','UserID','Domestic','Weekday','WaitingTime','DayHour']]

DF:包含:'纬度''经度''用户ID''国产' 0或1,'工作日' 0或1,' WaitingTime'在分钟,' DayHour'从1到24

def haversine(lon1, lat1, lon2, lat2): 
"""
Calculate the great circle distance between two points 
on the earth (specified in decimal degrees) in km
"""
# convert decimal degrees to radians 
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

# haversine formula 
dlon = lon2 - lon1 
dlat = lat2 - lat1 
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a)) 
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r

def ODM_cal(df_user, ID):
    aggregate_features = pd.DataFrame()
    p = df_user.groupby(['Latitude','Longitude'])['UserID'].count()
    n = len(p)
    aggregate_features = pd.DataFrame(p)
    aggregate_features['temp'] = aggregate_features.index
    lat = list()
    long = list()
    for index, row in aggregate_features.iterrows():
        lat.append(row['temp'][0])
        long.append(row['temp'][1])
    aggregate_features['Latitude'] = lat
    aggregate_features['Longitude'] = long
    aggregate_features = aggregate_features.drop('temp', 1)
    aggregate_features = aggregate_features.sort_values('UserID', ascending=False)
    aggregate_features = aggregate_features.reset_index(drop=True)

    ODM = np.zeros((n,n))
    frequency = dict()
    count = 0
    for index, row in df_user.iterrows():
    Latitude = row['Latitude']
    Longitude = row['Longitude']
    location_index = aggregate_features[(aggregate_features['Latitude'] == Latitude) & (aggregate_features['Longitude'] == Longitude)].index.tolist()[0]
    if count == 0:
        destination_index = location_index
    else:
        origin_index = destination_index
        destination_index = location_index
        if destination_index != origin_index:
            ODM[origin_index, destination_index] += 1 #Count trip number
        if location_index not in frequency:
            frequency[location_index] = 0
        frequency[location_index] += 1
    count+= 1
## Calculate distance matrix
    DistanceM = np.zeros((n,n))
    for i in range(0,n):
        for j in range(i,n):
            DistanceM[i, j] = haversine(aggregate_features['Longitude'][i], aggregate_features['Latitude'][i], aggregate_features['Longitude'][j], aggregate_features['Latitude'][j])
            return (aggregate_features, frequency, ODM, DistanceM)

我用np.zeros创建了一个矩阵。但是在运行memoryerror时发生的事情就像:

" ODM = np.zeros((n,n))
    的MemoryError"

我想问: (1)实际上有什么问题? (2)如何避免?

我用python3.6(64位)numpy(64位)运行它 我相信环境不是问题。

可能迭代需要太多内存? 如何优化?与gc。?

df是从csv文件收集的数据帧,包含一些时空信息。

告诉我细节,谢谢!

0 个答案:

没有答案