避免'内存错误"处理大型数组时

时间:2016-06-15 19:54:55

标签: python arrays numpy memory out-of-memory

我有时会遇到Memory Error,有时它会很好,有时它会弹出来......特别是在尝试将大数组减去一个时。我尝试了很多方法来做这个减法,有什么方法可以避免这种情况吗?并且我的其他代码部分也会出现这个错误吗?

这是我的代码:

def home(request):
    if request.method=="POST":
        img = UploadForm(request.POST, request.FILES)
        no_clus = int(request.POST.get('num_clusters', 10))
        if img.is_valid():

            paramFile =io.TextIOWrapper(request.FILES['pic'].file)
            portfolio1 = csv.DictReader(paramFile)

            users = []

            users = [row["BASE_NAME"] for row in portfolio1]
            print(len(users))

            my_list = users
            vectorizer = CountVectorizer()
            dtm = vectorizer.fit_transform(my_list)

            lsa = TruncatedSVD(n_components=100)
            dtm_lsa = lsa.fit_transform(dtm)
            dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
            dist1 = (1- np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T))
            # print(1-similarity)
            k = len(my_list)
         #   dist1 = (1- similarity)
            # dist1=similarity
            # dist1.astype(float)
            #print(dist1)
            # print(cosine_similarity(tfidf_matrix[3:4], tfidf_matrix))
            # float dist = 1 - similarity;
            data2 = np.asarray(dist1)
            arr_3d = data2.reshape((1, k, k))
            # arr_3d= 1- arr_3d
            #print(arr_3d)

            no_cluster = number_cluster(len(my_list))
            print(no_cluster)
            for i in range(len(arr_3d)):
                # print (i+1910)
                # km = AgglomerativeClustering(n_clusters=no_clus, linkage='ward').fit(arr_3d[i])
                km = AgglomerativeClustering(n_clusters=no_cluster, linkage='average').fit(arr_3d[i])
                # km = AgglomerativeClustering(n_clusters=no_clus, linkage='complete').fit(arr_3d[i])
                # km = MeanShift()
                # km = KMeans(n_clusters=no_clus, init='k-means++')
                # km = MeanShift()
                #  km = km.fit(arr_3d[i])
                # print km
                labels = km.labels_

            csvfile = settings.MEDIA_ROOT +'\\'+ 'images\\export.csv'

            csv_input = pd.read_csv(csvfile, encoding='latin-1')
            csv_input['cluster_ID'] = labels
            csv_input['BASE_NAME'] = my_list
            csv_input.to_csv(settings.MEDIA_ROOT +'/'+ 'output.csv', index=False)
            clus_groups = list()
            for j in range(no_cluster):
                # print(" cluster no %i:%s" % (j, [my_list[i] for i, x in enumerate(labels) if x == j]))
                list_of_ints = ([my_list[i] for i, x in enumerate(labels) if x == j])
                clus_groups.append('  '.join(list_of_ints))
            vectorizer = CountVectorizer()
            dtm = vectorizer.fit_transform(my_list)

            lsa = TruncatedSVD(n_components=100)
            dtm_lsa = lsa.fit_transform(dtm)
            dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
            dist1 = (1 - np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T))
           # similarity = np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T)
            k = len(my_list)
          #  dist1 = 1 - similarity

            data2 = np.asarray(dist1)
            arr_3d = data2.reshape((1, k, k))
            # arr_3d= 1- arr_3d

            #no_clus = 5
           # no_clus=get_name(request)
            for i in range(len(arr_3d)):
                # print (i+1910)
                # km = AgglomerativeClustering(n_clusters=no_clus, linkage='ward').fit(arr_3d[i])
                # km = AgglomerativeClustering(n_clusters=no_clus, linkage='average').fit(arr_3d[i])
                # km = AgglomerativeClustering(n_clusters=no_clus, linkage='complete').fit(arr_3d[i])
                km = KMeans(n_clusters=no_clus, init='k-means++')
                km = km.fit(arr_3d[i])
                # print km
                labels2 = km.labels_
                # error = km.inertia_
                print(labels2)

            labels = labels.tolist()
            labels2 = labels2.tolist()
            # new=list()


            csv_input = pd.read_csv(settings.MEDIA_ROOT +'/'+ 'output.csv',encoding='latin-1')
            labels1 = csv_input['cluster_ID']
            new_list = []
            for k in labels1:
                new_list.append(labels2[k])  # lookup the value in list2 at the index given by list1

            print(new_list)
            print(len(new_list))
            csv_input = pd.read_csv(settings.MEDIA_ROOT +'/'+ 'output.csv',encoding='latin-1')
            csv_input['cluster_ID'] = labels
            csv_input['BASE_NAME'] = my_list
            csv_input['User_Map'] = new_list
            csv_input.to_csv(settings.MEDIA_ROOT + '/' + 'output1.csv', index=False)
            #filename= settings.MEDIA_ROOT +'/'+ 'output.csv'
            send_file(request)
           # my_list = portfolio
            #save_file('output1.csv')
          #  csv(request)
          #  return HttpResponseRedirect(reverse('labels'))
            return render(request, 'new.html', {'labels': labels})
    else:
        img=UploadForm()
    images=Upload.objects.all()
    return render(request,'new.html',{'form':img,'images':images})

error是在尝试执行dist1 = (1- np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T))时..我还尝试使用相同大小的所有内容创建新数组,然后减去..我应该如何修改此数据以防止此错误?请注意,运行此代码的用户界面可以在任何PC上运行!

1 个答案:

答案 0 :(得分:0)

不确定,但在有罪的行中你使用numpy.asmatrix(dtm_lsa),这是一个分配一些内存的函数调用。

你这样做了两次,所以它创造了两倍的内存(这是在收集垃圾之前,但在某些情况下为时已晚)

(根本没有光顾你:这是数学公式中常见的陷阱,但这些公式必须在编入计算机时进行调整)。

我建议用这些行替换该行:

temp_matrix = numpy.asmatrix(dtm_lsa)
product = temp_matrix * temp_matrix.T
# maybe call the garbage collector at this point: gc.collect()
dist1 = (1- np.asarray(product))

这样1)减少复制/粘贴和2)在一行中没有大量的大矩阵分配