如何将大文本矩阵转换为稀疏格式

时间:2016-05-05 18:09:12

标签: python memory sparse-matrix

我有100万* 4096矩阵作为文本文件。我想将其转换为稀疏矩阵格式以节省内存并进行一些计算。我写了下面的代码,但由于内存问题而被杀死。我的电脑有32GB内存和8GB交换。这还不够。有人可以给出一些修改建议。感谢。

import numpy as np
from scipy.sparse import csr_matrix
from datetime import datetime as time
from scipy import sparse, io
from numpy import array
import os, sys, getopt

def process(input_file_name):
    crt_row_no = 0 # Current row number
    crt_col_no = 0 # Current column number
    row = []
    col = []
    data = []
    count = 0
    line = 0
    with open(input_file_name) as infile:
        for crt_line in infile:  
            value = crt_line.split(' ')
            for v in value:
                float_v = float(v)
                #print float_v
                if float_v != 0:
                    row.append(crt_row_no)
                    col.append(crt_col_no)
                    data.append(float_v)
                    count += 1
                    if count%1000 == 0:
                        print count, line 
                crt_col_no +=1
            crt_row_no += 1
            crt_col_no = 0
            line += 1
    return sparse.coo_matrix((data,(row,col))).tocsr()

def main(argv):
    inputfile = ''
    outputfile = ''

    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print 'python <script name> -i <inputfile> -o <outputfile>'
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print 'python <script name> -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i"):
            inputfile = arg
        elif opt in ("-o"):
            outputfile = arg

    print 'Reading images from "', inputfile
    print 'Writing vectors to "', outputfile

    input_file_name = inputfile

    """ ################### START PROGRAM ############################ """  

    print "--------------STRAT-----------------"
    running_time = time.now()
    data_matrix = process(input_file_name)
    print "Total time:" + str(time.now() - running_time)
    print "Writing to file."
    io.mmwrite(outputfile, data_matrix)

if __name__ == '__main__':   
    main(sys.argv[1:])

0 个答案:

没有答案