可视化矩阵

时间:2016-03-07 23:43:23

标签: apache-spark pyspark

使用python可视化org.apache.spark.mllib.linalg.Matrix / DenseMatrix内容的最佳方法是什么?

例如confusionMatrix

from pyspark.mllib.linalg import Vectors, DenseMatrix 

m = DenseMatrix(3,3,[  3.33470000e+04,   2.44000000e+02,   1.60000000e+01,  2.93500000e+03,   1.01400000e+03,   1.90000000e+01,  3.50300000e+03,   1.24000000e+02,   1.20000000e+01])
print(m)
m.toArray()

DenseMatrix([[  3.33470000e+04,   2.93500000e+03,   3.50300000e+03],
             [  2.44000000e+02,   1.01400000e+03,   1.24000000e+02],
             [  1.60000000e+01,   1.90000000e+01,   1.20000000e+01]])
Out[58]:
array([[  3.33470000e+04,   2.93500000e+03,   3.50300000e+03],
       [  2.44000000e+02,   1.01400000e+03,   1.24000000e+02],
       [  1.60000000e+01,   1.90000000e+01,   1.20000000e+01]])

1 个答案:

答案 0 :(得分:1)

用seaborn

from pyspark.mllib.linalg import Vectors, DenseMatrix 
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set() 


    from pyspark.mllib.linalg import Vectors, DenseMatrix 
import numpy as np

def display_cm(m):
  a = m.toArray().astype(np.float)
  print(a)
  print(m)
  row_sums = a.sum(axis=1)
  percentage_matrix = a.astype(np.float) / row_sums[:, np.newaxis]
  #percentage_matrix =   100 *a.astype(np.float64) /a.astype(np.float64).sum(axis=1)
  print(percentage_matrix)
  plt.figure(figsize=(3, 3))
  sns.heatmap(percentage_matrix, annot=True,  fmt='.2f', xticklabels=['0' ,'1','2'], yticklabels=['0' ,'1','2']);
  plt.title('Confusion Matrix');


    m = DenseMatrix(3,3,[ 3.33470000e+04,2.93500000e+03,3.50300000e+03,2.44000000e+02,1.01400000e+03, 1.24000000e+02,1.60000000e+01,1.90000000e+01,1.20000000e+01])


    display_cm(m)

enter image description here