分析图像信息并将其保存在.arff文件中

时间:2020-05-16 18:12:21

标签: python pandas dataframe weka arff

  1. 我有一个图片文件夹(在下面链接)
  2. 对此进行一些分析并以此为基础制作数据框
  3. 我想创建一个arff文件,该文件在一行中包含单个图像信息。这意味着如果文件夹中有20张图像,那么那里将有20行。
import itertools
import numpy as np
import cv2
from collections import Counter
import pandas as pd
from sklearn import preprocessing
import arff
import itertools
import glob

def pandas2arff(df,filename,wekaname = "pandasdata",cleanstringdata=True,cleannan=True):
    """
    converts the pandas dataframe to a weka compatible file
    df: dataframe in pandas format
    filename: the filename you want the weka compatible file to be in
    wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
    cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka. 
                     To suppress this, set this to False
    cleannan: replaces all nan values with "?" which is Weka's standard for missing values. 
              To suppress this, set this to False
    """
    import re

    def cleanstring(s):
        if s!="?":
            return re.sub('[^A-Za-z0-9]+', "_", str(s))
        else:
            return "?"

    dfcopy = df #all cleaning operations get done on this copy


    if cleannan!=False:
        dfcopy = dfcopy.fillna(-999999999) #this is so that we can swap this out for "?"
        #this makes sure that certain numerical columns with missing values don't get stuck with "object" type

    f = open(filename,"w")
    arffList = []
    arffList.append("@relation " + wekaname + "\n")
    #look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
    for i in range(df.shape[1]):
        if dfcopy.dtypes[i]=='O' or (df.columns[i] in ["Class","CLASS","class"]):
            if cleannan!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].replace(to_replace=-999999999, value="?")
            if cleanstringdata!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].apply(cleanstring)
            _uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:,i])]
            _uniqueNominalVals = ",".join(_uniqueNominalVals)
            _uniqueNominalVals = _uniqueNominalVals.replace("[","")
            _uniqueNominalVals = _uniqueNominalVals.replace("]","")
            _uniqueValuesString = "{" + _uniqueNominalVals +"}" 
            arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
        else:
            arffList.append("@attribute " + df.columns[i] + " real\n") 
            #even if it is an integer, let's just deal with it as a real number for now
    arffList.append("@data\n")           
    for i in range(dfcopy.shape[0]):#instances
        _instanceString = ""
        for j in range(df.shape[1]):#features
                if dfcopy.dtypes[j]=='O':
                    _instanceString+="\"" + str(dfcopy.iloc[i,j]) + "\""
                else:
                    _instanceString+=str(dfcopy.iloc[i,j])
                if j!=dfcopy.shape[1]-1:#if it's not the last feature, add a comma
                    _instanceString+=","
        _instanceString+="\n"
        if cleannan!=False:
            _instanceString = _instanceString.replace("-999999999.0","?") #for numeric missing values
            _instanceString = _instanceString.replace("\"?\"","?") #for categorical missing values
        arffList.append(_instanceString)
    f.writelines(arffList)
    f.close()
    del dfcopy
    return True


labels = ['h0','h1','h2','h3','h4','h5','h6','h7','h8','h9','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','v0','v1','v2','v3','v4','v5','v6','v7','v8','v9']
df_HSV = pd.DataFrame(columns=labels)
i=0

def main():
    trainDset = 'NTU_Color_HandImages/*.jpg'
    for file in glob.glob(trainDset):
        #read image
        img = cv2.imread(file)

        #convert in HSV
        hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
        #convert in YCrCb
        YCrCb = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)

        '''cv2.imshow('Original image',img)
        cv2.imshow('converted image',hsv)
        cv2.imshow('converted image',YCrCb)
        '''

        #Splitting H,S,V channel values
        H, S, V = cv2.split(hsv)
        Y, Cr, Cb = cv2.split(YCrCb)
        '''cv2.imshow('H image',H)
        cv2.imshow('S image',S)
        cv2.imshow('V image', V)'''

        '''cv2.imshow('Y image',Y)
        cv2.imshow('Cr image',Cr)
        cv2.imshow('Cb image', Cb)'''

        #Normalising H S V values in 0 to 1 range
        norm_image_H = cv2.normalize(H, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        norm_image_S = cv2.normalize(S, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        norm_image_V = cv2.normalize(V, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)

        #Normalising Y Cr Cb values in 0 to 1 range
        norm_image_Y = cv2.normalize(Y, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        norm_image_Cr = cv2.normalize(Cr, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        norm_image_Cb = cv2.normalize(Cb, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)


        '''cv2.imshow('Y normalised image', norm_image_Y)

        cv2.imshow('Cr normalised image', norm_image_Cr)

        cv2.imshow('Cb normalised image', norm_image_Cb)
        '''
        #Getting the pixels where the it is not black for H and S . basically they have black background and we are eliminating it
        rows,cols = norm_image_H.shape
        rows_Y,cols_Y = norm_image_Y.shape

        '''print(rows,rows_Y)
        print(cols,cols_Y)'''

        non_zero_H=[]
        non_zero_S = []
        non_one_V = []

        non_zero_Cr = []
        non_zero_Cb=[]
        non_one_Y=[]

        for i in range(rows):
            for j in range(cols):
                k = norm_image_H[i,j]
                l = norm_image_S[i,j]
                m = norm_image_V[i,j]
                n = norm_image_Y[i,j]
                o = norm_image_Cb[i,j]
                p = norm_image_Cr[i,j]
                if k !=0:
                    non_zero_H.append(k)
                if l !=0:
                    non_zero_S.append(l)
                if m!=1:
                    non_one_V.append(m)
                if o !=0:
                    non_zero_Cb.append(o)
                if p !=0:
                    non_zero_Cr.append(p)
                if n != 1:
                    non_one_Y.append(n)

        pix_h = len(non_zero_H)
        pix_S = len(non_zero_S)
        pix_v = len(non_one_V)

        pix_Y = len(non_one_Y)
        pix_Cb = len(non_zero_Cb)
        pix_Cr = len(non_zero_Cr)

        # converting non zero values of normalised H, S, V image into a dataFrame
        df = pd.DataFrame(non_zero_H,columns=['pixels_H'])
        df_s = pd.DataFrame(non_zero_S,columns=['pixels_S'])
        df_v = pd.DataFrame(non_one_V,columns=['pixels_V'])

        # converting non zero values of normalised Y, Cb, Cr image into a dataFrame
        df_y = pd.DataFrame(non_one_Y,columns=['pixels_Y'])
        df_cb = pd.DataFrame(non_zero_Cb,columns=['pixels_Cb'])
        df_cr = pd.DataFrame(non_zero_Cr,columns=['pixels_Cr'])

        # Performing the creation of bins for H, S, V
        interval_range = pd.interval_range(start=0, freq=0.1, end=1.1)
        df['bins_H'] = pd.cut(df['pixels_H'], bins=interval_range, labels=[1,2,3])
        df_s['bins_S'] = pd.cut(df_s['pixels_S'], bins=interval_range, labels=[1,2,3])
        df_s['bins_V'] = pd.cut(df_v['pixels_V'], bins=interval_range, labels=[1,2,3])
        '''#print(df['bins_H'])'''
        #print(df)

        # Performing the creation of bins for Y, Cb,Cr
        interval_range = pd.interval_range(start=0, freq=0.1, end=1.1)
        df_y['bins_Y'] = pd.cut(df_y['pixels_Y'], bins=interval_range, labels=[1,2,3])
        df_cb['bins_Cb'] = pd.cut(df_cb['pixels_Cb'], bins=interval_range, labels=[1,2,3])
        df_cr['bins_Cr'] = pd.cut(df_cr['pixels_Cr'], bins=interval_range, labels=[1,2,3])

        #Getting the frequency of values in each bin for normalised H image
        count_H_bin= df['bins_H'].value_counts()
        result_H = count_H_bin.reset_index()
        result_H.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
        #print(result_H)

        #Getting the frequency of values in each bin for normalised S image
        count_S_bin= df_s['bins_S'].value_counts()
        result_S = count_S_bin.reset_index()
        result_S.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
        #print(result_S)

        #Getting the frequency of values in each bin for normalised V image
        count_V_bin= df_s['bins_V'].value_counts()
        result_V = count_V_bin.reset_index()
        result_V.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
        #print(result_V)

        #Getting the frequency of values in each bin for normalised Y image
        count_Y_bin= df_y['bins_Y'].value_counts()
        result_Y = count_Y_bin.reset_index()
        result_Y.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
        #print(result_Y)


        #Getting the frequency of values in each bin for normalised Cb image
        count_Cb_bin= df_cb['bins_Cb'].value_counts()
        result_Cb = count_Cb_bin.reset_index()
        result_Cb.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
        #print(result_Cb)

        #Getting the frequency of values in each bin for normalised Cr image
        count_Cr_bin= df_cr['bins_Cr'].value_counts()
        result_Cr = count_Cr_bin.reset_index()
        result_Cr.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
        #print(result_Cr)


        H_val_bin = list(result_H['bins_H'].values)
        H_val_bin.pop()
        #print(H_val_bin)
        S_val_bin = list(result_S['bins_S'].values)
        S_val_bin.pop()
        #print(S_val_bin)
        V_val_bin = list(result_V['bins_V'].values)
        V_val_bin.pop()
        #print(V_val_bin)

        Y_val_bin = list(result_Y['bins_Y'].values)
        #print(Y_val_bin)
        Cb_val_bin = list(result_Cb['bins_Cb'].values)
        #print(Cb_val_bin)
        Cr_val_bin = list(result_Cr['bins_Cr'].values)
        #print(Cr_val_bin)

        # determining the frequency distribution against the central object of the image
        freq_distr_H = [x/pix_h for x in H_val_bin]
        freq_distr_H = list(np.around(np.array(freq_distr_H),4))
        #freq_distr_H= freq_distr_H.pop()
        '''print("Frequency distribution for H channel:")
        print(freq_distr_H)'''

        freq_distr_S = [x/pix_S for x in S_val_bin]
        freq_distr_S = list(np.around(np.array(freq_distr_S),4))
        #freq_distr_S=freq_distr_S.pop()
        '''print("Frequency distribution for S channel:")
        print(freq_distr_S)
        '''
        freq_distr_V = [x/pix_v for x in V_val_bin]
        freq_distr_V = list(np.around(np.array(freq_distr_V),4))
        #freq_distr_V = freq_distr_V.pop()
        '''print("Frequency distribution for V channel:")
        print(freq_distr_V)'''

        freq_distr_Y = [x/pix_Y for x in Y_val_bin]
        freq_distr_Y = list(np.around(np.array(freq_distr_Y),4))
        '''print("Frequency distribution for Y channel:")
        print(freq_distr_Y)'''

        freq_distr_Cb = [x/pix_Cb for x in Cb_val_bin]
        freq_distr_Cb = list(np.around(np.array(freq_distr_Cb),4))
        '''print("Frequency distribution for Cb channel:")
        print(freq_distr_Cb)'''

        freq_distr_Cr = [x/pix_Cr for x in Cr_val_bin]
        freq_distr_Cr = list(np.around(np.array(freq_distr_Cr),4))
        '''print("Frequency distribution for Cr channel:")
        print(freq_distr_Cr)'''

        allList = list(itertools.chain(freq_distr_H, freq_distr_S, freq_distr_V))
        df_HSV.loc[i] = allList
        i=i+1

    print(df_HSV)
    pandas2arff(df_HSV, "HSVNEW.arff")

main()







  • 创建的arff文件仅包含最后一张图像的数据。 请告诉我应该怎么做。我也想提及每个班级 每行图像。上课的样子,如果图片显示0 则为0类,如果图片显示为1,则为1类。

Link to the small dataset

0 个答案:

没有答案