- 我有一个图片文件夹(在下面链接)
- 对此进行一些分析并以此为基础制作数据框
- 我想创建一个arff文件,该文件在一行中包含单个图像信息。这意味着如果文件夹中有20张图像,那么那里将有20行。
import itertools
import numpy as np
import cv2
from collections import Counter
import pandas as pd
from sklearn import preprocessing
import arff
import itertools
import glob
def pandas2arff(df,filename,wekaname = "pandasdata",cleanstringdata=True,cleannan=True):
"""
converts the pandas dataframe to a weka compatible file
df: dataframe in pandas format
filename: the filename you want the weka compatible file to be in
wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka.
To suppress this, set this to False
cleannan: replaces all nan values with "?" which is Weka's standard for missing values.
To suppress this, set this to False
"""
import re
def cleanstring(s):
if s!="?":
return re.sub('[^A-Za-z0-9]+', "_", str(s))
else:
return "?"
dfcopy = df #all cleaning operations get done on this copy
if cleannan!=False:
dfcopy = dfcopy.fillna(-999999999) #this is so that we can swap this out for "?"
#this makes sure that certain numerical columns with missing values don't get stuck with "object" type
f = open(filename,"w")
arffList = []
arffList.append("@relation " + wekaname + "\n")
#look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
for i in range(df.shape[1]):
if dfcopy.dtypes[i]=='O' or (df.columns[i] in ["Class","CLASS","class"]):
if cleannan!=False:
dfcopy.iloc[:,i] = dfcopy.iloc[:,i].replace(to_replace=-999999999, value="?")
if cleanstringdata!=False:
dfcopy.iloc[:,i] = dfcopy.iloc[:,i].apply(cleanstring)
_uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:,i])]
_uniqueNominalVals = ",".join(_uniqueNominalVals)
_uniqueNominalVals = _uniqueNominalVals.replace("[","")
_uniqueNominalVals = _uniqueNominalVals.replace("]","")
_uniqueValuesString = "{" + _uniqueNominalVals +"}"
arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
else:
arffList.append("@attribute " + df.columns[i] + " real\n")
#even if it is an integer, let's just deal with it as a real number for now
arffList.append("@data\n")
for i in range(dfcopy.shape[0]):#instances
_instanceString = ""
for j in range(df.shape[1]):#features
if dfcopy.dtypes[j]=='O':
_instanceString+="\"" + str(dfcopy.iloc[i,j]) + "\""
else:
_instanceString+=str(dfcopy.iloc[i,j])
if j!=dfcopy.shape[1]-1:#if it's not the last feature, add a comma
_instanceString+=","
_instanceString+="\n"
if cleannan!=False:
_instanceString = _instanceString.replace("-999999999.0","?") #for numeric missing values
_instanceString = _instanceString.replace("\"?\"","?") #for categorical missing values
arffList.append(_instanceString)
f.writelines(arffList)
f.close()
del dfcopy
return True
labels = ['h0','h1','h2','h3','h4','h5','h6','h7','h8','h9','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','v0','v1','v2','v3','v4','v5','v6','v7','v8','v9']
df_HSV = pd.DataFrame(columns=labels)
i=0
def main():
trainDset = 'NTU_Color_HandImages/*.jpg'
for file in glob.glob(trainDset):
#read image
img = cv2.imread(file)
#convert in HSV
hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
#convert in YCrCb
YCrCb = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
'''cv2.imshow('Original image',img)
cv2.imshow('converted image',hsv)
cv2.imshow('converted image',YCrCb)
'''
#Splitting H,S,V channel values
H, S, V = cv2.split(hsv)
Y, Cr, Cb = cv2.split(YCrCb)
'''cv2.imshow('H image',H)
cv2.imshow('S image',S)
cv2.imshow('V image', V)'''
'''cv2.imshow('Y image',Y)
cv2.imshow('Cr image',Cr)
cv2.imshow('Cb image', Cb)'''
#Normalising H S V values in 0 to 1 range
norm_image_H = cv2.normalize(H, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
norm_image_S = cv2.normalize(S, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
norm_image_V = cv2.normalize(V, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
#Normalising Y Cr Cb values in 0 to 1 range
norm_image_Y = cv2.normalize(Y, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
norm_image_Cr = cv2.normalize(Cr, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
norm_image_Cb = cv2.normalize(Cb, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
'''cv2.imshow('Y normalised image', norm_image_Y)
cv2.imshow('Cr normalised image', norm_image_Cr)
cv2.imshow('Cb normalised image', norm_image_Cb)
'''
#Getting the pixels where the it is not black for H and S . basically they have black background and we are eliminating it
rows,cols = norm_image_H.shape
rows_Y,cols_Y = norm_image_Y.shape
'''print(rows,rows_Y)
print(cols,cols_Y)'''
non_zero_H=[]
non_zero_S = []
non_one_V = []
non_zero_Cr = []
non_zero_Cb=[]
non_one_Y=[]
for i in range(rows):
for j in range(cols):
k = norm_image_H[i,j]
l = norm_image_S[i,j]
m = norm_image_V[i,j]
n = norm_image_Y[i,j]
o = norm_image_Cb[i,j]
p = norm_image_Cr[i,j]
if k !=0:
non_zero_H.append(k)
if l !=0:
non_zero_S.append(l)
if m!=1:
non_one_V.append(m)
if o !=0:
non_zero_Cb.append(o)
if p !=0:
non_zero_Cr.append(p)
if n != 1:
non_one_Y.append(n)
pix_h = len(non_zero_H)
pix_S = len(non_zero_S)
pix_v = len(non_one_V)
pix_Y = len(non_one_Y)
pix_Cb = len(non_zero_Cb)
pix_Cr = len(non_zero_Cr)
# converting non zero values of normalised H, S, V image into a dataFrame
df = pd.DataFrame(non_zero_H,columns=['pixels_H'])
df_s = pd.DataFrame(non_zero_S,columns=['pixels_S'])
df_v = pd.DataFrame(non_one_V,columns=['pixels_V'])
# converting non zero values of normalised Y, Cb, Cr image into a dataFrame
df_y = pd.DataFrame(non_one_Y,columns=['pixels_Y'])
df_cb = pd.DataFrame(non_zero_Cb,columns=['pixels_Cb'])
df_cr = pd.DataFrame(non_zero_Cr,columns=['pixels_Cr'])
# Performing the creation of bins for H, S, V
interval_range = pd.interval_range(start=0, freq=0.1, end=1.1)
df['bins_H'] = pd.cut(df['pixels_H'], bins=interval_range, labels=[1,2,3])
df_s['bins_S'] = pd.cut(df_s['pixels_S'], bins=interval_range, labels=[1,2,3])
df_s['bins_V'] = pd.cut(df_v['pixels_V'], bins=interval_range, labels=[1,2,3])
'''#print(df['bins_H'])'''
#print(df)
# Performing the creation of bins for Y, Cb,Cr
interval_range = pd.interval_range(start=0, freq=0.1, end=1.1)
df_y['bins_Y'] = pd.cut(df_y['pixels_Y'], bins=interval_range, labels=[1,2,3])
df_cb['bins_Cb'] = pd.cut(df_cb['pixels_Cb'], bins=interval_range, labels=[1,2,3])
df_cr['bins_Cr'] = pd.cut(df_cr['pixels_Cr'], bins=interval_range, labels=[1,2,3])
#Getting the frequency of values in each bin for normalised H image
count_H_bin= df['bins_H'].value_counts()
result_H = count_H_bin.reset_index()
result_H.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
#print(result_H)
#Getting the frequency of values in each bin for normalised S image
count_S_bin= df_s['bins_S'].value_counts()
result_S = count_S_bin.reset_index()
result_S.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
#print(result_S)
#Getting the frequency of values in each bin for normalised V image
count_V_bin= df_s['bins_V'].value_counts()
result_V = count_V_bin.reset_index()
result_V.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
#print(result_V)
#Getting the frequency of values in each bin for normalised Y image
count_Y_bin= df_y['bins_Y'].value_counts()
result_Y = count_Y_bin.reset_index()
result_Y.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
#print(result_Y)
#Getting the frequency of values in each bin for normalised Cb image
count_Cb_bin= df_cb['bins_Cb'].value_counts()
result_Cb = count_Cb_bin.reset_index()
result_Cb.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
#print(result_Cb)
#Getting the frequency of values in each bin for normalised Cr image
count_Cr_bin= df_cr['bins_Cr'].value_counts()
result_Cr = count_Cr_bin.reset_index()
result_Cr.sort_values("index", axis = 0, ascending = True, inplace = True, na_position ='last')
#print(result_Cr)
H_val_bin = list(result_H['bins_H'].values)
H_val_bin.pop()
#print(H_val_bin)
S_val_bin = list(result_S['bins_S'].values)
S_val_bin.pop()
#print(S_val_bin)
V_val_bin = list(result_V['bins_V'].values)
V_val_bin.pop()
#print(V_val_bin)
Y_val_bin = list(result_Y['bins_Y'].values)
#print(Y_val_bin)
Cb_val_bin = list(result_Cb['bins_Cb'].values)
#print(Cb_val_bin)
Cr_val_bin = list(result_Cr['bins_Cr'].values)
#print(Cr_val_bin)
# determining the frequency distribution against the central object of the image
freq_distr_H = [x/pix_h for x in H_val_bin]
freq_distr_H = list(np.around(np.array(freq_distr_H),4))
#freq_distr_H= freq_distr_H.pop()
'''print("Frequency distribution for H channel:")
print(freq_distr_H)'''
freq_distr_S = [x/pix_S for x in S_val_bin]
freq_distr_S = list(np.around(np.array(freq_distr_S),4))
#freq_distr_S=freq_distr_S.pop()
'''print("Frequency distribution for S channel:")
print(freq_distr_S)
'''
freq_distr_V = [x/pix_v for x in V_val_bin]
freq_distr_V = list(np.around(np.array(freq_distr_V),4))
#freq_distr_V = freq_distr_V.pop()
'''print("Frequency distribution for V channel:")
print(freq_distr_V)'''
freq_distr_Y = [x/pix_Y for x in Y_val_bin]
freq_distr_Y = list(np.around(np.array(freq_distr_Y),4))
'''print("Frequency distribution for Y channel:")
print(freq_distr_Y)'''
freq_distr_Cb = [x/pix_Cb for x in Cb_val_bin]
freq_distr_Cb = list(np.around(np.array(freq_distr_Cb),4))
'''print("Frequency distribution for Cb channel:")
print(freq_distr_Cb)'''
freq_distr_Cr = [x/pix_Cr for x in Cr_val_bin]
freq_distr_Cr = list(np.around(np.array(freq_distr_Cr),4))
'''print("Frequency distribution for Cr channel:")
print(freq_distr_Cr)'''
allList = list(itertools.chain(freq_distr_H, freq_distr_S, freq_distr_V))
df_HSV.loc[i] = allList
i=i+1
print(df_HSV)
pandas2arff(df_HSV, "HSVNEW.arff")
main()
- 创建的arff文件仅包含最后一张图像的数据。
请告诉我应该怎么做。我也想提及每个班级
每行图像。上课的样子,如果图片显示0
则为0类,如果图片显示为1,则为1类。
Link to the small dataset