我不知道为什么我的模型总是给出相同的预测值

时间:2019-12-09 06:02:21

标签: python machine-learning keras

我训练了模型来预测排名。 功能有4列。现在排名,出价金额,关键字和时间。 我猜想,我在建立模型时犯了一些错误。 当我在数据集中检查nan值时,它没有nan值。 请检查我的代码,如果您想在执行代码时发现问题,我将附加一个数据集。 ps。当我获得一个时间戳的预测值时,将使用确定等级函数。

import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from pandas import DataFrame
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, GRU, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras.optimizers import Adam
import keras.backend as K
import sklearn.preprocessing


label_binarizer = sklearn.preprocessing.LabelBinarizer()
model = load_model("C:/Users/gyuri/Downloads/01-0.582880.hdf5")

def categorize(time):
    new_series = []
    for i in time:
        if re.match('[0-9]*:[0-9]*:[0-9]', i):   # hh: mm: ss형식과 일치하는 경우에만 카테고리화 하기 위함.
        if i == '24:00:00':
            num = 0
        else:
            hr, mi, se = map(int, i.split(':'))   # 리스트 내 모든 원소들에 int형 적용. : 기준으로 split함.
            num = hr * 6 + mi // 10   # 10분 단위로 넘버링이 1씩 증가 하니까 분 나누기 10의 몫을 더해줌.
    else:
        num = -1
    new_series += [num]
df['categorizedTime'] = new_series
label_binarizer.fit(range(max(df['categorizedTime']) + 1))
one_hot_time = label_binarizer.transform(df['categorizedTime'])
del df['SubmittedTime']


def categorize_time(time):
    if re.match('[0-9]*:[0-9]*:[0-9]', time):  # hh: mm: ss형식과 일치하는 경우에만 카테고리화 하기 위함.
    if time == '24:00:00':
        num = 0
    else:
        hr, mi, se = map(int, time.split(':'))  # 리스트 내 모든 원소들에 int형 적용. : 기준으로 split함.
        num = hr * 6 + mi // 10  # 10분 단위로 넘버링이 1씩 증가 하니까 분 나누기 10의 몫을 더해줌.
else:
    num = -1
return num

def determineRank(t, n, bid_t, w, h, k):
#    t = str(input())   # 시간
#    n = int(input())   # 현재 랭킹
#    bid_t = int(input())   # 비딩 가격
#    w = int(input())   # 요일
#    h = int(input())   #  주말 여부
#    k = str(input())   # 키워드

encode = LabelEncoder()
k = encode.fit_transform([k])
bid_t = normalization(bid_t)
t = categorize_time(t)

new_list = []
new_list = [t, n, bid_t, w, h, k]

new_list = np.array(new_list)
new_list = new_list.reshape(1, 6, 1)
rank = load_model.predict(new_list)
#rank = round(rank.item(0))
return rank.item(0)


def determinehamsu(t, n, bid_t, w, h, k):
encode = LabelEncoder()
rank = determineRank(t, n, bid_t, w, h, k)
df = pd.DataFrame(
    data={'Time': [t], 'Now_Rank': [n], 'Amount': [bid_t], 'Weekday': [w], 'Holiday': [h], 'Keyword': [k]},
    columns=['Time', 'Now_Rank', 'Amount', 'Weekday', 'Holiday', 'Keyword'])

if bid_t <= 6010:

    for bid_t in range(bid_t, 6010, 10):
        rank = determineRank(t, n, bid_t, w, h, k)

        df = df.append({'Time': t,
                        'Now_Rank': n,
                        'Amount': bid_t,
                        'Weekday': w,
                        'Holiday': h,
                        'Keyword': k,
                        'Target': rank}, ignore_index=True)
        df.to_csv("new_sample5.csv")
        print(rank)
        print(bid_t)
    return df

elif bid_t > 6010:
    for bid_t in range(bid_t, 0, -10):
        rank = determineRank(t, n, bid_t, w, h, k)
        df = df.append({'Time': t,
                        'Now_Rank': n,
                        'Amount': bid_t,
                        'Weekday': w,
                        'Holiday': h,
                        'Keyword': k,
                        'Target': rank}, ignore_index=True)
        df.to_csv("sample6.csv")
        print(rank)
        print(bid_t)
    return df.item(0)

def normalization(x):
   x_np = np.asarray(x)
   return (x_np - x_np.min()) / (x_np.max() - x_np.min())

filename = 'C:/Users/gyuri/Desktop/진짜수정완료.csv'
df = pd.read_csv('C:/Users/gyuri/Desktop/진짜수정완료.csv', encoding='euc-kr', delimiter=',')
df = pd.DataFrame(df)
df.info()
del df['ExecutedDate']
del df['queueName']
del df['cycleMinutes']
del df['shoudBid']
del df['fetchMethod']
del df['siteUrl']
del df['retryQueueName']
del df['retryCount']
del df['Submitted']
del df['submittedAt']
del df['SubmittedDate']
del df['nccKeywordId']
del df['statDt']
del df['statDtTimestamp']
del df['delaySeconds']
del df['customerId']
del df['bidStatusCode']

df.groupby('currentRank').count()
df.groupby('targetRank').count()

df.groupby('bidAmt').count()
scaler = MinMaxScaler()
encoder = LabelEncoder()


normalization(df['bidAmt'])

categorize(df['SubmittedTime'])
label_binarizer = sklearn.preprocessing.LabelBinarizer()
label_binarizer.fit(range(max(df['categorizedTime'])+1))
one_hot_time = label_binarizer.transform(df['categorizedTime'])
print('{0}'.format(one_hot_time))

one_hot_time.shape

print(df['categorizedTime'].shape)
#del df['SubmittedTime']
df['keyword'] = encoder.fit_transform(df['keyword'])
df.info()
#df['Holiday'].loc[df.Holiday=='Y']=1
#df['Holiday'].loc[df.Holiday == 'N']=0

df.info()
seed = tf.set_random_seed(42)

first_file_info = df.values[0:].astype(np.float)
print("file info.shape : ", first_file_info.shape)
print("file info[0] : ", first_file_info[0])


time = one_hot_time
print("date.shape : ", time.shape)
print("date[0] : ", time[0])
print("=" * 120)

now = first_file_info[:,:1]
print("now.shape : ", now.shape)
print("now[0] : ", now[0])
print("="*120)

target = first_file_info[:,1:2]
print("target.shape : ", target.shape)
print("target[0] : ", target[0])
print("="*120)

amount = first_file_info[:, 2:3]
#norm_amount = scaler.fit_transform(amount)
print("amount.shape : ", amount.shape)
print("amount[0] : ", amount[0])
print("norm_amount[0]: ", norm_amount[0])
print("="*120)



keyword = first_file_info[:, 3:4]
print("keyword.shape : ", keyword.shape)
print("keyword[0]: ", keyword[0])
print("="*120)

x = np.concatenate((time, now, amount, keyword), axis = 1)
print("x.shape : ", x.shape)
print("x[0] : ", x[0])
print("x[-1] : ", x[-1])
print("="*100)

y = target
print("y[0] : ", y[0])
print("y[-1] : ", y[-1])

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = seed, stratify = y)

X_train = X_train.reshape(89334, 1, 4)
X_test  = X_test.reshape (38287, 1, 4)

modelpath = "C:/Users/gyuri/ML_PATH/논문 파일/model/랭킹모델4/{epoch:02d}-{val_loss:4f}.hdf5"
early_stopping_callback = EarlyStopping(monitor = 'acc', patience = 10)
checkpointer = ModelCheckpoint(filepath = modelpath, monitor = 'acc', verbose = 1, save_best_only = True)



model = Sequential()
model.add(GRU(32, batch_input_shape=(1, 1, 4), return_sequences=True, stateful=True))
model.add(GRU(16, return_sequences=True, stateful=True))
model.add(GRU(8, activation='relu', return_sequences=True, stateful=True))
model.add(GRU(4, activation='relu', stateful=True))
model.add(Dense(1))
model.compile(optimizer=Adam(lr=0.001), loss='mae', metrics=['acc'])
model.fit(X_train, Y_train, epochs=30, batch_size=1, verbose=0,
      callbacks=[early_stopping_callback, checkpointer],
      validation_data=(X_test, Y_test))

0 个答案:

没有答案