使用Tensorflows DNNRegressor预测房价指数

时间:2019-12-12 21:36:57

标签: tensorflow regression prediction

从2012年到2017年,我有很多变量,包括那些年的hpi值。我使用Tensorflow的DNNRegressor构建了一个神经网络。结果还可以,但不是很好。尽管我的任务是预测未来3年,5年和7年,但我不确定该怎么做。

到目前为止,这是我对神经网络实现和结果的了解:

import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore",category=FutureWarning)
    import tensorflow as tf
import pickle       # Used to save the model
import re
import csv
import logging
import os
from sklearn.model_selection import train_test_split
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

# Import data
all_data = pd.read_csv('new_data.csv')

# 2) Series of helper functions
####################################################################################################################
#2 A) This function takes an address and two years, identifies the coordinates, state name and county name, and subsets the CSV dataframe
####################################################################################################################
def county_data_from_df(State,County,dataframe, end_year,start_year):
    if end_year == '2012':
        df = dataframe.loc[(dataframe['state'] == State) & (dataframe['County_name'] == County) & (dataframe['Year'] == end_year)]
        return df
    else:
        df = dataframe.loc[(dataframe['state'] == State) & (dataframe['County_name'] == County) & (dataframe['Year'] == end_year)]        
        df = df.append(dataframe.loc[(dataframe['state'] == State) & (dataframe['County_name'] == County) & (dataframe['Year'] == start_year)])
        return df

####################################################################################################################
#2 B) Once the data is pulled, rename and format in the desired way
####################################################################################################################
def ACS_data_formatting(dataframe):
    dataframe['Bachelors+']=(dataframe['B15003_022E']+dataframe['B15003_023E']+
                             dataframe['B15003_024E']+dataframe['B15003_025E'])/dataframe['B15003_001E']

    dataframe['Households with Income <$35k']=(dataframe['B19001_002E']+dataframe['B19001_003E']+
                                               dataframe['B19001_004E']+dataframe['B19001_005E']+
                                               dataframe['B19001_006E']+dataframe['B19001_007E'])

    dataframe['Households with Income $100k+']=(dataframe['B19001_013E']+dataframe['B19001_014E']+
                                               dataframe['B19001_015E']+dataframe['B19001_016E']+
                                               dataframe['B19001_017E'])

    dataframe['Pct of housing units in 4+ unit buildings'] = (dataframe['B25024_006E']+dataframe['B25024_007E']+
                                               dataframe['B25024_008E']+dataframe['B25024_009E'])/dataframe['B25024_001E']

#     dataframe=dataframe.reset_index()
    dataframe=dataframe.drop(['B15003_022E','B15003_023E','B15003_024E','B15003_025E'],axis=1)
    dataframe=dataframe.rename(index=str, 
            columns={'B01003_001E': "Total Population",
            'B25026_001E':'Total population in occupied housing units',          
            'B25010_001E':'Avg household size of occupied housing units',         
            'B25077_001E':'Median Estimated Home Value owner occupied units)',         
            'B02001_002E':'White Population',
            'B02001_003E':'Black/African American Population',             
            'B02001_005E':'Asian Population',
            'B02001_004E':'Native American Population',
            'B02001_006E':'Pacific Islander Population',          
            'B02001_009E':'Mixed Race Population',
            'B02001_007E':'Some other race Population',          
            'B01002_001E':'Median Age',
            'B19013_001E':'Median Household Income',
            'B15003_001E':'Total Population over 25',
            'B25064_001E':'Median Gross Rent',
            'B07013_002E':'Homeowner households',
            'B07013_003E':'Renter households',
            'B25027_002E':'Housing units with mortgage',
            'B25071_001E':'Median Gross rent as % of household inc',
            'B25024_001E':'Total housing Units',
            'B00001_001E': 'Total population estimate',
            'B19001_002E':'Total income Less than $10,000',
            'B19001_003E':'Total income $10,000 - $14,999',
            'B19001_004E':'Total income $15,000 - $19,999',
            'B19001_005E':'Total income $20,000 - $24,999',
            'B19001_006E':'Total income $25,000 - $29,999',
            'B19001_007E':'Total income $30,000 - $34,999',
            'B19001_008E':'Total income $35,000 - $39,999',
            'B19001_009E':'Total income $40,000 - $44,999',
            'B19001_010E':'Total income $45,000 - $49,999',
            'B19001_011E':'Total income $50,000 - $59,999',
            'B19001_012E':'Total income $60,000 - $74,999',
            'B19001_013E':'Total income $75,000 - $99,999',
            'B19001_014E':'Total income $100,000 - $124,999',
            'B19001_015E':'Total income $125,000 - $149,999',
            'B19001_016E':'Total income $150,000 - $199,999',
            'B19001_017E':'Total income $200,000 or more',
            'B25024_006E':'Total people at home ages 5 to 9',
            'B25024_007E':'Total people at home ages 10 to 19',
            'B25024_008E':'Total people at home ages 20 to 49',
            'B25024_009E':'Total people at home ages 50 or more',
            'B07001_033E':'Total Moved within same county',
            'B07001_049E':'Total Moved from different county within same state',
            'B07001_065E':'Total Moved from different state',
            'B07001_081E':'Total Moved from abroad',
            'B07002_005E':'Median age Total Moved from different state',
            'B07009_017E':'Total Moved within same county with Bachelors degree',
            'B07012_020E':'Total Moved from different state At or above 150 percent of the poverty level',
            'B08006_017E':'Total Worked at home',
            'B08006_015E':'Total Walked',
            'B01001A_024E':'Total Female 25 to 29 years',
            'B05006_017E':'Total Europe Western Europe Germany',
            'B08006_008E':'Total Public transportation (excluding taxicab)',
            'B08007_003E':'Total Worked in state of residence Worked in county of residence',
            'B08134_002E':'Total Less than 10 minutes to place of work',
            'B08012_011E':'Total 45 to 59 minutes to place of work',
            'B08012_012E':'Total 60 to 89 minutes to place of work',
            'B08012_013E':'Total 90 or more minutes to place of work',
            'B08124_004E':'Total Sales and office occupations',
            'B08124_002E':'Total Management, business, science, and arts occupations',
            'B08124_003E':'Total Service occupations',
            'B08126_011E':'Total Educational services, and health care and social assistance',
            'B08126_012E':'Total Arts, entertainment, and recreation, and accommodation and food services',
            'B08126_009E':'Total Finance and insurance, and real estate and rental and leasing',
            'B08126_008E':'Total Information',
            'B08128_003E':'Total Private for-profit wage and salary workers Employee of private company workers',
            'B08128_004E':'Total Private for-profit wage and salary workers Self-employed in own incorporated business workers',
            'B08128_006E':'Total Local government workers',
            'B08128_008E':'Total Federal government workers',
            'B08128_009E':'Total Self-employed in own not incorporated business workers',
            'B09010_002E':'Total Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months',
            'B23006_024E':'Total Bachelors degree or higher In labor force',
            'B24022_011E':'Total Male Management business, science, and arts occupations Education, legal, community service, arts, and media occupations',
            'B24022_024E':'Total Male Service occupations Food preparation and serving related occupations'
            })
    dataframe = dataframe.drop(['B25017_001E', 'B25017_010E', 'B17026_013E','Tract_number',
                               'state', 'Census_tract', 'County_name'],axis=1)
# #B25010_001E - average housing sizeB
# #B25026_001E- Total population in occupied housing units
    dataframe['High income to Low income household ratio'] = dataframe['Households with Income $100k+']/dataframe['Households with Income <$35k']
    dataframe['Total Households'] = dataframe['Homeowner households']+dataframe['Renter households']
    dataframe['Pct White'] = dataframe['White Population']/dataframe['Total Population']
    dataframe['Pct Asian'] = dataframe['Asian Population']/dataframe['Total Population']
    dataframe['Pct Black'] = dataframe['Black/African American Population']/dataframe['Total Population']
    dataframe['Pct Native American'] = dataframe['Native American Population']/dataframe['Total Population']
    dataframe['Pct Pacific Islander'] = dataframe['Pacific Islander Population']/dataframe['Total Population']
    dataframe['Pct Other Race'] = dataframe['Some other race Population']/dataframe['Total Population']
    dataframe['Pct Mixed'] = dataframe['Mixed Race Population']/dataframe['Total Population']

#     dataframe=dataframe.drop(['Total Population over 25','White Population','Asian Population',
#                               'Homeowner households','Renter households','Housing units with mortgage',
#                               'Black/African American Population',
#                              'Native American Population','Pacific Islander Population','Mixed Race Population'],axis=1)

#     dataframe=dataframe.drop(['B19001_002E','B19001_003E','B19001_004E','B19001_005E','B19001_006E','B19001_007E',
#                              'B19001_013E','B19001_014E','B19001_015E','B19001_016E','B19001_017E',
#                              'B25024_006E', 'B25024_007E', 'B25024_008E', 'B25024_009E'],axis=1)



    #     dataframe = dataframe.set_index(['Year','Location+Type'])
        return dataframe

    # More data preprocessing using the above functions
    all_data2 = ACS_data_formatting(all_data)
    # all_data2.to_csv("Census_data.csv")
    print(len(all_data2))
    all_data2.head()
    all_data3 = all_data2.drop(['index','Location+Type','Year'],axis=1)
    all_data3 = all_data3.dropna(subset = ['hpi'])
    all_data3 = all_data3.dropna(axis=1, how='all')
    all_data3 = all_data3.fillna(0)
    all_data3 = all_data3.replace([np.inf, -np.inf], np.nan)
    all_data3 = all_data3.fillna(0)
    print(len(all_data3))

# Create training and test set
all_data4 = all_data3.iloc[:,0:]
all_data4.columns = all_data4.columns.str.replace('+', 'plus')
all_data4.columns = all_data4.columns.str.replace(')', ' ')
all_data4.columns = all_data4.columns.str.replace('(', ' ')
all_data4.columns = all_data4.columns.str.replace(',', ' ')
all_data4.columns = all_data4.columns.str.replace(' ', '_')
all_data4.columns = all_data4.columns.str.replace('__', '_')
all_data4.columns = all_data4.columns.str.replace('%', 'percentage')
all_data4.columns = all_data4.columns.str.replace('$', '')
all_data4.columns = all_data4.columns.str.replace('<', 'lessthan')
all_data4 = all_data4.rename(columns={"Median_Estimated_Home_Value_owner_occupied_units)": "Median_Estimated_Home_Value_owner_occupied_units"})
all_data4 = all_data4.rename(columns={"Median_Estimated_Home_Value_owner_occupied_units)_Pct_Change" : "Median_Estimated_Home_Value_owner_occupied_units_Pct_Change", "Total_Arts,_entertainment,_and_recreation,_and_accommodation_and_food_services": "Total number in Arts, Entertainment, recreation, accomodation, and food services", "Asian Population": "Asian_Population"})  
all_data4 = all_data4.dropna(subset = ['hpi'])
train_dataset = all_data4.sample(frac=0.8,random_state=42)
train_dataset.head()
test_dataset = all_data4.drop(train_dataset.index)
train_stats = train_dataset.describe()
train_stats.pop("hpi")
train_stats = train_stats.transpose()
train_labels = train_dataset.pop('hpi')
test_labels = test_dataset.pop('hpi')
# Need to change feature columns to be of numeric type
feature_columns = ['Total_population_estimate', 'Avg_household_size_of_occupied_housing_units', 'Total_population_in_occupied_housing_units', 'Median_Estimated_Home_Value_owner_occupied_units_', 'Total_Population', 'Median_Gross_rent_as_percentage_of_household_inc', 'White_Population', 'Black/African_American_Population', 'Native_American_Population', 'Asian_Population', 'Pacific_Islander_Population', 'Some_other_race_Population', 'Mixed_Race_Population', 'Median_Age', 'Median_Household_Income', 'Total_Population_over_25', 'Median_Gross_Rent', 'Homeowner_households', 'Renter_households', 'Housing_units_with_mortgage', 'Total_income_Less_than_10_000', 'Total_income_10_000_-_14_999', 'Total_income_15_000_-_19_999', 'Total_income_20_000_-_24_999', 'Total_income_25_000_-_29_999', 'Total_income_30_000_-_34_999', 'Total_income_75_000_-_99_999', 'Total_income_100_000_-_124_999', 'Total_income_125_000_-_149_999', 'Total_income_150_000_-_199_999', 'Total_income_200_000_or_more', 'Total_housing_Units', 'Total_people_at_home_ages_5_to_9', 'Total_people_at_home_ages_10_to_19', 'Total_people_at_home_ages_20_to_49', 'Total_people_at_home_ages_50_or_more', 'Total_Moved_within_same_county', 'Total_Moved_from_different_county_within_same_state', 'Total_Moved_from_different_state', 'Total_Moved_from_abroad', 'Median_age_Total_Moved_from_different_state', 'Total_Moved_within_same_county_with_Bachelors_degree', 'Total_Moved_from_different_state_At_or_above_150_percent_of_the_poverty_level', 'Total_Worked_at_home', 'Total_Walked', 'Total_Female_25_to_29_years', 'Total_Europe_Western_Europe_Germany', 'Total_Public_transportation_excluding_taxicab_', 'Total_Worked_in_state_of_residence_Worked_in_county_of_residence', 'Total_Less_than_10_minutes_to_place_of_work', 'Total_45_to_59_minutes_to_place_of_work', 'Total_60_to_89_minutes_to_place_of_work', 'Total_90_or_more_minutes_to_place_of_work', 'Total_Sales_and_office_occupations', 'Total_Management_business_science_and_arts_occupations', 'Total_Service_occupations', 'Total_Educational_services_and_health_care_and_social_assistance', 'Total_Arts_entertainment_and_recreation_and_accommodation_and_food_services', 'Total_Finance_and_insurance_and_real_estate_and_rental_and_leasing', 'Total_Information', 'Total_Private_for-profit_wage_and_salary_workers_Employee_of_private_company_workers', 'Total_Private_for-profit_wage_and_salary_workers_Self-employed_in_own_incorporated_business_workers', 'Total_Local_government_workers', 'Total_Federal_government_workers', 'Total_Self-employed_in_own_not_incorporated_business_workers', 'Total_Living_in_household_with_Supplemental_Security_Income_SSI__cash_public_assistance_income_or_Food_Stamps/SNAP_in_the_past_12_months', 'Total_Bachelors_degree_or_higher_In_labor_force', 'Total_Male_Management_business_science_and_arts_occupations_Education_legal_community_service_arts_and_media_occupations', 'Total_Male_Service_occupations_Food_preparation_and_serving_related_occupations', 'Bachelorsplus', 'Households_with_Income_lessthan35k', 'Households_with_Income_100kplus', 'Pct_of_housing_units_in_4plus_unit_buildings', 'High_income_to_Low_income_household_ratio', 'Total_Households', 'Pct_White', 'Pct_Asian', 'Pct_Black', 'Pct_Native_American', 'Pct_Pacific_Islander', 'Pct_Other_Race', 'Pct_Mixed']
feat_cols = []
for x in feature_columns:
    x.strip()
    feat_cols.append(tf.feature_column.numeric_column(x))   
len(all_data4)
# print(train_dataset.columns.tolist())

# Normalize data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

X_train = norm(train_dataset)
y_train = train_labels
X_test = norm(test_dataset)
y_test = test_labels

# Define the input function
input_func=tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=1000,shuffle=True)
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                      y=y_test, 
                                                      batch_size=10, 
                                                      num_epochs=1, 
                                                      shuffle=False)
test_input_func = tf.estimator.inputs.pandas_input_fn(x= X_test,                                                   
                                                 batch_size=100, 
                                                 num_epochs=1, 
                                                 shuffle=False)

dnn_regressor = tf.estimator.DNNRegressor(
    feature_columns=feat_cols,
    hidden_units=[1024, 512, 256],
    optimizer=tf.train.ProximalAdagradOptimizer(
      learning_rate=0.1,
      l1_regularization_strength=0.001
    ))

# Train model
dnn_regressor.train(input_fn=input_func,steps=10000)

# Predictions
pred_input_func=tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=10,num_epochs=1,shuffle=False)

predictions=list(dnn_regressor.predict(input_fn=pred_input_func))
prediction=dnn_regressor.predict(input_fn=pred_input_func)

metrics = dnn_regressor.evaluate(input_fn=eval_input_func)

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, final_preds))**0.5

9.528585837872207

我在机器/深度学习方面还很陌生,因此将不胜感激。

0 个答案:

没有答案