Logo

Will Rothpletz

LinkedIn
Resume
GitHub

Will’s bestest predicting model ever!

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.metrics import r2_score

from df_after_transform import df_after_transform
# import data
housing_df = pd.read_csv('input_data2/housing_train.csv')
# load sets 
y_train = np.log(housing_df.v_SalePrice)
X_train = housing_df.drop('v_SalePrice', axis = 1)
# get all numerical variables
numerical = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

numericalVars = X_train.select_dtypes(include = numerical).columns.tolist()
# create pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipe = make_pipeline(OneHotEncoder())

preproc_pipe = ColumnTransformer(
    [("num_impute", num_pipe, numericalVars), 
     ("cat_trans", cat_pipe, ['v_Lot_Config'])
    ], 
    remainder = 'drop'
)
# preprocess X_train
preproc_df = df_after_transform(preproc_pipe, X_train)

# data descriptors
#print(f'There are {preproc_df.shape[1]} columns in the preprocessed data.')
#preproc_df.describe().T.round(2)
# other regressor models 

# Linear Regression
# Support Vector Regression
# KNeighborsRegressor
# RandomForestRegressor
# create template pipeline w/ GradientBoostingRegressor Model

temp_pipe = Pipeline([
    ('preproc', preproc_pipe), 
    ('feature_create', 'passthrough'), 
    ('feature_select', 'passthrough'), 
    ('estim', ensemble.GradientBoostingRegressor())
])
# display template pipeline

#temp_pipe.get_params()
# insert parameters here
params = [
    {'estim__learning_rate': [0.1128, 0.1129, 0.1130, 0.1131, 0.1127] 
     }
]
# run grid serach
grid_search = GridSearchCV(estimator = temp_pipe, 
                           param_grid = params, 
                           scoring = 'r2', 
                           cv = KFold(10))

results = grid_search.fit(X_train, y_train)

resultsDF = pd.DataFrame(results.cv_results_)
# print results
resultsDF
mean_fit_time std_fit_time mean_score_time std_score_time param_estim__learning_rate params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score mean_test_score std_test_score rank_test_score
0 0.770772 0.020499 0.005782 0.000587 0.1128 {'estim__learning_rate': 0.1128} 0.921781 0.864645 0.886064 0.880770 0.767492 0.870976 0.877148 0.784574 0.843924 0.877631 0.857500 0.044852 1
1 0.774695 0.016596 0.005799 0.000378 0.1129 {'estim__learning_rate': 0.1129} 0.910405 0.862323 0.886096 0.880246 0.763579 0.860863 0.877658 0.788509 0.847795 0.845040 0.852251 0.042579 5
2 0.826950 0.022032 0.006107 0.000698 0.113 {'estim__learning_rate': 0.113} 0.921323 0.861661 0.887722 0.870243 0.758338 0.874552 0.878520 0.781549 0.850395 0.874005 0.855831 0.046686 2
3 0.828358 0.027019 0.006108 0.000546 0.1131 {'estim__learning_rate': 0.1131} 0.919507 0.865085 0.882495 0.876536 0.762048 0.871911 0.880452 0.770487 0.846752 0.870076 0.854535 0.047426 3
4 0.821501 0.034226 0.006174 0.000663 0.1127 {'estim__learning_rate': 0.1127} 0.919244 0.865185 0.886335 0.879806 0.766001 0.858832 0.878918 0.780664 0.867277 0.836307 0.853857 0.045110 4
# create best model
best_model = results.best_estimator_
# fit best model on train sets
X_test = pd.read_csv('input_data2/housing_holdout.csv')

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.reset_index()

parcel = pd.DataFrame(X_test['parcel'])
parcel = parcel.reset_index()
# merge sales est and parcels 

submission = parcel.merge(y_pred, on='index')
submission = submission.drop(columns=['index'])
submission = submission.rename(columns={0:'prediction', 1: 'parcel'})
# export best estimate
submission.to_csv('submission/MY_PREDICTIONS.csv', index=False)