import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.metrics import r2_score
from df_after_transform import df_after_transform
# import data
housing_df = pd.read_csv('input_data2/housing_train.csv')
# load sets
y_train = np.log(housing_df.v_SalePrice)
X_train = housing_df.drop('v_SalePrice', axis = 1)
# get all numerical variables
numerical = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numericalVars = X_train.select_dtypes(include = numerical).columns.tolist()
# create pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipe = make_pipeline(OneHotEncoder())
preproc_pipe = ColumnTransformer(
[("num_impute", num_pipe, numericalVars),
("cat_trans", cat_pipe, ['v_Lot_Config'])
],
remainder = 'drop'
)
# preprocess X_train
preproc_df = df_after_transform(preproc_pipe, X_train)
# data descriptors
#print(f'There are {preproc_df.shape[1]} columns in the preprocessed data.')
#preproc_df.describe().T.round(2)
# other regressor models
# Linear Regression
# Support Vector Regression
# KNeighborsRegressor
# RandomForestRegressor
# create template pipeline w/ GradientBoostingRegressor Model
temp_pipe = Pipeline([
('preproc', preproc_pipe),
('feature_create', 'passthrough'),
('feature_select', 'passthrough'),
('estim', ensemble.GradientBoostingRegressor())
])
# display template pipeline
#temp_pipe.get_params()
# insert parameters here
params = [
{'estim__learning_rate': [0.1128, 0.1129, 0.1130, 0.1131, 0.1127]
}
]
# run grid serach
grid_search = GridSearchCV(estimator = temp_pipe,
param_grid = params,
scoring = 'r2',
cv = KFold(10))
results = grid_search.fit(X_train, y_train)
resultsDF = pd.DataFrame(results.cv_results_)
# print results
resultsDF
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_estim__learning_rate | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.770772 | 0.020499 | 0.005782 | 0.000587 | 0.1128 | {'estim__learning_rate': 0.1128} | 0.921781 | 0.864645 | 0.886064 | 0.880770 | 0.767492 | 0.870976 | 0.877148 | 0.784574 | 0.843924 | 0.877631 | 0.857500 | 0.044852 | 1 |
1 | 0.774695 | 0.016596 | 0.005799 | 0.000378 | 0.1129 | {'estim__learning_rate': 0.1129} | 0.910405 | 0.862323 | 0.886096 | 0.880246 | 0.763579 | 0.860863 | 0.877658 | 0.788509 | 0.847795 | 0.845040 | 0.852251 | 0.042579 | 5 |
2 | 0.826950 | 0.022032 | 0.006107 | 0.000698 | 0.113 | {'estim__learning_rate': 0.113} | 0.921323 | 0.861661 | 0.887722 | 0.870243 | 0.758338 | 0.874552 | 0.878520 | 0.781549 | 0.850395 | 0.874005 | 0.855831 | 0.046686 | 2 |
3 | 0.828358 | 0.027019 | 0.006108 | 0.000546 | 0.1131 | {'estim__learning_rate': 0.1131} | 0.919507 | 0.865085 | 0.882495 | 0.876536 | 0.762048 | 0.871911 | 0.880452 | 0.770487 | 0.846752 | 0.870076 | 0.854535 | 0.047426 | 3 |
4 | 0.821501 | 0.034226 | 0.006174 | 0.000663 | 0.1127 | {'estim__learning_rate': 0.1127} | 0.919244 | 0.865185 | 0.886335 | 0.879806 | 0.766001 | 0.858832 | 0.878918 | 0.780664 | 0.867277 | 0.836307 | 0.853857 | 0.045110 | 4 |
# create best model
best_model = results.best_estimator_
# fit best model on train sets
X_test = pd.read_csv('input_data2/housing_holdout.csv')
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.reset_index()
parcel = pd.DataFrame(X_test['parcel'])
parcel = parcel.reset_index()
# merge sales est and parcels
submission = parcel.merge(y_pred, on='index')
submission = submission.drop(columns=['index'])
submission = submission.rename(columns={0:'prediction', 1: 'parcel'})
# export best estimate
submission.to_csv('submission/MY_PREDICTIONS.csv', index=False)