Loading packages
library(caret)
library(ggplot2)
library(knitr)
library(plyr)
library(dplyr)
library(corrplot)
library(plotly)
library(randomForest)
library(h2o)
Loading datasets
df_train <- read.csv("projeto8-training.csv")
df_test <- read.csv("projeto8-testing.csv")
Merging test and train dataframes
df_train$label <- "train" # creating column for label
df_test$label <- "test"
df_merged <- rbind(df_train, df_test)
Showing some infos about the data
str(df_merged)
## 'data.frame': 19735 obs. of 33 variables:
## $ date : Factor w/ 19735 levels "2016-01-11 17:00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Appliances : int 60 60 50 60 50 60 60 70 430 250 ...
## $ lights : int 30 30 30 40 40 50 40 40 50 40 ...
## $ T1 : num 19.9 19.9 19.9 19.9 19.9 ...
## $ RH_1 : num 47.6 46.7 46.3 46.3 46 ...
## $ T2 : num 19.2 19.2 19.2 19.2 19.2 ...
## $ RH_2 : num 44.8 44.7 44.6 44.5 44.5 ...
## $ T3 : num 19.8 19.8 19.8 19.8 19.8 ...
## $ RH_3 : num 44.7 44.8 44.9 45 44.9 ...
## $ T4 : num 19 19 18.9 18.9 18.9 ...
## $ RH_4 : num 45.6 46 45.9 45.5 45.7 ...
## $ T5 : num 17.2 17.2 17.2 17.2 17.1 ...
## $ RH_5 : num 55.2 55.2 55.1 55.1 55 ...
## $ T6 : num 7.03 6.83 6.56 6.37 6.3 ...
## $ RH_6 : num 84.3 84.1 83.2 84.9 85.8 ...
## $ T7 : num 17.2 17.2 17.2 17.2 17.1 ...
## $ RH_7 : num 41.6 41.6 41.4 41.2 41.3 ...
## $ T8 : num 18.2 18.2 18.2 18.1 18.1 ...
## $ RH_8 : num 48.9 48.9 48.7 48.6 48.6 ...
## $ T9 : num 17 17.1 17 17 17 ...
## $ RH_9 : num 45.5 45.6 45.5 45.4 45.3 ...
## $ T_out : num 6.6 6.48 6.37 6.13 6.02 ...
## $ Press_mm_hg: num 734 734 734 734 734 ...
## $ RH_out : num 92 92 92 92 92 ...
## $ Windspeed : num 7 6.67 6.33 5.67 5.33 ...
## $ Visibility : num 63 59.2 55.3 47.7 43.8 ...
## $ Tdewpoint : num 5.3 5.2 5.1 4.9 4.8 ...
## $ rv1 : num 13.3 18.6 28.6 10.1 44.9 ...
## $ rv2 : num 13.3 18.6 28.6 10.1 44.9 ...
## $ NSM : int 61200 61800 62400 63600 64200 65400 66000 66600 68400 69000 ...
## $ WeekStatus : Factor w/ 2 levels "Weekday","Weekend": 1 1 1 1 1 1 1 1 1 1 ...
## $ Day_of_week: Factor w/ 7 levels "Friday","Monday",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ label : chr "train" "train" "train" "train" ...
summary(df_merged)
## date Appliances lights T1
## 2016-01-11 17:00:00: 1 Min. : 10.00 Min. : 0.000 Min. :16.79
## 2016-01-11 17:10:00: 1 1st Qu.: 50.00 1st Qu.: 0.000 1st Qu.:20.76
## 2016-01-11 17:20:00: 1 Median : 60.00 Median : 0.000 Median :21.60
## 2016-01-11 17:40:00: 1 Mean : 97.69 Mean : 3.802 Mean :21.69
## 2016-01-11 17:50:00: 1 3rd Qu.: 100.00 3rd Qu.: 0.000 3rd Qu.:22.60
## 2016-01-11 18:10:00: 1 Max. :1080.00 Max. :70.000 Max. :26.26
## (Other) :19729
## RH_1 T2 RH_2 T3
## Min. :27.02 Min. :16.10 Min. :20.46 Min. :17.20
## 1st Qu.:37.33 1st Qu.:18.79 1st Qu.:37.90 1st Qu.:20.79
## Median :39.66 Median :20.00 Median :40.50 Median :22.10
## Mean :40.26 Mean :20.34 Mean :40.42 Mean :22.27
## 3rd Qu.:43.07 3rd Qu.:21.50 3rd Qu.:43.26 3rd Qu.:23.29
## Max. :63.36 Max. :29.86 Max. :56.03 Max. :29.24
##
## RH_3 T4 RH_4 T5
## Min. :28.77 Min. :15.10 Min. :27.66 Min. :15.33
## 1st Qu.:36.90 1st Qu.:19.53 1st Qu.:35.53 1st Qu.:18.28
## Median :38.53 Median :20.67 Median :38.40 Median :19.39
## Mean :39.24 Mean :20.86 Mean :39.03 Mean :19.59
## 3rd Qu.:41.76 3rd Qu.:22.10 3rd Qu.:42.16 3rd Qu.:20.62
## Max. :50.16 Max. :26.20 Max. :51.09 Max. :25.80
##
## RH_5 T6 RH_6 T7
## Min. :29.82 Min. :-6.065 Min. : 1.00 Min. :15.39
## 1st Qu.:45.40 1st Qu.: 3.627 1st Qu.:30.02 1st Qu.:18.70
## Median :49.09 Median : 7.300 Median :55.29 Median :20.03
## Mean :50.95 Mean : 7.911 Mean :54.61 Mean :20.27
## 3rd Qu.:53.66 3rd Qu.:11.256 3rd Qu.:83.23 3rd Qu.:21.60
## Max. :96.32 Max. :28.290 Max. :99.90 Max. :26.00
##
## RH_7 T8 RH_8 T9
## Min. :23.20 Min. :16.31 Min. :29.60 Min. :14.89
## 1st Qu.:31.50 1st Qu.:20.79 1st Qu.:39.07 1st Qu.:18.00
## Median :34.86 Median :22.10 Median :42.38 Median :19.39
## Mean :35.39 Mean :22.03 Mean :42.94 Mean :19.49
## 3rd Qu.:39.00 3rd Qu.:23.39 3rd Qu.:46.54 3rd Qu.:20.60
## Max. :51.40 Max. :27.23 Max. :58.78 Max. :24.50
##
## RH_9 T_out Press_mm_hg RH_out
## Min. :29.17 Min. :-5.000 Min. :729.3 Min. : 24.00
## 1st Qu.:38.50 1st Qu.: 3.667 1st Qu.:750.9 1st Qu.: 70.33
## Median :40.90 Median : 6.917 Median :756.1 Median : 83.67
## Mean :41.55 Mean : 7.412 Mean :755.5 Mean : 79.75
## 3rd Qu.:44.34 3rd Qu.:10.408 3rd Qu.:760.9 3rd Qu.: 91.67
## Max. :53.33 Max. :26.100 Max. :772.3 Max. :100.00
##
## Windspeed Visibility Tdewpoint rv1
## Min. : 0.000 Min. : 1.00 Min. :-6.600 Min. : 0.00532
## 1st Qu.: 2.000 1st Qu.:29.00 1st Qu.: 0.900 1st Qu.:12.49789
## Median : 3.667 Median :40.00 Median : 3.433 Median :24.89765
## Mean : 4.040 Mean :38.33 Mean : 3.761 Mean :24.98803
## 3rd Qu.: 5.500 3rd Qu.:40.00 3rd Qu.: 6.567 3rd Qu.:37.58377
## Max. :14.000 Max. :66.00 Max. :15.500 Max. :49.99653
##
## rv2 NSM WeekStatus Day_of_week
## Min. : 0.00532 Min. : 0 Weekday:14263 Friday :2845
## 1st Qu.:12.49789 1st Qu.:21600 Weekend: 5472 Monday :2778
## Median :24.89765 Median :43200 Saturday :2736
## Mean :24.98803 Mean :42907 Sunday :2736
## 3rd Qu.:37.58377 3rd Qu.:64200 Thursday :2880
## Max. :49.99653 Max. :85800 Tuesday :2880
## Wednesday:2880
## label
## Length:19735
## Class :character
## Mode :character
##
##
##
##
df_merged$date <- as.POSIXct(df_merged$date) # transform date variable type to date/time
sapply(df_train, function(x) sum(is.na(x))) # checking missing values
## date Appliances lights T1 RH_1 T2
## 0 0 0 0 0 0
## RH_2 T3 RH_3 T4 RH_4 T5
## 0 0 0 0 0 0
## RH_5 T6 RH_6 T7 RH_7 T8
## 0 0 0 0 0 0
## RH_8 T9 RH_9 T_out Press_mm_hg RH_out
## 0 0 0 0 0 0
## Windspeed Visibility Tdewpoint rv1 rv2 NSM
## 0 0 0 0 0 0
## WeekStatus Day_of_week label
## 0 0 0
num_cols_df <- df_merged[,-c(1,31,32,33)] # numeric features
hist(df_merged$Appliances, breaks = 30, col='darkblue',
xlab="Energy consumption", main = "Frequencies - Energy consumption")
Unique values from Appliances
sort(unique(df_merged$Appliances))
## [1] 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150
## [16] 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300
## [31] 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450
## [46] 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600
## [61] 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750
## [76] 760 770 780 790 800 820 830 840 850 860 870 880 890 900 910
## [91] 1070 1080
Top Frequencies table
kable(table(cut(df_merged$Appliances, breaks=seq(1, 201, 10))), align = 'c')
Var1 | Freq |
---|---|
(1,11] | 9 |
(11,21] | 343 |
(21,31] | 723 |
(31,41] | 2019 |
(41,51] | 4368 |
(51,61] | 3282 |
(61,71] | 1560 |
(71,81] | 1205 |
(81,91] | 1015 |
(91,101] | 978 |
(101,111] | 736 |
(111,121] | 502 |
(121,131] | 330 |
(131,141] | 223 |
(141,151] | 144 |
(151,161] | 90 |
(161,171] | 70 |
(171,181] | 78 |
(181,191] | 86 |
(191,201] | 58 |
Plot time series Energy consumption - Each point measured
plot_ly(x = df_merged$date , y = df_merged$Appliances, type="scatter", mode="markers")
We can see the target variable is left-skewed. Let’s keep eye on that for later transformation.
m_cor <- cor(num_cols_df)
corrplot(m_cor, method = "square",tl.col = "black")
Here we see the temperatures are positive correlated from each other, and humidity too. But we can see a different behaviour
with the RH_6 by temperatures with negative correlation. Maybe it’s a part of the house that has major impact from temperatures from others. Also, note that rv1 and rv2 are highly correlated, it seems the same. Let’s check it later and remove one of them if applicable.
hist(df_merged$T1, breaks = 30, col='darkred', xlab = "", main = "T1")
hist(df_merged$T2, breaks = 30, col='darkred', xlab = "", main = "T2")
hist(df_merged$T3, breaks = 30, col='darkred', xlab = "", main = "T3")
hist(df_merged$T4, breaks = 30, col='darkred', xlab = "", main = "T4")
hist(df_merged$T5, breaks = 30, col='darkred', xlab = "", main = "T5")
hist(df_merged$T6, breaks = 30, col='darkred', xlab = "", main = "T6")
hist(df_merged$T7, breaks = 30, col='darkred', xlab = "", main = "T7")
hist(df_merged$T8, breaks = 30, col='darkred', xlab = "", main = "T8")
hist(df_merged$T9, breaks = 30, col='darkred', xlab = "", main = "T9")
hist(df_merged$RH_1, breaks = 30, col='#0c5069', xlab = "", main = "RH_1")
hist(df_merged$RH_2, breaks = 30, col='#0c5069', xlab = "", main = "RH_2")
hist(df_merged$RH_3, breaks = 30, col='#0c5069', xlab = "", main = "RH_3")
hist(df_merged$RH_4, breaks = 30, col='#0c5069', xlab = "", main = "RH_4")
hist(df_merged$RH_5, breaks = 30, col='#0c5069', xlab = "", main = "RH_5")
hist(df_merged$RH_6, breaks = 30, col='#0c5069', xlab = "", main = "RH_6")
hist(df_merged$RH_7, breaks = 30, col='#0c5069', xlab = "", main = "RH_7")
hist(df_merged$RH_8, breaks = 30, col='#0c5069', xlab = "", main = "RH_8")
hist(df_merged$RH_9, breaks = 30, col='#0c5069', xlab = "", main = "RH_9")
weekend <- df_merged %>%
filter(WeekStatus == "Weekend")
weekday <- df_merged %>%
filter(WeekStatus == "Weekday")
hist(weekend$T_out, ylim = c(0,1300), breaks=30, col=rgb(1,0,0,0.5), xlab="Temperature outside",
ylab="Frequency", main="Distribution of Out temperatures by Weekend and Weekday" )
hist(weekday$T_out, ylim = c(0,1300) ,breaks=30, col=rgb(0,0,1,0.5), add=T)
legend("topright", legend=c("weekend","weekday"), col=c(rgb(1,0,0,0.5),
rgb(0,0,1,0.5)), pt.cex=2, pch=15)
dewp_time <- df_merged %>%
group_by(date = as.Date(date)) %>%
summarize(tot = mean(Tdewpoint))
plot_ly(x = dewp_time$date , y = dewp_time$tot, type="bar", color= "red")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
ggplot(data=df_merged, aes(x=lights)) +
geom_bar(fill= '#E69F00') +
ylab("Count") + xlab("Lights (Wh) ") +
ggtitle("Lights by Wh")
Checking and removing identical features
features_pair <- combn(names(num_cols_df), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
f1 <- pair[1]
f2 <- pair[2]
if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
if (all(num_cols_df[[f1]] == num_cols_df[[f2]])) {
cat(f1, "and", f2, "are equals.\n")
toRemove <- c(toRemove, f2)
}
}
}
## rv1 and rv2 are equals.
df_merged <- subset(df_merged, select = -c(rv2)) # Removing rv2
log_Appliances <- log(df_merged$Appliances)
hist(log_Appliances, breaks = 30, col='darkblue',
xlab="Energy consumption", main = "Frequencies - Energy consumption")
A huge difference from the first left-skewed histogram
Normalize data
num_cols_df <- subset(num_cols_df, select = -c(Appliances, rv2))
scaled_num_cols <- scale(num_cols_df)
One hot enconding - WeekStatus feature
categorical_f <- subset(df_merged, select = c(WeekStatus))
dmy <- dummyVars(" ~ .", data = categorical_f)
categorical_enc <- data.frame(predict(dmy, newdata = categorical_f))
head(categorical_enc)
importance <- randomForest(log_Appliances ~ .,
data = scaled_num_cols,
ntree = 100,
nodesize = 10, importance = TRUE)
varImpPlot(importance)
Interesting! NSM is considered the most valuable feature. The measures are MSE (Mean Squared error) and mean decrease in node impurity. Let’s now choose the best features analyzed.
Removing variables rv1, T6, RH_6, T_out, RH_out and Visibility because they showed less importance related to Energy consumption.
scaled_num_cols <- subset(scaled_num_cols, select = -c(rv1, T6, RH_6, T_out, RH_out, Visibility))
df_final <- cbind(log_Appliances, scaled_num_cols, categorical_enc, df_merged['label'])
head(df_final)
train_d <- df_final[df_final$label == 'train', !names(df_final) %in% c("label")]
test_d <- df_final[df_final$label == 'test', !names(df_final) %in% c("label")]
I’ll use H20AutoML here. It can be used for automating the machine learning workflow and use many algorithms as Stacked Ensembles models for training. For more information I recommend acess the documentation on the following link http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
Initializing and transforming data to a H2OFrame object
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 hours 4 seconds
## H2O cluster timezone: -03:00
## H2O data parsing timezone: UTC
## H2O cluster version: 3.30.0.1
## H2O cluster version age: 1 month and 17 days
## H2O cluster name: H2O_started_from_R_joaop_cdv303
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.22 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.1 (2019-07-05)
train_frame = as.h2o(train_d)
##
|
| | 0%
|
|======================================================================| 100%
test_frame = as.h2o(test_d)
##
|
| | 0%
|
|======================================================================| 100%
y <- "log_Appliances"
When using H2OAutoml, the parameter ‘training_frame’ consider all columns except the response as predictors, so we can skip setting the x
argument explicitly. Here I’m using parameters: ‘max_models’ to determine the max number, excluding the Stacked Ensemble models. for test you can specify max time by ‘max_runtime_secs’ parameter (default unlimited time)
aml <- h2o.automl(y = y,
training_frame = train_frame,
max_models = 10,
max_runtime_secs = 1200,
seed = 36,
keep_cross_validation_predictions = TRUE,
project_name = "energy_cons_lb_frame")
##
|
| | 0%
## 14:35:07.839: AutoML: XGBoost is not available; skipping it.
## 14:56:37.457: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 12 models.
## 14:56:37.458: AutoML: XGBoost is not available; skipping it.
## 15:05:55.684: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 24 models.
## 15:05:55.684: AutoML: XGBoost is not available; skipping it.
## 15:12:36.638: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 36 models.
## 15:12:36.639: AutoML: XGBoost is not available; skipping it.
## 17:18:28.293: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 48 models.
## 17:18:28.318: AutoML: XGBoost is not available; skipping it.
## 17:26:41.394: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 60 models.
## 17:26:41.395: AutoML: XGBoost is not available; skipping it.
## 17:35:07.330: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 72 models.
## 17:35:07.331: AutoML: XGBoost is not available; skipping it.
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 32%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================== | 43%
|
|======================================== | 57%
|
|=========================================== | 61%
|
|============================================== | 65%
|
|================================================= | 70%
|
|==================================================== | 74%
|
|======================================================================| 100%
View the AutoML Leaderboard
lb <- aml@leaderboard
print(lb, n = nrow(lb))
## model_id mean_residual_deviance
## 1 StackedEnsemble_BestOfFamily_AutoML_20200521_151236 0.1245203
## 2 StackedEnsemble_AllModels_AutoML_20200521_145637 0.1245203
## 3 StackedEnsemble_BestOfFamily_AutoML_20200521_171828 0.1245203
## 4 StackedEnsemble_BestOfFamily_AutoML_20200521_172641 0.1245203
## 5 StackedEnsemble_BestOfFamily_AutoML_20200521_173507 0.1245203
## 6 StackedEnsemble_BestOfFamily_AutoML_20200521_150555 0.1245203
## 7 StackedEnsemble_BestOfFamily_AutoML_20200521_145637 0.1245203
## 8 StackedEnsemble_AllModels_AutoML_20200521_150555 0.1245556
## 9 StackedEnsemble_AllModels_AutoML_20200521_151236 0.1245556
## 10 StackedEnsemble_AllModels_AutoML_20200521_171828 0.1245563
## 11 StackedEnsemble_AllModels_AutoML_20200521_173507 0.1245563
## 12 StackedEnsemble_AllModels_AutoML_20200521_172641 0.1245563
## 13 DRF_1_AutoML_20200521_145637 0.1262736
## 14 DRF_1_AutoML_20200521_151236 0.1262736
## 15 DRF_1_AutoML_20200521_172641 0.1262736
## 16 DRF_1_AutoML_20200521_173507 0.1262736
## 17 DRF_1_AutoML_20200521_150555 0.1262736
## 18 DRF_1_AutoML_20200521_171828 0.1264583
## 19 StackedEnsemble_AllModels_AutoML_20200521_143507 0.1273468
## 20 StackedEnsemble_BestOfFamily_AutoML_20200521_143507 0.1273468
## 21 XRT_1_AutoML_20200521_173507 0.1278341
## 22 XRT_1_AutoML_20200521_171828 0.1278341
## 23 XRT_1_AutoML_20200521_150555 0.1278341
## 24 XRT_1_AutoML_20200521_151236 0.1278341
## 25 XRT_1_AutoML_20200521_145637 0.1278341
## 26 XRT_1_AutoML_20200521_172641 0.1278341
## 27 DRF_1_AutoML_20200521_143507 0.1294502
## 28 XRT_1_AutoML_20200521_143507 0.1298912
## 29 GBM_4_AutoML_20200521_150555 0.1302949
## 30 GBM_4_AutoML_20200521_151236 0.1302949
## 31 GBM_4_AutoML_20200521_172641 0.1302949
## 32 GBM_4_AutoML_20200521_173507 0.1302949
## 33 GBM_4_AutoML_20200521_145637 0.1302949
## 34 GBM_4_AutoML_20200521_171828 0.1302949
## 35 GBM_4_AutoML_20200521_143507 0.1320602
## 36 GBM_3_AutoML_20200521_171828 0.1365176
## 37 GBM_3_AutoML_20200521_150555 0.1365176
## 38 GBM_3_AutoML_20200521_151236 0.1365176
## 39 GBM_3_AutoML_20200521_173507 0.1365176
## 40 GBM_3_AutoML_20200521_172641 0.1365176
## 41 GBM_3_AutoML_20200521_145637 0.1365176
## 42 GBM_grid__1_AutoML_20200521_173507_model_1 0.1377044
## 43 GBM_grid__1_AutoML_20200521_150555_model_1 0.1377044
## 44 GBM_grid__1_AutoML_20200521_151236_model_1 0.1377044
## 45 GBM_grid__1_AutoML_20200521_145637_model_1 0.1377044
## 46 GBM_grid__1_AutoML_20200521_171828_model_1 0.1377044
## 47 GBM_grid__1_AutoML_20200521_172641_model_1 0.1377044
## 48 GBM_grid__1_AutoML_20200521_143507_model_1 0.1377534
## 49 GBM_3_AutoML_20200521_143507 0.1389048
## 50 GBM_1_AutoML_20200521_171828 0.1441560
## 51 GBM_1_AutoML_20200521_172641 0.1441560
## 52 GBM_1_AutoML_20200521_173507 0.1441560
## 53 GBM_1_AutoML_20200521_151236 0.1441560
## 54 GBM_1_AutoML_20200521_150555 0.1441560
## 55 GBM_1_AutoML_20200521_145637 0.1441560
## 56 GBM_2_AutoML_20200521_143507 0.1442442
## 57 GBM_2_AutoML_20200521_151236 0.1445701
## 58 GBM_2_AutoML_20200521_171828 0.1445701
## 59 GBM_2_AutoML_20200521_145637 0.1445701
## 60 GBM_2_AutoML_20200521_150555 0.1445701
## 61 GBM_2_AutoML_20200521_173507 0.1445701
## 62 GBM_2_AutoML_20200521_172641 0.1445701
## 63 GBM_5_AutoML_20200521_143507 0.1463852
## 64 GBM_5_AutoML_20200521_171828 0.1470363
## 65 GBM_5_AutoML_20200521_145637 0.1470363
## 66 GBM_5_AutoML_20200521_172641 0.1470363
## 67 GBM_5_AutoML_20200521_173507 0.1470363
## 68 GBM_5_AutoML_20200521_150555 0.1470363
## 69 GBM_5_AutoML_20200521_151236 0.1470363
## 70 GBM_1_AutoML_20200521_143507 0.1476148
## 71 DeepLearning_1_AutoML_20200521_150555 0.2606408
## 72 DeepLearning_1_AutoML_20200521_173507 0.2607621
## 73 DeepLearning_1_AutoML_20200521_171828 0.2613775
## 74 DeepLearning_1_AutoML_20200521_151236 0.2623785
## 75 DeepLearning_1_AutoML_20200521_145637 0.2627400
## 76 DeepLearning_1_AutoML_20200521_172641 0.2653141
## 77 DeepLearning_1_AutoML_20200521_143507 0.2669415
## 78 GLM_1_AutoML_20200521_143507 0.3137547
## 79 GLM_1_AutoML_20200521_151236 0.3146925
## 80 GLM_1_AutoML_20200521_172641 0.3146925
## 81 GLM_1_AutoML_20200521_145637 0.3146925
## 82 GLM_1_AutoML_20200521_173507 0.3146925
## 83 GLM_1_AutoML_20200521_171828 0.3146925
## 84 GLM_1_AutoML_20200521_150555 0.3146925
## rmse mse mae rmsle
## 1 0.3528744 0.1245203 0.2359063 0.06283139
## 2 0.3528744 0.1245203 0.2359063 0.06283139
## 3 0.3528744 0.1245203 0.2359063 0.06283139
## 4 0.3528744 0.1245203 0.2359063 0.06283139
## 5 0.3528744 0.1245203 0.2359063 0.06283139
## 6 0.3528744 0.1245203 0.2359063 0.06283139
## 7 0.3528744 0.1245203 0.2359063 0.06283139
## 8 0.3529244 0.1245556 0.2359408 0.06284026
## 9 0.3529244 0.1245556 0.2359408 0.06284026
## 10 0.3529253 0.1245563 0.2359414 0.06284044
## 11 0.3529253 0.1245563 0.2359414 0.06284044
## 12 0.3529253 0.1245563 0.2359414 0.06284044
## 13 0.3553500 0.1262736 0.2352619 0.06307621
## 14 0.3553500 0.1262736 0.2352619 0.06307621
## 15 0.3553500 0.1262736 0.2352619 0.06307621
## 16 0.3553500 0.1262736 0.2352619 0.06307621
## 17 0.3553500 0.1262736 0.2352619 0.06307621
## 18 0.3556098 0.1264583 0.2354422 0.06312241
## 19 0.3568568 0.1273468 0.2389199 0.06349190
## 20 0.3568568 0.1273468 0.2389199 0.06349190
## 21 0.3575390 0.1278341 0.2363905 0.06345866
## 22 0.3575390 0.1278341 0.2363905 0.06345866
## 23 0.3575390 0.1278341 0.2363905 0.06345866
## 24 0.3575390 0.1278341 0.2363905 0.06345866
## 25 0.3575390 0.1278341 0.2363905 0.06345866
## 26 0.3575390 0.1278341 0.2363905 0.06345866
## 27 0.3597919 0.1294502 0.2382631 0.06379927
## 28 0.3604042 0.1298912 0.2375793 0.06390898
## 29 0.3609638 0.1302949 0.2424119 0.06441495
## 30 0.3609638 0.1302949 0.2424119 0.06441495
## 31 0.3609638 0.1302949 0.2424119 0.06441495
## 32 0.3609638 0.1302949 0.2424119 0.06441495
## 33 0.3609638 0.1302949 0.2424119 0.06441495
## 34 0.3609638 0.1302949 0.2424119 0.06441495
## 35 0.3634009 0.1320602 0.2432499 0.06476702
## 36 0.3694829 0.1365176 0.2484454 0.06585417
## 37 0.3694829 0.1365176 0.2484454 0.06585417
## 38 0.3694829 0.1365176 0.2484454 0.06585417
## 39 0.3694829 0.1365176 0.2484454 0.06585417
## 40 0.3694829 0.1365176 0.2484454 0.06585417
## 41 0.3694829 0.1365176 0.2484454 0.06585417
## 42 0.3710854 0.1377044 0.2484933 0.06595956
## 43 0.3710854 0.1377044 0.2484933 0.06595956
## 44 0.3710854 0.1377044 0.2484933 0.06595956
## 45 0.3710854 0.1377044 0.2484933 0.06595956
## 46 0.3710854 0.1377044 0.2484933 0.06595956
## 47 0.3710854 0.1377044 0.2484933 0.06595956
## 48 0.3711514 0.1377534 0.2487664 0.06595801
## 49 0.3726994 0.1389048 0.2508116 0.06644421
## 50 0.3796788 0.1441560 0.2569958 0.06772334
## 51 0.3796788 0.1441560 0.2569958 0.06772334
## 52 0.3796788 0.1441560 0.2569958 0.06772334
## 53 0.3796788 0.1441560 0.2569958 0.06772334
## 54 0.3796788 0.1441560 0.2569958 0.06772334
## 55 0.3796788 0.1441560 0.2569958 0.06772334
## 56 0.3797950 0.1442442 0.2564710 0.06764931
## 57 0.3802237 0.1445701 0.2571722 0.06774982
## 58 0.3802237 0.1445701 0.2571722 0.06774982
## 59 0.3802237 0.1445701 0.2571722 0.06774982
## 60 0.3802237 0.1445701 0.2571722 0.06774982
## 61 0.3802237 0.1445701 0.2571722 0.06774982
## 62 0.3802237 0.1445701 0.2571722 0.06774982
## 63 0.3826032 0.1463852 0.2610442 0.06815408
## 64 0.3834531 0.1470363 0.2615666 0.06834008
## 65 0.3834531 0.1470363 0.2615666 0.06834008
## 66 0.3834531 0.1470363 0.2615666 0.06834008
## 67 0.3834531 0.1470363 0.2615666 0.06834008
## 68 0.3834531 0.1470363 0.2615666 0.06834008
## 69 0.3834531 0.1470363 0.2615666 0.06834008
## 70 0.3842067 0.1476148 0.2593622 0.06840914
## 71 0.5105299 0.2606408 0.3596753 0.09108533
## 72 0.5106487 0.2607621 0.3580805 0.09098965
## 73 0.5112509 0.2613775 0.3609867 0.09116738
## 74 0.5122289 0.2623785 0.3671524 0.09173722
## 75 0.5125817 0.2627400 0.3620742 0.09149177
## 76 0.5150864 0.2653141 0.3663252 0.09189685
## 77 0.5166638 0.2669415 0.3675624 0.09218277
## 78 0.5601381 0.3137547 0.3948662 0.09977844
## 79 0.5609746 0.3146925 0.3948982 0.09993106
## 80 0.5609746 0.3146925 0.3948982 0.09993106
## 81 0.5609746 0.3146925 0.3948982 0.09993106
## 82 0.5609746 0.3146925 0.3948982 0.09993106
## 83 0.5609746 0.3146925 0.3948982 0.09993106
## 84 0.5609746 0.3146925 0.3948982 0.09993106
##
## [84 rows x 6 columns]
Perfomance scores on test data
perf <- h2o.performance(aml@leader, test_frame)
perf
## H2ORegressionMetrics: stackedensemble
##
## MSE: 0.1270943
## RMSE: 0.3565028
## MAE: 0.2376857
## RMSLE: 0.06355658
## Mean Residual Deviance : 0.1270943
Predictions (H2oautoML uses the leader model)
pred <- h2o.predict(aml, test_frame)
##
|
| | 0%
|
|======================================================================| 100%
pred
## predict
## 1 4.006523
## 2 4.022654
## 3 4.815933
## 4 5.024039
## 5 4.707223
## 6 4.579227
##
## [4932 rows x 1 column]
predictions <- exp(pred)
predictions <- as.vector(predictions)
df_result <- data.frame(predictions)
head(df_result)
h2o.saveModel(object = aml@leader, path=getwd())
## [1] "C:\\FCD\\04.MachineLearning\\Projetos\\Projeto08\\StackedEnsemble_BestOfFamily_AutoML_20200521_151236"
write.csv(df_result, "predictions.csv")