Loading packages

library(caret)
library(ggplot2)
library(knitr)
library(plyr)
library(dplyr)
library(corrplot)
library(plotly)
library(randomForest)
library(h2o)

Loading datasets

df_train <- read.csv("projeto8-training.csv")
df_test <- read.csv("projeto8-testing.csv")

Merging test and train dataframes

df_train$label <- "train" # creating column for label
df_test$label <- "test"
df_merged <- rbind(df_train, df_test)

Showing some infos about the data

str(df_merged)
## 'data.frame':    19735 obs. of  33 variables:
##  $ date       : Factor w/ 19735 levels "2016-01-11 17:00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Appliances : int  60 60 50 60 50 60 60 70 430 250 ...
##  $ lights     : int  30 30 30 40 40 50 40 40 50 40 ...
##  $ T1         : num  19.9 19.9 19.9 19.9 19.9 ...
##  $ RH_1       : num  47.6 46.7 46.3 46.3 46 ...
##  $ T2         : num  19.2 19.2 19.2 19.2 19.2 ...
##  $ RH_2       : num  44.8 44.7 44.6 44.5 44.5 ...
##  $ T3         : num  19.8 19.8 19.8 19.8 19.8 ...
##  $ RH_3       : num  44.7 44.8 44.9 45 44.9 ...
##  $ T4         : num  19 19 18.9 18.9 18.9 ...
##  $ RH_4       : num  45.6 46 45.9 45.5 45.7 ...
##  $ T5         : num  17.2 17.2 17.2 17.2 17.1 ...
##  $ RH_5       : num  55.2 55.2 55.1 55.1 55 ...
##  $ T6         : num  7.03 6.83 6.56 6.37 6.3 ...
##  $ RH_6       : num  84.3 84.1 83.2 84.9 85.8 ...
##  $ T7         : num  17.2 17.2 17.2 17.2 17.1 ...
##  $ RH_7       : num  41.6 41.6 41.4 41.2 41.3 ...
##  $ T8         : num  18.2 18.2 18.2 18.1 18.1 ...
##  $ RH_8       : num  48.9 48.9 48.7 48.6 48.6 ...
##  $ T9         : num  17 17.1 17 17 17 ...
##  $ RH_9       : num  45.5 45.6 45.5 45.4 45.3 ...
##  $ T_out      : num  6.6 6.48 6.37 6.13 6.02 ...
##  $ Press_mm_hg: num  734 734 734 734 734 ...
##  $ RH_out     : num  92 92 92 92 92 ...
##  $ Windspeed  : num  7 6.67 6.33 5.67 5.33 ...
##  $ Visibility : num  63 59.2 55.3 47.7 43.8 ...
##  $ Tdewpoint  : num  5.3 5.2 5.1 4.9 4.8 ...
##  $ rv1        : num  13.3 18.6 28.6 10.1 44.9 ...
##  $ rv2        : num  13.3 18.6 28.6 10.1 44.9 ...
##  $ NSM        : int  61200 61800 62400 63600 64200 65400 66000 66600 68400 69000 ...
##  $ WeekStatus : Factor w/ 2 levels "Weekday","Weekend": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day_of_week: Factor w/ 7 levels "Friday","Monday",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ label      : chr  "train" "train" "train" "train" ...
summary(df_merged)
##                   date         Appliances          lights             T1       
##  2016-01-11 17:00:00:    1   Min.   :  10.00   Min.   : 0.000   Min.   :16.79  
##  2016-01-11 17:10:00:    1   1st Qu.:  50.00   1st Qu.: 0.000   1st Qu.:20.76  
##  2016-01-11 17:20:00:    1   Median :  60.00   Median : 0.000   Median :21.60  
##  2016-01-11 17:40:00:    1   Mean   :  97.69   Mean   : 3.802   Mean   :21.69  
##  2016-01-11 17:50:00:    1   3rd Qu.: 100.00   3rd Qu.: 0.000   3rd Qu.:22.60  
##  2016-01-11 18:10:00:    1   Max.   :1080.00   Max.   :70.000   Max.   :26.26  
##  (Other)            :19729                                                     
##       RH_1             T2             RH_2             T3       
##  Min.   :27.02   Min.   :16.10   Min.   :20.46   Min.   :17.20  
##  1st Qu.:37.33   1st Qu.:18.79   1st Qu.:37.90   1st Qu.:20.79  
##  Median :39.66   Median :20.00   Median :40.50   Median :22.10  
##  Mean   :40.26   Mean   :20.34   Mean   :40.42   Mean   :22.27  
##  3rd Qu.:43.07   3rd Qu.:21.50   3rd Qu.:43.26   3rd Qu.:23.29  
##  Max.   :63.36   Max.   :29.86   Max.   :56.03   Max.   :29.24  
##                                                                 
##       RH_3             T4             RH_4             T5       
##  Min.   :28.77   Min.   :15.10   Min.   :27.66   Min.   :15.33  
##  1st Qu.:36.90   1st Qu.:19.53   1st Qu.:35.53   1st Qu.:18.28  
##  Median :38.53   Median :20.67   Median :38.40   Median :19.39  
##  Mean   :39.24   Mean   :20.86   Mean   :39.03   Mean   :19.59  
##  3rd Qu.:41.76   3rd Qu.:22.10   3rd Qu.:42.16   3rd Qu.:20.62  
##  Max.   :50.16   Max.   :26.20   Max.   :51.09   Max.   :25.80  
##                                                                 
##       RH_5             T6              RH_6             T7       
##  Min.   :29.82   Min.   :-6.065   Min.   : 1.00   Min.   :15.39  
##  1st Qu.:45.40   1st Qu.: 3.627   1st Qu.:30.02   1st Qu.:18.70  
##  Median :49.09   Median : 7.300   Median :55.29   Median :20.03  
##  Mean   :50.95   Mean   : 7.911   Mean   :54.61   Mean   :20.27  
##  3rd Qu.:53.66   3rd Qu.:11.256   3rd Qu.:83.23   3rd Qu.:21.60  
##  Max.   :96.32   Max.   :28.290   Max.   :99.90   Max.   :26.00  
##                                                                  
##       RH_7             T8             RH_8             T9       
##  Min.   :23.20   Min.   :16.31   Min.   :29.60   Min.   :14.89  
##  1st Qu.:31.50   1st Qu.:20.79   1st Qu.:39.07   1st Qu.:18.00  
##  Median :34.86   Median :22.10   Median :42.38   Median :19.39  
##  Mean   :35.39   Mean   :22.03   Mean   :42.94   Mean   :19.49  
##  3rd Qu.:39.00   3rd Qu.:23.39   3rd Qu.:46.54   3rd Qu.:20.60  
##  Max.   :51.40   Max.   :27.23   Max.   :58.78   Max.   :24.50  
##                                                                 
##       RH_9           T_out         Press_mm_hg        RH_out      
##  Min.   :29.17   Min.   :-5.000   Min.   :729.3   Min.   : 24.00  
##  1st Qu.:38.50   1st Qu.: 3.667   1st Qu.:750.9   1st Qu.: 70.33  
##  Median :40.90   Median : 6.917   Median :756.1   Median : 83.67  
##  Mean   :41.55   Mean   : 7.412   Mean   :755.5   Mean   : 79.75  
##  3rd Qu.:44.34   3rd Qu.:10.408   3rd Qu.:760.9   3rd Qu.: 91.67  
##  Max.   :53.33   Max.   :26.100   Max.   :772.3   Max.   :100.00  
##                                                                   
##    Windspeed        Visibility      Tdewpoint           rv1          
##  Min.   : 0.000   Min.   : 1.00   Min.   :-6.600   Min.   : 0.00532  
##  1st Qu.: 2.000   1st Qu.:29.00   1st Qu.: 0.900   1st Qu.:12.49789  
##  Median : 3.667   Median :40.00   Median : 3.433   Median :24.89765  
##  Mean   : 4.040   Mean   :38.33   Mean   : 3.761   Mean   :24.98803  
##  3rd Qu.: 5.500   3rd Qu.:40.00   3rd Qu.: 6.567   3rd Qu.:37.58377  
##  Max.   :14.000   Max.   :66.00   Max.   :15.500   Max.   :49.99653  
##                                                                      
##       rv2                NSM          WeekStatus       Day_of_week  
##  Min.   : 0.00532   Min.   :    0   Weekday:14263   Friday   :2845  
##  1st Qu.:12.49789   1st Qu.:21600   Weekend: 5472   Monday   :2778  
##  Median :24.89765   Median :43200                   Saturday :2736  
##  Mean   :24.98803   Mean   :42907                   Sunday   :2736  
##  3rd Qu.:37.58377   3rd Qu.:64200                   Thursday :2880  
##  Max.   :49.99653   Max.   :85800                   Tuesday  :2880  
##                                                     Wednesday:2880  
##     label          
##  Length:19735      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
df_merged$date <- as.POSIXct(df_merged$date) # transform date variable type to date/time 
sapply(df_train, function(x) sum(is.na(x))) # checking missing values
##        date  Appliances      lights          T1        RH_1          T2 
##           0           0           0           0           0           0 
##        RH_2          T3        RH_3          T4        RH_4          T5 
##           0           0           0           0           0           0 
##        RH_5          T6        RH_6          T7        RH_7          T8 
##           0           0           0           0           0           0 
##        RH_8          T9        RH_9       T_out Press_mm_hg      RH_out 
##           0           0           0           0           0           0 
##   Windspeed  Visibility   Tdewpoint         rv1         rv2         NSM 
##           0           0           0           0           0           0 
##  WeekStatus Day_of_week       label 
##           0           0           0
num_cols_df <- df_merged[,-c(1,31,32,33)] # numeric features

Exploratory Analysis

Target variable ‘Appliances’

hist(df_merged$Appliances, breaks = 30,  col='darkblue',
     xlab="Energy consumption", main = "Frequencies - Energy consumption")

Unique values from Appliances

sort(unique(df_merged$Appliances))
##  [1]   10   20   30   40   50   60   70   80   90  100  110  120  130  140  150
## [16]  160  170  180  190  200  210  220  230  240  250  260  270  280  290  300
## [31]  310  320  330  340  350  360  370  380  390  400  410  420  430  440  450
## [46]  460  470  480  490  500  510  520  530  540  550  560  570  580  590  600
## [61]  610  620  630  640  650  660  670  680  690  700  710  720  730  740  750
## [76]  760  770  780  790  800  820  830  840  850  860  870  880  890  900  910
## [91] 1070 1080

Top Frequencies table

kable(table(cut(df_merged$Appliances, breaks=seq(1, 201, 10))), align = 'c')
Var1 Freq
(1,11] 9
(11,21] 343
(21,31] 723
(31,41] 2019
(41,51] 4368
(51,61] 3282
(61,71] 1560
(71,81] 1205
(81,91] 1015
(91,101] 978
(101,111] 736
(111,121] 502
(121,131] 330
(131,141] 223
(141,151] 144
(151,161] 90
(161,171] 70
(171,181] 78
(181,191] 86
(191,201] 58

Plot time series Energy consumption - Each point measured

plot_ly(x = df_merged$date , y = df_merged$Appliances, type="scatter", mode="markers")

We can see the target variable is left-skewed. Let’s keep eye on that for later transformation.

Correlation plot

m_cor <- cor(num_cols_df)
corrplot(m_cor, method = "square",tl.col = "black")

Here we see the temperatures are positive correlated from each other, and humidity too. But we can see a different behaviour
with the RH_6 by temperatures with negative correlation. Maybe it’s a part of the house that has major impact from temperatures from others. Also, note that rv1 and rv2 are highly correlated, it seems the same. Let’s check it later and remove one of them if applicable.

Temperatures frequencies

hist(df_merged$T1, breaks = 30,  col='darkred', xlab = "", main = "T1")

hist(df_merged$T2, breaks = 30,  col='darkred', xlab = "", main = "T2")

hist(df_merged$T3, breaks = 30,  col='darkred', xlab = "", main = "T3")

hist(df_merged$T4, breaks = 30,  col='darkred', xlab = "", main = "T4")

hist(df_merged$T5, breaks = 30,  col='darkred', xlab = "", main = "T5")

hist(df_merged$T6, breaks = 30,  col='darkred', xlab = "", main = "T6")

hist(df_merged$T7, breaks = 30,  col='darkred', xlab = "", main = "T7")

hist(df_merged$T8, breaks = 30,  col='darkred', xlab = "", main = "T8")

hist(df_merged$T9, breaks = 30,  col='darkred', xlab = "", main = "T9")

RH - Humidity histograms
hist(df_merged$RH_1, breaks = 30,  col='#0c5069', xlab = "", main = "RH_1")

hist(df_merged$RH_2, breaks = 30,  col='#0c5069', xlab = "", main = "RH_2")

hist(df_merged$RH_3, breaks = 30,  col='#0c5069', xlab = "", main = "RH_3")

hist(df_merged$RH_4, breaks = 30,  col='#0c5069', xlab = "", main = "RH_4")

hist(df_merged$RH_5, breaks = 30,  col='#0c5069', xlab = "", main = "RH_5")

hist(df_merged$RH_6, breaks = 30,  col='#0c5069', xlab = "", main = "RH_6")

hist(df_merged$RH_7, breaks = 30,  col='#0c5069', xlab = "", main = "RH_7")

hist(df_merged$RH_8, breaks = 30,  col='#0c5069', xlab = "", main = "RH_8")

hist(df_merged$RH_9, breaks = 30,  col='#0c5069', xlab = "", main = "RH_9")

Histograms of T_out by WeekStatus
weekend <- df_merged %>% 
  filter(WeekStatus == "Weekend")
weekday <- df_merged %>% 
  filter(WeekStatus == "Weekday")
hist(weekend$T_out, ylim = c(0,1300), breaks=30, col=rgb(1,0,0,0.5), xlab="Temperature outside", 
     ylab="Frequency", main="Distribution of Out temperatures by Weekend and Weekday" )
hist(weekday$T_out, ylim = c(0,1300) ,breaks=30, col=rgb(0,0,1,0.5), add=T)
legend("topright", legend=c("weekend","weekday"), col=c(rgb(1,0,0,0.5), 
                                                      rgb(0,0,1,0.5)), pt.cex=2, pch=15)

Examining the tdewpoint
dewp_time <- df_merged %>% 
  group_by(date = as.Date(date)) %>%
  summarize(tot = mean(Tdewpoint))

plot_ly(x = dewp_time$date , y = dewp_time$tot, type="bar", color= "red")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Lights - we see most of registers are 0w
ggplot(data=df_merged, aes(x=lights)) +
  geom_bar(fill= '#E69F00') + 
  ylab("Count") + xlab("Lights (Wh) ") + 
  ggtitle("Lights by Wh")

Feature engineering

Checking and removing identical features

features_pair <- combn(names(num_cols_df), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
  f1 <- pair[1]
  f2 <- pair[2]
  
  if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
    if (all(num_cols_df[[f1]] == num_cols_df[[f2]])) {
      cat(f1, "and", f2, "are equals.\n")
      toRemove <- c(toRemove, f2)
    }
  }
}
## rv1 and rv2 are equals.
df_merged <- subset(df_merged, select = -c(rv2)) # Removing rv2 

Log-transform to target variable

log_Appliances <- log(df_merged$Appliances)

hist(log_Appliances, breaks = 30,  col='darkblue',
     xlab="Energy consumption", main = "Frequencies - Energy consumption")

A huge difference from the first left-skewed histogram

Normalize data

num_cols_df <- subset(num_cols_df, select = -c(Appliances, rv2))
scaled_num_cols <- scale(num_cols_df)

One hot enconding - WeekStatus feature

categorical_f <- subset(df_merged, select = c(WeekStatus))
dmy <- dummyVars(" ~ .", data = categorical_f)
categorical_enc <- data.frame(predict(dmy, newdata = categorical_f))
head(categorical_enc)

Feature Selection

importance <- randomForest(log_Appliances ~ ., 
                            data = scaled_num_cols, 
                            ntree = 100, 
                            nodesize = 10, importance = TRUE)

varImpPlot(importance)

Interesting! NSM is considered the most valuable feature. The measures are MSE (Mean Squared error) and mean decrease in node impurity. Let’s now choose the best features analyzed.

Removing variables rv1, T6, RH_6, T_out, RH_out and Visibility because they showed less importance related to Energy consumption.

scaled_num_cols <- subset(scaled_num_cols, select = -c(rv1, T6, RH_6, T_out, RH_out, Visibility))
df_final <- cbind(log_Appliances, scaled_num_cols, categorical_enc, df_merged['label'])
head(df_final)

Machine learning modeling

Train and test Split

train_d <- df_final[df_final$label == 'train', !names(df_final) %in% c("label")] 
test_d <- df_final[df_final$label == 'test', !names(df_final) %in% c("label")] 

Create ML model

I’ll use H20AutoML here. It can be used for automating the machine learning workflow and use many algorithms as Stacked Ensembles models for training. For more information I recommend acess the documentation on the following link http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

Initializing and transforming data to a H2OFrame object

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 hours 4 seconds 
##     H2O cluster timezone:       -03:00 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    1 month and 17 days  
##     H2O cluster name:           H2O_started_from_R_joaop_cdv303 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.22 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.1 (2019-07-05)
train_frame = as.h2o(train_d)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_frame = as.h2o(test_d)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
y <- "log_Appliances"

When using H2OAutoml, the parameter ‘training_frame’ consider all columns except the response as predictors, so we can skip setting the x argument explicitly. Here I’m using parameters: ‘max_models’ to determine the max number, excluding the Stacked Ensemble models. for test you can specify max time by ‘max_runtime_secs’ parameter (default unlimited time)

aml <- h2o.automl(y = y,
                  training_frame = train_frame,
                  max_models = 10,
                  max_runtime_secs = 1200,
                  seed = 36,
                  keep_cross_validation_predictions = TRUE,
                  project_name = "energy_cons_lb_frame")
## 
  |                                                                            
  |                                                                      |   0%
## 14:35:07.839: AutoML: XGBoost is not available; skipping it.
## 14:56:37.457: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 12 models.
## 14:56:37.458: AutoML: XGBoost is not available; skipping it.
## 15:05:55.684: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 24 models.
## 15:05:55.684: AutoML: XGBoost is not available; skipping it.
## 15:12:36.638: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 36 models.
## 15:12:36.639: AutoML: XGBoost is not available; skipping it.
## 17:18:28.293: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 48 models.
## 17:18:28.318: AutoML: XGBoost is not available; skipping it.
## 17:26:41.394: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 60 models.
## 17:26:41.395: AutoML: XGBoost is not available; skipping it.
## 17:35:07.330: New models will be added to existing leaderboard energy_cons_lb_frame@@log_Appliances (leaderboard frame=null) with already 72 models.
## 17:35:07.331: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |======================================================================| 100%

View the AutoML Leaderboard

lb <- aml@leaderboard
print(lb, n = nrow(lb))
##                                               model_id mean_residual_deviance
## 1  StackedEnsemble_BestOfFamily_AutoML_20200521_151236              0.1245203
## 2     StackedEnsemble_AllModels_AutoML_20200521_145637              0.1245203
## 3  StackedEnsemble_BestOfFamily_AutoML_20200521_171828              0.1245203
## 4  StackedEnsemble_BestOfFamily_AutoML_20200521_172641              0.1245203
## 5  StackedEnsemble_BestOfFamily_AutoML_20200521_173507              0.1245203
## 6  StackedEnsemble_BestOfFamily_AutoML_20200521_150555              0.1245203
## 7  StackedEnsemble_BestOfFamily_AutoML_20200521_145637              0.1245203
## 8     StackedEnsemble_AllModels_AutoML_20200521_150555              0.1245556
## 9     StackedEnsemble_AllModels_AutoML_20200521_151236              0.1245556
## 10    StackedEnsemble_AllModels_AutoML_20200521_171828              0.1245563
## 11    StackedEnsemble_AllModels_AutoML_20200521_173507              0.1245563
## 12    StackedEnsemble_AllModels_AutoML_20200521_172641              0.1245563
## 13                        DRF_1_AutoML_20200521_145637              0.1262736
## 14                        DRF_1_AutoML_20200521_151236              0.1262736
## 15                        DRF_1_AutoML_20200521_172641              0.1262736
## 16                        DRF_1_AutoML_20200521_173507              0.1262736
## 17                        DRF_1_AutoML_20200521_150555              0.1262736
## 18                        DRF_1_AutoML_20200521_171828              0.1264583
## 19    StackedEnsemble_AllModels_AutoML_20200521_143507              0.1273468
## 20 StackedEnsemble_BestOfFamily_AutoML_20200521_143507              0.1273468
## 21                        XRT_1_AutoML_20200521_173507              0.1278341
## 22                        XRT_1_AutoML_20200521_171828              0.1278341
## 23                        XRT_1_AutoML_20200521_150555              0.1278341
## 24                        XRT_1_AutoML_20200521_151236              0.1278341
## 25                        XRT_1_AutoML_20200521_145637              0.1278341
## 26                        XRT_1_AutoML_20200521_172641              0.1278341
## 27                        DRF_1_AutoML_20200521_143507              0.1294502
## 28                        XRT_1_AutoML_20200521_143507              0.1298912
## 29                        GBM_4_AutoML_20200521_150555              0.1302949
## 30                        GBM_4_AutoML_20200521_151236              0.1302949
## 31                        GBM_4_AutoML_20200521_172641              0.1302949
## 32                        GBM_4_AutoML_20200521_173507              0.1302949
## 33                        GBM_4_AutoML_20200521_145637              0.1302949
## 34                        GBM_4_AutoML_20200521_171828              0.1302949
## 35                        GBM_4_AutoML_20200521_143507              0.1320602
## 36                        GBM_3_AutoML_20200521_171828              0.1365176
## 37                        GBM_3_AutoML_20200521_150555              0.1365176
## 38                        GBM_3_AutoML_20200521_151236              0.1365176
## 39                        GBM_3_AutoML_20200521_173507              0.1365176
## 40                        GBM_3_AutoML_20200521_172641              0.1365176
## 41                        GBM_3_AutoML_20200521_145637              0.1365176
## 42          GBM_grid__1_AutoML_20200521_173507_model_1              0.1377044
## 43          GBM_grid__1_AutoML_20200521_150555_model_1              0.1377044
## 44          GBM_grid__1_AutoML_20200521_151236_model_1              0.1377044
## 45          GBM_grid__1_AutoML_20200521_145637_model_1              0.1377044
## 46          GBM_grid__1_AutoML_20200521_171828_model_1              0.1377044
## 47          GBM_grid__1_AutoML_20200521_172641_model_1              0.1377044
## 48          GBM_grid__1_AutoML_20200521_143507_model_1              0.1377534
## 49                        GBM_3_AutoML_20200521_143507              0.1389048
## 50                        GBM_1_AutoML_20200521_171828              0.1441560
## 51                        GBM_1_AutoML_20200521_172641              0.1441560
## 52                        GBM_1_AutoML_20200521_173507              0.1441560
## 53                        GBM_1_AutoML_20200521_151236              0.1441560
## 54                        GBM_1_AutoML_20200521_150555              0.1441560
## 55                        GBM_1_AutoML_20200521_145637              0.1441560
## 56                        GBM_2_AutoML_20200521_143507              0.1442442
## 57                        GBM_2_AutoML_20200521_151236              0.1445701
## 58                        GBM_2_AutoML_20200521_171828              0.1445701
## 59                        GBM_2_AutoML_20200521_145637              0.1445701
## 60                        GBM_2_AutoML_20200521_150555              0.1445701
## 61                        GBM_2_AutoML_20200521_173507              0.1445701
## 62                        GBM_2_AutoML_20200521_172641              0.1445701
## 63                        GBM_5_AutoML_20200521_143507              0.1463852
## 64                        GBM_5_AutoML_20200521_171828              0.1470363
## 65                        GBM_5_AutoML_20200521_145637              0.1470363
## 66                        GBM_5_AutoML_20200521_172641              0.1470363
## 67                        GBM_5_AutoML_20200521_173507              0.1470363
## 68                        GBM_5_AutoML_20200521_150555              0.1470363
## 69                        GBM_5_AutoML_20200521_151236              0.1470363
## 70                        GBM_1_AutoML_20200521_143507              0.1476148
## 71               DeepLearning_1_AutoML_20200521_150555              0.2606408
## 72               DeepLearning_1_AutoML_20200521_173507              0.2607621
## 73               DeepLearning_1_AutoML_20200521_171828              0.2613775
## 74               DeepLearning_1_AutoML_20200521_151236              0.2623785
## 75               DeepLearning_1_AutoML_20200521_145637              0.2627400
## 76               DeepLearning_1_AutoML_20200521_172641              0.2653141
## 77               DeepLearning_1_AutoML_20200521_143507              0.2669415
## 78                        GLM_1_AutoML_20200521_143507              0.3137547
## 79                        GLM_1_AutoML_20200521_151236              0.3146925
## 80                        GLM_1_AutoML_20200521_172641              0.3146925
## 81                        GLM_1_AutoML_20200521_145637              0.3146925
## 82                        GLM_1_AutoML_20200521_173507              0.3146925
## 83                        GLM_1_AutoML_20200521_171828              0.3146925
## 84                        GLM_1_AutoML_20200521_150555              0.3146925
##         rmse       mse       mae      rmsle
## 1  0.3528744 0.1245203 0.2359063 0.06283139
## 2  0.3528744 0.1245203 0.2359063 0.06283139
## 3  0.3528744 0.1245203 0.2359063 0.06283139
## 4  0.3528744 0.1245203 0.2359063 0.06283139
## 5  0.3528744 0.1245203 0.2359063 0.06283139
## 6  0.3528744 0.1245203 0.2359063 0.06283139
## 7  0.3528744 0.1245203 0.2359063 0.06283139
## 8  0.3529244 0.1245556 0.2359408 0.06284026
## 9  0.3529244 0.1245556 0.2359408 0.06284026
## 10 0.3529253 0.1245563 0.2359414 0.06284044
## 11 0.3529253 0.1245563 0.2359414 0.06284044
## 12 0.3529253 0.1245563 0.2359414 0.06284044
## 13 0.3553500 0.1262736 0.2352619 0.06307621
## 14 0.3553500 0.1262736 0.2352619 0.06307621
## 15 0.3553500 0.1262736 0.2352619 0.06307621
## 16 0.3553500 0.1262736 0.2352619 0.06307621
## 17 0.3553500 0.1262736 0.2352619 0.06307621
## 18 0.3556098 0.1264583 0.2354422 0.06312241
## 19 0.3568568 0.1273468 0.2389199 0.06349190
## 20 0.3568568 0.1273468 0.2389199 0.06349190
## 21 0.3575390 0.1278341 0.2363905 0.06345866
## 22 0.3575390 0.1278341 0.2363905 0.06345866
## 23 0.3575390 0.1278341 0.2363905 0.06345866
## 24 0.3575390 0.1278341 0.2363905 0.06345866
## 25 0.3575390 0.1278341 0.2363905 0.06345866
## 26 0.3575390 0.1278341 0.2363905 0.06345866
## 27 0.3597919 0.1294502 0.2382631 0.06379927
## 28 0.3604042 0.1298912 0.2375793 0.06390898
## 29 0.3609638 0.1302949 0.2424119 0.06441495
## 30 0.3609638 0.1302949 0.2424119 0.06441495
## 31 0.3609638 0.1302949 0.2424119 0.06441495
## 32 0.3609638 0.1302949 0.2424119 0.06441495
## 33 0.3609638 0.1302949 0.2424119 0.06441495
## 34 0.3609638 0.1302949 0.2424119 0.06441495
## 35 0.3634009 0.1320602 0.2432499 0.06476702
## 36 0.3694829 0.1365176 0.2484454 0.06585417
## 37 0.3694829 0.1365176 0.2484454 0.06585417
## 38 0.3694829 0.1365176 0.2484454 0.06585417
## 39 0.3694829 0.1365176 0.2484454 0.06585417
## 40 0.3694829 0.1365176 0.2484454 0.06585417
## 41 0.3694829 0.1365176 0.2484454 0.06585417
## 42 0.3710854 0.1377044 0.2484933 0.06595956
## 43 0.3710854 0.1377044 0.2484933 0.06595956
## 44 0.3710854 0.1377044 0.2484933 0.06595956
## 45 0.3710854 0.1377044 0.2484933 0.06595956
## 46 0.3710854 0.1377044 0.2484933 0.06595956
## 47 0.3710854 0.1377044 0.2484933 0.06595956
## 48 0.3711514 0.1377534 0.2487664 0.06595801
## 49 0.3726994 0.1389048 0.2508116 0.06644421
## 50 0.3796788 0.1441560 0.2569958 0.06772334
## 51 0.3796788 0.1441560 0.2569958 0.06772334
## 52 0.3796788 0.1441560 0.2569958 0.06772334
## 53 0.3796788 0.1441560 0.2569958 0.06772334
## 54 0.3796788 0.1441560 0.2569958 0.06772334
## 55 0.3796788 0.1441560 0.2569958 0.06772334
## 56 0.3797950 0.1442442 0.2564710 0.06764931
## 57 0.3802237 0.1445701 0.2571722 0.06774982
## 58 0.3802237 0.1445701 0.2571722 0.06774982
## 59 0.3802237 0.1445701 0.2571722 0.06774982
## 60 0.3802237 0.1445701 0.2571722 0.06774982
## 61 0.3802237 0.1445701 0.2571722 0.06774982
## 62 0.3802237 0.1445701 0.2571722 0.06774982
## 63 0.3826032 0.1463852 0.2610442 0.06815408
## 64 0.3834531 0.1470363 0.2615666 0.06834008
## 65 0.3834531 0.1470363 0.2615666 0.06834008
## 66 0.3834531 0.1470363 0.2615666 0.06834008
## 67 0.3834531 0.1470363 0.2615666 0.06834008
## 68 0.3834531 0.1470363 0.2615666 0.06834008
## 69 0.3834531 0.1470363 0.2615666 0.06834008
## 70 0.3842067 0.1476148 0.2593622 0.06840914
## 71 0.5105299 0.2606408 0.3596753 0.09108533
## 72 0.5106487 0.2607621 0.3580805 0.09098965
## 73 0.5112509 0.2613775 0.3609867 0.09116738
## 74 0.5122289 0.2623785 0.3671524 0.09173722
## 75 0.5125817 0.2627400 0.3620742 0.09149177
## 76 0.5150864 0.2653141 0.3663252 0.09189685
## 77 0.5166638 0.2669415 0.3675624 0.09218277
## 78 0.5601381 0.3137547 0.3948662 0.09977844
## 79 0.5609746 0.3146925 0.3948982 0.09993106
## 80 0.5609746 0.3146925 0.3948982 0.09993106
## 81 0.5609746 0.3146925 0.3948982 0.09993106
## 82 0.5609746 0.3146925 0.3948982 0.09993106
## 83 0.5609746 0.3146925 0.3948982 0.09993106
## 84 0.5609746 0.3146925 0.3948982 0.09993106
## 
## [84 rows x 6 columns]

Perfomance scores on test data

perf <- h2o.performance(aml@leader, test_frame)
perf
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  0.1270943
## RMSE:  0.3565028
## MAE:  0.2376857
## RMSLE:  0.06355658
## Mean Residual Deviance :  0.1270943

Predictions (H2oautoML uses the leader model)

pred <- h2o.predict(aml, test_frame)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
pred
##    predict
## 1 4.006523
## 2 4.022654
## 3 4.815933
## 4 5.024039
## 5 4.707223
## 6 4.579227
## 
## [4932 rows x 1 column]

Convert predictions to original scale, save prediction and model

predictions <- exp(pred)

predictions <- as.vector(predictions)
df_result <- data.frame(predictions)
head(df_result)
h2o.saveModel(object = aml@leader, path=getwd())
## [1] "C:\\FCD\\04.MachineLearning\\Projetos\\Projeto08\\StackedEnsemble_BestOfFamily_AutoML_20200521_151236"
write.csv(df_result, "predictions.csv")