# Real datasets
## loading packages
#------ Check if package "pacman" is already installed 
lookup_packages <- installed.packages()[,1]
if(!("pacman" %in% lookup_packages))
  install.packages("pacman")


#------ To be installed or loaded
pacman::p_load(magrittr)
pacman::p_load(tidyverse)
pacman::p_load(devtools)
## package for "generateMachines"

# importing function from github

devtools::source_url("https://raw.githubusercontent.com/hassothea/AggregationMethods/main/KernelAggReg.R")

# Abalone data
# ------------
pacman::p_load(readr)
colname <- c("Id", "Type", "LongestShell", "Diameter", "Height", "WholeWeight", "ShuckedWeight", "VisceraWeight", "ShellWeight", "Rings")
df1 <- readr::read_delim("https://raw.githubusercontent.com/hassothea/AggregationMethods/main/datasets/abalone.txt", 
                        col_names = colname, 
                        delim = " ", 
                        show_col_types = FALSE)
df1 <- df1[,2:10]
n1 <- nrow(df1)
train1 <- logical(n1)
train1[sample(n1,  floor(n1*0.8))] <- TRUE


agg1 <- kernelAggReg(train_design = df1[train1, 2:8],
                    train_response = df1$Rings[train1],
                    test_design = df1[!train1, 2:8],
                    test_response = df1$Rings[!train1],
                    machines = c("lasso", "ridge", "knn", "tree", "rf"),
                    splits = .5,
                    n_cv = 10,
                    kernels = c("naive", "epan", "biw", "triw", "gaussian"),
                    optimizeMethod = c("grid", "grid", "grid", "grid", "grad"),
                    setBasicMachineParam = setBasicParameter(k = 5,
                                                             ntree = 300),
                    setGradParam = setGradParameter(max_val = 10,
                                                    rate = 0.5,
                                                    print_step = TRUE),
                    setGridParam = setGridParameter(max_val = 0.5,
                                                    n_val = 300))

# mse on Abalone data
 sqrt(agg1$mse)

 
# -------------------------------------------------------------------------------------------------------------------------
# # Red wine data
# # -------------
# df2 <- readr::read_delim("https://raw.githubusercontent.com/hassothea/AggregationMethods/main/datasets/wine-red.txt", 
#                          col_names = TRUE, 
#                          delim = " ", 
#                          show_col_types = FALSE)
# n2 <- nrow(df2)
# train2 <- logical(n2)
# train2[sample(n2,  floor(n2*0.8))] <- TRUE
# 
# 
# agg2 <- kernelAggReg(train_design = df2[train2, 2:11],
#                     train_response = df2$quality[train2],
#                     test_design = df2[!train2, 2:11],
#                     test_response = df2$quality[!train2],
#                     machines = c("lasso", "ridge", "knn", "tree", "rf"),
#                     splits = .5,
#                     n_cv = 10,
#                     kernels = c("naive", "epan", "biw", "triw", "gaussian"),
#                     optimizeMethod = c("grid", "grid", "grid", "grid", "grad"),
#                     setBasicMachineParam = setBasicParameter(k = 5,
#                                                              ntree = 300),
#                     setGradParam = setGradParameter(max_val = 10,
#                                                     rate = 0.5,
#                                                     print_step = TRUE),
#                     setGridParam = setGridParameter(max_val = 3,
#                                                     n_val = 300))
# # RMSE on Wine data
# sqrt(agg2$mse)

 # -------------------------------------------------------------------------------------------------------------------------

# House sales in King country data
# --------------------------------
# df3 <- readr::read_delim("https://raw.githubusercontent.com/hassothea/AggregationMethods/main/datasets/kc_house.txt", 
#                         col_names = TRUE,
#                         delim = " ",
#                         show_col_types = FALSE)
# n3 <- nrow(df3)
# train3 <- logical(n3)
# train3[sample(n3,  floor(n3*0.7))] <- TRUE
# var_predict <- c("bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "grade", "sqft_above", "sqft_basement", "sqft_living15", "sqft_lot15")
# 
# agg3 <- kernelAggReg(train_design = df3[train3, var_predict],
#                     train_response = df3$price[train3]/1e+5,
#                     test_design = df3[!train3, var_predict],
#                     test_response = df3$price[!train3]/1e+5,
#                     machines = c("lasso", "ridge", "knn", "tree", "rf"),
#                     splits = .5,
#                     n_cv = 10,
#                     kernels = c("naive", "epan", "biw", "triw", "gaussian"),
#                     optimizeMethod = c("grid", "grid", "grid", "grid", "grad"),
#                     setBasicMachineParam = setBasicParameter(k = 5,
#                                                              ntree = 300),
#                     setGradParam = setGradParameter(max_val = 10,
#                                                     rate = 1,
#                                                     print_step = TRUE),
#                     setGridParam = setGridParameter(max_val = 0.5,
#                                                     n_val = 300))
# # mse on House data
# sqrt(agg3$mse)*1e+5


alpha[ivl0] # D1 - TrainLR - TrainHR - extrapolation

pred_test_V1 = all_predict_V0[iv1,]
pred_test_VL = all_predict_VL[ivl0,]

train <- sample(N, N_test)

agg <- kernelAggReg(train_design = pred_test_VL[train, ],
                     train_response = y_test_VL[train],
                     test_design = pred_test_VL[-train,],
                     test_response = y_test_VL[-train],
                     build_machine = FALSE,
                     splits = .5,
                     n_cv = 10,
                     kernels = c("gaussian"),
                     optimizeMethod = c("grad"),
                     setBasicMachineParam = setBasicParameter(k = 5,
                                                              ntree = 300),
                     setGradParam = setGradParameter(max_val = 100,
                                                     coef_auto = 1,
                                                     print_step = TRUE),
                     silent = FALSE)
# mse on Abalone data
sqrt(agg$mse)
