---
title: "Formativno vrednovanje KOL1"
output:
flexdashboard::flex_dashboard:
social: menu
orientation: columns
vertical_layout: fill
source_code: embed
---
```{css, echo=FALSE}
.sidebar { overflow: auto; }
.dataTables_scrollBody {
height:95% !important;
max-height:95% !important;
}
.chart-stage-flex {
overflow:auto !important;
}
```
```{r setup, include=FALSE}
library(tidyverse)
library(readxl)
library(tidymodels)
library(vip)
library(corrplot)
library(doParallel)
set.seed(11042023)
ocjene1 <- read_excel("ocjene_MAT1_2021_2022.xlsx") %>%
mutate_at(vars(KOL1:UKUPNO), as.numeric) %>%
unite("Puno ime", Ime, Prezime, sep = " ") %>%
mutate(KVIZ = rowSums(across(KP0:KP4)), DZ = ZAD1 + ZAD2) %>%
select(`Puno ime`, KVIZ, DZ, KOL1) %>%
mutate(klasa = case_when(KOL1 <= 5 ~ "1",
KOL1 <= 10 ~ "2",
KOL1 <= 15 ~ "3",
.default = "4")) %>%
mutate_at(vars(klasa), ~fct_relevel(., c("1","2","3","4")))
logovi1 <- read_csv("all_logs_MAT1_2021_2022.csv") %>%
mutate(VIDEO = rowSums(across(P1:V3))) %>%
select(`Puno ime`, VIDEO)
train_data <- ocjene1 %>%
left_join(logovi1, by = "Puno ime") %>%
select(-c("Puno ime", "KOL1")) %>%
relocate(klasa, .after = last_col())
ocjene2 <- read_excel("ocjene_MAT1_2022_2023.xlsx") %>%
mutate_at(vars(KOL1:UKUPNO), as.numeric) %>%
unite("Puno ime", Ime, Prezime, sep = " ") %>%
mutate(KVIZ = rowSums(across(KP0:KP4)), DZ = ZAD1 + ZAD2) %>%
select(`Puno ime`, KVIZ, DZ, KOL1) %>%
mutate(klasa = case_when(KOL1 <= 5 ~ "1",
KOL1 <= 10 ~ "2",
KOL1 <= 15 ~ "3",
.default = "4")) %>%
mutate_at(vars(klasa), ~fct_relevel(., c("1","2","3","4")))
logovi2 <- read_csv("all_logs_MAT1_2022_2023.csv") %>%
mutate(VIDEO = rowSums(across(P1:V3))) %>%
select(`Puno ime`, VIDEO)
test_data <- ocjene2 %>%
left_join(logovi2, by = "Puno ime") %>%
select(-c("Puno ime", "KOL1")) %>%
relocate(klasa, .after = last_col())
# treba napraviti split objekt (zbog last_fit)
podaci <- bind_rows(train_data, test_data)
omjer <- nrow(train_data) / (nrow(train_data) + nrow(test_data))
podaci_split <- initial_time_split(podaci, prop = omjer)
# definiranje modela
rf_model <- rand_forest() %>%
set_args(mtry = tune(), trees = tune(), min_n = tune()) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
# recept za transformiranje podataka
rf_recipe <- recipe(klasa ~ ., data = train_data) %>%
step_normalize(all_numeric_predictors())
# workflow
rf_work <- workflow() %>%
add_model(rf_model) %>%
add_recipe(rf_recipe)
# cross validation folds
rf_folds <- vfold_cv(train_data, v = 10, strata = klasa)
# metrike
rf_metrike <- metric_set(roc_auc, sens, spec, accuracy, f_meas, mcc)
rf_grid <- grid_random(extract_parameter_set_dials(rf_model) %>% finalize(train_data), size = 100)
# paralelizacija za tuning random grid
cl <- makePSOCKcluster(8)
registerDoParallel(cl)
set.seed(2023)
rf_tuning <- rf_work %>%
tune_grid(resamples = rf_folds, grid = rf_grid, metrics = rf_metrike)
stopCluster(cl)
best_rf_model <- rf_tuning %>% select_best(metric = 'roc_auc')
final_rf_work <- rf_work %>% finalize_workflow(best_rf_model)
rf_fit <- final_rf_work %>% last_fit(split = podaci_split)
update_geom_defaults(geom = "tile", new = list(color = "black"))
```
# Hiperparametri
Column
-----------------------------------------------------------------------
### Najbolji modeli za roc_auc metriku
```{r}
rf_tuning %>% show_best(metric = 'roc_auc', n = 5)
```
### Najbolji modeli za mcc metriku
```{r}
rf_tuning %>% show_best(metric = 'mcc', n = 5)
```
### Najbolji modeli za sensitivity metriku
```{r}
rf_tuning %>% show_best(metric = 'sens', n = 5)
```
Column
-----------------------------------------------------------------------
### Vrijednosti testiranih hipeparametara
```{r}
rf_tuning %>%
collect_metrics() %>%
filter(.metric == "roc_auc", trees > 0) %>%
pivot_longer(cols = mtry:min_n) %>%
mutate(best_mod = mean == max(mean)) %>%
ggplot(aes(x = value, y = mean)) +
#geom_line(alpha = 0.5, size = 1.5) +
geom_point(aes(color = best_mod), size = 1) +
facet_wrap(~name, scales = "free_x") +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(y = "roc auc", x = "", color = "Best Model")
```
# Model
Column
-----------------------------------------------------------------------
### Metrike na testnom skupu {data-height=200}
```{r}
rf_fit %>% collect_predictions() %>%
rf_metrike(truth = klasa, estimate = .pred_class, .pred_1:.pred_4)
```
### Confusion matrix {data-height=500}
```{r}
rf_fit %>% collect_predictions() %>%
conf_mat(truth = klasa, estimate = .pred_class) %>% autoplot("heatmap") +
scale_fill_gradient2(low = "#075AFF", mid = "#FFFFCC", high = "#FF0000")
```
Column
-----------------------------------------------------------------------
### Metrike na trening skupu {data-height=200}
```{r}
rf_fit %>% extract_workflow() %>%
augment(training(podaci_split)) %>%
rf_metrike(truth = klasa, estimate = .pred_class, .pred_1:.pred_4)
```
### ROC curve {data-height=500}
```{r}
rf_fit %>% collect_predictions() %>% roc_curve(klasa, .pred_1:.pred_4) %>% autoplot()
```
# Varijable
Column
-----------------------------------------------------------------------
### Važnost varijabli
```{r}
rf_fit %>% extract_fit_parsnip() %>% vip()
```