Korelacije

Column

Godina 2021 / 2022

Najveće korelacije

# A tibble: 6 × 3
  rowname varijabla korelacija
  <chr>   <chr>          <dbl>
1 V4      V6             0.780
2 V9      V11            0.779
3 P14     V7             0.769
4 P6      P7             0.748
5 V12     V13            0.742
6 V7      V10            0.712

Column

Godina 2022 / 2023

Najveće korelacije

# A tibble: 5 × 3
  rowname varijabla korelacija
  <chr>   <chr>          <dbl>
1 P2      P3             0.802
2 P8      P9             0.738
3 V7      V10            0.727
4 V9      V11            0.715
5 P6      P7             0.713

Hiperparametri

Column

Najbolji modeli za roc_auc metriku

# A tibble: 5 × 9
   mtry trees min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     2  1636    27 roc_auc binary     0.905    10  0.0156 Preprocessor1_Model0…
2     2  1058    35 roc_auc binary     0.905    10  0.0155 Preprocessor1_Model0…
3     2  1905    18 roc_auc binary     0.903    10  0.0150 Preprocessor1_Model0…
4     5   534    35 roc_auc binary     0.901    10  0.0183 Preprocessor1_Model0…
5     2   242    29 roc_auc binary     0.901    10  0.0166 Preprocessor1_Model0…

Najbolji modeli za F1 metriku

# A tibble: 5 × 9
   mtry trees min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     2  1905    18 f_meas  binary     0.800    10  0.0225 Preprocessor1_Model0…
2     5  1335     7 f_meas  binary     0.785    10  0.0209 Preprocessor1_Model0…
3     2  1636    27 f_meas  binary     0.783    10  0.0195 Preprocessor1_Model0…
4     5   534    35 f_meas  binary     0.781    10  0.0239 Preprocessor1_Model0…
5     3  1709    36 f_meas  binary     0.780    10  0.0236 Preprocessor1_Model0…

Najbolji modeli za sensitivity metriku

# A tibble: 5 × 9
   mtry trees min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     2  1905    18 sens    binary     0.806    10  0.0325 Preprocessor1_Model0…
2     5  1335     7 sens    binary     0.797    10  0.0328 Preprocessor1_Model0…
3     3  1709    36 sens    binary     0.790    10  0.0359 Preprocessor1_Model0…
4     5   534    35 sens    binary     0.790    10  0.0363 Preprocessor1_Model0…
5     5  1231     7 sens    binary     0.782    10  0.0339 Preprocessor1_Model0…

Column

Vrijednosti testiranih hipeparametara

Model

Column

Metrike na testnom skupu

# A tibble: 6 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 sens     binary         0.451
2 spec     binary         0.976
3 accuracy binary         0.734
4 f_meas   binary         0.610
5 mcc      binary         0.514
6 roc_auc  binary         0.870

Confusion matrix

Column

Metrike na trening skupu

# A tibble: 6 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 sens     binary         0.914
2 spec     binary         0.948
3 accuracy binary         0.934
4 f_meas   binary         0.921
5 mcc      binary         0.864
6 roc_auc  binary         0.987

ROC curve

Varijable

Column

Važnost svih varijabli

Column

10 najvažnijih varijabli

---
title: "Formativno vrednovanje"
output: 
  flexdashboard::flex_dashboard:
    social: menu
    orientation: columns
    vertical_layout: fill
    source_code: embed
---

```{css, echo=FALSE}
.sidebar { overflow: auto; }
.dataTables_scrollBody {
    height:95% !important;
    max-height:95% !important;
}
.chart-stage-flex {
    overflow:auto !important;
}
```

```{r setup, include=FALSE}
library(tidyverse)
library(readxl)
library(tidymodels)
library(vip)
library(corrplot)
library(doParallel)

set.seed(2023)

ocjene1 <- read_excel("ocjene_MAT1_2021_2022.xlsx")
ocjene1 <- ocjene1 %>% mutate_at(vars(KOL1:UKUPNO), as.numeric) %>%
  unite("Puno ime", Ime, Prezime, sep = " ") %>%
  mutate(klasa = ifelse(UKUPNO >= 50, "1", "0")) %>% 
  mutate_at(vars(klasa), ~fct_relevel(., c("1","0")))
logovi1 <- read_csv("all_logs_MAT1_2021_2022.csv")
train_data <- ocjene1 %>% select(`Puno ime`, KP0:ZAD6, klasa) %>% 
  left_join(logovi1, by = "Puno ime") %>%
  select(-c("Puno ime")) %>% relocate(klasa, .after = last_col())

ocjene2 <- read_excel("ocjene_MAT1_2022_2023.xlsx")
ocjene2 <- ocjene2 %>% mutate_at(vars(KOL1:UKUPNO), as.numeric) %>%
  unite("Puno ime", Ime, Prezime, sep = " ") %>%
  mutate(klasa = ifelse(UKUPNO >= 50, "1", "0")) %>% 
  mutate_at(vars(klasa), ~fct_relevel(., c("1","0")))
logovi2 <- read_csv("all_logs_MAT1_2022_2023.csv")
test_data <- ocjene2 %>% select(`Puno ime`, KP0:ZAD6, klasa) %>% 
  left_join(logovi2, by = "Puno ime") %>%
  select(-c("Puno ime")) %>% relocate(klasa, .after = last_col())

kor1_tablica <- ocjene1 %>% select(`Puno ime`, KP0:ZAD6) %>% 
  left_join(logovi1, by = "Puno ime") %>%
  select(-c("Puno ime"))
korelacije1 <- as_tibble(cor(kor1_tablica), rownames = NA) %>% rownames_to_column()

kor2_tablica <- ocjene2 %>% select(`Puno ime`, KP0:ZAD6) %>% 
  left_join(logovi2, by = "Puno ime") %>%
  select(-c("Puno ime"))
korelacije2 <- as_tibble(cor(kor2_tablica), rownames = NA) %>% rownames_to_column()

# treba napraviti split objekt (zbog last_fit)
podaci <- bind_rows(train_data, test_data)
omjer <- nrow(train_data) / (nrow(train_data) + nrow(test_data))
podaci_split <- initial_time_split(podaci, prop = omjer)

# definiranje modela
rf_model <- rand_forest() %>%
  set_args(mtry = tune(), trees = tune(), min_n = tune()) %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

# recept za transformiranje podataka
rf_recipe <- recipe(klasa ~ ., data = train_data) %>%
  step_normalize(all_numeric_predictors())

# workflow
rf_work <- workflow() %>% 
  add_model(rf_model) %>%
  add_recipe(rf_recipe)

# cross validation folds
rf_folds <- vfold_cv(train_data, v = 10, strata = klasa)

# metrike
rf_metrike <- metric_set(roc_auc, sens, spec, accuracy, f_meas, mcc)

rf_grid <- grid_random(extract_parameter_set_dials(rf_model) %>% finalize(train_data), size = 100)

# paralelizacija za tuning random grid
cl <- makePSOCKcluster(8)
registerDoParallel(cl)
set.seed(2023)
rf_tuning <- rf_work %>% 
  tune_grid(resamples = rf_folds, grid = rf_grid, metrics = rf_metrike)
stopCluster(cl)

best_rf_model <- rf_tuning %>% select_best(metric = 'roc_auc')

final_rf_work <- rf_work %>% finalize_workflow(best_rf_model)
rf_fit <- final_rf_work %>% last_fit(split = podaci_split)

update_geom_defaults(geom = "tile", new = list(color = "black"))
```

# Korelacije

Column 
-----------------------------------------------------------------------

### Godina 2021 / 2022 {data-height=500}

```{r}
corrplot(round(cor(kor1_tablica), 2), type = "upper",
         tl.col = "black", diag=FALSE, tl.srt = 90, tl.cex=0.55,
         order = "FPC")
```

### Najveće korelacije {data-height=200}

```{r}
korelacije1 %>% 
  pivot_longer(-rowname, names_to = "varijabla", values_to = "korelacija") %>%
  filter(abs(korelacija) >= 0.7, abs(korelacija) < 1) %>%
  arrange(desc(korelacija)) %>% distinct(korelacija, .keep_all = TRUE)
```

Column
-----------------------------------------------------------------------

### Godina 2022 / 2023 {data-height=500}

```{r}
corrplot(round(cor(kor2_tablica), 2), type = "upper",
         tl.col = "black", diag=FALSE, tl.srt = 90, tl.cex=0.55,
         order = "FPC")
```

### Najveće korelacije {data-height=200}

```{r}
korelacije2 %>% 
  pivot_longer(-rowname, names_to = "varijabla", values_to = "korelacija") %>%
  filter(abs(korelacija) >= 0.7, abs(korelacija) < 1) %>%
  arrange(desc(korelacija)) %>% distinct(korelacija, .keep_all = TRUE)
```

# Hiperparametri

Column
-----------------------------------------------------------------------

### Najbolji modeli za roc_auc metriku

```{r}
rf_tuning %>% show_best(metric = 'roc_auc', n = 5)
```

### Najbolji modeli za F1 metriku

```{r}
rf_tuning %>% show_best(metric = 'f_meas', n = 5)
```

### Najbolji modeli za sensitivity metriku

```{r}
rf_tuning %>% show_best(metric = 'sens', n = 5)
```

Column
-----------------------------------------------------------------------

### Vrijednosti testiranih hipeparametara

```{r}
rf_tuning %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc", trees > 0) %>%
  pivot_longer(cols = mtry:min_n) %>%
  mutate(best_mod = mean == max(mean)) %>% 
  ggplot(aes(x = value, y = mean)) +
  #geom_line(alpha = 0.5, size = 1.5) +
  geom_point(aes(color = best_mod), size = 1) +
  facet_wrap(~name, scales = "free_x") +
  scale_x_continuous(breaks = scales::pretty_breaks()) +
  labs(y = "roc auc", x = "", color = "Best Model")
```

# Model

Column
-----------------------------------------------------------------------

###  Metrike na testnom skupu {data-height=200}

```{r}
rf_fit %>% collect_predictions() %>% 
  rf_metrike(truth = klasa, estimate = .pred_class, .pred_1)
```

### Confusion matrix {data-height=500}

```{r}
rf_fit %>% collect_predictions() %>%
  conf_mat(truth = klasa, estimate = .pred_class) %>% autoplot("heatmap") +
  scale_fill_gradient2(low = "#075AFF",  mid = "#FFFFCC", high = "#FF0000")
```

Column
-----------------------------------------------------------------------

### Metrike na trening skupu {data-height=200}

```{r}
rf_fit %>% extract_workflow() %>%
  augment(training(podaci_split)) %>%
  rf_metrike(truth = klasa, estimate = .pred_class, .pred_1)
```

### ROC curve {data-height=500}

```{r}
rf_fit %>% collect_predictions() %>% roc_curve(klasa, .pred_1) %>% autoplot()
```

# Varijable

Column
-----------------------------------------------------------------------

### Važnost svih varijabli

```{r fig.height=8}
rf_fit %>% extract_fit_parsnip() %>% vip(num_features = ncol(train_data)-1)
```

Column
-----------------------------------------------------------------------

### 10 najvažnijih varijabli

```{r}
rf_fit %>% extract_fit_parsnip() %>% vip(num_features = 10)
```