Skip to contents
library(xplainfi)

# learners, tasks, etc.
library(mlr3)
library(mlr3learners)

Defining a simple example case:

  • Breast cancer task
  • Random forests with 100 trees
  • Holdout split (default)
  • Measure: Brier score
task = tsk("breast_cancer")
learner = lrn("classif.ranger", predict_type = "prob")
measure = msr("classif.bbrier")

PFI

Simple case without resampling

Default behavior will internally construct standard holdout resampling with default ratio

Calculating PFI:

pfi = PFI$new(
  task = task,
  learner = learner,
  measure = measure
)

# Stores parameter set to calculate PFI in different ways
pfi$param_set
#> <ParamSet(2)>
#>            id    class lower upper nlevels    default      value
#>        <char>   <char> <num> <num>   <num>     <list>     <list>
#> 1:   relation ParamFct    NA    NA       2 difference difference
#> 2: iters_perm ParamInt     1   Inf     Inf          1          1

# Default behavior should be sane
pfi$compute()
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 0.0311134047
#> 2:     bl_cromatin 0.0039695264
#> 3:      cell_shape 0.0181463463
#> 4:       cell_size 0.0260011523
#> 5:    cl_thickness 0.0068208143
#> 6:    epith_c_size 0.0027591783
#> 7:   marg_adhesion 0.0026618505
#> 8:         mitoses 0.0004096992
#> 9: normal_nucleoli 0.0088215131

Q: Should $compute() be run on construction? Between the call to $new() and $compute() there’s nothing that needs to happen technically, as long as the relation param could be set directly.

Does not recompute if not needed, as "difference" is the default:

pfi$compute(relation = "difference")
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 0.0311134047
#> 2:     bl_cromatin 0.0039695264
#> 3:      cell_shape 0.0181463463
#> 4:       cell_size 0.0260011523
#> 5:    cl_thickness 0.0068208143
#> 6:    epith_c_size 0.0027591783
#> 7:   marg_adhesion 0.0026618505
#> 8:         mitoses 0.0004096992
#> 9: normal_nucleoli 0.0088215131

Recomputes if param changes, stores new param

pfi$compute(relation = "ratio")
#> Key: <feature>
#>            feature importance
#>             <char>      <num>
#> 1:     bare_nuclei   2.005166
#> 2:     bl_cromatin   1.360535
#> 3:      cell_shape   1.787501
#> 4:       cell_size   1.892795
#> 5:    cl_thickness   1.233292
#> 6:    epith_c_size   1.073837
#> 7:   marg_adhesion   1.143984
#> 8:         mitoses   1.038286
#> 9: normal_nucleoli   1.377058
pfi$param_set
#> <ParamSet(2)>
#>            id    class lower upper nlevels    default  value
#>        <char>   <char> <num> <num>   <num>     <list> <list>
#> 1:   relation ParamFct    NA    NA       2 difference  ratio
#> 2: iters_perm ParamInt     1   Inf     Inf          1      1

Q: When $compute() is called again its default value for "relation" (i.e. "difference") is used, which doesn’t seem ideal. Maybe this default should be the param stored in the object itself rather than feel like a separate function.

pfi$compute()
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 0.0269669118
#> 2:     bl_cromatin 0.0080938632
#> 3:      cell_shape 0.0192081960
#> 4:       cell_size 0.0281257099
#> 5:    cl_thickness 0.0061718034
#> 6:    epith_c_size 0.0010423567
#> 7:   marg_adhesion 0.0018831063
#> 8:         mitoses 0.0006549462
#> 9: normal_nucleoli 0.0086903167

Retrieve aggregated scores manually:

pfi$importance
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 0.0269669118
#> 2:     bl_cromatin 0.0080938632
#> 3:      cell_shape 0.0192081960
#> 4:       cell_size 0.0281257099
#> 5:    cl_thickness 0.0061718034
#> 6:    epith_c_size 0.0010423567
#> 7:   marg_adhesion 0.0018831063
#> 8:         mitoses 0.0006549462
#> 9: normal_nucleoli 0.0086903167

With resampling

learner = lrn("classif.ranger", predict_type = "prob")
resampling = rsmp("cv", folds = 3)
measure = msr("classif.bbrier")

pfi = PFI$new(
  task = task,
  learner = learner,
  resampling = resampling,
  measure = measure
)

pfi$resampling
#> <ResamplingCV>: Cross-Validation
#> * Iterations: 3
#> * Instantiated: TRUE
#> * Parameters: folds=3
pfi$resample_result
#> NULL

pfi$compute(relation = "difference")
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 0.0312390148
#> 2:     bl_cromatin 0.0075122543
#> 3:      cell_shape 0.0161138358
#> 4:       cell_size 0.0308948666
#> 5:    cl_thickness 0.0156788348
#> 6:    epith_c_size 0.0018646652
#> 7:   marg_adhesion 0.0029774891
#> 8:         mitoses 0.0006869458
#> 9: normal_nucleoli 0.0055014621

pfi$resample_result
#> <ResampleResult> with 3 resampling iterations
#>        task_id     learner_id resampling_id iteration     prediction_test
#>  breast_cancer classif.ranger            cv         1 <PredictionClassif>
#>  breast_cancer classif.ranger            cv         2 <PredictionClassif>
#>  breast_cancer classif.ranger            cv         3 <PredictionClassif>
#>  warnings errors
#>         0      0
#>         0      0
#>         0      0

pfi$importance
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 0.0312390148
#> 2:     bl_cromatin 0.0075122543
#> 3:      cell_shape 0.0161138358
#> 4:       cell_size 0.0308948666
#> 5:    cl_thickness 0.0156788348
#> 6:    epith_c_size 0.0018646652
#> 7:   marg_adhesion 0.0029774891
#> 8:         mitoses 0.0006869458
#> 9: normal_nucleoli 0.0055014621

Different measure:

Q: Maybe it would be worth allowing to change measure post-hoc?

pfi = PFI$new(
  task = task,
  learner = learner,
  resampling = resampling,
  measure = msr("classif.auc")
)

pfi$compute(relation = "ratio")
#> Key: <feature>
#>            feature importance
#>             <char>      <num>
#> 1:     bare_nuclei   1.017920
#> 2:     bl_cromatin   1.001659
#> 3:      cell_shape   1.005498
#> 4:       cell_size   1.006188
#> 5:    cl_thickness   1.006749
#> 6:    epith_c_size   1.001143
#> 7:   marg_adhesion   1.001822
#> 8:         mitoses   1.000341
#> 9: normal_nucleoli   1.002252
pfi$compute(relation = "difference")
#> Key: <feature>
#>            feature   importance
#>             <char>        <num>
#> 1:     bare_nuclei 1.487792e-02
#> 2:     bl_cromatin 3.758886e-04
#> 3:      cell_shape 3.792059e-03
#> 4:       cell_size 3.903575e-03
#> 5:    cl_thickness 2.910220e-03
#> 6:    epith_c_size 7.057850e-04
#> 7:   marg_adhesion 1.691431e-03
#> 8:         mitoses 3.936168e-04
#> 9: normal_nucleoli 3.428817e-05

With multiple permutation iterations

pfi = PFI$new(
  task = task,
  learner = learner,
  resampling = resampling,
  measure = msr("classif.ce"),
  iters_perm = 5 # permute each feature 5 times in each resampling iteration
)

pfi$compute(relation = "ratio")
#> Key: <feature>
#>            feature importance
#>             <char>      <num>
#> 1:     bare_nuclei   2.577778
#> 2:     bl_cromatin   1.114286
#> 3:      cell_shape   1.221164
#> 4:       cell_size   1.341799
#> 5:    cl_thickness   1.259259
#> 6:    epith_c_size   1.024339
#> 7:   marg_adhesion   1.019048
#> 8:         mitoses   1.000000
#> 9: normal_nucleoli   1.057143

LOCO

Same setup but now using LOCO, which differs in that it internally needs to refit the model.
Notably, the Task object does not need to be modified, as it suffices to adjust the .$col_roles$feature property.

learner = lrn("classif.ranger", predict_type = "prob")

loco = LOCO$new(
  task = task,
  learner = learner,
  resampling = resampling,
  measure = msr("classif.bbrier")
)

loco$compute(relation = "ratio")
#> Key: <feature>
#>            feature importance
#>             <char>      <num>
#> 1:     bare_nuclei  1.0905448
#> 2:     bl_cromatin  0.9838583
#> 3:      cell_shape  0.9994163
#> 4:       cell_size  0.9634430
#> 5:    cl_thickness  1.0666010
#> 6:    epith_c_size  0.9740075
#> 7:   marg_adhesion  0.9867229
#> 8:         mitoses  0.9999505
#> 9: normal_nucleoli  0.9977767

loco$scores
#> Key: <feature, iter_rsmp>
#>             feature iter_rsmp classif.bbrier_orig classif.bbrier_loco
#>              <char>     <int>               <num>               <num>
#>  1:     bare_nuclei         1          0.03014357          0.03379549
#>  2:     bare_nuclei         2          0.02458370          0.02650079
#>  3:     bare_nuclei         3          0.02599303          0.02787756
#>  4:     bl_cromatin         1          0.03014357          0.02771802
#>  5:     bl_cromatin         2          0.02458370          0.02518596
#>  6:     bl_cromatin         3          0.02599303          0.02618910
#>  7:      cell_shape         1          0.03014357          0.03003903
#>  8:      cell_shape         2          0.02458370          0.02503626
#>  9:      cell_shape         3          0.02599303          0.02555915
#> 10:       cell_size         1          0.03014357          0.02948189
#> 11:       cell_size         2          0.02458370          0.02431988
#> 12:       cell_size         3          0.02599303          0.02399186
#> 13:    cl_thickness         1          0.03014357          0.03258453
#> 14:    cl_thickness         2          0.02458370          0.02592081
#> 15:    cl_thickness         3          0.02599303          0.02766789
#> 16:    epith_c_size         1          0.03014357          0.03046220
#> 17:    epith_c_size         2          0.02458370          0.02186273
#> 18:    epith_c_size         3          0.02599303          0.02656836
#> 19:   marg_adhesion         1          0.03014357          0.03014357
#> 20:   marg_adhesion         2          0.02458370          0.02513951
#> 21:   marg_adhesion         3          0.02599303          0.02437002
#> 22:         mitoses         1          0.03014357          0.03025939
#> 23:         mitoses         2          0.02458370          0.02436608
#> 24:         mitoses         3          0.02599303          0.02611939
#> 25: normal_nucleoli         1          0.03014357          0.02924740
#> 26: normal_nucleoli         2          0.02458370          0.02512095
#> 27: normal_nucleoli         3          0.02599303          0.02602437
#>             feature iter_rsmp classif.bbrier_orig classif.bbrier_loco
#>     importance
#>          <num>
#>  1:  1.1211509
#>  2:  1.0779822
#>  3:  1.0725013
#>  4:  0.9195336
#>  5:  1.0244983
#>  6:  1.0075430
#>  7:  0.9965321
#>  8:  1.0184089
#>  9:  0.9833078
#> 10:  0.9780490
#> 11:  0.9892686
#> 12:  0.9230114
#> 13:  1.0809778
#> 14:  1.0543901
#> 15:  1.0644352
#> 16:  1.0105704
#> 17:  0.8893181
#> 18:  1.0221339
#> 19:  0.9999999
#> 20:  1.0226088
#> 21:  0.9375599
#> 22:  1.0038423
#> 23:  0.9911478
#> 24:  1.0048612
#> 25:  0.9702701
#> 26:  1.0218541
#> 27:  1.0012059
#>     importance

Aggregating results

importance_combined = merge(
  pfi$importance,
  loco$importance,
  by = "feature"
)

data.table::setnames(
  importance_combined,
  old = c("importance.x", "importance.y"),
  new = c("pfi", "loco")
)

importance_combined |>
  knitr::kable(digits = 4, caption = "Importance scores (ratio)")
Importance scores (ratio)
feature pfi loco
bare_nuclei 2.5778 1.0905
bl_cromatin 1.1143 0.9839
cell_shape 1.2212 0.9994
cell_size 1.3418 0.9634
cl_thickness 1.2593 1.0666
epith_c_size 1.0243 0.9740
marg_adhesion 1.0190 0.9867
mitoses 1.0000 1.0000
normal_nucleoli 1.0571 0.9978
library(ggplot2)

importance_combined |>
  data.table::melt(id.vars = "feature", value.name = "score", variable.name = "method") |>
  ggplot(aes(x = score, y = feature, color = method, fill = method)) +
  geom_col(position = "dodge", alpha = .5) +
  scale_color_brewer(palette = "Dark2", aesthetics = c("color", "fill")) +
  labs(
    title = "Feature Importance Scores",
    subtitle = sprintf("For task %s and measure %s, using relativ scores", task$id, measure$id),
    x = "Score", y = "Feature", color = "Method", fill = "Method",
    caption = sprintf("Using %i-fold %s", resampling$iters, resampling$id)
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title.position = "plot"
  )