calculate metrics for a rulelist — calculate.rulelist • tidyrules

Computes some metrics (based on estimation_type) in cumulative window function style over the rulelist (in the same order) ignoring the keys.

# S3 method for class 'rulelist'
calculate(x, metrics_to_exclude = NULL, ...)

Arguments

x: A rulelist
metrics_to_exclude: (character vector) Names of metrics to exclude
...: Named list of custom metrics. See 'details'.

Value

A dataframe of metrics with a rule_nbr column.

Details

Default Metrics

These metrics are calculated by default:

cumulative_coverage: For nth rule in the rulelist, number of distinct row_nbrs (of new_data) covered by nth and all preceding rules (in order). In weighted case, we sum the weights corresponding to the distinct row_nbrs.
cumulative_overlap: Up til nth rule in the rulelist, number of distinct row_nbrs (of new_data) already covered by some preceding rule (in order). In weighted case, we sum the weights corresponding to the distinct row_nbrs.

For classification:

cumulative_accuracy: For nth rule in the rulelist, fraction of row_nbrs such that RHS matches the y_name column (of new_data) by nth and all preceding rules (in order). In weighted case, weighted accuracy is computed.

For regression:

cumulative_RMSE: For nth rule in the rulelist, weighted RMSE of all predictions (RHS) predicted by nth rule and all preceding rules.

Custom metrics

Custom metrics to be computed should be passed a named list of function(s) in .... The custom metric function should take these arguments in same order: rulelist, new_data, y_name, weight. The custom metric function should return a numeric vector of same length as the number of rows of rulelist.

Examples

library("magrittr")
model_c5  = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE)
tidy_c5   = tidy(model_c5) %>%
            set_validation_data(modeldata::attrition, "Attrition") %>%
            set_keys(NULL)

# calculate default metrics (classification)
calculate(tidy_c5)
#> # A tidytable: 24 × 4
#>    rule_nbr cumulative_coverage cumulative_overlap cumulative_accuracy
#>       <int>               <dbl>              <dbl>               <dbl>
#>  1        1                  16                  0               1    
#>  2        2                 537                  0               0.944
#>  3        3                 545                  5               0.945
#>  4        4                 656                 89               0.941
#>  5        5                 664                 89               0.941
#>  6        6                 667                 94               0.942
#>  7        7                 681                 94               0.941
#>  8        8                 687                 94               0.942
#>  9        9                 699                 95               0.941
#> 10       10                1425                633               0.900
#> # ℹ 14 more rows

model_rpart = rpart::rpart(MonthlyIncome ~., data = modeldata::attrition)
tidy_rpart  =
  tidy(model_rpart) %>%
  set_validation_data(modeldata::attrition, "MonthlyIncome") %>%
  set_keys(NULL)

# calculate default metrics (regression)
calculate(tidy_rpart)
#> # A tidytable: 6 × 4
#>   rule_nbr cumulative_coverage cumulative_overlap cumulative_RMSE
#>      <int>               <dbl>              <dbl>           <dbl>
#> 1        1                 543                  0            748.
#> 2        2                1077                  0           1125.
#> 3        3                1255                  0           1153.
#> 4        4                1397                  0           1198.
#> 5        5                1437                  0           1191.
#> 6        6                1470                  0           1182.

# calculate default metrics with a custom metric
#' custom function to get cumulative MAE
library("tidytable")
#> 
#> Attaching package: ‘tidytable’
#> The following object is masked from ‘package:magrittr’:
#> 
#>     extract
#> The following objects are masked from ‘package:stats’:
#> 
#>     dt, filter, lag
#> The following object is masked from ‘package:base’:
#> 
#>     %in%
get_cumulative_MAE = function(rulelist, new_data, y_name, weight){

  priority_df =
    rulelist %>%
    select(rule_nbr) %>%
    mutate(priority = 1:nrow(rulelist)) %>%
    select(rule_nbr, priority)

  pred_df =
    predict(rulelist, new_data) %>%
    left_join(priority_df, by = "rule_nbr") %>%
    mutate(weight = local(weight)) %>%
    select(rule_nbr, row_nbr, weight, priority)

  new_data2 =
    new_data %>%
    mutate(row_nbr = 1:n()) %>%
    select(all_of(c("row_nbr", y_name)))

  rmse_till_rule = function(rn){

    if (is.character(rulelist$RHS)) {
      inter_df =
        pred_df %>%
        tidytable::filter(priority <= rn) %>%
        left_join(mutate(new_data, row_nbr = 1:n()), by = "row_nbr") %>%
        left_join(select(rulelist, rule_nbr, RHS), by = "rule_nbr") %>%
        nest(.by = c("RHS", "rule_nbr", "row_nbr", "priority", "weight")) %>%
        mutate(RHS = purrr::map2_dbl(RHS,
                                     data,
                                     ~ eval(parse(text = .x), envir = .y)
                                     )
               ) %>%
        unnest(data)
    } else {

      inter_df =
        pred_df %>%
        tidytable::filter(priority <= rn) %>%
        left_join(new_data2, by = "row_nbr") %>%
        left_join(select(rulelist, rule_nbr, RHS), by = "rule_nbr")
    }

    inter_df %>%
      summarise(rmse = MetricsWeighted::mae(RHS,
                                             .data[[y_name]],
                                             weight,
                                             na.rm = TRUE
                                             )
                ) %>%
      `[[`("rmse")
  }

  res = purrr::map_dbl(1:nrow(rulelist), rmse_till_rule)
  return(res)
}

calculate(tidy_rpart,
          metrics_to_exclude = NULL,
          list("cumulative_mae" = get_cumulative_MAE)
          )
#> # A tidytable: 6 × 5
#>   rule_nbr cumulative_coverage cumulative_overlap cumulative_RMSE cumulative_mae
#>      <int>               <dbl>              <dbl>           <dbl>          <dbl>
#> 1        1                 543                  0            748.           577.
#> 2        2                1077                  0           1125.           812.
#> 3        3                1255                  0           1153.           860.
#> 4        4                1397                  0           1198.           907.
#> 5        5                1437                  0           1191.           905.
#> 6        6                1470                  0           1182.           896.

`calculate` metrics for a rulelist