woebin generates optimal binning for numerical, factor and categorical variables using methods including tree-like segmentation or chi-square merge. woebin can also customizing breakpoints if the breaks_list was provided. The default woe is defined as ln(Pos_i/Neg_i). If you prefer ln(Neg_i/Pos_i), please set the argument positive as negative value, such as '0' or 'good'. If there is a zero frequency class when calculating woe, the zero will replaced by 0.99 to make the woe calculable.

woebin(dt, y, x = NULL, var_skip = NULL, breaks_list = NULL,
  special_values = NULL, missing_join = "left", stop_limit = 0.1,
  count_distr_limit = 0.05, bin_num_limit = 8, positive = "bad|1",
  no_cores = 2, print_step = 0L, method = "tree",
  ignore_const_cols = TRUE, ignore_datetime_cols = TRUE,
  check_cate_num = TRUE, replace_blank_inf = TRUE, save_as = NULL, ...)

Arguments

dt

A data frame with both x (predictor/feature) and y (response/label) variables.

y

Name of y variable.

x

Name of x variables. Defaults to NULL. If x is NULL, then all columns except y and var_skip are counted as x variables.

var_skip

Name of variables that will skip for binning. Defaults to NULL.

breaks_list

List of break points, Defaults to NULL. If it is not NULL, variable binning will based on the provided breaks.

special_values

the values specified in special_values will be in separate bins. Defaults to NULL.

missing_join

missing values join with the left non-missing bin if its share is lower than the threshold. Accepted values include 'left' and 'right'. If it sets to NULL, the missing values will be placed in a separate bin.

stop_limit

Stop binning segmentation when information value gain ratio less than the 'stop_limit' if using tree method; or stop binning merge when the chi-square of each neighbor bins are larger than the threshold under significance level of 'stop_limit' and freedom degree of 1 if using chimerge method. Accepted range: 0-0.5; Defaults to 0.1. If it is 'N', each x value is a bin.

count_distr_limit

The minimum count distribution percentage. Accepted range: 0.01-0.2; Defaults to 0.05.

bin_num_limit

Integer. The maximum number of binning. Defaults to 8.

positive

Value of positive class, defaults to "bad|1".

no_cores

Number of CPU cores for parallel computation. Defaults to 2, if it sets to NULL then 90 percent of total cpu cores will be used.

print_step

A non-negative integer. Defaults to 1. If print_step>0, print variable names by each print_step-th iteration. If print_step=0 or no_cores>1, no message is print.

method

Four methods are provided, "tree" and "chimerge" for optimal binning that support both numerical and categorical variables, and 'width' and 'freq' for equal binning that support numerical variables only. Defaults to "tree".

ignore_const_cols

Logical. Ignore constant columns. Defaults to TRUE.

ignore_datetime_cols

Logical. Ignore datetime columns. Defaults to TRUE.

check_cate_num

Logical. Check whether the number of unique values in categorical columns larger than 50. It might make the binning process slow if there are too many unique categories. Defaults to TRUE.

replace_blank_inf

Logical. Replace blank values with NA and infinite with -1. Defaults to TRUE.

save_as

A string. The file name to save breaks_list. Defaults to None.

...

Additional parameters.

Value

A list of data frames include binning information for each x variables.

Examples

# load germancredit data
data(germancredit)

# Example I
# binning of two variables in germancredit dataset
# using tree method
bins2_tree = woebin(germancredit, y="creditability",
   x=c("credit.amount","housing"), method="tree")
#>  Creating woe binning ...
#>  Binning on 1000 rows and 3 columns in 00:00:00
bins2_tree
#> $credit.amount
#>         variable         bin count count_distr   neg   pos   posprob
#>           <char>      <char> <int>       <num> <int> <int>     <num>
#> 1: credit.amount [-Inf,1400)   267       0.267   185    82 0.3071161
#> 2: credit.amount [1400,1800)   105       0.105    87    18 0.1714286
#> 3: credit.amount [1800,4000)   382       0.382   287    95 0.2486911
#> 4: credit.amount [4000,9200)   196       0.196   120    76 0.3877551
#> 5: credit.amount [9200, Inf)    50       0.050    21    29 0.5800000
#>            woe       bin_iv  total_iv breaks is_special_values
#>          <num>        <num>     <num> <char>            <lgcl>
#> 1:  0.03366128 0.0003045545 0.1812204   1400             FALSE
#> 2: -0.72823850 0.0468153322 0.1812204   1800             FALSE
#> 3: -0.25830746 0.0241086966 0.1812204   4000             FALSE
#> 4:  0.39053946 0.0319870413 0.1812204   9200             FALSE
#> 5:  1.17007125 0.0780047502 0.1812204    Inf             FALSE
#> 
#> $housing
#>    variable      bin count count_distr   neg   pos   posprob        woe
#>      <char>   <char> <int>       <num> <int> <int>     <num>      <num>
#> 1:  housing     rent   179       0.179   109    70 0.3910615  0.4044452
#> 2:  housing      own   713       0.713   527   186 0.2608696 -0.1941560
#> 3:  housing for free   108       0.108    64    44 0.4074074  0.4726044
#>        bin_iv   total_iv   breaks is_special_values
#>         <num>      <num>   <char>            <lgcl>
#> 1: 0.03139265 0.08329343     rent             FALSE
#> 2: 0.02579501 0.08329343      own             FALSE
#> 3: 0.02610577 0.08329343 for free             FALSE
#> 

if (FALSE) {
# using chimerge method
bins2_chi = woebin(germancredit, y="creditability",
   x=c("credit.amount","housing"), method="chimerge")

# binning in equal freq/width # only supports numerical variables
numeric_cols = c("duration.in.month", "credit.amount",
  "installment.rate.in.percentage.of.disposable.income", "present.residence.since",
  "age.in.years", "number.of.existing.credits.at.this.bank",
  "number.of.people.being.liable.to.provide.maintenance.for")
bins_freq  = woebin(germancredit, y="creditability", x=numeric_cols, method="freq")
bins_width = woebin(germancredit, y="creditability", x=numeric_cols, method="width")

# y can be NULL if no label column in dataset
bins_freq_noy  = woebin(germancredit, y=NULL, x=numeric_cols)

# Example II
# setting of stop_limit
# stop_limit = 0.1 (by default)
bins_x1 = woebin(germancredit, y = 'creditability', x = 'foreign.worker', stop_limit = 0.1)
# stop_limit = 'N', each x value is a bin
bins_x1_N = woebin(germancredit, y = 'creditability', x = 'foreign.worker', stop_limit = 'N')

# Example III
# binning of the germancredit dataset
bins_germ = woebin(germancredit, y = "creditability")
# converting bins_germ into a data frame
# bins_germ_df = data.table::rbindlist(bins_germ)

# Example IV
# customizing the breakpoints of binning
library(data.table)
dat = rbind(
  setDT(germancredit),
  data.table(creditability=sample(c("good","bad"),10,replace=TRUE)),
  fill=TRUE)

breaks_list = list(
  age.in.years = c(26, 35, 37, "Inf%,%missing"),
  housing = c("own", "for free%,%rent")
)

special_values = list(
  credit.amount = c(2600, 9960, "6850%,%missing"),
  purpose = c("education", "others%,%missing")
)

bins_cus_brk = woebin(dat, y="creditability",
  x=c("age.in.years","credit.amount","housing","purpose"),
  breaks_list=breaks_list, special_values=special_values)

# Example V
# save breaks_list as a R file
bins2 = woebin(germancredit, y="creditability",
   x=c("credit.amount","housing"), save_as='breaks_list')

# Example VI
# setting bin closed on the right
options(scorecard.bin_close_right = TRUE)
binsRight = woebin(germancredit, y = 'creditability', x = 'age.in.years')
binsRight
# setting bin close on the left, the default setting
options(scorecard.bin_close_right = FALSE)
}