This function filter variables base on specified conditions, such as missing rate, identical value rate, information value.

var_filter(dt, y, x = NULL, lims = list(missing_rate = 0.95, identical_rate
  = 0.95, info_value = 0.02), var_rm = NULL, var_kp = NULL,
  var_rm_reason = FALSE, positive = "bad|1", ...)

Arguments

dt

A data frame with both x (predictor/feature) and y (response/label) variables.

y

Name of y variable.

x

Name of x variables. Defaults to NULL. If x is NULL, then all columns except y are counted as x variables.

lims

A list of variable filters' thresholds.

  • missing_rate The missing rate of kept variables should <= 0.95 by defaults.

  • identical_rate The identical value rate (excluding NAs) of kept variables should <= 0.95 by defaults.

  • info_value The information value (iv) of kept variables should >= 0.02 by defaults.

var_rm

Name of force removed variables, Defaults to NULL.

var_kp

Name of force kept variables, Defaults to NULL.

var_rm_reason

Logical, Defaults to FALSE.

positive

Value of positive class, Defaults to "bad|1".

...

Additional parameters.

Value

A data frame with columns for y and selected x variables, and a data frame with columns for remove reason if var_rm_reason is TRUE.

Examples

# Load German credit data
data(germancredit)

# variable filter
dt_sel = var_filter(germancredit, y = "creditability")
#>  Filtering variables via missing_rate, identical_rate, info_value ...
#>  1 variables are removed via identical_rate
#>  6 variables are removed via info_value
#>  Variable filtering on 1000 rows and 20 columns in 00:00:00
#>  7 variables are removed in total
dim(dt_sel)
#> [1] 1000   14

# return the reason of varaible removed
dt_sel2 = var_filter(germancredit, y = "creditability", var_rm_reason = TRUE)
#>  Filtering variables via missing_rate, identical_rate, info_value ...
#>  1 variables are removed via identical_rate
#>  6 variables are removed via info_value
#>  Variable filtering on 1000 rows and 20 columns in 00:00:00
#>  7 variables are removed in total
lapply(dt_sel2, dim)
#> $dt
#> [1] 1000   14
#> 
#> $rm
#> [1] 20  5
#> 

str(dt_sel2$dt)
#> Classes ‘data.table’ and 'data.frame':	1000 obs. of  14 variables:
#>  $ status.of.existing.checking.account                : Factor w/ 4 levels "... < 0 DM","0 <= ... < 200 DM",..: 1 2 4 1 1 4 4 2 4 2 ...
#>  $ duration.in.month                                  : num  6 48 12 42 24 36 24 36 12 30 ...
#>  $ credit.history                                     : Factor w/ 5 levels "no credits taken/ all credits paid back duly",..: 5 3 5 3 4 3 3 3 3 5 ...
#>  $ purpose                                            : chr  "radio/television" "radio/television" "education" "furniture/equipment" ...
#>  $ credit.amount                                      : num  1169 5951 2096 7882 4870 ...
#>  $ savings.account.and.bonds                          : Factor w/ 5 levels "... < 100 DM",..: 5 1 1 1 1 5 3 1 4 1 ...
#>  $ present.employment.since                           : Factor w/ 5 levels "unemployed","... < 1 year",..: 5 3 4 4 3 3 5 3 4 1 ...
#>  $ installment.rate.in.percentage.of.disposable.income: num  4 2 2 2 3 2 3 2 2 4 ...
#>  $ other.debtors.or.guarantors                        : Factor w/ 3 levels "none","co-applicant",..: 1 1 1 3 1 1 1 1 1 1 ...
#>  $ property                                           : Factor w/ 4 levels "real estate",..: 1 1 1 2 4 4 2 3 1 3 ...
#>  $ age.in.years                                       : num  67 22 49 45 53 35 53 35 61 28 ...
#>  $ other.installment.plans                            : Factor w/ 3 levels "bank","stores",..: 3 3 3 3 3 3 3 3 3 3 ...
#>  $ housing                                            : Factor w/ 3 levels "rent","own","for free": 2 2 2 3 3 3 2 1 2 2 ...
#>  $ creditability                                      : int  0 1 0 0 1 0 0 0 0 1 ...
#>  - attr(*, ".internal.selfref")=<externalptr> 
str(dt_sel2$rm)
#> Classes ‘data.table’ and 'data.frame':	20 obs. of  5 variables:
#>  $ variable      : chr  "foreign.worker" "job" "number.of.existing.credits.at.this.bank" "number.of.people.being.liable.to.provide.maintenance.for" ...
#>  $ missing_rate  : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ identical_rate: num  0.963 0.63 0.633 0.845 0.548 0.413 0.596 0.051 0.003 0.53 ...
#>  $ info_value    : num  0.0439 0.0088 0.0133 0 0.0088 ...
#>  $ rm_reason     : chr  "identical_rate>0.95" "info_value<0.02" "info_value<0.02" "info_value<0.02" ...
#>  - attr(*, ".internal.selfref")=<externalptr> 

# keep columns manually, such as rowid
germancredit$rowid = row.names(germancredit)
dt_sel3 = var_filter(germancredit, y = "creditability", var_kp = 'rowid')
#>  Filtering variables via missing_rate, identical_rate, info_value ...
#>  1 variables are removed via identical_rate
#>  6 variables are removed via info_value
#>  Variable filtering on 1000 rows and 21 columns in 00:00:00
#>  7 variables are removed in total

# remove columns manually
dt_sel4 = var_filter(germancredit, y = "creditability", var_rm = 'rowid')
#>  Filtering variables via missing_rate, identical_rate, info_value ...
#>  1 variables are removed via identical_rate
#>  6 variables are removed via info_value
#>  Variable filtering on 1000 rows and 21 columns in 00:00:00
#>  8 variables are removed in total