DataCamp Fraud Detection in R
Introduction & Motivation
FRAUD DETECTION IN R
Introduction & Motivation Bart Baesens Professor Data Science - - PowerPoint PPT Presentation
DataCamp Fraud Detection in R FRAUD DETECTION IN R Introduction & Motivation Bart Baesens Professor Data Science at KU Leuven DataCamp Fraud Detection in R Instructors DataCamp Fraud Detection in R Instructors DataCamp Fraud
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
prop.table(table()) to determine percentage of fraud
> prop.table(table(fraud_label)) 0 1 0.9911 0.0089
DataCamp Fraud Detection in R
> labels <- c("no fraud", "fraud") > labels <- paste(labels, round(100*prop.table(table(fraud_label)), 2)) > labels <- paste0(labels, "%") > pie(table(fraud_label), labels, col = c("blue", "red"), main = "Pie chart of storm claims")
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
> predictions <- rep.int(0, nrow(claims)) > predictions <- factor(predictions, levels = c("no fraud", "fraud")) > library(caret) > confusionMatrix(data = predictions, reference = fraud_label) Confusion Matrix and Statistics Reference Prediction 0 1 0 614 14 1 0 0 Accuracy : 0.9777
DataCamp Fraud Detection in R
> total_cost <- sum(claim_amount[fraud_label == "fraud"]) > print(total_cost) [1] 2301508
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
> data(timestamps) > head(timestamps) [1] "20:27:28" "21:08:41" "01:30:16" "00:57:04" "23:12:14" "22:54:16" > library(lubridate) > ts <- as.numeric(hms(timestamps)) / 3600 > head(ts) [1] 20.4577778 21.1447222 1.5044444 0.9511111 23.2038889 22.9044444
DataCamp Fraud Detection in R
> library(ggplot2) > clock <- ggplot(data.frame(ts), aes(x = ts)) + geom_histogram(breaks = seq(0, 24), colour = "blue", fill = "lightblue") + coord_polar() > arithmetic_mean <- mean(ts) > clock + geom_vline(xintercept = arithmetic_mean, linetype = 2, color = "red", size = 2)
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
1 2 n
DataCamp Fraud Detection in R
# Convert the decimal timestamps to class "circular" > library(circular) > ts <- circular(ts, units = "hours", template = "clock24") > head(ts) Circular Data: [1] 20.457889 21.144607 1.504422 0.950982 23.203917 4.904397 > estimates <- mle.vonmises(ts) > p_mean <- estimates$mu %% 24 > concentration <- estimates$kappa
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
i time
> estimates <- mle.vonmises(ts) > p_mean <- estimates$mu %% 24 > concentration <- estimates$kappa > densities <- dvonmises(ts, mu = p_mean, kappa = concentration)
DataCamp Fraud Detection in R
> alpha <- 0.90 > quantile <- qvonmises((1 - alpha)/2, mu = p_mean, kappa = concentration) %% 24 > cutoff <- dvonmises(quantile, mu = p_mean, kappa = concentration) > time_feature <- densities >= cutoff
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
> print(ts) [1] 18.42 20.45 20.88 0.75 19.20 23.65 6.08 > time_feature = c(NA, NA) > for (i in 3:length(ts)) { # Previous timestamps ts_history <- ts[1:(i-1)] # Estimate mu and kappa on historic timestamps estimates <- mle.vonmises(ts_history) p_mean <- estimates$mu %% 24 concentration <- estimates$kappa # Estimate density of current timestamp dens_i <- dvonmises(ts[i], mu = p_mean, kappa = concentration) # Check if density is larger than cutoff with confidence level 90% alpha <- 0.90 quantile <- qvonmises((1-alpha)/2, mu=p_mean, kappa=concentration) %% 24 cutoff <- dvonmises(quantile, mu = p_mean, kappa = concentration) time_feature[i] <- dens_i >= cutoff } > print(time_feature) [1] NA NA TRUE FALSE TRUE TRUE FALSE
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
> trans %>% select(fraud_flag, orig_account_id, benef_country, authentication_cd, channel_cd, amount) fraud_flag account_name benef_country authentication_cd channel_cd amount 1 0 Bob ISO03 AU02 CH07 549 2 0 Alice ISO03 AU03 CH04 37 3 0 Bob ISO03 AU04 CH07 25 4 0 Bob ISO03 AU02 CH06 25 5 0 Alice ISO03 AU01 CH07 13 6 0 Bob ISO03 AU02 CH06 785 7 0 Alice ISO03 AU03 CH04 49 8 0 Bob ISO03 AU02 CH07 35 ... ... ... ... ... ... ... 36 0 Alice ISO03 AU05 CH04 126 37 0 Bob ISO03 AU02 CH06 22 38 0 Alice ISO03 AU03 CH04 41 39 1 Bob ISO03 AU03 CH05 3779 40 1 Alice ISO03 AU04 CH05 1531
DataCamp Fraud Detection in R
fraud_flag authentication_cd 0 1 AU01 6 0 AU02 0 0 AU03 7 0 AU04 0 1 AU05 9 0
DataCamp Fraud Detection in R
fraud_flag authentication_cd 0 1 AU01 6 0 AU02 0 0 AU03 7 0 AU04 0 1 AU05 9 0 fraud_flag authentication_cd 0 1 AU01 1 0 AU02 8 0 AU03 0 1 AU04 7 0 AU05 0 0
DataCamp Fraud Detection in R
> library(dplyr) > trans <- trans %>% arrange(timestamp)
DataCamp Fraud Detection in R
> library(dplyr) > trans <- trans %>% arrange(timestamp) > trans_Alice <- trans %>% filter(account_name == "Alice")
DataCamp Fraud Detection in R
> library(dplyr) > trans <- trans %>% arrange(timestamp) > trans_Alice <- trans %>% filter(account_name == "Alice") steps authentication_cd freq_auth AU03 0
DataCamp Fraud Detection in R
> frequency_fun <- function(steps, auth_method) { n <- length(steps) frequency <- sum(auth_method[1:n] == auth_method[n + 1]) return(frequency) } steps authentication_cd freq_auth AU03 0 1 AU03 1
DataCamp Fraud Detection in R
> frequency_fun <- function(steps, auth_method) { n <- length(steps) frequency <- sum(auth_method[1:n] == auth_method[n + 1]) return(frequency) } steps authentication_cd freq_auth AU03 0 1 AU03 1 2 AU03 2
DataCamp Fraud Detection in R
> frequency_fun <- function(steps, auth_method) { n <- length(steps) frequency <- sum(auth_method[1:n] == auth_method[n + 1]) return(frequency) } steps authentication_cd freq_auth AU03 0 1 AU03 1 2 AU03 2 3 AU01 0
DataCamp Fraud Detection in R
> frequency_fun <- function(steps, auth_method) { n <- length(steps) frequency <- sum(auth_method[1:n] == auth_method[n + 1]) return(frequency) } steps authentication_cd freq_auth AU03 0 1 AU03 1 2 AU03 2 3 AU01 0 4 AU01 1
DataCamp Fraud Detection in R
> library(zoo) > freq_auth <- rollapply(trans_Alice$transfer_id, width = list(-1:-length(trans_Alice$transfer_id)), partial = TRUE, FUN = frequency_fun, trans_Alice$authentication_cd)
DataCamp Fraud Detection in R
> library(zoo) > freq_auth <- rollapply(trans_Alice$transfer_id, width = list(-1:-length(trans_Alice$transfer_id)), partial = TRUE, FUN = frequency_fun, trans_Alice$authentication_cd) > freq_auth <- c(0, freq_auth)
DataCamp Fraud Detection in R
authentication_cd freq_auth fraud_flag 1 AU03 0 0 2 AU03 1 0 3 AU03 2 0 4 AU01 0 0 5 AU01 1 0 6 AU05 0 0 7 AU05 1 0 8 AU05 2 0 9 AU01 2 0 10 AU05 3 0 11 AU05 4 0 12 AU05 5 0 13 AU03 3 0 14 AU05 6 0 15 AU01 3 0 16 AU05 7 0 17 AU03 4 0 18 AU01 4 0 19 AU01 5 0 20 AU03 5 0 21 AU05 8 0 22 AU03 6 0 23 AU04 0 1
DataCamp Fraud Detection in R
> trans %>% group_by(account_name) > trans <- trans %>% group_by(account_name) %>% mutate(freq_auth = c(0, rollapplyr(transfer_id, width = list(-1:-length(transfer_id)), partial = TRUE, FUN = count_fun, authentication_cd) ) )
DataCamp Fraud Detection in R
account_name authentication_cd freq_auth fraud_flag 1 Bob AU02 0 0 2 Alice AU03 0 0 3 Bob AU04 0 0 4 Bob AU02 1 0 5 Alice AU01 0 0 6 Bob AU02 2 0 7 Alice AU03 1 0 8 Bob AU02 3 0 9 Alice AU01 1 0 10 Bob AU04 1 0 11 Bob AU02 4 0 12 Alice AU01 2 0 13 Alice AU05 0 0 14 Alice AU05 1 0 15 Alice AU05 2 0 16 Bob AU02 5 0 17 Bob AU04 2 0 18 Bob AU02 6 0 ... ... ... ... ... 37 Bob AU02 7 0 38 Alice AU03 5 0 39 Bob AU03 0 1 40 Alice AU04 0 1
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
FRAUD DETECTION IN R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
−γt
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
> gamma <- -log(0.01)/180 > gamma [1] 0.02558428
DataCamp Fraud Detection in R
recency_fun <- function(t, gamma, auth_cd, freq_auth) { n_t <- length(t) if (freq_auth[n_t] == 0) { recency <- 0 # recency = 0 when frequency = 0 } else { time_diff <- t[1] - max(t[2:n_t][auth_cd[(n_t-1):1] == auth_cd[n_t]]) # time-interval = current timestamp # - timestamp of previous transfer with same auth_cd recency <- exp(-gamma * time_diff) } return(recency) }
DataCamp Fraud Detection in R
> gamma <- -log(0.01)/180 # = 0.0256 > library(dplyr) # needed for group_by() and mutate() > library(zoo) # needed for rollapply() > trans <- trans %>% group_by(account_name) %>% mutate(rec_auth = rollapply(timestamp, width = list(0:-length(transfer_id)), partial = TRUE, FUN = recency_fun, gamma, authentication_cd, freq_auth))
DataCamp Fraud Detection in R
account_name timestamp authentication_cd rec_auth fraud_flag 1 Bob 44.25 AU02 0.000 0 2 Alice 54.12 AU03 0.000 0 3 Bob 57.45 AU04 0.000 0 4 Bob 64.29 AU02 0.599 0 5 Alice 64.29 AU03 0.771 0 6 Bob 64.29 AU02 1.000 0 7 Alice 70.25 AU03 0.859 0 8 Bob 70.25 AU02 0.859 0 9 Alice 74.08 AU01 0.000 0 10 Bob 74.08 AU04 0.653 0 11 Bob 74.08 AU02 0.907 0 12 Alice 83.93 AU01 0.777 0 13 Alice 96.21 AU05 0.000 0 14 Alice 96.21 AU05 1.000 0 15 Alice 98.25 AU05 0.949 0 16 Bob 109.27 AU02 0.406 0 17 Bob 123.89 AU04 0.280 0 18 Bob 155.95 AU02 0.303 0 ... ... ... ... ... ... 37 Bob 407.17 AU02 0.002 0 38 Alice 420.17 AU03 0.717 0 39 Bob 441.34 AU03 0.000 1 40 Alice 443.24 AU04 0.000 1
DataCamp Fraud Detection in R
DataCamp Fraud Detection in R
FRAUD DETECTION IN R