Supervised Learning Techniques in R

R Source Code by Leonid Shpaner

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
################################################################################
################################ Part One ######################################
############################ Building A Model ##################################
################################################################################

# function for loading necessary libraries and installing them if they have not
# yet been installed

pack <- function(lib){

     new.lib <- lib[!(lib %in% 
                     installed.packages()[, 'Package'])]
   if (length(new.lib)) 
      install.packages(new.lib, dependencies = TRUE)
   sapply(lib, require, character.only = TRUE)

   }

packages <- c('partykit', 'e1071', 'caret', 'corrplot', 'MASS', 'car', 'DT', 
              'ggplot2', 'cowplot', 'ggpubr', 'rms', 'pander', 'ROCR', 'pROC')

pack(packages) # run function

# set working directory by concatenating long string
string1 <- 'C:/Users/lshpaner/OneDrive/Cornell University/Coursework'
string2 <- '/Data Science Certificate Program/CEEM585 '
string3 <- '- Supervised Learning Techniques'

# concatenate each string
working_dir = paste(string1, string2, string3, sep = '')

# set the working directory by calling function
setwd(working_dir) 

# confirm working directory
getwd() 

# In this part of the project, you will focus on building a model to 
# understand who might make a good product technician if hired using linear 
# discriminate analysis logit and ordered logit modeling. The data set you will 
# be using is in the file HRdata2groups.csv, contained in the RStudio instance.  

# 1.	The four performance scores in PerfScore have been mapped into two new 
# categories of Satisfactory and Unsatisfactory under the heading of 
# CollapseScore. Assume that levels 1 and 2 are unacceptable and levels 3 and 4 
# are acceptable. Build a linear discriminant analysis using regression with 
# these two categories as the dependent variable. The purpose of this question 
# is for you to examine the independent variables and conclude which one to 
# include in the regression model. Several are not useful. Remember that when we 
# do this, only the coefficients in the model are useful. You may use the 
# function lm() which has the syntax lm(dependent variable ~ independent 
# variable 1+ independent variable 2+…, data=frame).  This function is part of 
# the package caret: hence you will need to use the command library(caret).  

# Notice that you have a several variables that might be used as independent 
# variables.  You should pick the variables to include based on how effective 
# they are at explaining the variability in the dependent variable as well as 
# which variables might be available should you need to use this model to 
# determine if a candidate is likely to make a good employee. You may assume 
# that the verbal and mechanical scores will be available at the point where a 
# decision about hiring is to be made. In this question, please give us the 
# linear discriminate model you have developed.

# read in the data
hr_data <- read.csv('HRdata2groups.csv')

# Adding column based on other column:
# inspect first five rows of the dataset
datatable(hr_data) 

# cast categorical classes to Performance Score 
hr_data$CollapseScore <- ifelse(hr_data$PerfScoreID >= 3, 'Acceptable', 
                                                          'Unacceptable')
# numerically binarize these performance scores
hr_data$Score <- ifelse(hr_data$CollapseScore == 'Acceptable', 1, 0)
datatable(hr_data, options = list(scrollX = TRUE))

# extract meaningful data (i.e., remove categorical data types)
hr_data_numeric <- subset(hr_data, select = -c(EmpID, CollapseScore, Score))

# create function for plotting histograms to check for near-zero variance 
# in distributions where input `df` is a dataframe of interest
nearzerohist <- function(df, x, y) {
  
   # x rows by y columns & adjust margins
   par(mfrow = c(x, y), mar = c(4, 4, 4, 0)) 
   for (i in 1:ncol(df)){
     hist(df[, i], 
          xlab = names(df[i]), 
          main = paste(names(df[i]), ''), 
          col = 'gray18')
   }  
  
   # check for near zero variance predictors using if-else statement
   nearzero_names <- nearZeroVar(df)  
   if (length(nearzero_names) == 0) {
      print('There are no near-zero variance predictors.')
   } else {
     cat('The following near-zero variance predictors exist:', 
     print(nearzero_names))
   }
}

# call the `nearzerohist()` function
nearzerohist(hr_data_numeric, x = 2, y = 3) 


# function for generating class balance table and barplot
# inputs -->   feat: feature or column of interest
#              title: plot title
#                 x: x-axis label
#                 y: y-axis label

class_balance <- function(feat, title, x, y) {
  
   # check target column's class balance
   # parse target variable into table showcasing class distribution
   feat_table <- table(unname(feat)) # generate table for column
   
   # fix plot margins
   par(mar = c (2, 2, 2, 1)) 
      # plot the counts (values) of each respective class on barplot
   barplot(feat_table, main = title, space = c(0), horiz = FALSE,
           names.arg = c(x, y), 
           col = c('cornflowerblue', 'brown2'))
    
   return (feat_table)
    
}

class_balance(feat = hr_data$CollapseScore, title = 'HR by Class', 
                 x = 'Acceptable', y = 'Unacceptable')

# 2. Explain the variables you decided to use in the model 
# described above and why.    

# create function to plot correlation matrix and establish multicollinearity
# takes one input (df) to pass in dataframe of interest
multicollinearity <- function(df) {
  
      # Examine between predictor correlations/multicollinearity
      corr <- cor(df, use = 'pairwise.complete.obs')
      corrplot(corr, mar = c(0, 0, 0, 0), method = 'color', 
                     col = colorRampPalette(c('#FC0320', '#FFFFFF', 
                                              '#FF0000'))(100), 
                     addCoef.col = 'black', tl.srt = 45, tl.col = 'black', 
                     type = 'lower')
      
      # assign variable to count how many highly correlated
      # variables there exist based on 0.75 threshold
      highCorr <- findCorrelation(corr, cutoff = 0.75)
      
      # find correlated names
      highCorr_names <- findCorrelation(corr, cutoff = 0.75, names = TRUE)
      cat(' The following variables should be omitted:',  
      paste('\n', unlist(highCorr_names)))
    
}      

# determine multicollinearity
multicollinearity(hr_data[c(1:7)])

# use generalized linear model to determine confirm multicollinearity w/ VIF
model_all <- lm(Score ~ . - CollapseScore, data = hr_data) # remove CollapseScore 
                                                           # since it is target
# and we are only interested in comparing between-predictor relationships

# use car library to extract VIF and parse it into a pandoc table using the 
# linear model as a proxy for analysis
pandoc.table(vif(model_all), style = 'simple', split.table = Inf)

# create vector of VIF values for plotting
vif_values <- vif(model_all) 

par(mar = c(7.5, 2, 1, 1)) # fix plot margins
# create column chart to display each VIF value
barplot(vif_values, main = 'VIF Values', horiz = FALSE, col = 'steelblue', 
        las = 2) 

# add vertical line at 5 as after 5 there is severe correlation
abline(h = 5, lwd = 3, lty = 2)   

# create average score since result of both scores creates multicollinearity
hr_data$Aptitude <- rowMeans(hr_data[, c(6, 7)], na.rm = TRUE) 

# create a final dataframe with selected columns of interest for modeling
hr_data_final <- hr_data[, c(3, 5, 8, 9, 10)]

# Re-examine between predictor correlations/multicollinearity
highCorr <- findCorrelation(cor(hr_data_final[c(1, 2, 5)]), cutoff = 0.75, 
                            names = TRUE)
cat(' The following variables should be omitted:',   
paste('\n', unlist(highCorr)))

# create function for plotting correlations between variables  
# inputs: xvar: independent variable, yvar: dependent variable, 
#         title: plot title, xlab: x-axis label, ylab: y-axis label
correl_plot <- function(df, xvar, yvar, title, xlab, ylab) {
  
   ggplot(df, aes(x = xvar, y = yvar)) +
   ggtitle(title) +
   xlab(xlab) + ylab(ylab) +
   geom_point(pch = 1) + ylim(0, 1.25) +
   geom_smooth(method = 'lm', se = FALSE) +
   theme_classic() +
   stat_cor(method = 'pearson', label.x = 0.15, label.y = 0.20) # correl coeff. 
  
}

# create three correlation plots on same grid
plot1 <- correl_plot(hr_data_final, xvar = hr_data_final$EmpStatusID, 
                     yvar = hr_data_final$Score, title = 'Score vs. EmpStatusID', 
                     xlab = 'EmpStatusID', ylab = 'Score')

plot2 <- correl_plot(hr_data_final, xvar = hr_data_final$EmpSatisfaction, 
                     yvar = hr_data_final$Score, 
                     title = 'Score vs. EmpSatisfaction', 
                     xlab = 'EmpSatisfaction', ylab = 'Score')

plot3 <- correl_plot(hr_data_final, xvar = hr_data_final$Aptitude,
                     yvar = hr_data_final$Score, title = 'Score vs. Aptitude',
                     xlab = 'Aptitude', ylab = 'Score')

# plot all correlations together
plot_grid(plot1, plot2, plot3, labels = 'AUTO', ncol = 3, align = '')

par(mar = c(4, 2, 0, 0)) # fix plot margins
# Fit the Linear Discriminant Analysis (LDA) model
lda_fit <- lda(Score ~  EmpStatusID + EmpSatisfaction + Aptitude, 
               data = hr_data_final); lda_fit
plot(lda_fit) # plot the lda model

# 3. The regression model can be used to classify each of the individuals in the 
# dataset. As discussed in the videos, you will need to find the cutoff value 
# for the regression value that separates the unsatisfactory performers from the 
# satisfactory performers. Find this value and determine whether individual 5 is 
# predicted to be satisfactory or not.

# Fit a regression model
reg_model <- lm(Score ~ EmpStatusID + EmpSatisfaction + Aptitude, 
                data = hr_data_final)

# stores the predicted values from the regression function into the variable 
# pred when the regression model has been assigned to the variable reg_model  
pred <- predict(reg_model, hr_data_final) 

# find the mean value of the regression for all observations of unsatisfactory 
# and satisfactory employees
meanunsat <- mean(pred[hr_data_final$Score == 0]) 
meansat <- mean(pred[hr_data_final$Score == 1]) 
cat(' Mean of Satisfactory Results =', meansat, '\n', 
    'Mean of Unsatisfactory Results =', meanunsat, '\n')

# determine the cutoff value
cutoff <- 0.5*(meanunsat + meansat)
cat(' Cutoff Value =', cutoff)
cbind_hrdatafinal <- cbind(hr_data_final, pred)
datatable(cbind_hrdatafinal)

# 4.	Construct a logit model using the two performance groups. Compare this 
# model and the discriminant analysis done in step 1. To construct the logit 
# model, use the function `lrm()` in the library rms. 

# Construct a logit model using the two performance groups
logit <- lrm(Score ~ MechanicalApt + VerbalApt, data = hr_data); logit

# Build an ordered logit model for the full four categories for performance
ologit <- lrm(PerfScoreID ~ Termd + EmpStatusID + EmpSatisfaction, 
              data = hr_data)
ologit
# probability that individual two is in each of the four performance categories
pred_ologit <- predict(ologit, data = hr_data, type = 'fitted.ind')

# inspect the dataframe
pandoc.table(head(pred_ologit), style = 'grid', split.table = Inf, round = 4) 
# get predictions only for second individual
individual2 <- pred_ologit[2, ]; cat('\n') 
par(mar = c(4, 4, 1, 1)) # fix plot margins
plot(individual2, type = 'l', main = 'Predictions for Individual 2', 
     xlab = 'Category', ylab = 'Probability')
pandoc.table(individual2, style = 'simple', split.table = Inf, round = 4)

################################################################################
################################ Part Two ######################################
############## Using Naïve Bayes to Predict a Performance Score ################
################################################################################

# In this part of the project, you will use naïve Bayes to predict a performance 
# score. This part continues the scenario from Part One and uses the same 
# modified version of the human resources data set available on the Kaggle 
# website. The data set you will be using is in the file `NaiveBayesHW.csv` 
# file. Over the course of this project, your task is to gain insight into who 
# might be a “high” performer if hired.

# 1.	Using only the mechanical aptitude score, use naïve Bayes to predict the 
# performance score for each employee. Professor Nozick discretized the 
# mechanical scores into four classes. Notice only three of four classes have 
# observations. This discretization is in the data file `NaiveBayesHW.csv`. The 
# function to create the model is `naiveBayes()`. 

naive_df <- read.csv('NaiveBayesHW.csv') # read in the dataset

# inspect the dataset
datatable(naive_df) 

# assign the naivebayes function to a new variable
nbmodel <- naiveBayes(PerfScore ~ MechanicalApt, data = naive_df)
print(nbmodel)

# predict the naive bayes model
# type = raw' specifies that R should return the probability that a point is in 
# each risk group. Not specifying a type would print the most likely category 
# that each point would fall into. 

pred_bayes <- predict(nbmodel, naive_df, type = 'raw')  
head(pred_bayes, 20) # inspect the first 10 rows

# 2.	Using this modeling approach, what is your assessment of the probability 
# that individual 10 will evolve into each of the four probability classes if 
# hired? This can be done using the model created above and the `pred()` 
# function. The arguments for that function are the model name, data and for 
# type use “raw”. This question is parallel to the Practice using Naïve Bayes 
# activity you completed in R.

# table the probabilities of each respective class for the individual
# get the 10th row only
individual10 <- pred_bayes[10, ] 
# assign to a dataframe
individual10 <- data.frame(individual10) 
# rename the column
colnames(individual10) <- c('Probability') 
# show the table
individual10 

################################################################################
############################### Part Three #####################################
####################### Building Classification Trees ##########################
################################################################################

# In this part of the project, you will build classification trees. This part 
# continues the scenario from Parts One and Two, as it uses the same modified 
# version of the human resources data set available on the Kaggle website. Use 
# the `HRdata4groups.csv` data set to predict each individual's performance 
# (Performance Score ID) using classification trees. In the space below, you 
# will explain the model you have developed and describe how well it performs.

# 1.	In the space below, explain the model you developed. It is sufficient to 
# use the function `ctree()` in R to accomplish this in the style of the codio 
# exercise Practice: Building a Classification Tree in R—Small Example.

hrdata_groups <- read.csv('HRdata4groups.csv') # read in the dataset

# inspect the first five rows of the dataset
datatable(hrdata_groups, options = list(scrollX = TRUE)) 

str(hrdata_groups) # print out the structure of the dataframe

# Examine between predictor correlations/multicollinearity
highCorr <- findCorrelation(cor(hrdata_groups[c(-2)]), cutoff = 0.75, 
                            names = TRUE)
cat(' The following variables should be omitted: \n', paste(unlist(highCorr)))

# create aptitude from averaged MechanicalApt and VerbalApt scores
hrdata_groups$Aptitude <- rowMeans(hrdata_groups[, c(9, 10)], na.rm = TRUE)
# mechanical aptitude, and verbal aptitude are omitted
hrgroups_final <- hrdata_groups[, c(-9, -10)] # finalize dataframe for modeling

# Re-examine between predictor correlations/multicollinearity
highCorr <- findCorrelation(cor(hrgroups_final[, c(-2)]), cutoff = 0.75, 
                            names = TRUE)
cat(' The following variables should be omitted:', '\n',  
paste(unlist(highCorr)))

# build the classification tree
ctout <- ctree(PerfScoreID ~ ., data = hrgroups_final) 
ctout 

# predict the performance score based on all input features of final df
ctpred <- predict(ctout, hrgroups_final)

# Check the percentage of time that the classification tree correctly classifies 
# a data point
cat('Correct Classification of Data Point:', 
    mean(ctpred == hrgroups_final$PerfScoreID)) 
plot(ctout) # plot the classification tree

# 2. In the space below, describe how well your model performs. 

################################################################################
############################### Part Four ######################################
####################### Building Classification Trees ##########################
################################################################################

# In this part of the project, you will apply SVM to a data set. The RStudio 
# instance contains the file `acquisitionacceptanceSVM.csv`, which includes 
# information about whether or not homeowners accepted a government offer to 
# purchase their home. 

# 1.	Apply the tool SVM to the acquisition data set in the CSV file 
# `acquisitionacceptanceSVM.csv` to predict which homeowners will most likely 
# accept the government’s offer. What variables did you choose to use in your 
# analysis?

acquisition <- read.csv('acquisitionacceptanceSVM.csv') # read in the dataset

# inspect the dataframe
datatable(acquisition, options = list(scrollX = TRUE)) 

str(acquisition) # obtain the structure of the dataframe

nearzerohist(acquisition[c(-12)], x = 4, y = 3)

multicollinearity(acquisition)

acquisition$Accept <- as.factor(acquisition$Accept)
acquisition$Accept <- ifelse(acquisition$Accept == 1, 'Accept', 'Not Accept')
acquisition$Accept <- as.factor(acquisition$Accept) 

set.seed(222) # set seed for reproducibility

# Use 70% of dataset as training set and remaining 30% as testing set
sample <- sample(c(TRUE, FALSE), nrow(acquisition), replace = TRUE, 
                 prob = c(0.7, 0.3))
train_acquisition <- acquisition[sample, ] # training set
test_acquisition <- acquisition[!sample, ] # test set

cat(' Training Dimensions:', dim(train_acquisition),
    '\n Testing Dimensions:', dim(test_acquisition), '\n',
    '\n Training Dimensions Percentage:', round(nrow(train_acquisition) /
                                                nrow(acquisition), 2),
    '\n Testing Dimensions Percentage:', round(nrow(test_acquisition) /
                                               nrow(acquisition), 2))

predictors <- train_acquisition[, c(-12)] # extract ind. var. from train set
target <- train_acquisition[, c(12)] # extract dep. var. from train set
target <- as.factor(target) # cast target as factor

# Support Vector Machines via caret
model_svm  <- train(predictors, target, method = 'svmLinear', verbose = FALSE)

# plot the variable importance
svm_varimp <- varImp(object = model_svm)
ggplot2::ggplot(varImp(object = model_svm)) + 
  ggtitle('SVM - Variable Importance') + 
  scale_y_continuous(expand = c(0, 0)) +
  theme_classic() +
  theme(plot.margin = unit(c(1, 1, 0, 0), 'cm')) +
  theme(axis.text = element_text(color = 'black'),
        axis.title = element_text(color = 'black'))

train_df <- train_acquisition[, c(8, 11, 12)]
test_df <- test_acquisition[, c(8, 11, 12)]

# column names of df to confirm cols
pandoc.table(colnames(train_df), type = 'simple') 

# tune the support vector machine, optimizing the hyperparameters
# of gamma, cost, and epsilon
set.seed (222) # set seed for reproducibility
tune.out <- tune(svm, Accept ~ Price75 + Price125, data = train_df, 
                 ranges = list(cost = 10 ^ seq(-3, 3), 
                               kernel = c('linear', 'polynomial', 
                                          'radial')))

bestparam <- tune.out$best.parameters # best hyperparamaters
bestmod <- tune.out$best.model # best model based on tuning parameters
bestparam  # print out the best hyperparamaters
summary(tune.out)

# Construct Soft Margin SVM
acquisition_result <- svm(Accept ~ Price125 + Price75, kernel = 'linear', 
                          gamma = 0.001, cost = 0.01, epsilon = 0, 
                          data = train_df, decision.values = TRUE)
print(acquisition_result)

# Visualize the SVM decision boundary using only the training data using price75
# and price125 as features
plot(acquisition_result, data = train_df)

# create function for outputting a confusion matrix in a pandoc-style format
# where inputs --> df1: model df
#                  df2: dataset
#                  feat: target column
#                  x: H0  column (i.e., 'yes', 'accept' '1', etc.)
#                  y: H1  column (i.e., 'no', 'not accept', '0', etc.)
#                  custom_name: any string you want to pass into table name
      
conf_matrix <- function (df1, df2, feat, x, y, custom_name) {
  
    prediction <- predict(df1, newdata = df2)
    # Evaluate the model on the training data and inspect first six rows
    pred_table <- table(prediction, feat)
    # print out pander-grid-style table with performance results
    metrics <- c(x, y)
    h0 <- c(pred_table[1], pred_table[2])
    h1 <- c(pred_table[3], pred_table[4])
    # create table as dataframe from above variables
    table <- data.frame(metrics, h0, h1)
    # change column names of table
    colnames(table) <- c('\n', x, y)
    table %>% pander(style = 'simple', justify = c('center'),
                     caption = sprintf('Confusion Matrix for %s', 
                                                       custom_name))

}

conf_matrix(df1 = acquisition_result, df2 = train_df, feat = train_df$Accept, 
             x = 'Accept', y = 'Not Accept', custom_name = 'Train Set')

conf_matrix(df1 = acquisition_result, df2 = test_df, feat = test_df$Accept, 
             x = 'Accept', y = 'Not Accept', custom_name = 'Test Set')

# 
#
#
#
# create function for calculating model performance metrics that takes in the 
# following inputs --> df1: model df
#                      df2: dataset
#                      feat: target column
#                      custom_name: any string you want to pass into table name
#

perf_metrics <- function(df1, df2, feat, custom_name) {
  
   prediction <- predict(df1, newdata = df2)
   # Evaluate the model on the training data and inspect first six rows
   df <- table(prediction, feat)  
  
   tp <- df[1] # position of true positives
   tn <- df[4] # position of true negatives
   fp <- df[3] # position of false positives
   fn <- df[2] # position of false negatives
  
   # calculate model performance metrics
   accuracy <- round((tp + tn)/(tp + tn + fp + fn),2) # calculate accuracy
   spec <- round((tp) / (tp + fp),2) # calculate specificity (precision)
   sens <- round((tp) / (tp + fn),2) # calculate sensitivity (recall)
   f1 <- round((tp) / (tp+0.5*(fp+fn)),2) # calculate f1-score

   # print out pander-grid-style table with performance results
   metrics <- c('Accuracy', 'Specificity', 'Sensitivity', 'F1-Score')
   values <- c(accuracy, spec, sens, f1)
   table <- data.frame(Metric = metrics, Value = values)
   table %>% pander(style = 'simple', justify = c('center'),
                    caption = sprintf('Performance Metrics for %s', custom_name))
  
}

# call the `perf_metrics` function to establish performance metrics for train set
perf_metrics(df1 = acquisition_result, df2 = train_df, feat = train_df$Accept,
             custom_name = 'Training Set')

# call the `perf_metrics` function to establish performance metrics for test set
perf_metrics(df1 = acquisition_result, df2 = test_df, feat = test_df$Accept, 
             custom_name = 'Test Set')

# 2. How good was your model at correctly predicting who would and who would not
# accept the offer?

test_prob  <- predict(acquisition_result, test_df, type = 'decision')
pr <-prediction(as.numeric(test_prob), as.numeric(test_df$Accept))
prf <- performance(pr, measure = 'tpr', x.measure = 'fpr')
test_roc <- roc(test_df$Accept ~ as.numeric(test_prob), print.auc = TRUE)
auc <- round(as.numeric(test_roc$auc), 2); par(mar = c (4, 4, 2, 1)) 
plot(prf, main = 'SVM ROC Curve', col = 'red', xlab = 'False Positive Rate',
                                               ylab = 'True Positive Rate')
abline(0, 1, col = 'black', lty = 2, lwd = 1)
legend(0.73, 0.2, legend = paste('AUC =', rev(auc)), lty = c(1), col = c('red'))

# 3.	When building models, we often use part of the data to estimate the model 
# and use the remainder for prediction. Why do we do this? It is not necessary 
# to do this for each of the problems above. It is essential to realize that you
# will need to do this in practice.