lapse rate

Coordinator
Jul 28, 2014 at 5:56 PM
library(rattle)
building <- TRUE
scoring <- ! building

A pre-defined value is used to reset the random seed so that results are repeatable.

seed <- 42
TrainingFile<-"C:/Code/R/FundingPattern/RFinput/RF_DEVELOP_SET.csv"
TestFolder <-"C:/Code/R/FundingPattern/"
TestFile <-"C:/Code/R/FundingPattern/RFinput/RF_VAL_SET.csv"
ModelFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_July28.rda"
QuantFile <-"C:/Code/R/FundingPattern/Result/lapse_Quant.csv"
SummaryFile <-"C:/Code/R/FundingPattern/Result/lapse_Summary.csv"
SampleSetFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_sample.csv"
ValidSetFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_validate.csv"
TestSetFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_test.csv"
importanceFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_importance.csv"
PredictionFile<-"C:/Code/R/FundingPattern/result/RF_policies(lapse).csv"

NUM_OF_TEST <- 1
NUM_OF_MEASURE <- 17

crs$dataset <- read.csv(TrainingFile, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
crs$nobs <- nrow(crs$dataset) # 86037 observations
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.5crs$nobs) # 60225 observations
crs$validate <- sample(setdiff(seq_len(nrow(crs$dataset)), crs$train), 0.1
crs$nobs) # 12905 observations
crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train),crs$validate) # 12907 observations

The following variable selections have been noted.

crs$input <- c("nduration", "j", "DB_OPTION", "ln",
           "ln_cnt", "lr", "lr_cnt", "lb",
           "fc", "fc_cnt", "ps", "ps_cnt",
           "prem_path", "sum_paid", "end_zeros", "runout",
           "ANNUAL_PREMIUM_AMT", "POLICY_AGE", "FACE_AMOUNT", "LN_INT_RATE",
           "MIN_PREM_AMT", "MODE_PREM_AMT", "OWNER_AGE_AT_ISSUE", "level1",
           "tphase", "level2", "alpha", "slope",
           "stderr", "drops", "mf_dump", "proj_y1",
           "level31", "proj_y", "level3", "lifetime_prem_dev",
           "single_prem_dev", "ten_prem_dev", "ISSUE_YR", "duration",
           "cash_value", "INTEREST_RATE", "LAST_TRANSACTION_AMOUNT", "LOAN_AMOUNT",
           "REDUCTION_RATING", "TOTAL_PREMIUMS", "ACCT_VALUE", "BIRTHDATE",
           "CURRENT_FACE", "SURR_VALUE", "TARGET_PREMIUM", "cashpercnt",
           "accntpercnt", "IBE_9152", "IBE_2058", "IBE_2059",
           "IBE_2060", "IBE_2061", "IBE_2062", "IBE_4000",
           "IBE_8165", "IBE_8167", "IBE_8433", "IBE_8434",
           "IBE_8463", "IBE_8562", "IBE_8579", "IBE_8580",
           "IBE_8590", "IBE_8592", "IBE_8600", "IBE_8601",
           "IBE_8611", "IBE_8614", "IBE_8616", "IBE_8617",
           "IBE_8618", "IBE_8621", "IBE_8626", "IBE_8628",
           "IBE_8640", "IBE_8641", "IBE_8643", "IBE_8647",
           "IBE_8652", "IBE_8655", "IBE_8682", "IBE_8685",
           "IBE_8686", "IBE_8689", "IBE_8690", "IBE_8691",
           "IBE_8704", "IBE_8707", "IBE_8816", "IBE_9153",
           "IBE_9509", "IBE_9510", "IBE_9511", "IBE_9512",
           "IBE_9513", "IBE_9514", "ECONOMIC_STABILITY_IND", "HEAVYTRANSACTORS",
           "PROPENSITY_SCORE_INTERNET", "PROPENSITY_SCORE_MAIL", "PROPENSITY_SCORE_PHONE", "MEDIA_CHNL_INTERNET",
           "MEDIA_CHNL_CELLPHONE", "MEDIA_CHNL_PRIMETIME_TV", "MEDIA_CHNL_DAYTIME_TV", "MEDIA_CHNL_OUTDOOR",
           "MEDIA_CHNL_YELLOWPAGES", "MEDIA_CHNL_RADIO", "MEDIA_CHNL_MAGAZINE", "MEDIA_CHNL_NEWSPAPER",
           "AFFORDABILITY_IND", "PHONE_APPEND", "UNDERBANKED_9351", "TOT_OFLN_ORD_100_250_6698",
           "TOT_ONLN_PURCHASES_6718", "WKS_SNC_LST_OLN_ORD_6843", "WKS_SNC_LST_ORD_JWRY_6870", "FaceHousePercnt",
           "BILLING_FREQ")
crs$categoric <- NULL
crs$target <- "LAPSE"
crs$risk <- NULL
crs$ident <- NULL
crs$ignore <- c("SURRENDER", "status2", "termination_date", "single_ratio9", "single_ratio10", "paid_premium9", "paid_premium10")
crs$weights <- NULL

training_set<-crs$dataset[crs$sample,]
write.csv(training_set,SampleSetFile)

num_of_lapse_sample<-sum(training_set$LAPSE)
num_of_trainingset<-nrow(training_set)

valid_set<-crs$dataset[crs$validate,]
write.csv(valid_set,ValidSetFile)

testing_set<-crs$dataset[crs$test,]
write.csv(testing_set,TestSetFile)

num_of_lapse_test<-sum(testing_set$LAPSE)
num_of_testingset<-nrow(testing_set)

require(randomForest, quietly=TRUE)

Build the Random Forest model.

set.seed(crv$seed)
crs$rf <- randomForest(as.factor(LAPSE) ~ .,
                   data=crs$dataset[crs$sample,c(crs$input, crs$target)], 
                   ntree=500,
                   mtry=9,
                   importance=TRUE,
                   na.action=na.roughfix,
                   replace=FALSE)
rf<-crs$rf
save(rf,file=ModelFile)
load(ModelFile)

rn <- round(importance(crs$rf), 2)
var_importance<-rn[order(rn[,3], decreasing=TRUE),]
write.csv(var_importance,importanceFile)


testdata <- read.csv(TestFile, na.strings=c(".", "NA", "", "?"), header=TRUE, sep=",", encoding="UTF-8")
crs$testset<-testdata
i<-1

prob <- predict(crs$rf, (crs$testset[crs$validate,c(crs$input)]), type="prob")[,2]
valid_result<-cbind(prob,valid_set)
PROB_THREAD=quantile(valid_result[valid_result$LAPSE==1,]$prob,0.8,na.rm=TRUE)

PROB_THREAD=0.4

num_sample<-nrow(testdata)
crs$testset<-testdata
crs$pr <- predict(crs$rf, (crs$testset[,c(crs$input)]), type="prob")[,2]
rfdata<-cbind(crs$pr,crs$testset)
write.csv(rfdata,PredictionFile)
summary<-matrix(nrow=390,ncol=24)

for (i in 10:20)
{
idx=i
3-2
PROB_THREAD=i/100
num_policies=num_event_actual=num_event_pred=num_event_pos=num_event_neg=num_normal_pos=num_normal_neg=0;
num_train_event_pos=num_train_event_neg=num_train_event_actual=num_test_event_actual=num_test_event_pos=num_test_event_neg=0;
num_normal_pos_train=num_normal_neg_train=0;
num_normal_pos_test=num_normal_neg_test=0;
sum_face_total=sum_face_event=sum_face_pred=0;
sum_face_total_train=sum_face_event_train=sum_face_pred_train=0;
sum_face_total_test=sum_face_event_test=sum_face_pred_test=0;
sum_test_normal_pos=sum_test_normal_neg=0;
sum_test_event_pos=sum_test_event_neg=0;

for ( j in 1:nrow(rfdata))
{
num_policies=num_policies+1

#total face amount
sum_face_total=sum_face_total+rfdata$"CURRENT_FACE"[j]
if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
{
sum_face_total_train=sum_face_total_train+rfdata$"CURRENT_FACE"[j]
}
if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
{
sum_face_total_test=sum_face_total_test+rfdata$"CURRENT_FACE"[j]
}


#predicted face amount
if(is.na(rfdata$"crs$pr"[j])==FALSE & rfdata$"crs$pr"[j]>=PROB_THREAD)
{
num_event_pred=num_event_pred+1;
sum_face_pred=sum_face_pred+rfdata$"CURRENT_FACE"[j]
if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
{
  sum_face_pred_train=sum_face_pred_train+rfdata$"CURRENT_FACE"[j]

}
if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
{
  sum_face_pred_test=sum_face_pred_test+rfdata$"CURRENT_FACE"[j]

}
}



if (rfdata$LAPSE[j]==1)
{
num_event_actual=num_event_actual+1;
sum_face_event=sum_face_event+rfdata$"CURRENT_FACE"[j]

if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
{
  num_train_event_actual=num_train_event_actual+1
  sum_face_event_train=sum_face_event_train+rfdata$"CURRENT_FACE"[j]

}
if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
{
  num_test_event_actual=num_test_event_actual+1
  sum_face_event_test=sum_face_event_test+rfdata$"CURRENT_FACE"[j]
}   


if(is.na(rfdata$"crs$pr"[j])==FALSE & rfdata$"crs$pr"[j]>=PROB_THREAD)
{
  num_event_pos=num_event_pos+1
  if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
  {
    num_train_event_pos=num_train_event_pos+1


  }
  if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
  {
    num_test_event_pos=num_test_event_pos+1
    sum_test_event_pos=sum_test_event_pos+rfdata$"CURRENT_FACE"[j]

  }               


}
if(is.na(rfdata$"crs$pr"[j])==FALSE & rfdata$"crs$pr"[j]<PROB_THREAD)
{
  num_event_neg=num_event_neg+1
  if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
  {
    num_train_event_neg=num_train_event_neg+1

  }
  if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
  {
    num_test_event_neg=num_test_event_neg+1
    sum_test_event_neg=sum_test_event_neg+rfdata$"CURRENT_FACE"[j]

  } 
}    
}
else
{
if(is.na(rfdata$"crs$pr"[j])==FALSE & rfdata$"crs$pr"[j]<PROB_THREAD)
{
  num_normal_pos=num_normal_pos+1
  if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
  {
    num_normal_pos_train=num_normal_pos_train+1

  }
  if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
  {
    num_normal_pos_test=num_normal_pos_test+1
    sum_test_normal_pos=sum_test_normal_pos+rfdata$"CURRENT_FACE"[j]

  }
}
else
{
  num_normal_neg=num_normal_neg+1
  if(rfdata$POLICY_NUMBER[j] %in% training_set$POLICY_NUMBER)
  {
    num_normal_neg_train=num_normal_neg_train+1

  }
  if(rfdata$POLICY_NUMBER[j] %in% testing_set$POLICY_NUMBER)
  {
    num_normal_neg_test=num_normal_neg_test+1
    sum_test_normal_neg=sum_test_normal_neg+rfdata$"CURRENT_FACE"[j]

  }
}

}

}

summary[idx,1]="Testing Set"
summary[idx,2]=PROB_THREAD
summary[idx,3]=num_of_testingset
summary[idx,4]=num_of_lapse_test
summary[idx,5]=num_test_event_pos
summary[idx,6]=num_test_event_neg
summary[idx,7]=num_test_event_neg/(num_test_event_pos+num_test_event_neg)
summary[idx,8]=num_normal_pos_test
summary[idx,9]=num_normal_neg_test
summary[idx,10]=num_normal_neg_test/(num_normal_pos_test+num_normal_neg_test)
summary[idx,11]=n
Coordinator
Jul 28, 2014 at 9:08 PM
avoid the inefficient for loop

library(rattle)
building <- TRUE
scoring <- ! building

A pre-defined value is used to reset the random seed so that results are repeatable.

seed <- 42
TrainingFile<-"C:/Code/R/FundingPattern/RFinput/RF_DEVELOP_SET.csv"
TestFolder <-"C:/Code/R/FundingPattern/"
TestFile <-"C:/Code/R/FundingPattern/RFinput/RF_VAL_SET.csv"
ModelFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_July28.rda"
QuantFile <-"C:/Code/R/FundingPattern/Result/lapse_Quant.csv"
SummaryFile <-"C:/Code/R/FundingPattern/Result/lapse_Summary.csv"
SampleSetFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_sample.csv"
ValidSetFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_validate.csv"
TestSetFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_test.csv"
importanceFile<-"C:/Code/R/FundingPattern/Result/RF_lapse_importance.csv"
PredictionFile<-"C:/Code/R/FundingPattern/result/RF_policies(lapse).csv"

NUM_OF_TEST <- 1
NUM_OF_MEASURE <- 17

crs$dataset <- read.csv(TrainingFile, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
crs$nobs <- nrow(crs$dataset) # 86037 observations
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.5crs$nobs) # 60225 observations
crs$validate <- sample(setdiff(seq_len(nrow(crs$dataset)), crs$train), 0.1
crs$nobs) # 12905 observations
crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train),crs$validate) # 12907 observations

The following variable selections have been noted.

crs$input <- c("nduration", "j", "DB_OPTION", "ln",
           "ln_cnt", "lr", "lr_cnt", "lb",
           "fc", "fc_cnt", "ps", "ps_cnt",
           "prem_path", "sum_paid", "end_zeros", "runout",
           "ANNUAL_PREMIUM_AMT", "POLICY_AGE", "FACE_AMOUNT", "LN_INT_RATE",
           "MIN_PREM_AMT", "MODE_PREM_AMT", "OWNER_AGE_AT_ISSUE", "level1",
           "tphase", "level2", "alpha", "slope",
           "stderr", "drops", "mf_dump", "proj_y1",
           "level31", "proj_y", "level3", "lifetime_prem_dev",
           "single_prem_dev", "ten_prem_dev", "ISSUE_YR", "duration",
           "cash_value", "INTEREST_RATE", "LAST_TRANSACTION_AMOUNT", "LOAN_AMOUNT",
           "REDUCTION_RATING", "TOTAL_PREMIUMS", "ACCT_VALUE", "BIRTHDATE",
           "CURRENT_FACE", "SURR_VALUE", "TARGET_PREMIUM", "cashpercnt",
           "accntpercnt", "IBE_9152", "IBE_2058", "IBE_2059",
           "IBE_2060", "IBE_2061", "IBE_2062", "IBE_4000",
           "IBE_8165", "IBE_8167", "IBE_8433", "IBE_8434",
           "IBE_8463", "IBE_8562", "IBE_8579", "IBE_8580",
           "IBE_8590", "IBE_8592", "IBE_8600", "IBE_8601",
           "IBE_8611", "IBE_8614", "IBE_8616", "IBE_8617",
           "IBE_8618", "IBE_8621", "IBE_8626", "IBE_8628",
           "IBE_8640", "IBE_8641", "IBE_8643", "IBE_8647",
           "IBE_8652", "IBE_8655", "IBE_8682", "IBE_8685",
           "IBE_8686", "IBE_8689", "IBE_8690", "IBE_8691",
           "IBE_8704", "IBE_8707", "IBE_8816", "IBE_9153",
           "IBE_9509", "IBE_9510", "IBE_9511", "IBE_9512",
           "IBE_9513", "IBE_9514", "ECONOMIC_STABILITY_IND", "HEAVYTRANSACTORS",
           "PROPENSITY_SCORE_INTERNET", "PROPENSITY_SCORE_MAIL", "PROPENSITY_SCORE_PHONE", "MEDIA_CHNL_INTERNET",
           "MEDIA_CHNL_CELLPHONE", "MEDIA_CHNL_PRIMETIME_TV", "MEDIA_CHNL_DAYTIME_TV", "MEDIA_CHNL_OUTDOOR",
           "MEDIA_CHNL_YELLOWPAGES", "MEDIA_CHNL_RADIO", "MEDIA_CHNL_MAGAZINE", "MEDIA_CHNL_NEWSPAPER",
           "AFFORDABILITY_IND", "PHONE_APPEND", "UNDERBANKED_9351", "TOT_OFLN_ORD_100_250_6698",
           "TOT_ONLN_PURCHASES_6718", "WKS_SNC_LST_OLN_ORD_6843", "WKS_SNC_LST_ORD_JWRY_6870", "FaceHousePercnt",
           "BILLING_FREQ")
crs$categoric <- NULL
crs$target <- "LAPSE"
crs$risk <- NULL
crs$ident <- NULL
crs$ignore <- c("SURRENDER", "status2", "termination_date", "single_ratio9", "single_ratio10", "paid_premium9", "paid_premium10")
crs$weights <- NULL

training_set<-crs$dataset[crs$sample,]
write.csv(training_set,SampleSetFile)

num_of_lapse_sample<-sum(training_set$LAPSE)
num_of_trainingset<-nrow(training_set)

valid_set<-crs$dataset[crs$validate,]
write.csv(valid_set,ValidSetFile)

testing_set<-crs$dataset[crs$test,]
write.csv(testing_set,TestSetFile)



require(randomForest, quietly=TRUE)

Build the Random Forest model.

set.seed(crv$seed)
crs$rf <- randomForest(as.factor(LAPSE) ~ .,
                   data=crs$dataset[crs$sample,c(crs$input, crs$target)], 
                   ntree=500,
                   mtry=9,
                   importance=TRUE,
                   na.action=na.roughfix,
                   replace=FALSE)
rf<-crs$rf
save(rf,file=ModelFile)
load(ModelFile)

rn <- round(importance(crs$rf), 2)
var_importance<-rn[order(rn[,3], decreasing=TRUE),]
write.csv(var_importance,importanceFile)


nrow()
crs$testset<-testdata
testing_set<-testdata
num_of_lapse_test<-sum(testing_set$LAPSE)
num_of_testingset<-nrow(testing_set)
i<-1

prob <- predict(crs$rf, (crs$testset[crs$validate,c(crs$input)]), type="prob")[,2]
valid_result<-cbind(prob,valid_set)
PROB_THREAD=quantile(valid_result[valid_result$LAPSE==1,]$prob,0.8,na.rm=TRUE)

PROB_THREAD=0.4

num_sample<-nrow(testdata)
crs$testset<-testdata
testing_set<-testdata
crs$pr <- predict(crs$rf, (crs$testset[,c(crs$input)]), type="prob")[,2]
n
write.csv(rfdata,PredictionFile)
summary<-matrix(nrow=390,ncol=24)


ptm<-proc.time()
i=20
idx=i
3-2
PROB_THREAD=i/100

num_of_testingset=nrow(rfdata)
num_of_lapse_test=length(which(rfdata$LAPSE==1))
num_test_event_pos=length(which(rfdata$"crs$pr">PROB_THREAD & rfdata$LAPSE==1))
num_test_event_neg=length(which(rfdata$"crs$pr">PROB_THREAD & rfdata$LAPSE==0))
num_normal_pos_test=length(which(rfdata$"crs$pr"<=PROB_THREAD & rfdata$LAPSE==0))
num_normal_neg_test=length(which(rfdata$"crs$pr"<=PROB_THREAD & rfdata$LAPSE==1))

sum_test_event_pos=sum(rfdata[which(rfdata$"crs$pr">PROB_THREAD & rfdata$LAPSE==1),]$"CURRENT_FACE")
sum_test_event_neg=sum(rfdata[which(rfdata$"crs$pr">PROB_THREAD & rfdata$LAPSE==0),]$"CURRENT_FACE")
sum_test_normal_pos=sum(as.numeric(rfdata[which(rfdata$"crs$pr"<=PROB_THREAD & rfdata$LAPSE==0),]$"CURRENT_FACE"))
sum_test_normal_neg=sum(as.numeric(rfdata[which(rfdata$"crs$pr"<=PROB_THREAD & rfdata$LAPSE==1),]$"CURRENT_FACE"))

summary[idx,1]="Testing Set"
summary[idx,2]=PROB_THREAD
summary[idx,3]=num_of_testingset
summary[idx,4]=num_of_lapse_test
summary[idx,5]=num_test_event_pos
summary[idx,6]=num_test_event_neg
summary[idx,7]=num_test_event_neg/(num_test_event_pos+num_test_event_neg)
summary[idx,8]=num_normal_pos_test
summary[idx,9]=num_normal_neg_test
summary[idx,10]=num_normal_neg_test/(num_normal_pos_test+num_normal_neg_test)
summary[idx,11]=num_test_event_pos+num_normal_neg_test
summary[idx,12]=num_of_lapse_test/num_of_testingset
summary[idx,13]=(num_test_event_pos+num_normal_neg_test)/num_of_testingset

summary[idx,14]=sum_test_event_pos/1000000
summary[idx,15]=sum_test_event_neg/1000000
summary[idx,16]=sum_test_event_neg/(sum_test_event_pos+sum_test_event_neg)
summary[idx,17]=sum_test_normal_pos/1000000
summary[idx,18]=sum_test_normal_neg/1000000
summary[idx,19]=sum_test_normal_neg/(sum_test_normal_pos+sum_test_normal_neg)



time_elpased<-proc.time()-ptm

colname<-c("Dataset","Voting Prob","#policies","#Lapse",
       "LP pos","LP neg","LP error rate",
       "Normal pos","Normal neg","Normal error rate",
       "# of predicted LP","actual LP rate","pred LP rate",
       "LP pos Amt","LP neg Amt","LP error rate",
       "Normal pos Amt","Normal neg Amt","Normal error rate",
       "total face","total event face","total pred face",
       "actual rate by face","pred rate by face")
colnames(summary)<-colname

write.csv(summary,SummaryFile,row.names=FALSE)