r program

Coordinator
May 15, 2014 at 9:58 PM
extract_feature<-function(x)
{

opt_mat=matrix(0,nrow=7,ncol=5)
ret_vec=list()
length(ret_vec)<-39

y=x[x$record_type==0,]

ret_vec[1:8]=colMeans(y[,18:25])
ret_vec[9:16]=sd(y[,18:25])

ret_vec[17:21]=x[max(x[x$record_type==0,]$shopping_pt),10:14] #car_age car_value risk_factor age_oldest age_youngest

ret_vec[22]=x[max(x[x$record_type==0,]$shopping_pt),16] # c_previous
ret_vec[23]=x[max(x[x$record_type==0,]$shopping_pt),17]#previous duration
ret_vec[24:31]=x[max(x[x$record_type==0,]$shopping_pt),18:25] # last A-G
ret_vec[32:39]=x[x$record_type==1,18:25] #target A-G

print(x[1,"customer_ID"])
return(ret_vec)
}


train=read.csv("C:/Code/R/Policies/train.csv")
train[is.na(train)]<-0
features_train=do.call("rbind",as.list(by(train,train$customer_ID,extract_feature)))
colnames(features_train)<-c("A_mean","B_mean","C_mean","D_mean","E_mean","F_mean","G_mean","cost_mean",
                        "A_sd","B_sd","C_sd","D_sd","E_sd","F_sd","G_sd","cost_sd",
                        "car_age",  "car_value",  "risk_factor","age_oldest","age_youngest","C_Previous","previous_duration",
                        "A_Last","B_Last","C_Last","D_Last","E_Last","F_Last","G_Last","Last_cost","A","B","C","D","E","F","G","Cost")
write.csv(features_train,"C:/Code/R/Policies/features_train.csv")

print("Training features done!")

test=read.csv("C:/Code/R/Policies/test.csv")
test[is.na(test)]<-0
features_test=do.call("rbind",as.list(by(train,train$customer_ID,extract_feature)))
colnames(features_test)<-c("A_mean","B_mean","C_mean","D_mean","E_mean","F_mean","G_mean","cost_mean",
                       "A_sd","B_sd","C_sd","D_sd","E_sd","F_sd","G_sd","cost_sd",
                       "car_age",  "car_value",  "risk_factor","age_oldest","age_youngest","C_Previous","previous_duration",
                       "A_Last","B_Last","C_Last","D_Last","E_Last","F_Last","G_Last","Last_cost","A","B","C","D","E","F","G","Cost")
write.csv(features_test,"C:/Code/R/Policies/features_test.csv")
print("Testing features done!")


library(rattle)
building <- TRUE
scoring <- ! building

A pre-defined value is used to reset the random seed so that results are repeatable.

seed <- 42

crs$dataset <- read.csv("C:/Code/R/Policies/features_train.csv", na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
crs$testset <- read.csv("C:/Code/R/Policies/features_test.csv", na.strings=c(".", "NA", "", "?"), header=TRUE, sep=",", encoding="UTF-8")
set.seed(crv$seed)

crs$nobs <- nrow(crs$dataset) # 58555 observations
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.5*crs$nobs)

crs$input <- c("A_mean","B_mean","C_mean","D_mean","E_mean","F_mean","G_mean","cost_mean",
           "A_sd","B_sd","C_sd","D_sd","E_sd","F_sd","G_sd","cost_sd",
           "car_age",  "car_value",  "risk_factor","age_oldest","age_youngest","C_Previous","previous_duration",
           "A_Last","B_Last","C_Last","D_Last","E_Last","F_Last","G_Last","Last_cost")
crs$target <- c("A","B","C","D","E","F","G")
formula<-c(as.factor(A) ~ .,as.factor(B) ~ .,as.factor(C) ~ .,as.factor(D) ~ .,as.factor(E) ~ .,as.factor(F) ~ .,as.factor(G) ~ .)
crs$risk <- NULL
crs$ident <- NULL

require(randomForest, quietly=TRUE)

Build the Random Forest model.

result<-subset(crs$testset[,],select=c("X"))
for (i in 1:7)
{
set.seed(seed)
crs$rf <- randomForest(formula[[i]],
                     data=crs$dataset[crs$sample,c(crs$input, crs$target[i])], 
                     ntree=500,
                     mtry=9,
                     importance=TRUE,
                     na.action=na.roughfix,
                     replace=FALSE)
print(crs$rf)
crs$pr <- predict(crs$rf, crs$testset[,c(crs$input)], type="class")
result<-cbind(result,crs$pr)
}

result<-cbind(crs$testset[,c(crs$input)],result)
write.csv(result,"C:/Code/R/Policies/result10.csv")