Bowling Green State University
(419) 378 - 9131
ebaltay@bgsu.edu
library(dplyr)
library(dummies) # to change categorical into dummies
# Author: Endale B Altaye
# Dec 2014
# BGSU, for regression class
forwardSelection=function(data,responceVariable,alphaToEnter)
{
dataname=data.frame(data)
response=responceVariable
varname=names(dataname)
predVarname=varname[varname!=response]
# Simple linear regression list
SLRList=lapply(predVarname, function(x){SLR = as.formula(sprintf("%s ~ %s", response, x))
summary(lm(SLR, data = dataname))$coeff})
# Extracting the p-values
pvalueVector=sapply(1:length(predVarname), function(x) pvalue=c(SLRList[[x]][2,4]))
# 2 in above line represents the second row which is the variable since the first row is intercept
# 4 for the p-value column
#Picking the most probable significant variable, min p-value
m=which(pvalueVector==min(pvalueVector))
#Comparing p-value of the most probable sig. with the alphaToEnter value
if(pvalueVector[m]<=alphaToEnter)
{
sigvariable=c(predVarname[m])
predVarname=predVarname[-m]
selectedvar=sigvariable
message (sprintf("Variable entered per each step with alpha to enter = %s:",alphaToEnter))
print(sprintf("step 1 Selected variable=%s, the p-value was %s",selectedvar,min(pvalueVector)))
k=1
repeat
{
sigvarcomb=ifelse(length(selectedvar)>1, paste(selectedvar, collapse=" + ") , selectedvar[1])
SLRList=lapply(predVarname, function(x){SLR = as.formula(sprintf("%s ~ %s+%s",response,x,sigvarcomb))
summary(lm(SLR, data = dataname))$coeff})
pvalueVector=sapply(1:length(predVarname), function(x) pvalue=c(SLRList[[x]][2,4]))
m=which(pvalueVector==min(pvalueVector))
if(pvalueVector[m]<=alphaToEnter)
{ k=k+1
selectedvar=c(selectedvar,predVarname[m])
addedvar=predVarname[m]
predVarname=predVarname[-m]
print(sprintf("step %s Selected variable=%s, the p-value was %s",k,addedvar,min(pvalueVector)))
if(length(predVarname)==0) {
message("All Variables Selected")
# stop if all the variables are significant and included
break}
}
else {
sigvarcomb=ifelse(length(selectedvar)>1, paste(selectedvar, collapse=" + ") , selectedvar[1])
SLRList=summary(lm(as.formula(sprintf("%s ~ %s",response,sigvarcomb)),data = dataname))$coeff
# stop if all the significant variables are included no more to add. Remaining are insignificant once.
break
}
}
message("Summary of Finally Selected Model is:")
SLRList
} else {print("No significant variable at the given alpha")}
}
housingprice <- read.delim(".../housingprice.txt")
#colnames(housingprice)
str(housingprice)
housingpriceWithDummy=dummy.data.frame(housingprice,verbose=TRUE)
str(housingpriceWithDummy)
housingpriceWithDummy=housingpriceWithDummy %>% select(-c(BEDS3,BATHS2,HEAT0,STYLE0,GARAGE1,BASEMENT0,FIRE0,SCHOOL0))
forwardSelection(housingpriceWithDummy,"PRICE",0.2) # keeping varibles inthe model which have a pvalue less than 20%