Bowling Green State University
(419) 378 - 9131
ebaltay@bgsu.edu
library(dplyr)
library(dummies)
# Author: Endale B Altaye
# Dec 2014
# BGSU, For regression class
StepwiseAlgorithm=function(data,responceVariable,alphaToEnter,alphaToRemove)
{
dataname=data.frame(data)
response=responceVariable
varname=names(dataname)
predVarname=varname[varname!=response]
# Simple linear regression list for forward case
SLRList=lapply(predVarname, function(x){fml = as.formula(sprintf("%s ~ %s", response, x))
summary(lm(fml, data = dataname))$coeff})
# Extracting the p-values
pvalueVector=sapply(1:length(predVarname), function(x) pvalue=c(SLRList[[x]][2,4]))
# 2 in above line represents the second row which is the variable since the first row is intercept
# 4 for the p-value column
#Picking the most probable significant variable, min p-value
m=which(pvalueVector==min(pvalueVector))
#Comparing the p-value of the most probable sig. with the alphaToEnter value
if(pvalueVector[m]<=alphaToEnter)
{
sigvariable=c(predVarname[m])
Xreduc=predVarname[-m]
Xnew=sigvariable
message (sprintf("Variable entered/removed per each step with alpha to enter = %s and alpha to stay =%s :",alphaToEnter,alphaToRemove))
print(sprintf("step 1 Selected variable=%s, the p-value was %s",Xnew,min(pvalueVector)))
}
else
{stop("No Significant Variable to include.") # stops the algorithm returning no sig variable to include
}
k=1
repeat
{
i=0
k=k+1
# checking if there exist variable to include into the model for forward alg. ifnot stop algorithm
stopifnot(length(Xreduc)>0)
# checking if forward alg results more than one pred variable or not
sigvarcomb=ifelse(length(Xnew)>1, paste(Xnew, collapse=" + ") , Xnew[1])
# running forward step
SLRList=lapply(Xreduc, function(x){fml = as.formula(sprintf("%s ~ %s+%s",response,x,sigvarcomb))
summary(lm(fml, data = dataname))$coeff})
pvalueVector=sapply(1:length(Xreduc), function(x) pvalue=c(SLRList[[x]][2,4]))
m=which(pvalueVector==min(pvalueVector))
if(pvalueVector[m]<=alphaToEnter)
{ Xnew=c(Xnew,Xreduc[m])
print(sprintf("step %s Selected variable=%s, the p-value was %s",k,Xreduc[m],min(pvalueVector)))
Xreduc=Xreduc[-m]
lengthXnew=length(Xnew)
j=1
while(j<=lengthXnew)
{
i=i+1
# running backward elimination
mymodel=lm(as.formula(paste(response,paste(Xnew,collapse="+"),sep="~")), data = dataname)
pvalue=summary(mymodel)$coeff[-1,4]
if(max(pvalue)>alphaToRemove)
{
maxpvalue=which(pvalue==max(pvalue))
print(sprintf("step %s.%s Removed variable=%s, the p-value was %s",k,i,Xnew[maxpvalue],max(pvalue)))
Xnew=Xnew[-maxpvalue]
j=j+1
}
else {
j=lengthXnew+1
Xreduc=setdiff(predVarname,Xnew)
}
}
}
else { break }
}
finalmodel=lm(as.formula(paste(response,paste(Xnew,collapse="+"),sep="~")), data = dataname)
message("Summary of Finally Selected Model is:")
summary(finalmodel)
}
housingprice <- read.delim(".../housingprice.txt")
#colnames(housingprice)
str(housingprice)
housingpriceWithDummy=dummy.data.frame(housingprice,verbose=TRUE)
str(housingpriceWithDummy)
housingpriceWithDummy=housingpriceWithDummy %>% select(-c(BEDS3,BATHS2,HEAT0,STYLE0,GARAGE1,BASEMENT0,FIRE0,SCHOOL0))
StepwiseAlgorithm(housingpriceWithDummy,"PRICE",0.15,0.25) # adding variable into the model with pvalue less than 15%
# Droping variable outof the model if its pvalue is more than 25 %