Monday, November 9, 2015

Bayesian Model Averaging examples

#
############### Bayesian Model Averaging examples ###############
#
# This script assumes you have worked through all the previous notes from 
# the web page and you have downloaded, installed, and updated all available
# R packages. 

### Upload the SAS SEM Example data ("SEMData.sav") which is SIMULATION DATA.

library(foreign)

exsem <- read.spss("http://www.unt.edu/rss/class/Jon/R_SC/Module10/SEMData.sav", use.value.labels=TRUE, 
  max.value.labels=Inf, to.data.frame=TRUE)

summary(exsem)
head(exsem)

cor(exsem)

# The goal of the this example is to choose the best set of predictor variables for a linear (OLS) prediction 
# model with Extroversion (extro) as the outcome. 

# Basic linear (OLS) regression model. 

reg.1 <- lm(extro~abstruse+agree+block+cognitive+common+cultural+open+physical+series+sets+social+vocab, data=exsem)
summary(reg.1)

# The BMA (Bayesian Model Averaging) package/library was designed specifically to use Bayesian Model 
# Averaging to address the variable selection problem in the context of several types of models (e.g. 
# GLM, LM, and Survival Models. 
 
library(BMA)

# The function for conducting BMA with respect to linear regression is 'bicreg'. 
# The 'bicreg' function requires a matrix of predictor variables as input. 

attach(exsem)
predictors <- as.matrix(cbind(open, agree, social, cognitive, physical, cultural, vocab, abstruse, 
                              block, common, sets, series))
detach(exsem)

# Conduct the BMA using the 'bicreg' function by submitting the matrix of predictors (predictors) 
# and the outcome variable (exsem$extro).

bma1 <- bicreg(predictors, exsem$extro)
summary(bma1)

# Based on the first column of the output we can see that "open", "agree", and "series" are the most 
# important variables; the column 'p!=0' indicates the percentage/probability that the coefficient 
# for a given predictor is NOT zero. We can also see that the first model 'model 1' (which includes 
# only 'open', 'agree', & 'series') is the best because it has the lowest BIC and the largest 
# posterior probability. 

# The 'ols' part of the output (not printed by default) gives a matrix, with each model as a row and 
# each predictor variable as a column; listing the estimated (OLS) coefficient for each variable in 
# a given model. 

bma1$ols

# Likewise, the 'se' part of the output produces a similar matrix, with the standard errors for each 
# coefficient (for each variable/model combination). 

bma1$se

# The 'postmean' part of the output (not printed by default) contains the average posterior coefficient 
# for each predictor. The 'postsd' provides the standard deviation of each average posterior coefficient.

bma1$postmean

bma1$postsd

# The 'which' part of the output (not provided by default) contains a matrix, with each model as a row and 
# each predictor variable as a column; listing whether a variable was included in each model. 

bma1$which

# The BMA package also contains a plot function for displaying the posterior distributions of the 
# coefficients. 

plot(bma1)

# For a complete description of the 'bicreg' function:

help(bicreg)

# Bayesian model averaging can also be conducted when attempting to identify the best set of predictors 
# for a Generalized Linear Model. The 'bic.glm' function is very similar to 'bicreg'; you must 
# supply a matrix or data frame of the independent variables (predictors) and the outcome variable. 
# Results of the following model mirror those above. However, the obvious benefit of the 'bic.glm' function 
# is the ability to specify non-normal error distributions (i.e. non-Gaussian; e.g. binomial).

bma2 <- bic.glm(predictors, exsem$extro, glm.family = "gaussian")
summary(bma2)

# Notice that when specifying "Gaussian" the estimation of the posterior standard 
# deviations is slightly off; therefore, it is best to use 'bicreg' when family = 
# "Gaussian".

bma1$postsd
bma2$postsd

plot(bma2)

# For a complete description of the 'bic.glm' function and its arguments:

help(bic.glm)

### Example of Binomial Logistic Regression using 'bic.glm'.

# Read in the data:

logreg <- read.table("http://www.unt.edu/rss/class/Jon/R_SC/Module9/logreg1.txt",
          header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE)
summary(logreg)

# Create a matrix of the predictor variables.

attach(logreg)
predictors.logist <- as.matrix(cbind(x1,x2,x3,x4))
detach(logreg)

# Run the 'bic.glm' function specifying the binomial family. 

bma2 <- bic.glm(predictors.logist, logreg$y, glm.family = "binomial")
summary(bma2)

plot(bma2)

### Example of Multinomial Logistic Regression using library 'mlogitBMA' and function 'bic.mlogit'.

library(mlogitBMA)

# Read in the data from the web (data is an SPSS.sav file, so the 'foreign' package is necessary). 

library(foreign)
mdata1 <- 
  read.spss("http://www.unt.edu/rss/class/Jon/R_SC/Module9/MultiNomReg.sav", 
  use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE)
summary(mdata1)

# Apply the 'bic.logit' function; supplying the formula in standard format, choices represent 
# the choices on the outcome variable (i.e. the categories of the outcome). 

mlog.1 <- bic.mlogit(y ~ x1 + x2 + X3, data = mdata1, choices = 1:3)
summary(mlog.1)

# To see all the arguments of the 'bic.mlogit' function 

help(bic.mlogit)



# End; Feb, 2011

source

No comments:

Post a Comment