Assignment 2—Solution

Question 1

I read in the data and calculate the Simpson's diversity index. The species data lie in columns 2 through 76 of the data frame so I select only those columns to calculate the diversity index.

rikz <- read.table( 'ecol 562/RIKZ.txt', header=TRUE)
library(vegan)
rikz$simpson <- diversity(rikz[,2:76], 'simpson', 1)
model3 <- lm(simpson~NAP*factor(week), data=rikz)

To force R to return estimates for the individual slopes and intercepts I fit the full interaction model without NAP and without an intercept.

model3a <- lm(simpson~factor(week)+factor(week):NAP - 1, data=rikz)

I extract the coefficient estimates, calculate confidence intervals using the confint function, and create labels for the week and estimate type, slope or intercept.

out.mod <- data.frame(est=coef(model3a), confint(model3a), week=rep(1:4,2), parm=rep(c('Intercept', 'Slope'), c(4,4)))
names(out.mod)[2:3] <- c('low95','up95')
out.mod
                           est      low95        up95 week      parm
factor(week)1      0.601960078  0.4579173  0.74600288    1 Intercept
factor(week)2      0.326222041  0.1939818  0.45846226    2 Intercept
factor(week)3      0.599122838  0.4734142  0.72483152    3 Intercept
factor(week)4      0.668685772  0.4094507  0.92792085    4 Intercept
factor(week)1:NAP -0.214123822 -0.3753447 -0.05290296    1     Slope
factor(week)2:NAP -0.213231377 -0.3439623 -0.08250049    2     Slope
factor(week)3:NAP  0.004919613 -0.1015062  0.11134545    3     Slope
factor(week)4:NAP  0.050147353 -0.2177414  0.31803614    4     Slope

Finally I use the lattice code developed in class to plot the point estimates and 95% confidence intervals.

myprepanel.ci <- function(x, y, lx, ux, subscripts, ...) {
list(xlim=range(x, ux[subscripts], lx[subscripts], finite=TRUE))
}
library(lattice)
dotplot(week~est|parm, data=out.mod, xlab='Estimate', ylab='Week', prepanel=myprepanel.ci, lx=out.mod$low95, ux=out.mod$up95, layout=c(2,1), panel=function(x, y, subscripts){
panel.dotplot(x, y, col=4, pch='+', cex=.6)
panel.segments(out.mod$low95[subscripts], y, out.mod$up95[subscripts], y, lwd=3, col='dodgerblue4', lineend=1)
panel.points(x, y, col='white', pch=16, cex=1.1)
panel.points(x, y, col='dodgerblue4', pch=1, cex=1.2)
panel.abline(v=0, col=2, lty=2)
}, scales=list(x='free'))
fig 1

Question 2

I start by calculating the appropriate confidence level to use in displaying the intercepts.

nor.func1 <- function(alpha, model, sig) 1 - pt(-abs(qt(alpha/2,model$df.residual))* sum(sqrt(diag(sig))) / sqrt(c(1,-1) %*% sig %*%c (1,-1)), model$df.residual) - pt(abs(qt(alpha/2,model$df.residual)) * sum(sqrt(diag(sig))) / sqrt(c(1,-1) %*% sig %*% c(1,-1)), model$df.residual, lower.tail=F)
#function whose roots we need
nor.func2 <- function(a,model,sigma) nor.func1(a,model,sigma)-.95
#obtain confidence level for intercepts
#initialize storage vector
xvec1b <- numeric(6)
#create sigma for intercepts
vmat <- vcov(model3a)[1:4,1:4]
#running index
ind <- 1
for(i in 1:3) {
for(j in (i+1):4){
sig <- vmat[c(i,j),c(i,j)]
#solve for alpha
xvec1b[ind] <- uniroot(function(x) nor.func2(x,model3a, sig), c(.001,.999))$root
ind <- ind+1
}}
#average the confidence levels
ci1b <- round(mean(1-xvec1b),3)
1-xvec1b
[1] 0.8400148 0.8405799 0.8553249 0.8397747 0.8595149 0.8621142
ci1b
[1] 0.85

The actual values for the six pairwise comparisons vary from 84% to 86%, so the mean of 85% looks reasonable. I repeat this for the slopes.

#obtain confidence level for slopes
#initialize storage
xvec1c <- numeric(6)
#create sigma for slopes
vmat <- vcov(model3a)[5:8,5:8]
#location in storage vector
ind <- 1
for(i in 1:3) {
for(j in (i+1):4) {
sig <- vmat[c(i,j),c(i,j)]
#solve for alpha
xvec1c[ind] <- uniroot(function(x) nor.func2(x,model3a, sig), c(.001,.999))$root
ind <- ind+1
}}
#average the confidence levels
ci1c <- round(mean(1-xvec1c),3)
1-xvec1c
[1] 0.8418427 0.8479629 0.8517045 0.8417620 0.8617851 0.8728127
ci1c
[1] 0.853

The slope confidence levels are a little more variable ranging from 84% to 87%. Once again the mean of 85.3% seems like a reasonable choice.

I use these levels, 85% and 85.3%., to calculate a new set of confidence intervals and I add them to the data frame constructed in Question 1.

# calculate confidence intervals
slope.ci <- confint(model3a,level=ci1c)[5:8,]
int.ci <- confint(model3a,level=ci1b)[1:4,]
# add CI to the data frame from before
out.mod$low50 <- c(int.ci[,1],slope.ci[,1])
out.mod$up50 <- c(int.ci[,2],slope.ci[,2])
out.mod
                           est      low95        up95 week      parm      low50        up50
factor(week)1      0.601960078  0.4579173  0.74600288    1 Intercept  0.4974533  0.70646683
factor(week)2      0.326222041  0.1939818  0.45846226    2 Intercept  0.2302784  0.42216572
factor(week)3      0.599122838  0.4734142  0.72483152    3 Intercept  0.5079180  0.69032772
factor(week)4      0.668685772  0.4094507  0.92792085    4 Intercept  0.4806041  0.85676749
factor(week)1:NAP -0.214123822 -0.3753447 -0.05290296    1     Slope -0.3319865 -0.09626118
factor(week)2:NAP -0.213231377 -0.3439623 -0.08250049    2     Slope -0.3088039 -0.11765883
factor(week)3:NAP  0.004919613 -0.1015062  0.11134545    3     Slope -0.0728844  0.08272363
factor(week)4:NAP  0.050147353 -0.2177414  0.31803614    4     Slope -0.1456963  0.24599099

I generate the plot.

#add a new line to the panel function from before
dotplot(week~est|parm, data=out.mod, xlab='Estimate', ylab='Week', prepanel=myprepanel.ci, lx=out.mod$low95, ux=out.mod$up95, layout=c(2,1), panel=function(x, y, subscripts){
panel.dotplot(x, y, col=4, pch='+', cex=.6)
panel.segments(out.mod$low95[subscripts], y, out.mod$up95[subscripts], y, lwd=3, col='dodgerblue4', lineend=1)
#new line
panel.segments(out.mod$low50[subscripts], y, out.mod$up50[subscripts], y, lwd=6, col='dodgerblue1', lineend=1)
panel.points(x, y, col='white', pch=16, cex=1.1)
panel.points(x, y, col='dodgerblue4', pch=1, cex=1.2)
panel.abline(v=0, col=2, lty=2)
}, scales=list(x='free'))

fig 2

From the graph it's clear that the slopes in weeks 1 and 2 are not different and the slopes in weeks 3 and 4 are not different. The slopes in weeks 1 and 2 are both different from the slope in week 3 (the thick lines failt to overlap). If you carefully scrutinize the graph you'll see that the thick blue bars of weeks 1 and 2 do overlap the thick blue bar of week 4, so these slopes are not significantly different. Thus only two of the six possible slope pairwise differences are statistically significant: slope 1 is different from 3 and slope 2 is different from 3

Using the 95% confidence intervals we see that the slopes in weeks 1 and 2 are significantly different from zero while the slopes in weeks 3 and 4 are not.

Remember that this graph is supposed to be consistent with what we would obtain if we formallyl carried out these six individual tests. Three of the tests, comparing the slopes of weeks 2, 3, and 4 against the slope in week 1 are carried out in the summary table of the original model.

printCoefmat(summary(model3)$coefficients)
                     Estimate  Std. Error t value  Pr(>|t|)   
(Intercept)        0.60196008  0.07109038  8.4675 3.477e-10 ***
NAP               -0.21412382  0.07956839 -2.6911  0.010628 * 
factor(week)2     -0.27573804  0.09650602 -2.8572  0.006975 **
factor(week)3     -0.00283724  0.09435587 -0.0301  0.976173   
factor(week)4      0.06672569  0.14636596  0.4559  0.651139   
NAP:factor(week)2  0.00089244  0.10244032  0.0087  0.993096   
NAP:factor(week)3  0.21904343  0.09534153  2.2975 
0.027348
NAP:factor(week)4  0.26427117  0.15430936  1.7126 
0.095155
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Notice that the last line of the output shows that the slopes in weeks 1 and 4 are not significantly different (p = 0.095), while the slopes in weeks 1 and 3 are (second to the last line, p = 0.027). To get additional pairwise tests we can refit the model choosing week 2 as the reference group.

model3.1 <- lm(simpson~NAP*factor(week, levels=c(2,1,3,4)), data=rikz)
printCoefmat(summary(model3.1)$coefficients)
                                             Estimate  Std. Error t value  Pr(>|t|)   
(Intercept)                                0.32622204  0.06526538  4.9984 1.418e-05 ***
NAP                                       -0.21323138  0.06452047 -3.3049  0.002118 **
factor(week, levels = c(2, 1, 3, 4))1      0.27573804  0.09650602  2.8572  0.006975 **
factor(week, levels = c(2, 1, 3, 4))3      0.27290080  0.09004864  3.0306  0.004437 **
factor(week, levels = c(2, 1, 3, 4))4      0.34246373  0.14362702  2.3844  0.022347 * 
NAP:factor(week, levels = c(2, 1, 3, 4))1 -0.00089244  0.10244032 -0.0087  0.993096   
NAP:factor(week, levels = c(2, 1, 3, 4))3  0.21815099  0.08319718  2.6221 
0.012612
NAP:factor(week, levels = c(2, 1, 3, 4))4  0.26337873  0.14711608  1.7903 
0.081595
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

The last line of the output shows that the slopes in weeks 2 and 4 are not significantly different (p = 0.082), while the slopes in weeks 1 and 3 are (second to the last line, p = 0.013). This agrees with our graphical display.

Question 3

By definition Simpson's diversity index reports the probability that two randomly chosen individuals from the same site are different species. Being a probability Simpson's diversity is a number that lies between zero and one. We can demonstrate that there is a problem with the model by showing that the predictions from the model routinely exceed both of these boundaries.

The X = 0 boundary. I calculate P(X ≤ 0) for each of the 45 predicted means. The largest observed probability of this event is 0.60 and at 12 of the 45 predicted means there is a greater than 5% chance of obtaining a negative value for diversity.

max(pnorm(0, predict(model3), summary(model3)$sigma))
[1] 0.5965873
sum(pnorm(0, predict(model3), summary(model3)$sigma)>.05)
[1] 12

I display all of probabilities in a box plot.

boxplot(pnorm(0, predict(model3), summary(model3)$sigma), horizontal=T, outline=F, ylim=c(0,.6), xlab='P(X <= 0)')
#jitter the points vertically to prevent overlap
points(pnorm(0, predict(model3), summary(model3)$sigma), jitter(rep(1,45), a=.2), col='seagreen', cex=.8, pch=16)
abline(v=.05, lty=2, col=2)

fig 3

The X = 1 boundary. I calculate P(X > 1) using the lower.tail=F option of pnorm. This time the largest probability of a value exceeding 1 is 0.31. At 9 of the 45 means there is a greater than 5% chance of obtaining a diversity value that exceeds 1.

max(pnorm(1, predict(model3), summary(model3)$sigma, lower.tail=F))
[1] 0.3080553
sum(pnorm(1, predict(model3), summary(model3)$sigma, lower.tail=F)>.05)
[1] 9

I display the 45 probabilities in a box plot.

boxplot(pnorm(1, predict(model3), summary(model3)$sigma, lower.tail=F), horizontal=T, outline=F, ylim=c(0,.35), xlab='P(X > 1)')
#jitter the points vertically to prevent overlap
points(pnorm(1, predict(model3), summary(model3)$sigma, lower.tail=F), jitter(rep(1,45), a=.2), col='seagreen', cex=.8, pch=16)
abline(v=.05, lty=2, col=2)

fig 4

For completeness I collect both sets of calculations in a single data frame and display the two box plots together in a single graph.

newdat <- data.frame(prob = c(pnorm(0, predict(model3), summary(model3)$sigma), pnorm(1, predict(model3), summary(model3)$sigma, lower.tail=F)), cat=rep(c(0,1), c(45,45)))
boxplot(prob~factor(cat), data=newdat, horizontal=T, outline=F, ylim=c(0,.6), axes=F, xlab='Probability')
axis(1)
axis(2, las=2, at=1:2, labels=c(expression(P(X<=0)), expression(P(X>=1))), cex.axis=.9)
box()
points(newdat$prob,jitter(as.numeric(newdat$cat)+1,a=0.2), col='seagreen', cex=.8, pch=16)
abline(v=.05, lty=2, col=2)

fig 5


hw2 scores

Course Home Page

Jack Weiss
Phone: (919) 962-5930
E-Mail: jack_weiss@unc.edu
Address: Curriculum for the Environment and Ecology, Box 3275, University of North Carolina, Chapel Hill, 27599
Copyright © 2012
Last Revised--February 2, 2012
URL: https://sakai.unc.edu/access/content/group/2842013b-58f5-4453-aa8d-3e01bacbfc3d/public/Ecol562_Spring2012/docs/solutions/assign2.htm