Data Visualization

The yield data is from the paper Chen, O’Leary, and Evans (2019). The data is collected from two sites: CNT and NVT. We just focus on the site NVT.

NVTData <- read.table("NVTData.txt",header = TRUE)
plot(NVTData$Wavail,NVTData$yield,cex=0.1)

datOR    <- NVTData[order(NVTData$Wavail),]
knotsNum <- length(table(datOR$Wavail)) # unique knots

The explanatory variables of NVT data comprises lat, longi, year, soils, Wavail, yield, where there are 362 unique water of available values.

Model Fitting by `gam()`

gam refers to the generalized additive models with integrated smoothness estimation (Wood 2006). The functions gam() is in the package mgcv, which is developed by Wood and Wood (2015).

fitgam.tp1 <- gam(yield ~ s(Wavail, bs = "tp"),data=datOR)

plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.tp1$fitted.values,type="l",lwd=2)

In the above model, the basis functions is tp, which refers to thin plate regression spline. In the following models, cr is for cubic regression spline and gp is for Gaussian process models with a variety of simple correlation functions.

fitgam.tp2 <- gam(yield ~ s(Wavail,k=3, bs = "tp"), data = datOR)
fitgam.tp3 <- gam(yield ~ s(Wavail,k=10, bs = "tp"),data = datOR)
fitgam.tp4 <- gam(yield ~ s(Wavail,k=50, bs = "tp"),data = datOR)
fitgam.tp5 <- gam(yield ~ s(Wavail,k=100, bs = "tp"),data = datOR)

plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.tp2$fitted.values,type="l",col="red",lwd=2)
points(datOR$Wavail,fitgam.tp3$fitted.values,type="l",col="blue",lwd=2)
points(datOR$Wavail,fitgam.tp4$fitted.values,type="l",col="green",lwd=2)
points(datOR$Wavail,fitgam.tp5$fitted.values,type="l",col="brown",lwd=2)

AIC(fitgam.tp2,fitgam.tp3,fitgam.tp4,fitgam.tp5)

##                    df      AIC
## fitgam.tp2   3.998908 30748.05
## fitgam.tp3  10.933138 30492.71
## fitgam.tp4  50.120859 29182.53
## fitgam.tp5 100.266214 26895.97

BIC(fitgam.tp2,fitgam.tp3,fitgam.tp4,fitgam.tp5)

##                    df      BIC
## fitgam.tp2   3.998908 30777.73
## fitgam.tp3  10.933138 30573.87
## fitgam.tp4  50.120859 29554.61
## fitgam.tp5 100.266214 27640.31

A different basis functions `cr`.

fitgam.cr1 <- gam(yield ~ s(Wavail, bs = "cr"),data=datOR)
plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.cr1$fitted.values,type="l",lwd=2)

fitgam.cr2 <- gam(yield ~ s(Wavail,k=3, bs = "cr"), data = datOR)
fitgam.cr3 <- gam(yield ~ s(Wavail,k=10, bs = "cr"),data = datOR)
fitgam.cr4 <- gam(yield ~ s(Wavail,k=50, bs = "cr"),data = datOR)
fitgam.cr5 <- gam(yield ~ s(Wavail,k=100, bs = "cr"),data = datOR)

plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.cr2$fitted.values,type="l",col="red",lwd=2)
points(datOR$Wavail,fitgam.cr3$fitted.values,type="l",col="blue",lwd=2)
points(datOR$Wavail,fitgam.cr4$fitted.values,type="l",col="green",lwd=2)
points(datOR$Wavail,fitgam.cr5$fitted.values,type="l",col="brown",lwd=2)

AIC(fitgam.cr2,fitgam.cr3,fitgam.cr4,fitgam.cr5)

##                    df      AIC
## fitgam.cr2   3.999819 30755.26
## fitgam.cr3  10.967775 30579.91
## fitgam.cr4  50.613471 29010.34
## fitgam.cr5 100.517310 27075.68

BIC(fitgam.cr2,fitgam.cr3,fitgam.cr4,fitgam.cr5)

##                    df      BIC
## fitgam.cr2   3.999819 30784.96
## fitgam.cr3  10.967775 30661.33
## fitgam.cr4  50.613471 29386.08
## fitgam.cr5 100.517310 27821.88

A different basis functions `gp`.

fitgam.gp2 <- gam(yield ~ s(Wavail,k=3, bs = "gp"), data = datOR)
fitgam.gp3 <- gam(yield ~ s(Wavail,k=10, bs = "gp"),data = datOR)
fitgam.gp4 <- gam(yield ~ s(Wavail,k=50, bs = "gp"),data = datOR)
fitgam.gp5 <- gam(yield ~ s(Wavail,k=100, bs = "gp"),data = datOR)

plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.gp2$fitted.values,type="l",col="red",lwd=2)
points(datOR$Wavail,fitgam.gp3$fitted.values,type="l",col="blue",lwd=2)
points(datOR$Wavail,fitgam.gp4$fitted.values,type="l",col="green",lwd=2)
points(datOR$Wavail,fitgam.gp5$fitted.values,type="l",col="brown",lwd=2)

AIC(fitgam.gp2,fitgam.gp3,fitgam.gp4,fitgam.gp5)

##                   df      AIC
## fitgam.gp2  3.999795 30774.97
## fitgam.gp3 10.968502 30525.49
## fitgam.gp4 21.906114 29926.96
## fitgam.gp5 22.203024 29924.30

BIC(fitgam.gp2,fitgam.gp3,fitgam.gp4,fitgam.gp5)

##                   df      BIC
## fitgam.gp2  3.999795 30804.67
## fitgam.gp3 10.968502 30606.92
## fitgam.gp4 21.906114 30089.58
## fitgam.gp5 22.203024 30089.13

Comparison of different selection critera

aics.tp <- numeric(knotsNum)
gcvs.tp <- numeric(knotsNum)
devs.tp <- numeric(knotsNum)
aics.gp <- numeric(knotsNum)
gcvs.gp <- numeric(knotsNum)
devs.gp <- numeric(knotsNum)
for(kts in 3:knotsNum){
  fitgam.tp.temp <- gam(yield ~ s(Wavail,k=kts, bs = "tp"), 
                        data = datOR)
  aics.tp[kts] <- fitgam.tp.temp$aic
  gcvs.tp[kts] <- fitgam.tp.temp$gcv.ubre
  devs.tp[kts] <- fitgam.tp.temp$deviance
  fitgam.gp.temp <- gam(yield ~ s(Wavail,k=kts, bs = "gp"),                              data = datOR)
  aics.gp[kts] <- fitgam.gp.temp$aic
  gcvs.gp[kts] <- fitgam.gp.temp$gcv.ubre
  devs.gp[kts] <- fitgam.gp.temp$deviance
  cat(kts);
}

knotsNum<- 50
par(mfrow=c(1,3))
plot(3:knotsNum,aics.tp[-c(1:2)])
plot(3:knotsNum,gcvs.tp[-c(1:2)])
plot(3:knotsNum,devs.tp[-c(1:2)])

par(mfrow=c(1,1))

par(mfrow=c(1,3))
plot(3:knotsNum,aics.gp[-c(1:2)])
plot(3:knotsNum,gcvs.gp[-c(1:2)])
plot(3:knotsNum,devs.gp[-c(1:2)])

par(mfrow=c(1,1))

fitgam.gp6 <- gam(yield ~ s(Wavail,k=22, bs = "gp"),data = datOR)
plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.gp6$fitted.values,type="l",col="red",lwd=2)

However, the contradiction still exists

plot(datOR$Wavail,datOR$yield,cex=0.1,col="gray")
points(datOR$Wavail,fitgam.gp2$fitted.values,type="l",col="red",lwd=2)
points(datOR$Wavail,fitgam.gp3$fitted.values,type="l",col="blue",lwd=2)
points(datOR$Wavail,fitgam.gp4$fitted.values,type="l",col="green",lwd=2)
points(datOR$Wavail,fitgam.gp5$fitted.values,type="l",col="brown",lwd=2)

A possible solution: aggregative data

datOR.agg <- aggregate(datOR[c("lat","longi","year","Wavail","yield")],list(datOR$Wavail),mean)
names(datOR.agg)[1] <- c("WavailAgg")

plot(datOR$Wavail,datOR$yield,cex=0.1,pch=20,col="grey")
points(datOR.agg$WavailAgg,datOR.agg$yield,pch=20,col="red")

By taking the means for each unique Wavail, we have the aggregative data, which has 362 rows rather than 12377.

aics.agg <- numeric(knotsNum)
gcvs.agg <- numeric(knotsNum)
devs.agg <- numeric(knotsNum)
for(kts in 3:knotsNum){
  fitgam.agg    <- gam(yield ~ s(Wavail,k=kts,bs = "gp"), 
                       data=datOR.agg)
  aics.agg[kts] <- fitgam.agg$aic
  gcvs.agg[kts] <- fitgam.agg$gcv.ubre
  devs.agg[kts] <- fitgam.agg$deviance
  cat(kts);
}

par(mfrow=c(1,3))
plot(3:knotsNum,aics.agg[-c(1:2)],type="b")
plot(3:knotsNum,gcvs.agg[-c(1:2)],type="b")
plot(3:knotsNum,devs.agg[-c(1:2)],type="b")

par(mfrow=c(1,1))

fitgam.gp.agg1 <- gam(yield ~ s(Wavail,k=3, bs = "gp"), data=datOR.agg)
fitgam.gp.agg2 <- gam(yield ~ s(Wavail,k=5, bs = "gp"), data=datOR.agg)
fitgam.gp.agg3 <- gam(yield ~ s(Wavail,k=10,bs = "gp"), data=datOR.agg)

plot(datOR.agg$Wavail,datOR.agg$yield,cex=0.1,pch=20)
points(datOR.agg$Wavail,fitgam.gp.agg1$fitted.values,type="l",col="red",lwd=2)
points(datOR.agg$Wavail,fitgam.gp.agg2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.agg$Wavail,fitgam.gp.agg3$fitted.values,type = "l",col="green",lwd=2)

Another solution: taking a subset

By taking a subset of size \(N\) from the origianl data set, we may reduce the variance.

set.seed(2019)
s<- sort(sample(1:nrow(datOR),1000))
datOR.subset  <- datOR[s,]
knotsNum  <- 50
aics.subset <- numeric(knotsNum)
gcvs.subset <- numeric(knotsNum)
divs.subset <- numeric(knotsNum)
for(kts in 3:knotsNum){
  fitgam.subset    <- gam(yield ~ s(Wavail,k=kts, bs = "gp"), 
                            data=datOR.subset)
  aics.subset[kts] <- fitgam.subset$aic
  gcvs.subset[kts] <- fitgam.subset$gcv.ubre
  divs.subset[kts] <- fitgam.subset$deviance
}

par(mfrow=c(1,3))
plot(aics.subset[-c(1,2)])
plot(gcvs.subset[-c(1,2)])
plot(divs.subset[-c(1,2)])

par(mfrow=c(1,1))
fitgam.subset <- gam(yield ~ s(Wavail,k=20, bs = "gp"), 
                        data=datOR.subset)
plot(datOR.subset$Wavail,datOR.subset$yield,cex=0.5,pch=20,col="grey")
points(datOR.subset$Wavail,fitgam.subset$fitted.values,type="l")

A bootstrap and naive resampling

datOR.sample <- NULL
wavs <- names(table(datOR$Wavail))
set.seed(2019)
for(k in wavs){
  tempdat<- datOR[which(datOR$Wavail == k),]
  s<- sort(sample(1:nrow(tempdat),1))
  datOR.sample<-rbind(datOR.sample,tempdat[s,])
}
fitgam.sample <- gam(yield  ~ s(Wavail,k=30,bs = "gp"), 
                     data=datOR.sample)

Taking one sample from each unique Wavail, we have the subset, which contains 362 rows. It is similar to the aggregative data approach.

plot(datOR.sample$Wavail,datOR.sample$yield,cex=0.5,pch=20,col="grey")
points(datOR.sample$Wavail,fitgam.sample$fitted.values,type = "l")

fitgam.gp.sample1 <- gam(yield ~ s(Wavail,k=3,bs = "gp"), data=datOR.sample)
fitgam.gp.sample2 <- gam(yield ~ s(Wavail,k=10,bs = "gp"), data=datOR.sample)
fitgam.gp.sample3 <- gam(yield ~ s(Wavail,k=50,bs = "gp"), data=datOR.sample)
fitgam.gp.sample4 <- gam(yield ~ s(Wavail,k=100,bs = "gp"), data=datOR.sample)

plot(datOR.sample$Wavail,datOR.sample$yield,cex=0.1,pch=20,col="grey")
points(datOR.sample$Wavail,fitgam.gp.sample1$fitted.values,type = "l",col="red",lwd=2)
points(datOR.sample$Wavail,fitgam.gp.sample2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.sample$Wavail,fitgam.gp.sample3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.sample$Wavail,fitgam.gp.sample4$fitted.values,type = "l",col="brown",lwd=2)

AIC(fitgam.gp.sample1,fitgam.gp.sample2,fitgam.gp.sample3,fitgam.gp.sample4)

##                         df      AIC
## fitgam.gp.sample1 3.979491 886.6777
## fitgam.gp.sample2 5.735994 884.1414
## fitgam.gp.sample3 6.161752 883.8681
## fitgam.gp.sample4 6.720663 883.9109

BIC(fitgam.gp.sample1,fitgam.gp.sample2,fitgam.gp.sample3,fitgam.gp.sample4)

##                         df      BIC
## fitgam.gp.sample1 3.979491 902.1644
## fitgam.gp.sample2 5.735994 906.4638
## fitgam.gp.sample3 6.161752 907.8474
## fitgam.gp.sample4 6.720663 910.0654

Repeat the bootstrap sampling (naive resampling) for 30 times.

N <- 30
boost.max1 <- matrix(0,N,362)
boost.max2 <- matrix(0,N,362)
boost.max3 <- matrix(0,N,362)
boost.max4 <- matrix(0,N,362)
rowNms <- matrix(0,N,362)
set.seed(2019)
for(i in 1:N){
  datOR.boost <- NULL
  for(k in wavs){
    tempdat<- datOR[which(datOR$Wavail == k),]
    s<- sort(sample(1:nrow(tempdat),1))
    datOR.boost<-rbind(datOR.boost,tempdat[s,])
  }
  fitgam.gp.boost1 <- gam(yield ~ s(Wavail,k=3,bs="gp"),  data=datOR.boost)
  fitgam.gp.boost2 <- gam(yield ~ s(Wavail,k=10,bs="gp"), data=datOR.boost)
  fitgam.gp.boost3 <- gam(yield ~ s(Wavail,k=50,bs="gp"), data=datOR.boost)
  fitgam.gp.boost4 <- gam(yield ~ s(Wavail,k=100,bs="gp"),data=datOR.boost)
  boost.max1[i,] <- fitgam.gp.boost1$fitted.values
  boost.max2[i,] <- fitgam.gp.boost2$fitted.values
  boost.max3[i,] <- fitgam.gp.boost3$fitted.values
  boost.max4[i,] <- fitgam.gp.boost4$fitted.values
  rowNms[i,]     <- row.names(datOR.boost)
}

hist(as.integer(rowNms),30)

plot(datOR$Wavail,datOR$yield,cex=0.5,pch=20,col="grey")
points(wavs,apply(boost.max1,2,mean),type="l",col="red",lwd=2)
points(wavs,apply(boost.max2,2,mean),type="l",col="blue",lwd=2)
points(wavs,apply(boost.max3,2,mean),type="l",col="green",lwd=2)
points(wavs,apply(boost.max4,2,mean),type="l",col="brown",lwd=2)

plot(datOR$Wavail,datOR$yield,cex=0.1,pch=20,col="grey")
for(k in 1:N) points(wavs,boost.max1[k,],col=grey(0.5),type="l")

plot(datOR$Wavail,datOR$yield,cex=0.1)
for(k in 1:N) points(wavs,boost.max2[k,],col=grey(0.5),type="l")

plot(datOR$Wavail,datOR$yield,cex=0.1)
for(k in 1:N) points(wavs,boost.max3[k,],col=grey(0.5),type="l")

plot(datOR$Wavail,datOR$yield,cex=0.1)
for(k in 1:N) points(wavs,boost.max4[k,],col=grey(0.5),type="l")

Comparison of \(AIC\)s, \(BIC\)s and deviances.

plot(aics.max1,type="b",col="red")
points(aics.max2,type="b",col="blue")
points(aics.max3,type="b",col="green")
points(aics.max4,type="b",col="brown")

mean(aics.max1);mean(aics.max2);mean(aics.max3);mean(aics.max4)

## [1] 910.6492

## [1] 909.2509

## [1] 908.8717

## [1] 908.7469

plot(bics.max1,type="b",col="red")
points(bics.max2,type="b",col="blue")
points(bics.max3,type="b",col="green")
points(bics.max4,type="b",col="brown")

mean(bics.max1);mean(bics.max2);mean(bics.max3);mean(bics.max4)

## [1] 926.1302

## [1] 930.5354

## [1] 941.4827

## [1] 938.555

plot(devs.max1,type="b",col="red")
points(devs.max2,type="b",col="blue")
points(devs.max3,type="b",col="green")
points(devs.max4,type="b",col="brown")

mean(devs.max1);mean(devs.max2);mean(devs.max3);mean(devs.max4)

## [1] 256.7962

## [1] 253.7053

## [1] 249.4922

## [1] 250.3439

This approach is NOT robust.

A cutting off CV OR stratified CV

In the stratified cross-validation (SCV), the values \(\mathbf{y}_i\), which denote all measured time points corresponding to an individual \(i\), are taken from the data set for further testing. The rest data \(\mathbf{y}^{-i}\), is used for training the model.

The cutting off CV is taking a bunch of consecutive individuals \(i,\ldots,i+k\) from the data set for testing.

For example

tempTests <- subset(datOR,datOR$Wavail %in% wavs[91:100])
plot(datOR$Wavail,datOR$yield,pch=20,cex=0.5,col="grey")
points(tempTests$Wavail,tempTests$yield,pch=20,col="red")

wavs <- names(table(datOR$Wavail))
knotsNum <- length(wavs)
cv.train <- matrix(0,knotsNum,knotsNum)
cv.tests <- matrix(0,knotsNum,knotsNum)

kfo <- 5
sq1 <- seq(3,30,by=1)
sq2 <- 2:(knotsNum-kfo)

for(nkts in sq1){
  for(st in sq2){
    s         <- st:(st+kfo-1)
    tempTrain <- subset(datOR,!(datOR$Wavail %in% wavs[s]))
    tempTests <- subset(datOR,datOR$Wavail %in% wavs[s])
    fitgam.gp.cv <- gam(yield ~ s(Wavail,k=nkts), data=tempTrain)
    cv.train[nkts,st] <- RMSE(fitgam.gp.cv$fitted.values,tempTrain$yield)
    preTest <- predict(fitgam.gp.cv,data.frame(Wavail = tempTests$Wavail))
    cv.tests[nkts,st] <- RMSE(preTest,tempTests$yield)
  }
  cat(nkts);
}

plot(sq1,apply(cv.train[sq1,sq2],1,mean),ylim=c(0.75,0.9),pch=20,type="b")
points(sq1,apply(cv.tests[sq1,sq2],1,mean),col="red",pch=20,type="b")
legend("top", legend=c("Training Error", "Testing error"), col=c("black", "red"),pch=c(20,20))

The training error and testing error are balanced at the 15.

Constrained Generalized Additive Model Fitting

The mgcv package contains routines to fit the generalized additive model where the components may be modeled with shape and smoothness assumptions. The function s.incr.conc() specifies a smooth, increasing and concave shape-restriction in a cgam() formula (Xiyue Liao 2019).

According to the value of the vector itself and its shape, numknots, knots and space attributes, the cone edges will be made by C-spline basis functions, where C-spline refers to convex regression spline (Meyer 2008, @meyer2013semi).

library(cgam)
fitcgam1 <- cgam(yield ~ s.incr.conc(Wavail,numknots=3),  data=datOR)
fitcgam2 <- cgam(yield ~ s.incr.conc(Wavail,numknots=10), data=datOR)
fitcgam3 <- cgam(yield ~ s.incr.conc(Wavail,numknots=50), data=datOR)
fitcgam4 <- cgam(yield ~ s.incr.conc(Wavail,numknots=100),data=datOR)

plot(datOR$Wavail,datOR$yield,cex=0.5,pch=20,col="grey")
points(datOR$Wavail,fitcgam1$muhat,type = "l",lwd=2,col="red")
points(datOR$Wavail,fitcgam2$muhat,type = "l",lwd=2,col="blue")
points(datOR$Wavail,fitcgam3$muhat,type = "l",lwd=2,col="green")
points(datOR$Wavail,fitcgam4$muhat,type = "l",lwd=2,col="blue")

fitcgam1$deviance

## [1] 8815.735

fitcgam2$deviance

## [1] 8630.178

fitcgam3$deviance

## [1] 8584.179

fitcgam4$deviance

## [1] 8579.372

A drawback of cgam is that it doesn’t work for the interaction of two variables, where gam does.

fitcgam1.s <- cgam(yield ~ s.incr.conc(Wavail,numknots=3) + 
                   s(lat,longi, numknots = 6), data=datOR)
# system crash

Comparison of `gam()` and `cgam()` on deviance

kts     <- seq(3,40)
gamDev  <- numeric(102)
cgamDev <- numeric(102)
for(k in kts){
  gamfit    <- gam(yield ~ s(Wavail,k=k,bs="gp"), data=datOR)
  gamDev[k] <- gamfit$deviance
  cat(k);
}
for(k in kts){
  cgamfit   <- cgam(yield ~ s.incr.conc(Wavail,numknots=k), data=datOR)
  cgamDev[k]<- cgamfit$deviance
  cat(k);
}

kts     <- seq(3,40)
plot(kts,gamDev[kts],type="b",col="red",ylim=c(8000,9000))
points(kts,cgamDev[kts],type="b",col="blue")

The deviance from cgam() is more stable than it from gam().

Shape constrained additive models (SCAM)

Similar to cgam, scam is a shape constrained additive model (SCAM). It is a generalized linear model (GLM) in which the linear predictor is given by strictly parametric components plus a sum of smooth functions of the covariates where some of the functions are assumed to be shape constrained (Pya and Wood 2015, @pya2010additive).

fitscam1 <- scam(yield ~ s(Wavail,k=4, bs="cv"),  data=datOR)
fitscam2 <- scam(yield ~ s(Wavail,k=10, bs="cv"), data=datOR)
fitscam3 <- scam(yield ~ s(Wavail,k=50, bs="cv"), data=datOR)
fitscam4 <- scam(yield ~ s(Wavail,k=100, bs="cv"),data=datOR)

A SCAM formula. This is exactly like the formula for a GAM except that monotone smooth terms. bs="cv" indicates concave P-splines.

plot(datOR$Wavail,datOR$yield,cex=0.5,pch=20,col="grey")
points(datOR$Wavail,fitscam1$fitted.values,type = "l",lwd=2,col="red")
points(datOR$Wavail,fitscam2$fitted.values,type = "l",lwd=2,col="blue")
points(datOR$Wavail,fitscam3$fitted.values,type = "l",lwd=2,col="green")
points(datOR$Wavail,fitscam4$fitted.values,type = "l",lwd=2,col="blue")

fitscam1$aic;fitscam2$aic;fitscam3$aic;fitscam4$aic

## [1] 30828.17

## [1] 30670.38

## [1] 30596.57

## [1] 30590.81

# BIC(fitscam1,fitscam2,fitscam3,fitscam4)

If we compare cgam and scam on the same amount of knots:

fitcgam2 <- cgam(yield ~ s.incr.conc(Wavail,numknots=10), data=datOR)
fitscam2 <- scam(yield ~ s(Wavail,k=10, bs="cv"), data=datOR)

fitcgam2$deviance;fitscam2$deviance

## [1] 8630.178

## [1] 8627.923

plot(datOR$Wavail,datOR$yield,cex=0.5,pch=20,col="grey")
points(datOR$Wavail,fitcgam2$muhat,type = "l",lwd=2,col="red")
points(datOR$Wavail,fitscam2$fitted.values,type = "l",lwd=2,col="blue")

Here we propose a research topic: Comparison of gam,cgam,scam (and asreml) on the performance of fitting longitudinal data set (repeatedly measured data set)

An additive Gaussian process regression model

Gaussian processes are the extension of multivariate Gaussian to infinite-sized collections of real value variables, any finite number of which have a joint Gaussian distribution (???). Gaussian process regression is a probability distribution over functions. It is fully defined by its mean \(m(t)\) and covariance \(K(s,t)\) function as \[\begin{align} m(t)&=\mathrm{E} \lbrack f(t)\rbrack \\ K(s,t)&=\mathrm{E} \lbrack \left(f(s)-m(s)\right) \left(f(t)-m(t)\right)\rbrack, \end{align}\] where \(s\) and \(t\) are two variables. A function \(f\) distributed as such is denoted in form of \[\begin{equation} f \sim \mathrm{GP}\left(m(t),K(s,t) \right). \end{equation}\] Usually the mean function is assumed to be zero everywhere.

Given a set of input variables \(\mathbf{t} = \left\lbrace t_1,\ldots,t_n\right\rbrace\) for function \(f(t)\) and the output \(\mathbf{y}=f(\mathbf{t})+\varepsilon\) with i.i.d Gaussian noise \(\varepsilon\) of variance \(\sigma_n^2\), we can use the above definition to predict the value of the function \(f_*=f(t_*)\) at a particular input \(t_*\). As the noisy observations becoming \[\begin{equation} \mathrm{Cov} (y_p,y_q) = K(t_p,t_q)+\sigma_n^2 \delta_{pq} \end{equation}\] where \(\delta_{pq}\) is a Kronecker delta which is one if and only if \(p=q\) and zero otherwise, the joint distribution of the observed outputs \(\mathbf{y}\) and the estimated output \(f_*\) according to prior is \[\begin{equation} \begin{bmatrix} \mathbf{y}\\ f_*\end{bmatrix} \sim N \left( 0, \begin{bmatrix} K(\mathbf{t},\mathbf{t}) +\sigma_n^2I& K(\mathbf{t},t_*) \\ K(t_*,\mathbf{t}) & K(t_*,t_*) \end{bmatrix} \right). \end{equation}\] The posterior distribution over the predicted value is obtained by conditioning on the observed data \[\begin{equation} f_* \mid \mathbf{y},\mathbf{t},t_* \sim N\left(\bar{f_*},\mathrm{Cov} (f_*)\right) \end{equation}\] where \[\begin{align} \bar{f_*}&=\mathrm{E} \left( f_* \mid \mathbf{y},\mathbf{t},t_*\right) = K(t_*,\mathbf{t})\left( K(\mathbf{t},\mathbf{t})+\sigma_n^2\right) ^{-1}\mathbf{y},\\ \mathrm{Cov}(f_*)&=K(t_*,t_*)-K(t_*,\mathbf{t})\left( K(\mathbf{t},\mathbf{t})+\sigma_n^2I\right) ^{-1}K(\mathbf{t},t_*). \end{align}\] Therefore it can seen that the Bayesian estimation of a smoothing spline is a special format of Gaussian process regression with diffuse prior and the covariance matrix \(R(s,t)\).

Cheng et al. (2019) proposed an additive Gaussian process regression model for fitting longitudinal data in biomedical (biostatistical) research.

For example,

\[\begin{align*} \mbox{AGPM1: } y&=f^{(1)}(id)+\varepsilon \\ \mbox{AGPM2: } y&=f^{(1)}(id)+f^{(2)}(age)+f^{(3)}(id\times age)+\varepsilon \\ \mbox{AGPM3: } y&=f^{(1)}(id)+f^{(2)}(age)+f^{(3)}(id\times age)+f^{(4)}(loc)+f^{(5)}(loc\times age)+\varepsilon \end{align*}\]

knitr::include_graphics("agpm.png",dpi=100)

However, LonGP, a software to implement additive Gaussian process regression, is only available in Matlab and Linux (or Mac) system.

Here we propose another research topic: Additive Gaussian process regression model on longitudinal agricultural data, and plan to develop an R package, which will be widely used for statisticians on their PCs.

References

Chen, Kefei, Rebecca A. O’Leary, and Fiona H. Evans. 2019. “A Simple and Parsimonious Generalised Additive Model for Predicting Wheat Yield in a Decision Support Tool.” Agricultural Systems 173: 140–50.

Cheng, Lu, Siddharth Ramchandran, Tommi Vatanen, Niina Lietzén, Riitta Lahesmaa, Aki Vehtari, and Harri Lähdesmäki. 2019. “An Additive Gaussian Process Regression Model for Interpretable Non-Parametric Analysis of Longitudinal Data.” Nature Communications 10 (1): 1798.

Meyer, Mary C. 2008. “Inference Using Shape-Restricted Regression Splines.” The Annals of Applied Statistics 2 (3): 1013–33.

———. 2013. “Semi-Parametric Additive Constrained Regression.” Journal of Nonparametric Statistics 25 (3): 715–30.

Pya, Natalya. 2010. “Additive Models with Shape Constraints.” PhD thesis, University of Bath.

Pya, Natalya, and Simon N. Wood. 2015. “Shape Constrained Additive Models.” Statistics and Computing 25 (3): 543–59.

Wood, Simon, and Maintainer Simon Wood. 2015. “Package ‘Mgcv’.” R Package Version 1: 29.

Wood, S. N. 2006. “Generalized Additive Models: An Introduction with R.”

Xiyue Liao, Mary C. Meyer. 2019. “Cgam: An R Package for the Constrained Generalized Additive Model.” Journal of Statistical Software 89 (5). https://doi.org/doi: 10.18637/jss.v089.i05.

The optimal we found is not the best we want