1 Visualizing data (Chen, O’Leary, and Evans 2019)

NVTData <- read.table("NVTData.txt",header = TRUE)
plot(NVTData$Wavail,NVTData$yield,cex=0.1)

2 Fitting data

fitgam0 <- gam(yield ~ s(Wavail), data=NVTData)
plot(NVTData$Wavail,NVTData$yield,cex=0.1)
points(NVTData$Wavail,fitgam0$fitted.values,type="l")

3 Ordering data

datOR    <- NVTData[order(NVTData$Wavail),]
knotsNum <- length(table(datOR$Wavail)) # unique knots
print(knotsNum)
## [1] 362
fitgam1 <- gam(yield ~ s(Wavail), data=datOR)
plot(datOR$Wavail,datOR$yield,cex=0.1)
points(datOR$Wavail,fitgam1$fitted.values,type="l")

4 Comparison

fitgam2 <- gam(yield ~ s(Wavail,k=3),   data = datOR)
fitgam3 <- gam(yield ~ s(Wavail,k=100), data = datOR)
fitgam4 <- gam(yield ~ s(Wavail,k=300), data = datOR)
plot(datOR$Wavail,datOR$yield,cex=0.1)
points(datOR$Wavail,fitgam2$fitted.values,type="l",col="red")
points(datOR$Wavail,fitgam3$fitted.values,type="l",col="blue")
points(datOR$Wavail,fitgam4$fitted.values,type="l",col="green")

compTable <- AIC(fitgam0,fitgam1,fitgam2,fitgam3,fitgam4)
compTable$BIC <- BIC(fitgam0,fitgam1,fitgam2,fitgam3,fitgam4)$BIC
compTable$GCV <- c(fitgam0$gcv.ubre, fitgam1$gcv.ubre, fitgam2$gcv.ubre, fitgam3$gcv.ubre, fitgam4$gcv.ubre)
compTable$deviance <- c(fitgam0$deviance, fitgam1$deviance, fitgam2$deviance, fitgam3$deviance, fitgam4$deviance)
compTable$RMSE<- c(sqrt(mean((fitgam0$fitted.values-NVTData$yield)^2)), sqrt(mean((fitgam1$fitted.values-datOR$yield)^2)), sqrt(mean((fitgam2$fitted.values-datOR$yield)^2)), sqrt(mean((fitgam3$fitted.values-datOR$yield)^2)), sqrt(mean((fitgam4$fitted.values-datOR$yield)^2)))

print(compTable)
##                 df      AIC      BIC       GCV deviance      RMSE
## fitgam0  10.933138 30492.71 30573.87 0.6877164 8498.209 0.8286211
## fitgam1  10.933138 30492.71 30573.87 0.6877164 8498.209 0.8286211
## fitgam2   3.998908 30748.05 30777.73 0.7020508 8685.073 0.8376817
## fitgam3 100.266214 26895.97 27640.31 0.5143182 6264.017 0.7114080
## fitgam4 300.368361 14930.74 17160.55 0.1957113 2306.556 0.4316923

From the criteria, such as AIC, GCV, deviance and root mean squared error (RMSE), they indicate that the more knots are used in the model, the better it should be.

for(kts in 3:knotsNum){
  for(st in 1:5){  # 5-fold cross-validation
    s         <- seq(st,dim(dat3)[1],by=5)
    
    tempTrain <- datOR[-s,]
    tempTests <- datOR[s,]
    
    fitgam <- gam(yield ~ s(Wavail,k=kts), data=tempTrain)
    
    cv.train[kts,st] <- mean((fitgam$fitted.values-tempTrain$yield)^2)
    
    preTest <- predict(fitgam,data.frame(Wavail = tempTests$Wavail))
    
    cv.tests[kts,st] <- mean((preTest-tempTests$yield)^2)
  }
}
# aics   <- numeric(knotsNum)
# gcvs   <- numeric(knotsNum)
# mses   <- numeric(knotsNum)
for(kts in 3:knotsNum){
  fitgam    <- gam(yield ~ s(Wavail,k=kts), data=dat3)
  aics[kts] <- fitgam$aic
  gcvs[kts] <- fitgam$gcv.ubre
  mses[kts] <- sqrt(mean((fitgam$fitted.values-dat3$yield)^2))
}
cvtrain <- read.table("cvtrain.txt",header = TRUE)
cvtests <- read.table("cvtests.txt",header = TRUE)
par(mfrow=c(1,2));
plot(apply(cvtrain[-c(1,2),],1,mean),main="cv train");
plot(apply(cvtests[-c(1,2),],1,mean),main="cv test");

par(mfrow=c(1,1))

In the package mgcv, the spline method s() of the function gam() is the default penalized thin plate regression spline, which tends to give the best MSE performance (Wood 2003, 2006).

Zhu, Li, and Liang (2009) conclude that:

  1. \(R^2\) and \(RMSE\) have the same properties and perform worst in their simulation.
  2. The sample size has an effect on the performance \(AIC\), \(AIC_c\) and \(BIC\).
  3. \(AIC\) is not suitable to use in small samples and tends to select more complex model when the sample size becomes large.
  4. \(AIC_c\) and \(BIC\) have best performance in small and large sample cases, respectively.

Here is the contradiction:

The optimal model, which is found by either AIC(), BIC(), etc., is not the best we want (by visualization).

5 Potential solutions

5.1 Aggregating data

datOR.agg <- aggregate(datOR[c("lat","longi","Wavail","yield")],list(datOR$Wavail),mean)

plot(datOR$Wavail,datOR$yield,cex=0.1,pch=20)
points(datOR.agg$Wavail,datOR.agg$yield,pch=20,col="red")

# aics.agg <- numeric(knotsNum)
# gcvs.agg <- numeric(knotsNum)
# divs.agg <- numeric(knotsNum)
# mses.agg <- numeric(knotsNum)
for(kts in 3:knotsNum){
  fitgam.agg    <- gam(yield ~ s(Wavail,k=kts), data=datOR.agg)
  aics.agg[kts] <- fitgam.agg$aic
  gcvs.agg[kts] <- fitgam.agg$gcv.ubre
  divs.agg[kts] <- fitgam.agg$deviance
  mses.agg[kts] <- sqrt(mean((fitgam.agg$fitted.values-datOR.agg$yield)^2))
}
par(mfrow=c(2,2))
plot(aics.agg[-c(1:2)],type="b")
plot(gcvs.agg[-c(1:2)],type="b")
plot(divs.agg[-c(1:2)],type="b")
plot(mses.agg[-c(1:2)],type="b")

par(mfrow=c(1,1))
fitgam.agg1 <- gam(yield ~ s(Wavail,k=3), data=datOR.agg)
fitgam.agg2 <- gam(yield ~ s(Wavail,k=5), data=datOR.agg)
fitgam.agg3 <- gam(yield ~ s(Wavail,k=100), data=datOR.agg)
fitgam.agg4 <- gam(yield ~ s(Wavail,k=300), data=datOR.agg)

plot(datOR.agg$Wavail,datOR.agg$yield,cex=0.1,pch=20)
points(datOR.agg$Wavail,fitgam.agg1$fitted.values,type="l",col="red",lwd=2)
points(datOR.agg$Wavail,fitgam.agg2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.agg$Wavail,fitgam.agg3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.agg$Wavail,fitgam.agg4$fitted.values,type = "l",col="brown",lwd=2)

compTable.agg <- AIC(fitgam.agg1,fitgam.agg2,fitgam.agg3,fitgam.agg4)
compTable.agg$BIC <- BIC(fitgam.agg1,fitgam.agg2,fitgam.agg3,fitgam.agg4)$BIC
compTable.agg$GCV <- c(fitgam.agg1$gcv.ubre, fitgam.agg2$gcv.ubre, fitgam.agg3$gcv.ubre, fitgam.agg4$gcv.ubre)
compTable.agg$deviance <- c(fitgam.agg1$deviance, fitgam.agg2$deviance, fitgam.agg3$deviance, fitgam.agg4$deviance)
compTable.agg$RMSE<- c(sqrt(mean((fitgam.agg1$fitted.values-datOR.agg$yield)^2)), sqrt(mean((fitgam.agg2$fitted.values-datOR.agg$yield)^2)), sqrt(mean((fitgam.agg3$fitted.values-datOR.agg$yield)^2)), sqrt(mean((fitgam.agg4$fitted.values-datOR.agg$yield)^2)))
print(compTable.agg)
##                   df      AIC      BIC       GCV deviance      RMSE
## fitgam.agg1 3.981404 860.8931 876.3873 0.6280246 223.6155 0.7859532
## fitgam.agg2 5.259226 860.9008 881.3679 0.6280828 222.0472 0.7831921
## fitgam.agg3 5.549870 861.3912 882.9893 0.6289465 221.9914 0.7830937
## fitgam.agg4 5.549814 861.3912 882.9891 0.6289465 221.9914 0.7830939

Drawbacks: some valuable data are missing; aggregated by mean() might not be the best option.

5.2 Weighted values

datOR.wts <- datOR
plot(datOR$Wavail,datOR$yield,cex=0.5)
points(datOR$Wavail,fitgam2$fitted.values,type="l",col="red",lwd=2)

sres0<-(datOR$yield - fitgam2$fitted.values)^2
sres <- 1/sres0
datOR.wts$Weights <- 0
wavs <- names(table(datOR.wts$Wavail))

for(k in wavs){
  wh <- which(datOR.wts$Wavail == k)
  datOR.wts$Weights[wh] <- sres[wh]/sum(sres[wh])
}

fitgam.wts1 <- gam(yield ~ s(Wavail,k=3),weights = datOR.wts$Weights, data=datOR.wts)
fitgam.wts2 <- gam(yield ~ s(Wavail,k=5),weights = datOR.wts$Weights, data=datOR.wts)
fitgam.wts3 <- gam(yield ~ s(Wavail,k=100),weights = datOR.wts$Weights, data=datOR.wts)
fitgam.wts4 <- gam(yield ~ s(Wavail,k=300),weights = datOR.wts$Weights, data=datOR.wts)

plot(datOR.wts$Wavail,datOR.wts$yield,cex=0.1,pch=20)
points(datOR.wts$Wavail,fitgam.wts1$fitted.values,type = "l",col="red",lwd=2)
points(datOR.wts$Wavail,fitgam.wts2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.wts$Wavail,fitgam.wts3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.wts$Wavail,fitgam.wts4$fitted.values,type = "l",col="brown",lwd=2)

AIC(fitgam.wts1,fitgam.wts2,fitgam.wts3,fitgam.wts4)
##                     df      AIC
## fitgam.wts1   3.999472 49964.63
## fitgam.wts2   5.995467 49841.01
## fitgam.wts3 100.426600 45655.24
## fitgam.wts4 300.412109 31041.80
BIC(fitgam.wts1,fitgam.wts2,fitgam.wts3,fitgam.wts4)
##                     df      BIC
## fitgam.wts1   3.999472 49994.32
## fitgam.wts2   5.995467 49885.51
## fitgam.wts3 100.426600 46400.77
## fitgam.wts4 300.412109 33271.94

5.3 Taking a subset

set.seed(2019)
datOR.sample <- NULL
for(k in wavs){
  tempdat<- datOR[which(datOR$Wavail == k),]
  s<- sort(sample(1:nrow(tempdat),1))
  datOR.sample<-rbind(datOR.sample,tempdat[s,])
}

fitgam.sample <- gam(yield  ~ s(Wavail,k=300), data=datOR.sample)
plot(datOR.sample$Wavail,datOR.sample$yield,cex=0.5)
points(datOR.sample$Wavail,fitgam.sample$fitted.values,type = "l")

AIC(fitgam.sample)
## [1] 884.1104
fitgam.sample1 <- gam(yield ~ s(Wavail,k=3), data=datOR.sample)
fitgam.sample2 <- gam(yield ~ s(Wavail,k=5), data=datOR.sample)
fitgam.sample3 <- gam(yield ~ s(Wavail,k=100), data=datOR.sample)
fitgam.sample4 <- gam(yield ~ s(Wavail,k=300), data=datOR.sample)

plot(datOR.sample$Wavail,datOR.sample$yield,cex=0.1,pch=20)
points(datOR.sample$Wavail,fitgam.sample1$fitted.values,type = "l",col="red",lwd=2)
points(datOR.sample$Wavail,fitgam.sample2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.sample$Wavail,fitgam.sample3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.sample$Wavail,fitgam.sample4$fitted.values,type = "l",col="brown",lwd=2)

AIC(fitgam.sample1,fitgam.sample2,fitgam.sample3,fitgam.sample4)
##                      df      AIC
## fitgam.sample1 3.980366 884.9641
## fitgam.sample2 5.404610 884.2511
## fitgam.sample3 6.515357 884.1104
## fitgam.sample4 6.515493 884.1104
BIC(fitgam.sample1,fitgam.sample2,fitgam.sample3,fitgam.sample4)
##                      df      BIC
## fitgam.sample1 3.980366 900.4543
## fitgam.sample2 5.404610 905.2840
## fitgam.sample3 6.515357 909.4659
## fitgam.sample4 6.515493 909.4664

5.3.1 A naive resampling approach

N <- 30
boost.max1 <- matrix(0,N,362)
boost.max2 <- matrix(0,N,362)
boost.max3 <- matrix(0,N,362)
boost.max4 <- matrix(0,N,362)
for(i in 1:N){
  datOR.boost <- NULL
  for(k in wavs){
    tempdat<- datOR[which(datOR$Wavail == k),]
    s<- sort(sample(1:nrow(tempdat),1))
    datOR.boost <- rbind(datOR.boost,tempdat[s,])
  }
  fitgam.boost1 <- gam(yield ~ s(Wavail,k=3),   data=datOR.boost)
  fitgam.boost2 <- gam(yield ~ s(Wavail,k=5),   data=datOR.boost)
  fitgam.boost3 <- gam(yield ~ s(Wavail,k=100), data=datOR.boost)
  fitgam.boost4 <- gam(yield ~ s(Wavail,k=300), data=datOR.boost)
  
  boost.max1[i,] <- fitgam.boost1$fitted.values
  boost.max2[i,] <- fitgam.boost2$fitted.values
  boost.max3[i,] <- fitgam.boost3$fitted.values
  boost.max4[i,] <- fitgam.boost4$fitted.values
  cat(i);
}
plot(datOR$Wavail,datOR$yield,cex=0.1)
points(datOR$Wavail,fitgam2$fitted.values,type="l",lwd=2)
points(datOR.sample$Wavail,apply(boost.max1,2,mean),type="l",col="red")
points(datOR.sample$Wavail,apply(boost.max2,2,mean),type="l",col="blue")
points(datOR.sample$Wavail,apply(boost.max3,2,mean),type="l",col="green")
points(datOR.sample$Wavail,apply(boost.max4,2,mean),type="l",col="brown")

6 A question

Is it worth spending some time on finding an approach, which leads to the consistency: the optimal is the best, for large size longitudinal data?

References

Chen, Kefei, Rebecca A. O’Leary, and Fiona H. Evans. 2019. “A Simple and Parsimonious Generalised Additive Model for Predicting Wheat Yield in a Decision Support Tool.” Agricultural Systems 173: 140–50.

Wood, Simon N. 2003. “Thin Plate Regression Splines.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 65 (1): 95–114.

Wood, S. N. 2006. “Generalized Additive Models: An Introduction with R.”

Zhu, Lixin, Lifang Li, and Zhenlin Liang. 2009. “Comparison of Six Statistical Approaches in the Selection of Appropriate Fish Growth Models.” Chinese Journal of Oceanology and Limnology 27 (3): 457. https://doi.org/10.1007/s00343-009-9236-6.