NVTData <- read.table("NVTData.txt",header = TRUE)
plot(NVTData$Wavail,NVTData$yield,cex=0.1)
fitgam0 <- gam(yield ~ s(Wavail), data=NVTData)
plot(NVTData$Wavail,NVTData$yield,cex=0.1)
points(NVTData$Wavail,fitgam0$fitted.values,type="l")
datOR <- NVTData[order(NVTData$Wavail),]
knotsNum <- length(table(datOR$Wavail)) # unique knots
print(knotsNum)
## [1] 362
fitgam1 <- gam(yield ~ s(Wavail), data=datOR)
plot(datOR$Wavail,datOR$yield,cex=0.1)
points(datOR$Wavail,fitgam1$fitted.values,type="l")
fitgam2 <- gam(yield ~ s(Wavail,k=3), data = datOR)
fitgam3 <- gam(yield ~ s(Wavail,k=100), data = datOR)
fitgam4 <- gam(yield ~ s(Wavail,k=300), data = datOR)
plot(datOR$Wavail,datOR$yield,cex=0.1)
points(datOR$Wavail,fitgam2$fitted.values,type="l",col="red")
points(datOR$Wavail,fitgam3$fitted.values,type="l",col="blue")
points(datOR$Wavail,fitgam4$fitted.values,type="l",col="green")
compTable <- AIC(fitgam0,fitgam1,fitgam2,fitgam3,fitgam4)
compTable$BIC <- BIC(fitgam0,fitgam1,fitgam2,fitgam3,fitgam4)$BIC
compTable$GCV <- c(fitgam0$gcv.ubre, fitgam1$gcv.ubre, fitgam2$gcv.ubre, fitgam3$gcv.ubre, fitgam4$gcv.ubre)
compTable$deviance <- c(fitgam0$deviance, fitgam1$deviance, fitgam2$deviance, fitgam3$deviance, fitgam4$deviance)
compTable$RMSE<- c(sqrt(mean((fitgam0$fitted.values-NVTData$yield)^2)), sqrt(mean((fitgam1$fitted.values-datOR$yield)^2)), sqrt(mean((fitgam2$fitted.values-datOR$yield)^2)), sqrt(mean((fitgam3$fitted.values-datOR$yield)^2)), sqrt(mean((fitgam4$fitted.values-datOR$yield)^2)))
print(compTable)
## df AIC BIC GCV deviance RMSE
## fitgam0 10.933138 30492.71 30573.87 0.6877164 8498.209 0.8286211
## fitgam1 10.933138 30492.71 30573.87 0.6877164 8498.209 0.8286211
## fitgam2 3.998908 30748.05 30777.73 0.7020508 8685.073 0.8376817
## fitgam3 100.266214 26895.97 27640.31 0.5143182 6264.017 0.7114080
## fitgam4 300.368361 14930.74 17160.55 0.1957113 2306.556 0.4316923
From the criteria, such as AIC, GCV, deviance and root mean squared error (RMSE), they indicate that the more knots are used in the model, the better it should be.
for(kts in 3:knotsNum){
for(st in 1:5){ # 5-fold cross-validation
s <- seq(st,dim(dat3)[1],by=5)
tempTrain <- datOR[-s,]
tempTests <- datOR[s,]
fitgam <- gam(yield ~ s(Wavail,k=kts), data=tempTrain)
cv.train[kts,st] <- mean((fitgam$fitted.values-tempTrain$yield)^2)
preTest <- predict(fitgam,data.frame(Wavail = tempTests$Wavail))
cv.tests[kts,st] <- mean((preTest-tempTests$yield)^2)
}
}
# aics <- numeric(knotsNum)
# gcvs <- numeric(knotsNum)
# mses <- numeric(knotsNum)
for(kts in 3:knotsNum){
fitgam <- gam(yield ~ s(Wavail,k=kts), data=dat3)
aics[kts] <- fitgam$aic
gcvs[kts] <- fitgam$gcv.ubre
mses[kts] <- sqrt(mean((fitgam$fitted.values-dat3$yield)^2))
}
cvtrain <- read.table("cvtrain.txt",header = TRUE)
cvtests <- read.table("cvtests.txt",header = TRUE)
par(mfrow=c(1,2));
plot(apply(cvtrain[-c(1,2),],1,mean),main="cv train");
plot(apply(cvtests[-c(1,2),],1,mean),main="cv test");
par(mfrow=c(1,1))
In the package mgcv
, the spline method s()
of the function gam()
is the default penalized thin plate regression spline, which tends to give the best MSE performance (Wood 2003, 2006).
Zhu, Li, and Liang (2009) conclude that:
Here is the contradiction:
The optimal model, which is found by either AIC()
, BIC()
, etc., is not the best we want (by visualization).
datOR.agg <- aggregate(datOR[c("lat","longi","Wavail","yield")],list(datOR$Wavail),mean)
plot(datOR$Wavail,datOR$yield,cex=0.1,pch=20)
points(datOR.agg$Wavail,datOR.agg$yield,pch=20,col="red")
# aics.agg <- numeric(knotsNum)
# gcvs.agg <- numeric(knotsNum)
# divs.agg <- numeric(knotsNum)
# mses.agg <- numeric(knotsNum)
for(kts in 3:knotsNum){
fitgam.agg <- gam(yield ~ s(Wavail,k=kts), data=datOR.agg)
aics.agg[kts] <- fitgam.agg$aic
gcvs.agg[kts] <- fitgam.agg$gcv.ubre
divs.agg[kts] <- fitgam.agg$deviance
mses.agg[kts] <- sqrt(mean((fitgam.agg$fitted.values-datOR.agg$yield)^2))
}
par(mfrow=c(2,2))
plot(aics.agg[-c(1:2)],type="b")
plot(gcvs.agg[-c(1:2)],type="b")
plot(divs.agg[-c(1:2)],type="b")
plot(mses.agg[-c(1:2)],type="b")
par(mfrow=c(1,1))
fitgam.agg1 <- gam(yield ~ s(Wavail,k=3), data=datOR.agg)
fitgam.agg2 <- gam(yield ~ s(Wavail,k=5), data=datOR.agg)
fitgam.agg3 <- gam(yield ~ s(Wavail,k=100), data=datOR.agg)
fitgam.agg4 <- gam(yield ~ s(Wavail,k=300), data=datOR.agg)
plot(datOR.agg$Wavail,datOR.agg$yield,cex=0.1,pch=20)
points(datOR.agg$Wavail,fitgam.agg1$fitted.values,type="l",col="red",lwd=2)
points(datOR.agg$Wavail,fitgam.agg2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.agg$Wavail,fitgam.agg3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.agg$Wavail,fitgam.agg4$fitted.values,type = "l",col="brown",lwd=2)
compTable.agg <- AIC(fitgam.agg1,fitgam.agg2,fitgam.agg3,fitgam.agg4)
compTable.agg$BIC <- BIC(fitgam.agg1,fitgam.agg2,fitgam.agg3,fitgam.agg4)$BIC
compTable.agg$GCV <- c(fitgam.agg1$gcv.ubre, fitgam.agg2$gcv.ubre, fitgam.agg3$gcv.ubre, fitgam.agg4$gcv.ubre)
compTable.agg$deviance <- c(fitgam.agg1$deviance, fitgam.agg2$deviance, fitgam.agg3$deviance, fitgam.agg4$deviance)
compTable.agg$RMSE<- c(sqrt(mean((fitgam.agg1$fitted.values-datOR.agg$yield)^2)), sqrt(mean((fitgam.agg2$fitted.values-datOR.agg$yield)^2)), sqrt(mean((fitgam.agg3$fitted.values-datOR.agg$yield)^2)), sqrt(mean((fitgam.agg4$fitted.values-datOR.agg$yield)^2)))
print(compTable.agg)
## df AIC BIC GCV deviance RMSE
## fitgam.agg1 3.981404 860.8931 876.3873 0.6280246 223.6155 0.7859532
## fitgam.agg2 5.259226 860.9008 881.3679 0.6280828 222.0472 0.7831921
## fitgam.agg3 5.549870 861.3912 882.9893 0.6289465 221.9914 0.7830937
## fitgam.agg4 5.549814 861.3912 882.9891 0.6289465 221.9914 0.7830939
Drawbacks: some valuable data are missing; aggregated by mean()
might not be the best option.
datOR.wts <- datOR
plot(datOR$Wavail,datOR$yield,cex=0.5)
points(datOR$Wavail,fitgam2$fitted.values,type="l",col="red",lwd=2)
sres0<-(datOR$yield - fitgam2$fitted.values)^2
sres <- 1/sres0
datOR.wts$Weights <- 0
wavs <- names(table(datOR.wts$Wavail))
for(k in wavs){
wh <- which(datOR.wts$Wavail == k)
datOR.wts$Weights[wh] <- sres[wh]/sum(sres[wh])
}
fitgam.wts1 <- gam(yield ~ s(Wavail,k=3),weights = datOR.wts$Weights, data=datOR.wts)
fitgam.wts2 <- gam(yield ~ s(Wavail,k=5),weights = datOR.wts$Weights, data=datOR.wts)
fitgam.wts3 <- gam(yield ~ s(Wavail,k=100),weights = datOR.wts$Weights, data=datOR.wts)
fitgam.wts4 <- gam(yield ~ s(Wavail,k=300),weights = datOR.wts$Weights, data=datOR.wts)
plot(datOR.wts$Wavail,datOR.wts$yield,cex=0.1,pch=20)
points(datOR.wts$Wavail,fitgam.wts1$fitted.values,type = "l",col="red",lwd=2)
points(datOR.wts$Wavail,fitgam.wts2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.wts$Wavail,fitgam.wts3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.wts$Wavail,fitgam.wts4$fitted.values,type = "l",col="brown",lwd=2)
AIC(fitgam.wts1,fitgam.wts2,fitgam.wts3,fitgam.wts4)
## df AIC
## fitgam.wts1 3.999472 49964.63
## fitgam.wts2 5.995467 49841.01
## fitgam.wts3 100.426600 45655.24
## fitgam.wts4 300.412109 31041.80
BIC(fitgam.wts1,fitgam.wts2,fitgam.wts3,fitgam.wts4)
## df BIC
## fitgam.wts1 3.999472 49994.32
## fitgam.wts2 5.995467 49885.51
## fitgam.wts3 100.426600 46400.77
## fitgam.wts4 300.412109 33271.94
set.seed(2019)
datOR.sample <- NULL
for(k in wavs){
tempdat<- datOR[which(datOR$Wavail == k),]
s<- sort(sample(1:nrow(tempdat),1))
datOR.sample<-rbind(datOR.sample,tempdat[s,])
}
fitgam.sample <- gam(yield ~ s(Wavail,k=300), data=datOR.sample)
plot(datOR.sample$Wavail,datOR.sample$yield,cex=0.5)
points(datOR.sample$Wavail,fitgam.sample$fitted.values,type = "l")
AIC(fitgam.sample)
## [1] 884.1104
fitgam.sample1 <- gam(yield ~ s(Wavail,k=3), data=datOR.sample)
fitgam.sample2 <- gam(yield ~ s(Wavail,k=5), data=datOR.sample)
fitgam.sample3 <- gam(yield ~ s(Wavail,k=100), data=datOR.sample)
fitgam.sample4 <- gam(yield ~ s(Wavail,k=300), data=datOR.sample)
plot(datOR.sample$Wavail,datOR.sample$yield,cex=0.1,pch=20)
points(datOR.sample$Wavail,fitgam.sample1$fitted.values,type = "l",col="red",lwd=2)
points(datOR.sample$Wavail,fitgam.sample2$fitted.values,type = "l",col="blue",lwd=2)
points(datOR.sample$Wavail,fitgam.sample3$fitted.values,type = "l",col="green",lwd=2)
points(datOR.sample$Wavail,fitgam.sample4$fitted.values,type = "l",col="brown",lwd=2)
AIC(fitgam.sample1,fitgam.sample2,fitgam.sample3,fitgam.sample4)
## df AIC
## fitgam.sample1 3.980366 884.9641
## fitgam.sample2 5.404610 884.2511
## fitgam.sample3 6.515357 884.1104
## fitgam.sample4 6.515493 884.1104
BIC(fitgam.sample1,fitgam.sample2,fitgam.sample3,fitgam.sample4)
## df BIC
## fitgam.sample1 3.980366 900.4543
## fitgam.sample2 5.404610 905.2840
## fitgam.sample3 6.515357 909.4659
## fitgam.sample4 6.515493 909.4664
N <- 30
boost.max1 <- matrix(0,N,362)
boost.max2 <- matrix(0,N,362)
boost.max3 <- matrix(0,N,362)
boost.max4 <- matrix(0,N,362)
for(i in 1:N){
datOR.boost <- NULL
for(k in wavs){
tempdat<- datOR[which(datOR$Wavail == k),]
s<- sort(sample(1:nrow(tempdat),1))
datOR.boost <- rbind(datOR.boost,tempdat[s,])
}
fitgam.boost1 <- gam(yield ~ s(Wavail,k=3), data=datOR.boost)
fitgam.boost2 <- gam(yield ~ s(Wavail,k=5), data=datOR.boost)
fitgam.boost3 <- gam(yield ~ s(Wavail,k=100), data=datOR.boost)
fitgam.boost4 <- gam(yield ~ s(Wavail,k=300), data=datOR.boost)
boost.max1[i,] <- fitgam.boost1$fitted.values
boost.max2[i,] <- fitgam.boost2$fitted.values
boost.max3[i,] <- fitgam.boost3$fitted.values
boost.max4[i,] <- fitgam.boost4$fitted.values
cat(i);
}
plot(datOR$Wavail,datOR$yield,cex=0.1)
points(datOR$Wavail,fitgam2$fitted.values,type="l",lwd=2)
points(datOR.sample$Wavail,apply(boost.max1,2,mean),type="l",col="red")
points(datOR.sample$Wavail,apply(boost.max2,2,mean),type="l",col="blue")
points(datOR.sample$Wavail,apply(boost.max3,2,mean),type="l",col="green")
points(datOR.sample$Wavail,apply(boost.max4,2,mean),type="l",col="brown")
Is it worth spending some time on finding an approach, which leads to the consistency: the optimal is the best, for large size longitudinal data?
Chen, Kefei, Rebecca A. O’Leary, and Fiona H. Evans. 2019. “A Simple and Parsimonious Generalised Additive Model for Predicting Wheat Yield in a Decision Support Tool.” Agricultural Systems 173: 140–50.
Wood, Simon N. 2003. “Thin Plate Regression Splines.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 65 (1): 95–114.
Wood, S. N. 2006. “Generalized Additive Models: An Introduction with R.”
Zhu, Lixin, Lifang Li, and Zhenlin Liang. 2009. “Comparison of Six Statistical Approaches in the Selection of Appropriate Fish Growth Models.” Chinese Journal of Oceanology and Limnology 27 (3): 457. https://doi.org/10.1007/s00343-009-9236-6.