1. ## HEADER ######################
## R CODE TO Validate SDM's
## Using the PresentAbsent package
## T.A.Groen (T.A.Groen@UTwente.com)
## May 2014
setwd("D:mod12day-9_model validationscript")
## LOAD THE REQUIRED PACKAGES #######
library("PresenceAbsence")
## Load the Data
d<-read.csv("dataBase.csv",header=TRUE, sep=";")
## ORGANIZE AND SPLIT THE DATA
## To split the data randomly, we're going to assign a
## random variable with a uniform distribution to the
## database, caled "random", using the runif() function.
## runif() stands for r[ANDOM]unif[ORM] and returns
## a list of random numbers between 0 and 1. Other minima
## and maxima can be set using additional arguments.
## Check ?runif if you want to know more about this.
d$random<-runif(length(d$Observation))
train<-d[d$random<0.70,] ## the training database (70%)
evalu<-d[d$random>=0.70,] ## the evaluation database (30%)
train<-d[d$random<0.70,]
evalu<-d[d$random>=0.70,]
## FIT THE MODELS (Make sure all packages and functions are loaded)
## 1) LOGISTIC
source("brt.functions.R")
model.glm0 <- glm(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,
family="binomial",data=train)
model.glm <- step(model.glm0)
## 2) REGRESSION TREE
model.tree<-rpart(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,data=train,
method="class")
## 3) BOOSTED REGRESSION TREE
source("brt.functions.R")
model.brt <- gbm.step(data=train,
gbm.x = c(10,9,7,11,5,12,36,21,20,22),
gbm.y = 1,
family = "bernoulli",
tree.complexity = 5,
learning.rate = 0.01,
bag.fraction = 0.5)
## CALCULATE THE PROBABILITIES FOR THE EVALUATION TRAININGSET
## we will now "predict" the new data for the the points that
## were not used (30% randomly selected) and calculate
## how acurately the models can predict for these
## points.
evalu$p.logistic<-predict(model.glm,type="response",newdata=evalu)
evalu$p.tree<-predict(model.tree,newdata=evalu)[,2]
evalu$p.brt<-gbm.predict.grids(model.brt,evalu, want.grids = F)
## ASSESS THE VARIOUS EVALUATION CRITERIA (ROC, AUC, KAPPA ETC)
## THE "PresenceAbsence" PACKAGE HAS A NUMBER
## OF FUNCTIONS TO SHOW THESE. THESE FUNCTIONS MOST OF THE
## TIME REQUIRE THE DATA TO BE OFFERED AS A DATAFRAME WITH
## THE FOLLOWING STRUCTURE:
## DATA[,1] plot ID text
## DATA[,2] observed values zero-one values
## DATA[,3] predicted probabilities from first model numeric (between 0 and 1)
## DATA[,4] predicted probabilities from second model, etc...
## (SEE ALSO THE HELP INFORMATION FOR THE PresenceAbsence PACKAGE)
## SO LETS FIRST CREATE SUCH A DATAFRAME
pa.validate<-data.frame(ID=1:length(evalu$Observation),
PA=evalu$Observation,
logistic=evalu$p.logistic,
tree=evalu$p.tree,
brt=evalu$p.brt)
## QUESTION: Do you understan the code provided above to create a new data
## set called "pa.validate"? How many variables should this dataset
## contain?
## NOW WE CAN START EVALUATING THE MODELS BY USING THE FUNCTIONS
## THAT THE PresenceAbsence PACKAGE PROVIDES US WITH
presence.absence.accuracy(pa.validate, threshold=0.5)
## SO BY SETTING A THRESHOLD YOU CAN ASK THIS COMMAND TO CALCULATE
## A LOT OF POSSIBLE EVALUATION PARAMETERS
## THERE ARE ALSO QUALITY PARAMETERS THAT DETERMINE THEIR
## OWN THRESHOLD (FOR EXAMPLE MAXIMUM KAPPA)
## YOU CAN USE THE FOLLOWING COMMAND FOR THAT:
2. optimal.thresholds(pa.validate)
## We can also have a look at relevant plots
error.threshold.plot(pa.validate,which.model=1)
error.threshold.plot(pa.validate,which.model=2)
error.threshold.plot(pa.validate,which.model=3)
## the "which.model" argument specifies which
## of the models in the dataset should be plotted
auc.roc.plot(pa.validate)
## this function can plot ROC curves for all models
## at once, but you can also use the which.model argument
## when you want to select just one model.
save.image()
## ASSIGNMENT 1
## fit models for at least three of thethe techniques
## you have learned so far
## on a random subset of your data
## using a 50-50 distribution. Look at the acuracy indicators
## of these models when you use predictions based on
## the training data and also when you use the predictions
## based on evaluation data.
## QUESTION 1
## Which of the chosen modelling techniques performs best?
## QUESTION 2
## WHich dataset (training or validation)
## gives the highest accuracy results?
## Which dataset of these gives the most honest
## assessment of the model quality?
setwd("D:mod12day-13_casestudyassignmentsscript")
d<-read.csv("viper_database.csv",header=TRUE, sep=",")
train<-d[d$random<0.70,]
evalu<-d[d$random>=0.70,
my.glm=glm(SPECIES~alt+geology+ecoclim+rad_w+temp_med_jun,data=d,family="binomial")
model.brt <- gbm.step(data=train,
gbm.x = c(10,9,7,11,5,12,36,21,20,22),
gbm.y = 1,
family = "bernoulli",
tree.complexity = 5,
learning.rate = 0.01,
bag.fraction = 0.5)