SlideShare a Scribd company logo
1 of 2
Download to read offline
## HEADER ######################
## R CODE TO Validate SDM's
## Using the PresentAbsent package
## T.A.Groen (T.A.Groen@UTwente.com)
## May 2014
setwd("D:mod12day-9_model validationscript")
## LOAD THE REQUIRED PACKAGES #######
library("PresenceAbsence")
## Load the Data
d<-read.csv("dataBase.csv",header=TRUE, sep=";")
## ORGANIZE AND SPLIT THE DATA
## To split the data randomly, we're going to assign a
## random variable with a uniform distribution to the
## database, caled "random", using the runif() function.
## runif() stands for r[ANDOM]unif[ORM] and returns
## a list of random numbers between 0 and 1. Other minima
## and maxima can be set using additional arguments.
## Check ?runif if you want to know more about this.
d$random<-runif(length(d$Observation))
train<-d[d$random<0.70,] ## the training database (70%)
evalu<-d[d$random>=0.70,] ## the evaluation database (30%)
train<-d[d$random<0.70,]
evalu<-d[d$random>=0.70,]
## FIT THE MODELS (Make sure all packages and functions are loaded)
## 1) LOGISTIC
source("brt.functions.R")
model.glm0 <- glm(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,
family="binomial",data=train)
model.glm <- step(model.glm0)
## 2) REGRESSION TREE
model.tree<-rpart(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,data=train,
method="class")
## 3) BOOSTED REGRESSION TREE
source("brt.functions.R")
model.brt <- gbm.step(data=train,
gbm.x = c(10,9,7,11,5,12,36,21,20,22),
gbm.y = 1,
family = "bernoulli",
tree.complexity = 5,
learning.rate = 0.01,
bag.fraction = 0.5)
## CALCULATE THE PROBABILITIES FOR THE EVALUATION TRAININGSET
## we will now "predict" the new data for the the points that
## were not used (30% randomly selected) and calculate
## how acurately the models can predict for these
## points.
evalu$p.logistic<-predict(model.glm,type="response",newdata=evalu)
evalu$p.tree<-predict(model.tree,newdata=evalu)[,2]
evalu$p.brt<-gbm.predict.grids(model.brt,evalu, want.grids = F)
## ASSESS THE VARIOUS EVALUATION CRITERIA (ROC, AUC, KAPPA ETC)
## THE "PresenceAbsence" PACKAGE HAS A NUMBER
## OF FUNCTIONS TO SHOW THESE. THESE FUNCTIONS MOST OF THE
## TIME REQUIRE THE DATA TO BE OFFERED AS A DATAFRAME WITH
## THE FOLLOWING STRUCTURE:
## DATA[,1] plot ID text
## DATA[,2] observed values zero-one values
## DATA[,3] predicted probabilities from first model numeric (between 0 and 1)
## DATA[,4] predicted probabilities from second model, etc...
## (SEE ALSO THE HELP INFORMATION FOR THE PresenceAbsence PACKAGE)
## SO LETS FIRST CREATE SUCH A DATAFRAME
pa.validate<-data.frame(ID=1:length(evalu$Observation),
PA=evalu$Observation,
logistic=evalu$p.logistic,
tree=evalu$p.tree,
brt=evalu$p.brt)
## QUESTION: Do you understan the code provided above to create a new data
## set called "pa.validate"? How many variables should this dataset
## contain?
## NOW WE CAN START EVALUATING THE MODELS BY USING THE FUNCTIONS
## THAT THE PresenceAbsence PACKAGE PROVIDES US WITH
presence.absence.accuracy(pa.validate, threshold=0.5)
## SO BY SETTING A THRESHOLD YOU CAN ASK THIS COMMAND TO CALCULATE
## A LOT OF POSSIBLE EVALUATION PARAMETERS
## THERE ARE ALSO QUALITY PARAMETERS THAT DETERMINE THEIR
## OWN THRESHOLD (FOR EXAMPLE MAXIMUM KAPPA)
## YOU CAN USE THE FOLLOWING COMMAND FOR THAT:
optimal.thresholds(pa.validate)
## We can also have a look at relevant plots
error.threshold.plot(pa.validate,which.model=1)
error.threshold.plot(pa.validate,which.model=2)
error.threshold.plot(pa.validate,which.model=3)
## the "which.model" argument specifies which
## of the models in the dataset should be plotted
auc.roc.plot(pa.validate)
## this function can plot ROC curves for all models
## at once, but you can also use the which.model argument
## when you want to select just one model.
save.image()
## ASSIGNMENT 1
## fit models for at least three of thethe techniques
## you have learned so far
## on a random subset of your data
## using a 50-50 distribution. Look at the acuracy indicators
## of these models when you use predictions based on
## the training data and also when you use the predictions
## based on evaluation data.
## QUESTION 1
## Which of the chosen modelling techniques performs best?
## QUESTION 2
## WHich dataset (training or validation)
## gives the highest accuracy results?
## Which dataset of these gives the most honest
## assessment of the model quality?
setwd("D:mod12day-13_casestudyassignmentsscript")
d<-read.csv("viper_database.csv",header=TRUE, sep=",")
train<-d[d$random<0.70,]
evalu<-d[d$random>=0.70,
my.glm=glm(SPECIES~alt+geology+ecoclim+rad_w+temp_med_jun,data=d,family="binomial")
model.brt <- gbm.step(data=train,
gbm.x = c(10,9,7,11,5,12,36,21,20,22),
gbm.y = 1,
family = "bernoulli",
tree.complexity = 5,
learning.rate = 0.01,
bag.fraction = 0.5)

More Related Content

What's hot

Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"Webmontag Berlin
 
Perl 5.28 new features
Perl 5.28 new featuresPerl 5.28 new features
Perl 5.28 new featuresbrian d foy
 
Ruby on Rails testing with Rspec
Ruby on Rails testing with RspecRuby on Rails testing with Rspec
Ruby on Rails testing with RspecBunlong Van
 
Perl v5.26 Features (AmsterdamX.pm)
Perl v5.26 Features (AmsterdamX.pm)Perl v5.26 Features (AmsterdamX.pm)
Perl v5.26 Features (AmsterdamX.pm)brian d foy
 
Переход на Scala: босиком по граблям
Переход на Scala: босиком по граблямПереход на Scala: босиком по граблям
Переход на Scala: босиком по граблямSveta Bozhko
 
6 more things about Perl 6
6 more things about Perl 66 more things about Perl 6
6 more things about Perl 6brian d foy
 

What's hot (9)

Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"Webmontag Berlin "coffee script"
Webmontag Berlin "coffee script"
 
Perl6 grammars
Perl6 grammarsPerl6 grammars
Perl6 grammars
 
Perl 5.28 new features
Perl 5.28 new featuresPerl 5.28 new features
Perl 5.28 new features
 
Wp query
Wp queryWp query
Wp query
 
Ruby on Rails testing with Rspec
Ruby on Rails testing with RspecRuby on Rails testing with Rspec
Ruby on Rails testing with Rspec
 
Perl 6 talk
Perl 6 talkPerl 6 talk
Perl 6 talk
 
Perl v5.26 Features (AmsterdamX.pm)
Perl v5.26 Features (AmsterdamX.pm)Perl v5.26 Features (AmsterdamX.pm)
Perl v5.26 Features (AmsterdamX.pm)
 
Переход на Scala: босиком по граблям
Переход на Scala: босиком по граблямПереход на Scala: босиком по граблям
Переход на Scala: босиком по граблям
 
6 more things about Perl 6
6 more things about Perl 66 more things about Perl 6
6 more things about Perl 6
 

Similar to Validation

Getfilestruct zbksh
Getfilestruct zbkshGetfilestruct zbksh
Getfilestruct zbkshBen Pope
 
Getfilestruct zbksh(1)
Getfilestruct zbksh(1)Getfilestruct zbksh(1)
Getfilestruct zbksh(1)Ben Pope
 
Ao assignment sanmeet dhokay
Ao assignment   sanmeet dhokayAo assignment   sanmeet dhokay
Ao assignment sanmeet dhokaySanmeet Dhokay
 
Using the following code Install Packages pip install .pdf
Using the following code Install Packages   pip install .pdfUsing the following code Install Packages   pip install .pdf
Using the following code Install Packages pip install .pdfpicscamshoppe
 
Real-Time Web Programming with PrismTech Vortex Web
Real-Time Web Programming with PrismTech Vortex WebReal-Time Web Programming with PrismTech Vortex Web
Real-Time Web Programming with PrismTech Vortex WebADLINK Technology IoT
 
Building Real-Time Web Applications with Vortex-Web
Building Real-Time Web Applications with Vortex-WebBuilding Real-Time Web Applications with Vortex-Web
Building Real-Time Web Applications with Vortex-WebAngelo Corsaro
 
How to generate a 100+ page website using parameterisation in R
How to generate a 100+ page website using parameterisation in RHow to generate a 100+ page website using parameterisation in R
How to generate a 100+ page website using parameterisation in RPaul Bradshaw
 
Testing Ruby with Rspec (a beginner's guide)
Testing Ruby with Rspec (a beginner's guide)Testing Ruby with Rspec (a beginner's guide)
Testing Ruby with Rspec (a beginner's guide)Vysakh Sreenivasan
 
Apache Drill with Oracle, Hive and HBase
Apache Drill with Oracle, Hive and HBaseApache Drill with Oracle, Hive and HBase
Apache Drill with Oracle, Hive and HBaseNag Arvind Gudiseva
 
Testing Code and Assuring Quality
Testing Code and Assuring QualityTesting Code and Assuring Quality
Testing Code and Assuring QualityKent Cowgill
 
Into to DBI with DBD::Oracle
Into to DBI with DBD::OracleInto to DBI with DBD::Oracle
Into to DBI with DBD::Oraclebyterock
 
R and Athena … there is another way!?
R and Athena … there is another way!?R and Athena … there is another way!?
R and Athena … there is another way!?Dyfan Jones
 
Introduction to unit testing
Introduction to unit testingIntroduction to unit testing
Introduction to unit testingArtem Shoobovych
 
4.7 Automate Loading Data with Little to No Duplicates!
4.7 Automate Loading Data with Little to No Duplicates!4.7 Automate Loading Data with Little to No Duplicates!
4.7 Automate Loading Data with Little to No Duplicates!TargetX
 
Advanced Perl Techniques
Advanced Perl TechniquesAdvanced Perl Techniques
Advanced Perl TechniquesDave Cross
 
Perl courseparti
Perl coursepartiPerl courseparti
Perl coursepartiernlow
 

Similar to Validation (20)

Getfilestruct zbksh
Getfilestruct zbkshGetfilestruct zbksh
Getfilestruct zbksh
 
Getfilestruct zbksh(1)
Getfilestruct zbksh(1)Getfilestruct zbksh(1)
Getfilestruct zbksh(1)
 
Ao assignment sanmeet dhokay
Ao assignment   sanmeet dhokayAo assignment   sanmeet dhokay
Ao assignment sanmeet dhokay
 
Using the following code Install Packages pip install .pdf
Using the following code Install Packages   pip install .pdfUsing the following code Install Packages   pip install .pdf
Using the following code Install Packages pip install .pdf
 
Real-Time Web Programming with PrismTech Vortex Web
Real-Time Web Programming with PrismTech Vortex WebReal-Time Web Programming with PrismTech Vortex Web
Real-Time Web Programming with PrismTech Vortex Web
 
Building Real-Time Web Applications with Vortex-Web
Building Real-Time Web Applications with Vortex-WebBuilding Real-Time Web Applications with Vortex-Web
Building Real-Time Web Applications with Vortex-Web
 
Lecture19-20
Lecture19-20Lecture19-20
Lecture19-20
 
Lecture19-20
Lecture19-20Lecture19-20
Lecture19-20
 
How to generate a 100+ page website using parameterisation in R
How to generate a 100+ page website using parameterisation in RHow to generate a 100+ page website using parameterisation in R
How to generate a 100+ page website using parameterisation in R
 
Testing Ruby with Rspec (a beginner's guide)
Testing Ruby with Rspec (a beginner's guide)Testing Ruby with Rspec (a beginner's guide)
Testing Ruby with Rspec (a beginner's guide)
 
Apache Drill with Oracle, Hive and HBase
Apache Drill with Oracle, Hive and HBaseApache Drill with Oracle, Hive and HBase
Apache Drill with Oracle, Hive and HBase
 
Testing Code and Assuring Quality
Testing Code and Assuring QualityTesting Code and Assuring Quality
Testing Code and Assuring Quality
 
Into to DBI with DBD::Oracle
Into to DBI with DBD::OracleInto to DBI with DBD::Oracle
Into to DBI with DBD::Oracle
 
R and Athena … there is another way!?
R and Athena … there is another way!?R and Athena … there is another way!?
R and Athena … there is another way!?
 
Having Fun with Play
Having Fun with PlayHaving Fun with Play
Having Fun with Play
 
Introduction to unit testing
Introduction to unit testingIntroduction to unit testing
Introduction to unit testing
 
Postman On Steroids
Postman On SteroidsPostman On Steroids
Postman On Steroids
 
4.7 Automate Loading Data with Little to No Duplicates!
4.7 Automate Loading Data with Little to No Duplicates!4.7 Automate Loading Data with Little to No Duplicates!
4.7 Automate Loading Data with Little to No Duplicates!
 
Advanced Perl Techniques
Advanced Perl TechniquesAdvanced Perl Techniques
Advanced Perl Techniques
 
Perl courseparti
Perl coursepartiPerl courseparti
Perl courseparti
 

Validation

  • 1. ## HEADER ###################### ## R CODE TO Validate SDM's ## Using the PresentAbsent package ## T.A.Groen (T.A.Groen@UTwente.com) ## May 2014 setwd("D:mod12day-9_model validationscript") ## LOAD THE REQUIRED PACKAGES ####### library("PresenceAbsence") ## Load the Data d<-read.csv("dataBase.csv",header=TRUE, sep=";") ## ORGANIZE AND SPLIT THE DATA ## To split the data randomly, we're going to assign a ## random variable with a uniform distribution to the ## database, caled "random", using the runif() function. ## runif() stands for r[ANDOM]unif[ORM] and returns ## a list of random numbers between 0 and 1. Other minima ## and maxima can be set using additional arguments. ## Check ?runif if you want to know more about this. d$random<-runif(length(d$Observation)) train<-d[d$random<0.70,] ## the training database (70%) evalu<-d[d$random>=0.70,] ## the evaluation database (30%) train<-d[d$random<0.70,] evalu<-d[d$random>=0.70,] ## FIT THE MODELS (Make sure all packages and functions are loaded) ## 1) LOGISTIC source("brt.functions.R") model.glm0 <- glm(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum, family="binomial",data=train) model.glm <- step(model.glm0) ## 2) REGRESSION TREE model.tree<-rpart(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,data=train, method="class") ## 3) BOOSTED REGRESSION TREE source("brt.functions.R") model.brt <- gbm.step(data=train, gbm.x = c(10,9,7,11,5,12,36,21,20,22), gbm.y = 1, family = "bernoulli", tree.complexity = 5, learning.rate = 0.01, bag.fraction = 0.5) ## CALCULATE THE PROBABILITIES FOR THE EVALUATION TRAININGSET ## we will now "predict" the new data for the the points that ## were not used (30% randomly selected) and calculate ## how acurately the models can predict for these ## points. evalu$p.logistic<-predict(model.glm,type="response",newdata=evalu) evalu$p.tree<-predict(model.tree,newdata=evalu)[,2] evalu$p.brt<-gbm.predict.grids(model.brt,evalu, want.grids = F) ## ASSESS THE VARIOUS EVALUATION CRITERIA (ROC, AUC, KAPPA ETC) ## THE "PresenceAbsence" PACKAGE HAS A NUMBER ## OF FUNCTIONS TO SHOW THESE. THESE FUNCTIONS MOST OF THE ## TIME REQUIRE THE DATA TO BE OFFERED AS A DATAFRAME WITH ## THE FOLLOWING STRUCTURE: ## DATA[,1] plot ID text ## DATA[,2] observed values zero-one values ## DATA[,3] predicted probabilities from first model numeric (between 0 and 1) ## DATA[,4] predicted probabilities from second model, etc... ## (SEE ALSO THE HELP INFORMATION FOR THE PresenceAbsence PACKAGE) ## SO LETS FIRST CREATE SUCH A DATAFRAME pa.validate<-data.frame(ID=1:length(evalu$Observation), PA=evalu$Observation, logistic=evalu$p.logistic, tree=evalu$p.tree, brt=evalu$p.brt) ## QUESTION: Do you understan the code provided above to create a new data ## set called "pa.validate"? How many variables should this dataset ## contain? ## NOW WE CAN START EVALUATING THE MODELS BY USING THE FUNCTIONS ## THAT THE PresenceAbsence PACKAGE PROVIDES US WITH presence.absence.accuracy(pa.validate, threshold=0.5) ## SO BY SETTING A THRESHOLD YOU CAN ASK THIS COMMAND TO CALCULATE ## A LOT OF POSSIBLE EVALUATION PARAMETERS ## THERE ARE ALSO QUALITY PARAMETERS THAT DETERMINE THEIR ## OWN THRESHOLD (FOR EXAMPLE MAXIMUM KAPPA) ## YOU CAN USE THE FOLLOWING COMMAND FOR THAT:
  • 2. optimal.thresholds(pa.validate) ## We can also have a look at relevant plots error.threshold.plot(pa.validate,which.model=1) error.threshold.plot(pa.validate,which.model=2) error.threshold.plot(pa.validate,which.model=3) ## the "which.model" argument specifies which ## of the models in the dataset should be plotted auc.roc.plot(pa.validate) ## this function can plot ROC curves for all models ## at once, but you can also use the which.model argument ## when you want to select just one model. save.image() ## ASSIGNMENT 1 ## fit models for at least three of thethe techniques ## you have learned so far ## on a random subset of your data ## using a 50-50 distribution. Look at the acuracy indicators ## of these models when you use predictions based on ## the training data and also when you use the predictions ## based on evaluation data. ## QUESTION 1 ## Which of the chosen modelling techniques performs best? ## QUESTION 2 ## WHich dataset (training or validation) ## gives the highest accuracy results? ## Which dataset of these gives the most honest ## assessment of the model quality? setwd("D:mod12day-13_casestudyassignmentsscript") d<-read.csv("viper_database.csv",header=TRUE, sep=",") train<-d[d$random<0.70,] evalu<-d[d$random>=0.70, my.glm=glm(SPECIES~alt+geology+ecoclim+rad_w+temp_med_jun,data=d,family="binomial") model.brt <- gbm.step(data=train, gbm.x = c(10,9,7,11,5,12,36,21,20,22), gbm.y = 1, family = "bernoulli", tree.complexity = 5, learning.rate = 0.01, bag.fraction = 0.5)