SlideShare a Scribd company logo
#install.packages("ISOweek")
#install.packages("stringr")
#install.packages("caTools")
#install.packages("TTR")
#install.packages("gam")
#install.packages("data.table")
#install.packages("Metrics")
#install.packages("randomForest")
#install.packages("MASS")
#install.packages("e1071")
#install.packages("fpc", dependencies = TRUE)
#install.packages("geosphere")
#install.packages("scatterplot3d")
#install.packages("party")
#install.packages("adabag")
#install.packages("DeducerExtras")
#install.packages("h2o")
library(ISOweek)
library(stringr)
library(caTools)
library(TTR)
require(gam)
library(data.table)
library(Metrics)
require(randomForest)
require(MASS)
require(e1071)
library(fpc)
library(geosphere)
library(scatterplot3d)
require(party)
require(adabag)
require(DeducerExtras)
require(h2o)
#Read Raw
Data------------------------------------------------------------------
------------------------------------------------
train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/train.csv")
test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working
Folder/test.csv")
weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/weather.csv")
spray <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/spray.csv")
#Get Week Number and ReCategorize
Species---------------------------------------------------------------
------------------------
vSpecies<-c(as.character(train$Species),as.character(test$Species))
vSpecies[vSpecies=="UNSPECIFIED CULEX"]<-"CULEX ERRATICUS"
vSpecies[-which(vSpecies == "CULEX PIPIENS" |
vSpecies == "CULEX PIPIENS/RESTUANS" |
vSpecies == "CULEX RESTUANS")] = "CULEX OTHER"
vSpecies<-factor(vSpecies,levels=unique(vSpecies))
## data.table syntax for adding a column; could overwrite the existing
column as well
train[,Species2:=factor(vSpecies[1:nrow(train)],levels=unique(vSpecies
))]
test[,Species2:=factor(vSpecies[(nrow(train)
+1):length(vSpecies)],levels=unique(vSpecies))]
## also add some fields for components of the date using simple
substrings
train[,dMonth:=as.factor(paste(substr(train$Date,6,7)))]
train[,dYear:=as.factor(paste(substr(train$Date,1,4)))]
train$Date = as.Date(train$Date, format="%Y-%m-%d")
xsDate = as.Date(paste0(train$dYear, "0101"), format="%Y%m%d")
train$dWeek = as.numeric(paste(floor((train$Date - xsDate + 1)/7)))
test[,dMonth:=as.factor(paste(substr(test$Date,6,7)))]
test[,dYear:=as.factor(paste(substr(test$Date,1,4)))]
test$Date = as.Date(test$Date, format="%Y-%m-%d")
tsDate = as.Date(paste0(test$dYear, "0101"), format="%Y%m%d")
test$dWeek = as.numeric(paste(floor((test$Date - tsDate + 1)/7)))
weather[,dMonth:=as.factor(paste(substr(weather$Date,6,7)))]
weather[,dYear:=as.factor(paste(substr(weather$Date,1,4)))]
weather$Date = as.Date(weather$Date, format="%Y-%m-%d")
tsDate = as.Date(paste0(weather$dYear, "0101"), format="%Y%m%d")
weather$dWeek = as.numeric(paste(floor((weather$Date - tsDate + 1)/
7)))
#Train Data Pre-
processing------------------------------------------------------------
------------------------------------------
wnv.train <- data.frame(train[,list(Date, dWeek, Species, Trap,
NumMosquitos, Latitude, Longitude, Species2, WnvPresent)])
#Cast Date Type
wnv.train$Date <- as.Date(wnv.train$Date)
oh.lat <- 41.995
oh.long <- -87.933
mw.lat <- 41.786
mw.long <- -87.752
for (i in 1:nrow(wnv.train)) {
#Euclidean Distances
wnv.train$dist.oh[i] <- sqrt((wnv.train$Latitude[i]-oh.lat)**2+
(wnv.train$Longitude[i]-oh.long)**2)*111
wnv.train$dist.mw[i] <- sqrt((wnv.train$Latitude[i]-mw.lat)**2+
(wnv.train$Longitude[i]-mw.long)**2)*111
#Haversine Distance
wnv.train$disth.oh[i] <- distHaversine(c(wnv.train$Longitude[i],
wnv.train$Latitude[i]), c(oh.long, oh.lat))/1000
wnv.train$disth.mw[i] <- distHaversine(c(wnv.train$Longitude[i],
wnv.train$Latitude[i]), c(mw.long, mw.lat))/1000
if (wnv.train$dist.oh[i] >= wnv.train$dist.mw[i]) {
wnv.train$Station.Euc[i] <- 2 #2 is Midway
}
else {
wnv.train$Station.Euc[i] <- 1 #1 is O'Hare
}
if (wnv.train$disth.oh[i] >= wnv.train$disth.mw[i]) {
wnv.train$Station[i] <- 2 #2 is Midway
}
else {
wnv.train$Station[i] <- 1 #1 is O'Hare
}
#Creating ISO Week Number
#wnv.train$week.num[i] <- substring(ISOweek(wnv.train$Date[i]), 7,
8)
}
#Weather Data
Cleaning--------------------------------------------------------------
----------------------------------------------
#Choose Variables
wnv.weather <- data.frame(weather[,list(Station, Date, Tmax, Tmin,
Tavg, Depart, DewPoint, WetBulb, Heat, Cool, PrecipTotal, StnPressure,
SeaLevel, ResultSpeed,
ResultDir, AvgSpeed)])
#Cast Date Type
wnv.weather$Date <- as.Date(wnv.weather$Date)
#Create Week Number Variable
for (i in 1:nrow(wnv.weather)) {
wnv.weather$week.num[i] <- as.numeric(substring(ISOweek(wnv.weather
$Date[i]), 7, 8)) #Create Week Markers
wnv.weather$date.year[i] <- as.numeric(format(wnv.weather$Date[i],
"%Y"))
#wnv.weather$date.month[i] <- as.numeric(format(wnv.weather$Date[i],
"%m"))
#wnv.weather$date.day[i] <- as.numeric(format(wnv.weather$Date[i],
"%d"))
}
#Clean Temperature
wnv.weather$Tmax <- as.numeric(as.character(wnv.weather$Tmax))
wnv.weather$Tmin <- as.numeric(as.character(wnv.weather$Tmin))
wnv.weather$Tavg <- as.numeric(as.character(wnv.weather$Tavg))
wnv.weather$Tavg[is.na(wnv.weather$Tavg)] <- (wnv.weather
$Tmax[is.na(wnv.weather$Tavg)] + wnv.weather$Tmin[is.na(wnv.weather
$Tavg)])/2
#Clean DewPoint
wnv.weather$DewPoint <- as.numeric(as.character(wnv.weather$DewPoint))
#Clean Depart
wnv.weather$Depart[wnv.weather$Depart=="M"] <- NA
wnv.weather$Depart <- as.numeric(as.character(wnv.weather$Depart))
#Clean WetBulb
wnv.weather$WetBulb <- as.numeric(as.character(wnv.weather$WetBulb))
wnv.weather$WetBulb[849]<- wnv.weather$WetBulb[850]
wnv.weather$WetBulb[2411]<- wnv.weather$WetBulb[2412]
wnv.weather$WetBulb[2413]<- wnv.weather$WetBulb[2414]
wnv.weather$WetBulb[2416]<- wnv.weather$WetBulb[2415]
#Clean Heat
wnv.weather$Heat <- as.numeric(as.character(wnv.weather$Heat))
wnv.weather$Heat[8]<- wnv.weather$Heat[7]
wnv.weather$Heat[506]<- wnv.weather$Heat[505]
wnv.weather$Heat[676]<- wnv.weather$Heat[675]
wnv.weather$Heat[1638]<- wnv.weather$Heat[1637]
wnv.weather$Heat[2068]<- wnv.weather$Heat[2067]
wnv.weather$Heat[2212]<- wnv.weather$Heat[2411]
wnv.weather$Heat[2502]<- wnv.weather$Heat[2501]
wnv.weather$Heat[2512]<- wnv.weather$Heat[2511]
wnv.weather$Heat[2526]<- wnv.weather$Heat[2525]
wnv.weather$Heat[2580]<- wnv.weather$Heat[2579]
wnv.weather$Heat[2812]<- wnv.weather$Heat[2811]
#Clean Cool
wnv.weather$Cool <- as.numeric(as.character(wnv.weather$Cool))
wnv.weather$Cool[8]<- wnv.weather$Cool[7]
wnv.weather$Cool[506]<- wnv.weather$Cool[505]
wnv.weather$Cool[676]<- wnv.weather$Cool[675]
wnv.weather$Cool[1638]<- wnv.weather$Cool[1637]
wnv.weather$Cool[2068]<- wnv.weather$Cool[2067]
wnv.weather$Cool[2212]<- wnv.weather$Cool[2411]
wnv.weather$Cool[2502]<- wnv.weather$Cool[2501]
wnv.weather$Cool[2512]<- wnv.weather$Cool[2511]
wnv.weather$Cool[2526]<- wnv.weather$Cool[2525]
wnv.weather$Cool[2580]<- wnv.weather$Cool[2579]
wnv.weather$Cool[2812]<- wnv.weather$Cool[2811]
#Clean PrecipTotal
wnv.weather$PrecipTotal <- as.numeric(as.character(wnv.weather
$PrecipTotal))
wnv.weather$PrecipTotal[is.na(wnv.weather$PrecipTotal)] <- 0
#Clean StnPressure
wnv.weather$StnPressure <- as.numeric(as.character(wnv.weather
$StnPressure))
wnv.weather$StnPressure[88]<- wnv.weather$StnPressure[87]
wnv.weather$StnPressure[849]<- wnv.weather$StnPressure[850]
wnv.weather$StnPressure[2411]<- wnv.weather$StnPressure[2410]
wnv.weather$StnPressure[2412]<- wnv.weather$StnPressure[2410]
#Clean SeaLevel
wnv.weather$SeaLevel <- as.numeric(as.character(wnv.weather$SeaLevel))
wnv.weather$SeaLevel[88]<- wnv.weather$SeaLevel[87]
wnv.weather$SeaLevel[833]<- wnv.weather$SeaLevel[834]
wnv.weather$SeaLevel[995]<- wnv.weather$SeaLevel[996]
wnv.weather$SeaLevel[1733]<- wnv.weather$SeaLevel[1734]
wnv.weather$SeaLevel[1746]<- wnv.weather$SeaLevel[1745]
wnv.weather$SeaLevel[1757]<- wnv.weather$SeaLevel[1758]
wnv.weather$SeaLevel[2068]<- wnv.weather$SeaLevel[2067]
wnv.weather$SeaLevel[2091]<- wnv.weather$SeaLevel[2092]
wnv.weather$SeaLevel[2744]<- wnv.weather$SeaLevel[2743]
#Clean ResultSpeed
wnv.weather$ResultSpeed <- as.numeric(as.character(wnv.weather
$ResultSpeed))
#Clean ResultDir
wnv.weather$ResultDir <- as.numeric(as.character(wnv.weather
$ResultDir))
#Clean AvgSpeed
wnv.weather$AvgSpeed <- as.numeric(as.character(wnv.weather$AvgSpeed))
wnv.weather$AvgSpeed[is.na(wnv.weather$AvgSpeed)] <- mean(wnv.weather
$AvgSpeed, na.rm =TRUE)
#Split weather data into O'Hare and Midway
wnv.weather.oh <- subset(wnv.weather, wnv.weather$Station == 1)
wnv.weather.mw <- subset(wnv.weather, wnv.weather$Station == 2)
#Create New Weather Features: temp.mov.avg, prec.mov.avg, deg.day,
accumulated degree day for each year
Tbase <- 71.6 #22deg C = 71.6deg F
wnv.weather.oh$deg.day <- 0
wnv.weather.oh$deg.day.c <- 0
wnv.weather.oh$acc.deg.day <- 0
wnv.weather.oh$acc.deg.day.c <- 0
newyear <- TRUE
#For O'Hare
start <- 1
for (year in 2007:2014) {
iterations <- nrow(subset(wnv.weather.oh, wnv.weather.oh$date.year
== year ))
for (i in start:(start+iterations-1)) {
Tmean <- (wnv.weather.oh$Tmax[i] + wnv.weather.oh$Tmin[i])/2
if ( Tmean > Tbase) {
wnv.weather.oh$deg.day[i] <- Tmean - Tbase
wnv.weather.oh$deg.day.c[i] <- 0
} else {
wnv.weather.oh$deg.day[i] <- 0
wnv.weather.oh$deg.day.c[i] <- Tbase - Tmean
}
if (newyear == TRUE) {
newyear <- FALSE
wnv.weather.oh$acc.deg.day[i] <- 0
} else {
wnv.weather.oh$acc.deg.day[i] <- wnv.weather.oh$deg.day[i] +
wnv.weather.oh$acc.deg.day[i-1]
wnv.weather.oh$acc.deg.day.c[i] <- wnv.weather.oh$deg.day.c[i] +
wnv.weather.oh$acc.deg.day.c[i-1]
}
}
start <- start + iterations
newyear <- TRUE
}
#For Midway
Tbase <- 71.6 #22deg C = 71.6deg F
wnv.weather.mw$deg.day <- 0
wnv.weather.mw$deg.day.c <- 0
wnv.weather.mw$acc.deg.day <- 0
wnv.weather.mw$acc.deg.day.c <- 0
newyear <- TRUE
start <- 1
for (year in 2007:2014) {
iterations <- nrow(subset(wnv.weather.mw, wnv.weather.mw$date.year
== year ))
for (i in start:(start+iterations-1)) {
Tmean <- (wnv.weather.mw$Tmax[i] + wnv.weather.mw$Tmin[i])/2
if ( Tmean > Tbase) {
wnv.weather.mw$deg.day[i] <- Tmean - Tbase
wnv.weather.mw$deg.day.c[i] <- 0
} else {
wnv.weather.mw$deg.day[i] <- 0
wnv.weather.mw$deg.day.c[i] <- Tbase - Tmean
}
if (newyear == TRUE) {
newyear <- FALSE
wnv.weather.mw$acc.deg.day[i] <- 0
} else {
wnv.weather.mw$acc.deg.day[i] <- wnv.weather.mw$deg.day[i] +
wnv.weather.mw$acc.deg.day[i-1]
wnv.weather.mw$acc.deg.day.c[i] <- wnv.weather.mw$deg.day.c[i] +
wnv.weather.mw$acc.deg.day.c[i-1]
}
}
start <- start + iterations
newyear <- TRUE
}
#Creating Moving Averages and Moving Sums
oh.temp.mov.avg.wk <- vector()
oh.temp.mov.avg.2wk <- vector()
oh.prec.mov.avg.wk <- vector()
oh.prec.mov.avg.2wk <- vector()
oh.prec.mov.sum.wk <- vector()
oh.prec.mov.sum.2wk <- vector()
oh.dd.mov.sum.wk <- vector()
oh.dd.mov.sum.2wk <- vector()
mw.temp.mov.avg.wk <- vector()
mw.temp.mov.avg.2wk <- vector()
mw.prec.mov.avg.wk <- vector()
mw.prec.mov.avg.2wk <- vector()
mw.prec.mov.sum.wk <- vector()
mw.prec.mov.sum.2wk <- vector()
mw.dd.mov.sum.wk <- vector()
mw.dd.mov.sum.2wk <- vector()
for (year in 2007:2014) {
#O'Hare
assign(paste("oh.temp.mov.avg.wk", year, sep=""),
runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year ==
year], 7, alg="C", endrule="mean"))
assign(paste("oh.temp.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year ==
year], 14, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.avg.wk", year, sep=""),
runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year
== year], 7, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year
== year], 14, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.sum.wk", year, sep=""),
runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("oh.prec.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year ==
year], n = 14, cumulative = FALSE))
assign(paste("oh.dd.mov.sum.wk", year, sep=""),
runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("oh.dd.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year ==
year], n = 14, cumulative = FALSE))
oh.temp.mov.avg.wk <- c(oh.temp.mov.avg.wk,
get(paste("oh.temp.mov.avg.wk", year, sep="")))
oh.temp.mov.avg.2wk <- c(oh.temp.mov.avg.2wk,
get(paste("oh.temp.mov.avg.2wk", year, sep="")))
oh.prec.mov.avg.wk <- c(oh.prec.mov.avg.wk,
get(paste("oh.prec.mov.avg.wk", year, sep="")))
oh.prec.mov.avg.2wk <- c(oh.prec.mov.avg.2wk,
get(paste("oh.prec.mov.avg.2wk", year, sep="")))
oh.prec.mov.sum.wk <- c(oh.prec.mov.sum.wk,
get(paste("oh.prec.mov.sum.wk", year, sep="")))
oh.prec.mov.sum.2wk <- c(oh.prec.mov.sum.2wk,
get(paste("oh.prec.mov.sum.2wk", year, sep="")))
oh.dd.mov.sum.wk <- c(oh.dd.mov.sum.wk,
get(paste("oh.dd.mov.sum.wk", year, sep="")))
oh.dd.mov.sum.2wk <- c(oh.dd.mov.sum.2wk, get(paste("oh.dd.mov.sum.
2wk", year, sep="")))
#Midway
assign(paste("mw.temp.mov.avg.wk", year, sep=""),
runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year ==
year], 7, alg="C", endrule="mean"))
assign(paste("mw.temp.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year ==
year], 14, alg="C", endrule="mean"))
assign(paste("mw.prec.mov.avg.wk", year, sep=""),
runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year
== year], 7, alg="C", endrule="mean"))
assign(paste("mw.prec.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year
== year], 14, alg="C", endrule="mean"))
assign(paste("mw.prec.mov.sum.wk", year, sep=""),
runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("mw.prec.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year ==
year], n = 14, cumulative = FALSE))
assign(paste("mw.dd.mov.sum.wk", year, sep=""),
runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("mw.dd.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year ==
year], n = 14, cumulative = FALSE))
mw.temp.mov.avg.wk <- c(mw.temp.mov.avg.wk,
get(paste("mw.temp.mov.avg.wk", year, sep="")))
mw.temp.mov.avg.2wk <- c(mw.temp.mov.avg.2wk,
get(paste("mw.temp.mov.avg.2wk", year, sep="")))
mw.prec.mov.avg.wk <- c(mw.prec.mov.avg.wk,
get(paste("mw.prec.mov.avg.wk", year, sep="")))
mw.prec.mov.avg.2wk <- c(mw.prec.mov.avg.2wk,
get(paste("mw.prec.mov.avg.2wk", year, sep="")))
mw.prec.mov.sum.wk <- c(mw.prec.mov.sum.wk,
get(paste("mw.prec.mov.sum.wk", year, sep="")))
mw.prec.mov.sum.2wk <- c(mw.prec.mov.sum.2wk,
get(paste("mw.prec.mov.sum.2wk", year, sep="")))
mw.dd.mov.sum.wk <- c(mw.dd.mov.sum.wk,
get(paste("mw.dd.mov.sum.wk", year, sep="")))
mw.dd.mov.sum.2wk <- c(mw.dd.mov.sum.2wk, get(paste("mw.dd.mov.sum.
2wk", year, sep="")))
}
#Add Variable Matrices to Weather Data Frame
wnv.weather.oh$temp.mov.avg.wk <- oh.temp.mov.avg.wk
wnv.weather.oh$temp.mov.avg.2wk <- oh.temp.mov.avg.2wk
wnv.weather.oh$prec.mov.avg.wk <- oh.prec.mov.avg.wk
wnv.weather.oh$prec.mov.avg.2wk <- oh.prec.mov.avg.2wk
wnv.weather.oh$prec.mov.sum.wk <- oh.prec.mov.sum.wk
wnv.weather.oh$prec.mov.sum.2wk <- oh.prec.mov.sum.2wk
wnv.weather.oh$dd.mov.sum.wk <- oh.dd.mov.sum.wk
wnv.weather.oh$dd.mov.sum.2wk <- oh.dd.mov.sum.2wk
wnv.weather.oh$prec.mov.sum.wk[is.na(wnv.weather.oh$prec.mov.sum.wk)]
<- 0
wnv.weather.oh$prec.mov.sum.2wk[is.na(wnv.weather.oh$prec.mov.sum.
2wk)] <- 0
wnv.weather.oh$dd.mov.sum.wk[is.na(wnv.weather.oh$dd.mov.sum.wk)] <- 0
wnv.weather.oh$dd.mov.sum.2wk[is.na(wnv.weather.oh$dd.mov.sum.2wk)] <-
0
wnv.weather.mw$temp.mov.avg.wk <- mw.temp.mov.avg.wk
wnv.weather.mw$temp.mov.avg.2wk <- mw.temp.mov.avg.2wk
wnv.weather.mw$prec.mov.avg.wk <- mw.prec.mov.avg.wk
wnv.weather.mw$prec.mov.avg.2wk <- mw.prec.mov.avg.2wk
wnv.weather.mw$prec.mov.sum.wk <- mw.prec.mov.sum.wk
wnv.weather.mw$prec.mov.sum.2wk <- mw.prec.mov.sum.2wk
wnv.weather.mw$dd.mov.sum.wk <- mw.dd.mov.sum.wk
wnv.weather.mw$dd.mov.sum.2wk <- mw.dd.mov.sum.2wk
wnv.weather.mw$prec.mov.sum.wk[is.na(wnv.weather.mw$prec.mov.sum.wk)]
<- 0
wnv.weather.mw$prec.mov.sum.2wk[is.na(wnv.weather.mw$prec.mov.sum.
2wk)] <- 0
wnv.weather.mw$dd.mov.sum.wk[is.na(wnv.weather.mw$dd.mov.sum.wk)] <- 0
wnv.weather.mw$dd.mov.sum.2wk[is.na(wnv.weather.mw$dd.mov.sum.2wk)] <-
0
options(scipen=100)
#Test Data Pre-
processing------------------------------------------------------------
------------------------------------------
wnv.test <- data.frame(test[,list(Date, Id, dWeek, Species, Trap,
Latitude, Longitude, Species2)])
#Cast Date Type
wnv.test$Date <- as.Date(wnv.test$Date)
oh.lat <- 41.995
oh.long <- -87.933
mw.lat <- 41.786
mw.long <- -87.752
for (i in 1:nrow(wnv.test)) {
#Euclidean Distances
wnv.test$dist.oh[i] <- sqrt((wnv.test$Latitude[i]-oh.lat)**2+
(wnv.test$Longitude[i]-oh.long)**2)*111
wnv.test$dist.mw[i] <- sqrt((wnv.test$Latitude[i]-mw.lat)**2+
(wnv.test$Longitude[i]-mw.long)**2)*111
#Haversine Distance
wnv.test$disth.oh[i] <- distHaversine(c(wnv.test$Longitude[i],
wnv.test$Latitude[i]), c(oh.long, oh.lat))/1000
wnv.test$disth.mw[i] <- distHaversine(c(wnv.test$Longitude[i],
wnv.test$Latitude[i]), c(mw.long, mw.lat))/1000
if (wnv.test$dist.oh[i] >= wnv.test$dist.mw[i]) {
wnv.test$Station.Euc[i] <- 2 #2 is Midway
}
else {
wnv.test$Station.Euc[i] <- 1 #1 is O'Hare
}
if (wnv.test$disth.oh[i] >= wnv.test$disth.mw[i]) {
wnv.test$Station[i] <- 2 #2 is Midway
}
else {
wnv.test$Station[i] <- 1 #1 is O'Hare
}
#Creating ISO Week Number
#wnv.test$week.num[i] <- substring(ISOweek(wnv.test$Date[i]), 7, 8)
}
#Concatenating Clean Weather Datasets and Merging with Training/Test
Data----------------------------------------------------------
#wnv.train$Species2 <- train$Species2
#wnv.test$Species2 <- test$Species2
wnv.weather.clean <- rbind(wnv.weather.oh, wnv.weather.mw)
wnv.train.weather <- merge(wnv.train, wnv.weather.clean, by=c("Date",
"Station"))
wnv.test.weather <- merge(wnv.test, wnv.weather.clean, by=c("Date",
"Station"))
#Just use one station...
#Sorting wnv.test.weather
wnv.test.weather.sorted <- wnv.test.weather[order(wnv.test.weather
$Id),]
#Training/Testing
Split-----------------------------------------------------------------
------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2007,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2007,]
#-----------------------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2009,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2009,]
#-----------------------------------------------------
#Set aside 2011 data as test, and train on the remaining
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2011,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2011,]
#-----------------------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2013,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2013,]
#Clustering Locations -
DBSCAN----------------------------------------------------------------
---------------------------
#Potential epsilon values:
#1/4 mile = 0.003623
#1/2 mile = 0.007246
#1 mile = 0.014493
#2 mile = 0.028986
#3 mile = 0.043479
#Based on 1deg Latitude/Longitude = 69 miles
x.train <- cbind(wnv.train.weather$Longitude, wnv.train.weather
$Latitude)
dbs.train <- dbscan(x.train, 0.003623)
plot(dbs.train, x.train)
print(dbs.train)
prediction.train <- predict(dbs.train, x.train)
#length(prediction.train)
#unique(prediction.train)
wnv.train.weather$loc.cluster.qmile <- as.factor(prediction.train)
x.test <- cbind(wnv.test.weather.sorted$Longitude,
wnv.test.weather.sorted$Latitude)
prediction.test <- predict(dbs.train, x.train, x.test)
#length(prediction.test)
#unique(prediction.test)
wnv.test.weather.sorted$loc.cluster.2mile <-
as.factor(prediction.test)
#Clustering Locations - K-
Means-----------------------------------------------------------------
-----------------------
#With normalized Longitude & Latitude
train.means <- apply(wnv.train.weather[c("Longitude", "Latitude")], 2,
mean)
train.sds <-apply(wnv.train.weather[c("Longitude", "Latitude")], 2,
sd)
train.loc.norm <- scale(wnv.train.weather[c("Longitude",
"Latitude")],center=means,scale=sds)
test.means <- apply(wnv.test.weather.sorted[c("Longitude",
"Latitude")], 2, mean)
test.sds <-apply(wnv.test.weather.sorted[c("Longitude", "Latitude")],
2, sd)
test.loc.norm <- scale(wnv.test.weather.sorted[c("Longitude",
"Latitude")],center=means,scale=sds)
set.seed(123)
clusters.kmeans.norm <- kmeans(train.loc.norm, 20)
clusters.kmeans.norm$centers
plot(clusters.kmeans.norm$centers)
pred.cluster.norm.train <- predict(clusters.kmeans.norm,
data=train.loc.norm)
pred.cluster.norm.test <- predict(clusters.kmeans.norm,
data=test.loc.norm)
#With un-normalized Longitude & Latitude
train.loc <- wnv.train.weather[c("Longitude", "Latitude")]
test.loc <- wnv.test.weather.sorted[c("Longitude", "Latitude")]
set.seed(123)
clusters.kmeans <- kmeans(train.loc, 25)
clusters.kmeans$centers
plot(clusters.kmeans$centers)
pred.cluster.train <- predict(clusters.kmeans.norm,
data=train.loc.norm)
pred.cluster.test <- predict(clusters.kmeans.norm, data=test.loc.norm)
#Assign to training/testing set
wnv.train.weather$kmeans.cluster.n <-
as.factor(pred.cluster.norm.train)
wnv.test.weather.sorted$kmeans.cluster.n <-
as.factor(pred.cluster.norm.test)
#PCA------------------------------------------------------------------
-------------------------------------------------
keeps1 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb",
"StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed",
"acc.deg.day", "temp.mov.avg.wk", "prec.mov.avg.wk",
"prec.mov.sum.wk", "dd.mov.sum.wk" )
keeps2 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb",
"StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed")
#model.pca <- princomp(wnv.train.weather[keeps1], cor=TRUE,
scores=TRUE)
#model.pca
model.pca.train1 <- princomp(wnv.train.weather[keeps1], cor=TRUE,
scores=TRUE)
model.pca.test1 <- princomp(wnv.test.weather.sorted[keeps1], cor=TRUE,
scores=TRUE)
model.pca.train <- princomp(wnv.train.weather[keeps2], cor=TRUE,
scores=TRUE)
model.pca.test <- princomp(wnv.test.weather.sorted[keeps2], cor=TRUE,
scores=TRUE)
plot(model.pca.train)
summary(model.pca.train)
plot(model.pca.test)
summary(model.pca.test)
plot(model.pca.train1)
summary(model.pca.train1)
model.pca.train$scores
model.pca.test$scores
model.pca.train$loadings
model.pca.test$loadings
model.pca.train1$loadings
pca.out.train <- data.frame(model.pca.train$scores)
pca.out.test <- data.frame(model.pca.test$scores)
wnv.train.weather <- cbind(wnv.train.weather, pca.out.train)
wnv.test.weather.sorted <- cbind(wnv.test.weather.sorted,
pca.out.test)
#wnv.train.weather.pca <- cbind(wnv.train.weather, pca.out)
#Dealing with unbalanced
classes---------------------------------------------------------------
------------------------
wnv.train.no <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent ==
0,]
wnv.train.yes <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent ==
1,]
set.seed(1234)
wnv.train.no.75 <- wnv.train.no[sample(1:nrow(wnv.train.no), 5852,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.75, wnv.train.yes)
set.seed(1234)
wnv.train.no.50 <- wnv.train.no[sample(1:nrow(wnv.train.no), 3901,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.50, wnv.train.yes)
set.seed(1234)
wnv.train.no.25 <- wnv.train.no[sample(1:nrow(wnv.train.no), 1951,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.25, wnv.train.yes)
set.seed(1234)
wnv.train.no.10 <- wnv.train.no[sample(1:nrow(wnv.train.no), 780,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.10, wnv.train.yes)
set.seed(1234)
wnv.train.no.5 <- wnv.train.no[sample(1:nrow(wnv.train.no), 312,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.5, wnv.train.yes)
#70/30 Split--------------------------------------------
set.seed(1234)
#set.seed(4321)
ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7,
0.3))
trainData <- wnv.train.weather[ind==1,]
testData <- wnv.train.weather[ind==2,]
#GAM
Modeling--------------------------------------------------------------
--------------------------------------------------
#Experiment with PCA and Kmeans-------------------------------
train.model <- gam(WnvPresent ~ s(dWeek) + Species2 + kmeans.cluster.n
+ s(Comp.1)
+ s(Comp.2)
+ s(Comp.3)
+ s(Comp.4)
+ s(Comp.5)
+ s(acc.deg.day)
+ s(temp.mov.avg.wk)
+ s(prec.mov.avg.2wk)
#+ s(prec.mov.sum.wk)
, data = trainData, family="binomial", bf.maxit =
1000)
prediction.gam <-predict(train.model, newdata = testData, type =
"response")
auc(testData$WnvPresent, prediction.gam)
#-------------------------------------------------------------
fitSubmit <- update(train.model, data=wnv.train.weather)
pSubmit.gam<-predict(fitSubmit, newdata = wnv.test.weather.sorted,
type = "response")
submissionFile<-cbind(test$Id,pSubmit.gam)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi134prec.csv",row.names=FALSE
,quote=FALSE)
#GLM------------------------------------------------------------------
--------------------------------------------------------
#Create Squared Term deg.day, dWeek, dd.mov.sum.2wk
wnv.train.weather$acc.deg.day2 <- (wnv.train.weather$acc.deg.day)**2
wnv.train.weather$dWeek2 <- (wnv.train.weather$dWeek)**2
wnv.train.weather$dd.mov.sum.2wk2 <- (wnv.train.weather$dd.mov.sum.
2wk)**2
wnv.test.weather.sorted$acc.deg.day2 <- (wnv.test.weather.sorted
$acc.deg.day)**2
wnv.test.weather.sorted$dWeek2 <- (wnv.test.weather.sorted$dWeek)**2
wnv.test.weather.sorted$dd.mov.sum.2wk2 <- (wnv.test.weather.sorted
$dd.mov.sum.2wk)**2
#Experiment with PCA and Kmeans--------------------------------
log.model <- glm(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n
+ dWeek2
+ Comp.1
#+ Comp.2
#+ Comp.3
#+ Comp.4
#+ Comp.5
#+ acc.deg.day
#+ acc.deg.day2
+ temp.mov.avg.2wk
+ prec.mov.avg.2wk
, data = trainData, family=binomial())
summary(log.model)
prediction.glm <-predict(log.model, newdata = testData, type =
"response")
auc(testData$WnvPresent, prediction.glm)
#------------------------------------------------------------
fitSubmit <- update(log.model, data=wnv.train.weather)
pSubmit.glm <-predict(fitSubmit, newdata = wnv.test.weather.sorted,
type = "response")
#summary(pSubmit.glm)
submissionFile<-cbind(test$Id,pSubmit.glm)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Base Model Submission AUC: 0.64569
#Base Train AUC: 0.78259055
#LDA------------------------------------------------------------------
--------------------------------------------------------
lda.model <- lda(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
+ acc.deg.day2
#+ temp.mov.avg.2wk
+ prec.mov.avg.wk
+ prec.mov.sum.2wk
,data = trainData)
prediction.lda <- predict(lda.model, testData)
lda.df <- data.frame(prediction.lda$posterior)
auc(testData$WnvPresent, lda.df[,2])
#-------------------------------------------------------------------
fitSubmit <- update(lda.model, data=wnv.train.weather)
pSubmit<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type =
"probability")
lda.test <- data.frame(pSubmit$posterior)
submissionFile<-cbind(test$Id,lda.df.test[,2])
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Random
Forest----------------------------------------------------------------
------------------------------------------------
set.seed(1234)
rf.model <- randomForest(WnvPresent.C ~ dWeek + Species2 +
kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
#+ prec.mov.sum.2wk
, data = trainData, nTree = 1000)
prediction.rf <- predict(rf.model, testData, type = "prob")
rf.df <- data.frame(prediction.rf)
auc(testData$WnvPresent.C, rf.df[,2])
fitSubmit <- update(rf.model, data=wnv.train.weather)
pSubmit.rf<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type
= "prob")
rf.df.test <- data.frame(pSubmit.rf)
submissionFile<-cbind(test$Id, rf.df.test[,2])
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#PCA and Kmeans-------------------
set.seed(1234)
rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
#+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
#+ prec.mov.sum.wk
,data = trainData,
controls=cforest_unbiased(ntree=200, mtry=2))
prediction.rf <- predict(rf.model, newdata=testData, type = "prob")
rf.df <- data.frame(prediction.rf)
x <- vector()
for (i in seq(2, 6232, 2)) {
x <- c(x, rf.df[1, i])
}
auc(testData$WnvPresent.C, x)
set.seed(1234)
rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
#+ Comp.3
#+ Comp.4
#+ Comp.5
#+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
,data = wnv.train.weather,
controls=cforest_unbiased(ntree=10, mtry=2, maxdepth = 10))
prediction.rf <- predict(rf.model, newdata=wnv.test.weather.sorted,
type = "prob")
rf.df <- data.frame(prediction.rf)
#H20 Random Forest--------------
library(h2o)
localH2O <- h2o.init(nthreads = -1,max_mem_size = '7g')
rf.model <- h2o.randomForest(x=c(3, 9, 43, 44:48),y = 53,data =
trainData,
mtries = 18,
sample.rate = 0.5,
classification = T,ntree = 200,verbose =
T)
prediction.rf <- h2o.predict(rf.model,testData)
#Support Vector
Machines--------------------------------------------------------------
----------------------------------------
#PCA and kmeans----------------------------------------------------
set.seed(1234)
svm.model <- svm(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
+ temp.mov.avg.2wk
#+ prec.mov.avg.wk
#+ prec.mov.sum.2wk
, data = trainData, probability = TRUE)
#summary(svm.model)
prediction.svm <- predict(svm.model, testData, probability = TRUE)
svm.df <- data.frame(attr(prediction.svm, "probabilities"))
auc(testData$WnvPresent.C, svm.df[,2])
#------------------------------------------------------------------
set.seed(1234)
fitSubmit <- update(svm.model, data=wnv.train.weather)
pSubmit.svm<-predict(fitSubmit, newdata = wnv.test.weather.sorted,
probability = TRUE)
svm.df.test <- data.frame(attr(pSubmit.svm, "probabilities"))
submissionFile<-cbind(test$Id, svm.df.test[,2])
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Ensemble-------------------------------------------------------------
--------------------------------------------------------
gam.train <- as.matrix(prediction.gam)
glm.train <- as.matrix(prediction.glm)
lda.train <- as.matrix(lda.df[,2])
rf.train <- as.matrix(rf.df[,2])
svm.train <- as.matrix(svm.df[,2])
ens.train <- cbind(gam.train, lda.train, rf.train, svm.train)
gam.test <- as.matrix(pSubmit.gam)
glm.test <- as.matrix(pSubmit.glm)
lda.test <- as.matrix(lda.df.test[,2])
rf.test <- as.matrix(rf.df.test[,2])
svm.test <- as.matrix(svm.df.test[,2])
ens.test <- cbind(lda.test, rf.test, svm.test)
w <- matrix(c(0.0, 0.3, 0.1, 0.6), nrow = 4, ncol = 1)
ens.model <- ens.train %*% w
auc(testData$WnvPresent, ens.model)
w <- matrix(c(0.3, 0.1, 0.6), nrow = 3, ncol = 1)
ens.model.test <- ens.test %*% w
submissionFile<-cbind(test$Id,ens.model.test)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLiBest3.csv",row.names=FALSE,q
uote=FALSE)
#Bagging--------------------------------------------------------------
------------------------------------------------------
set.seed(1234)
ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7,
0.3))
trainData <- wnv.train.weather[ind==1,]
testData <- wnv.train.weather[ind==2,]
set.seed(4321)
all_data <- wnv.train.weather
positions <- sample(nrow(all_data),size=floor((nrow(all_data)/10)*7))
trainBag<- all_data[positions,]
testBag<- all_data[-positions,]
library(foreach)
length_divisor<-6
iterations<-50
predictions<-foreach(m=1:iterations,.combine=cbind) %do% {
training_positions <- sample(nrow(trainData),
size=floor((nrow(trainData)/length_divisor)))
train_pos<-1:nrow(trainData) %in% training_positions
gam.bag <- gam(WnvPresent ~ lo(dWeek) + Species2 + lo(Longitude,
Latitude) + s(acc.deg.day), data = trainData[train_pos,]
, family="binomial", bf.maxit = 45)
predict(gam.bag, newdata = testData, type = "response")
}
predictions<-data.frame(rowMeans(predictions))
auc(testData$WnvPresent.C, predictions)
#--------------------------------------------------------------------
library(foreach)
length_divisor<-6
iterations<-1000
predictions<-foreach(m=1:iterations,.combine=cbind) %do% {
training_positions <- sample(nrow(trainData),
size=floor((nrow(trainData)/length_divisor)))
train_pos<-1:nrow(trainData) %in% training_positions
glm.bag <- glm(WnvPresent ~ dWeek + dWeek2 + Species2 + Longitude +
acc.deg.day + acc.deg.day2
, data = trainData[train_pos,], family=binomial())
#summary(log.model)
predict(glm.bag, newdata = testData, type = "response")
}
predictions<-data.frame(rowMeans(predictions))
auc(testData$WnvPresent.C, predictions)
#Vs.
gam.test <- gam(WnvPresent ~ lo(dWeek) + Species2 + loc.cluster.qmile,
data = trainBag, family="binomial", bf.maxit = 200)
prediction.gam <-predict(gam.test, newdata = testBag, type =
"response")
auc(testBag$WnvPresent, prediction.gam)
#Adabag
Exploration-----------------------------------------------------------
---------------------------------------------
wnv.adaboost <- boosting(WnvPresent.C ~ dWeek, data = trainData,
boos=TRUE, mfinal = 5)

More Related Content

What's hot

Unittesting JavaScript with Evidence
Unittesting JavaScript with EvidenceUnittesting JavaScript with Evidence
Unittesting JavaScript with Evidence
Tobie Langel
 
The elements of a functional mindset
The elements of a functional mindsetThe elements of a functional mindset
The elements of a functional mindset
Eric Normand
 
The Ring programming language version 1.3 book - Part 63 of 88
The Ring programming language version 1.3 book - Part 63 of 88The Ring programming language version 1.3 book - Part 63 of 88
The Ring programming language version 1.3 book - Part 63 of 88
Mahmoud Samir Fayed
 
Kotlin Coroutines in Practice @ KotlinConf 2018
Kotlin Coroutines in Practice @ KotlinConf 2018Kotlin Coroutines in Practice @ KotlinConf 2018
Kotlin Coroutines in Practice @ KotlinConf 2018
Roman Elizarov
 
Ejemplo radio
Ejemplo radioEjemplo radio
Ejemplo radio
lupe ga
 
EMFPath
EMFPathEMFPath
EMFPath
mikaelbarbero
 
The Ring programming language version 1.7 book - Part 12 of 196
The Ring programming language version 1.7 book - Part 12 of 196The Ring programming language version 1.7 book - Part 12 of 196
The Ring programming language version 1.7 book - Part 12 of 196
Mahmoud Samir Fayed
 
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
Codemotion
 
State of the CFEngine 2018
State of the CFEngine 2018State of the CFEngine 2018
State of the CFEngine 2018
Nick Anderson
 
JJUG CCC 2011 Spring
JJUG CCC 2011 SpringJJUG CCC 2011 Spring
JJUG CCC 2011 Spring
Kiyotaka Oku
 
Registro de venta
Registro de ventaRegistro de venta
Registro de venta
lupe ga
 
Procesos
ProcesosProcesos
Procesos
PublioScipion
 
Message-based communication patterns in distributed Akka applications
Message-based communication patterns in distributed Akka applicationsMessage-based communication patterns in distributed Akka applications
Message-based communication patterns in distributed Akka applications
Andrii Lashchenko
 
XTW_Import
XTW_ImportXTW_Import
XTW_Import
Luther Quinn
 
Dmxedit
DmxeditDmxedit
Dmxedit
Eddy_TKJ
 
Benchy, python framework for performance benchmarking of Python Scripts
Benchy, python framework for performance benchmarking  of Python ScriptsBenchy, python framework for performance benchmarking  of Python Scripts
Benchy, python framework for performance benchmarking of Python Scripts
Marcel Caraciolo
 
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
Samuel Lampa
 
RHadoop, R meets Hadoop
RHadoop, R meets HadoopRHadoop, R meets Hadoop
RHadoop, R meets Hadoop
Revolution Analytics
 
mobl - model-driven engineering lecture
mobl - model-driven engineering lecturemobl - model-driven engineering lecture
mobl - model-driven engineering lecture
zefhemel
 
Writing Hadoop Jobs in Scala using Scalding
Writing Hadoop Jobs in Scala using ScaldingWriting Hadoop Jobs in Scala using Scalding
Writing Hadoop Jobs in Scala using Scalding
Toni Cebrián
 

What's hot (20)

Unittesting JavaScript with Evidence
Unittesting JavaScript with EvidenceUnittesting JavaScript with Evidence
Unittesting JavaScript with Evidence
 
The elements of a functional mindset
The elements of a functional mindsetThe elements of a functional mindset
The elements of a functional mindset
 
The Ring programming language version 1.3 book - Part 63 of 88
The Ring programming language version 1.3 book - Part 63 of 88The Ring programming language version 1.3 book - Part 63 of 88
The Ring programming language version 1.3 book - Part 63 of 88
 
Kotlin Coroutines in Practice @ KotlinConf 2018
Kotlin Coroutines in Practice @ KotlinConf 2018Kotlin Coroutines in Practice @ KotlinConf 2018
Kotlin Coroutines in Practice @ KotlinConf 2018
 
Ejemplo radio
Ejemplo radioEjemplo radio
Ejemplo radio
 
EMFPath
EMFPathEMFPath
EMFPath
 
The Ring programming language version 1.7 book - Part 12 of 196
The Ring programming language version 1.7 book - Part 12 of 196The Ring programming language version 1.7 book - Part 12 of 196
The Ring programming language version 1.7 book - Part 12 of 196
 
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
 
State of the CFEngine 2018
State of the CFEngine 2018State of the CFEngine 2018
State of the CFEngine 2018
 
JJUG CCC 2011 Spring
JJUG CCC 2011 SpringJJUG CCC 2011 Spring
JJUG CCC 2011 Spring
 
Registro de venta
Registro de ventaRegistro de venta
Registro de venta
 
Procesos
ProcesosProcesos
Procesos
 
Message-based communication patterns in distributed Akka applications
Message-based communication patterns in distributed Akka applicationsMessage-based communication patterns in distributed Akka applications
Message-based communication patterns in distributed Akka applications
 
XTW_Import
XTW_ImportXTW_Import
XTW_Import
 
Dmxedit
DmxeditDmxedit
Dmxedit
 
Benchy, python framework for performance benchmarking of Python Scripts
Benchy, python framework for performance benchmarking  of Python ScriptsBenchy, python framework for performance benchmarking  of Python Scripts
Benchy, python framework for performance benchmarking of Python Scripts
 
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
 
RHadoop, R meets Hadoop
RHadoop, R meets HadoopRHadoop, R meets Hadoop
RHadoop, R meets Hadoop
 
mobl - model-driven engineering lecture
mobl - model-driven engineering lecturemobl - model-driven engineering lecture
mobl - model-driven engineering lecture
 
Writing Hadoop Jobs in Scala using Scalding
Writing Hadoop Jobs in Scala using ScaldingWriting Hadoop Jobs in Scala using Scalding
Writing Hadoop Jobs in Scala using Scalding
 

Similar to West-Nile-Virus | Kaggle

Ns2programs
Ns2programsNs2programs
Ns2programs
Meenakshi Devi
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
Sander Kieft
 
Analysis crop trials using climate data
Analysis crop trials using climate dataAnalysis crop trials using climate data
Analysis crop trials using climate data
Alberto Labarga
 
Phylogenetics in R
Phylogenetics in RPhylogenetics in R
Phylogenetics in R
schamber
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析
Takeshi Arabiki
 
Environmental analysis of crop trials - Van Etten
Environmental analysis of crop trials - Van EttenEnvironmental analysis of crop trials - Van Etten
R (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support SystemR (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support System
Maithreya Chakravarthula
 
From grep to BERT
From grep to BERTFrom grep to BERT
From grep to BERT
QAware GmbH
 
Everything About PowerShell
Everything About PowerShellEverything About PowerShell
Everything About PowerShell
Gaetano Causio
 
Parallel Computing With Dask - PyDays 2017
Parallel Computing With Dask - PyDays 2017Parallel Computing With Dask - PyDays 2017
Parallel Computing With Dask - PyDays 2017
Christian Aichinger
 
Clojure: Practical functional approach on JVM
Clojure: Practical functional approach on JVMClojure: Practical functional approach on JVM
Clojure: Practical functional approach on JVM
sunng87
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and Monoids
Hugo Gävert
 
Meet Up - Spark Stream Processing + Kafka
Meet Up - Spark Stream Processing + KafkaMeet Up - Spark Stream Processing + Kafka
Meet Up - Spark Stream Processing + Kafka
Knoldus Inc.
 
Apache Velocity 1.6
Apache Velocity 1.6Apache Velocity 1.6
Apache Velocity 1.6
Henning Schmiedehausen
 
From Query Plan to Query Performance: Supercharging your Apache Spark Queries...
From Query Plan to Query Performance: Supercharging your Apache Spark Queries...From Query Plan to Query Performance: Supercharging your Apache Spark Queries...
From Query Plan to Query Performance: Supercharging your Apache Spark Queries...
Databricks
 
Metaprogramming in Haskell
Metaprogramming in HaskellMetaprogramming in Haskell
Metaprogramming in Haskell
Hiromi Ishii
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
Databricks
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
Databricks
 
Scala 2 + 2 > 4
Scala 2 + 2 > 4Scala 2 + 2 > 4
Scala 2 + 2 > 4
Emil Vladev
 
Watch out: Observables are here to stay
Watch out: Observables are here to stayWatch out: Observables are here to stay
Watch out: Observables are here to stay
Guilherme Ventura
 

Similar to West-Nile-Virus | Kaggle (20)

Ns2programs
Ns2programsNs2programs
Ns2programs
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
Analysis crop trials using climate data
Analysis crop trials using climate dataAnalysis crop trials using climate data
Analysis crop trials using climate data
 
Phylogenetics in R
Phylogenetics in RPhylogenetics in R
Phylogenetics in R
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析
 
Environmental analysis of crop trials - Van Etten
Environmental analysis of crop trials - Van EttenEnvironmental analysis of crop trials - Van Etten
Environmental analysis of crop trials - Van Etten
 
R (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support SystemR (Shiny Package) - Server Side Code for Decision Support System
R (Shiny Package) - Server Side Code for Decision Support System
 
From grep to BERT
From grep to BERTFrom grep to BERT
From grep to BERT
 
Everything About PowerShell
Everything About PowerShellEverything About PowerShell
Everything About PowerShell
 
Parallel Computing With Dask - PyDays 2017
Parallel Computing With Dask - PyDays 2017Parallel Computing With Dask - PyDays 2017
Parallel Computing With Dask - PyDays 2017
 
Clojure: Practical functional approach on JVM
Clojure: Practical functional approach on JVMClojure: Practical functional approach on JVM
Clojure: Practical functional approach on JVM
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and Monoids
 
Meet Up - Spark Stream Processing + Kafka
Meet Up - Spark Stream Processing + KafkaMeet Up - Spark Stream Processing + Kafka
Meet Up - Spark Stream Processing + Kafka
 
Apache Velocity 1.6
Apache Velocity 1.6Apache Velocity 1.6
Apache Velocity 1.6
 
From Query Plan to Query Performance: Supercharging your Apache Spark Queries...
From Query Plan to Query Performance: Supercharging your Apache Spark Queries...From Query Plan to Query Performance: Supercharging your Apache Spark Queries...
From Query Plan to Query Performance: Supercharging your Apache Spark Queries...
 
Metaprogramming in Haskell
Metaprogramming in HaskellMetaprogramming in Haskell
Metaprogramming in Haskell
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
Scala 2 + 2 > 4
Scala 2 + 2 > 4Scala 2 + 2 > 4
Scala 2 + 2 > 4
 
Watch out: Observables are here to stay
Watch out: Observables are here to stayWatch out: Observables are here to stay
Watch out: Observables are here to stay
 

West-Nile-Virus | Kaggle

  • 1. #install.packages("ISOweek") #install.packages("stringr") #install.packages("caTools") #install.packages("TTR") #install.packages("gam") #install.packages("data.table") #install.packages("Metrics") #install.packages("randomForest") #install.packages("MASS") #install.packages("e1071") #install.packages("fpc", dependencies = TRUE) #install.packages("geosphere") #install.packages("scatterplot3d") #install.packages("party") #install.packages("adabag") #install.packages("DeducerExtras") #install.packages("h2o") library(ISOweek) library(stringr) library(caTools) library(TTR) require(gam) library(data.table) library(Metrics) require(randomForest) require(MASS) require(e1071) library(fpc) library(geosphere) library(scatterplot3d) require(party) require(adabag) require(DeducerExtras) require(h2o) #Read Raw Data------------------------------------------------------------------ ------------------------------------------------ train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/ Working Folder/train.csv") test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working Folder/test.csv") weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
  • 2. Working Folder/weather.csv") spray <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/ Working Folder/spray.csv") #Get Week Number and ReCategorize Species--------------------------------------------------------------- ------------------------ vSpecies<-c(as.character(train$Species),as.character(test$Species)) vSpecies[vSpecies=="UNSPECIFIED CULEX"]<-"CULEX ERRATICUS" vSpecies[-which(vSpecies == "CULEX PIPIENS" | vSpecies == "CULEX PIPIENS/RESTUANS" | vSpecies == "CULEX RESTUANS")] = "CULEX OTHER" vSpecies<-factor(vSpecies,levels=unique(vSpecies)) ## data.table syntax for adding a column; could overwrite the existing column as well train[,Species2:=factor(vSpecies[1:nrow(train)],levels=unique(vSpecies ))] test[,Species2:=factor(vSpecies[(nrow(train) +1):length(vSpecies)],levels=unique(vSpecies))] ## also add some fields for components of the date using simple substrings train[,dMonth:=as.factor(paste(substr(train$Date,6,7)))] train[,dYear:=as.factor(paste(substr(train$Date,1,4)))] train$Date = as.Date(train$Date, format="%Y-%m-%d") xsDate = as.Date(paste0(train$dYear, "0101"), format="%Y%m%d") train$dWeek = as.numeric(paste(floor((train$Date - xsDate + 1)/7))) test[,dMonth:=as.factor(paste(substr(test$Date,6,7)))] test[,dYear:=as.factor(paste(substr(test$Date,1,4)))] test$Date = as.Date(test$Date, format="%Y-%m-%d") tsDate = as.Date(paste0(test$dYear, "0101"), format="%Y%m%d") test$dWeek = as.numeric(paste(floor((test$Date - tsDate + 1)/7))) weather[,dMonth:=as.factor(paste(substr(weather$Date,6,7)))] weather[,dYear:=as.factor(paste(substr(weather$Date,1,4)))] weather$Date = as.Date(weather$Date, format="%Y-%m-%d") tsDate = as.Date(paste0(weather$dYear, "0101"), format="%Y%m%d") weather$dWeek = as.numeric(paste(floor((weather$Date - tsDate + 1)/ 7))) #Train Data Pre-
  • 3. processing------------------------------------------------------------ ------------------------------------------ wnv.train <- data.frame(train[,list(Date, dWeek, Species, Trap, NumMosquitos, Latitude, Longitude, Species2, WnvPresent)]) #Cast Date Type wnv.train$Date <- as.Date(wnv.train$Date) oh.lat <- 41.995 oh.long <- -87.933 mw.lat <- 41.786 mw.long <- -87.752 for (i in 1:nrow(wnv.train)) { #Euclidean Distances wnv.train$dist.oh[i] <- sqrt((wnv.train$Latitude[i]-oh.lat)**2+ (wnv.train$Longitude[i]-oh.long)**2)*111 wnv.train$dist.mw[i] <- sqrt((wnv.train$Latitude[i]-mw.lat)**2+ (wnv.train$Longitude[i]-mw.long)**2)*111 #Haversine Distance wnv.train$disth.oh[i] <- distHaversine(c(wnv.train$Longitude[i], wnv.train$Latitude[i]), c(oh.long, oh.lat))/1000 wnv.train$disth.mw[i] <- distHaversine(c(wnv.train$Longitude[i], wnv.train$Latitude[i]), c(mw.long, mw.lat))/1000 if (wnv.train$dist.oh[i] >= wnv.train$dist.mw[i]) { wnv.train$Station.Euc[i] <- 2 #2 is Midway } else { wnv.train$Station.Euc[i] <- 1 #1 is O'Hare } if (wnv.train$disth.oh[i] >= wnv.train$disth.mw[i]) { wnv.train$Station[i] <- 2 #2 is Midway } else { wnv.train$Station[i] <- 1 #1 is O'Hare } #Creating ISO Week Number #wnv.train$week.num[i] <- substring(ISOweek(wnv.train$Date[i]), 7, 8)
  • 4. } #Weather Data Cleaning-------------------------------------------------------------- ---------------------------------------------- #Choose Variables wnv.weather <- data.frame(weather[,list(Station, Date, Tmax, Tmin, Tavg, Depart, DewPoint, WetBulb, Heat, Cool, PrecipTotal, StnPressure, SeaLevel, ResultSpeed, ResultDir, AvgSpeed)]) #Cast Date Type wnv.weather$Date <- as.Date(wnv.weather$Date) #Create Week Number Variable for (i in 1:nrow(wnv.weather)) { wnv.weather$week.num[i] <- as.numeric(substring(ISOweek(wnv.weather $Date[i]), 7, 8)) #Create Week Markers wnv.weather$date.year[i] <- as.numeric(format(wnv.weather$Date[i], "%Y")) #wnv.weather$date.month[i] <- as.numeric(format(wnv.weather$Date[i], "%m")) #wnv.weather$date.day[i] <- as.numeric(format(wnv.weather$Date[i], "%d")) } #Clean Temperature wnv.weather$Tmax <- as.numeric(as.character(wnv.weather$Tmax)) wnv.weather$Tmin <- as.numeric(as.character(wnv.weather$Tmin)) wnv.weather$Tavg <- as.numeric(as.character(wnv.weather$Tavg)) wnv.weather$Tavg[is.na(wnv.weather$Tavg)] <- (wnv.weather $Tmax[is.na(wnv.weather$Tavg)] + wnv.weather$Tmin[is.na(wnv.weather $Tavg)])/2 #Clean DewPoint wnv.weather$DewPoint <- as.numeric(as.character(wnv.weather$DewPoint))
  • 5. #Clean Depart wnv.weather$Depart[wnv.weather$Depart=="M"] <- NA wnv.weather$Depart <- as.numeric(as.character(wnv.weather$Depart)) #Clean WetBulb wnv.weather$WetBulb <- as.numeric(as.character(wnv.weather$WetBulb)) wnv.weather$WetBulb[849]<- wnv.weather$WetBulb[850] wnv.weather$WetBulb[2411]<- wnv.weather$WetBulb[2412] wnv.weather$WetBulb[2413]<- wnv.weather$WetBulb[2414] wnv.weather$WetBulb[2416]<- wnv.weather$WetBulb[2415] #Clean Heat wnv.weather$Heat <- as.numeric(as.character(wnv.weather$Heat)) wnv.weather$Heat[8]<- wnv.weather$Heat[7] wnv.weather$Heat[506]<- wnv.weather$Heat[505] wnv.weather$Heat[676]<- wnv.weather$Heat[675] wnv.weather$Heat[1638]<- wnv.weather$Heat[1637] wnv.weather$Heat[2068]<- wnv.weather$Heat[2067] wnv.weather$Heat[2212]<- wnv.weather$Heat[2411] wnv.weather$Heat[2502]<- wnv.weather$Heat[2501] wnv.weather$Heat[2512]<- wnv.weather$Heat[2511] wnv.weather$Heat[2526]<- wnv.weather$Heat[2525] wnv.weather$Heat[2580]<- wnv.weather$Heat[2579] wnv.weather$Heat[2812]<- wnv.weather$Heat[2811] #Clean Cool wnv.weather$Cool <- as.numeric(as.character(wnv.weather$Cool)) wnv.weather$Cool[8]<- wnv.weather$Cool[7] wnv.weather$Cool[506]<- wnv.weather$Cool[505] wnv.weather$Cool[676]<- wnv.weather$Cool[675] wnv.weather$Cool[1638]<- wnv.weather$Cool[1637] wnv.weather$Cool[2068]<- wnv.weather$Cool[2067] wnv.weather$Cool[2212]<- wnv.weather$Cool[2411] wnv.weather$Cool[2502]<- wnv.weather$Cool[2501] wnv.weather$Cool[2512]<- wnv.weather$Cool[2511] wnv.weather$Cool[2526]<- wnv.weather$Cool[2525] wnv.weather$Cool[2580]<- wnv.weather$Cool[2579] wnv.weather$Cool[2812]<- wnv.weather$Cool[2811]
  • 6. #Clean PrecipTotal wnv.weather$PrecipTotal <- as.numeric(as.character(wnv.weather $PrecipTotal)) wnv.weather$PrecipTotal[is.na(wnv.weather$PrecipTotal)] <- 0 #Clean StnPressure wnv.weather$StnPressure <- as.numeric(as.character(wnv.weather $StnPressure)) wnv.weather$StnPressure[88]<- wnv.weather$StnPressure[87] wnv.weather$StnPressure[849]<- wnv.weather$StnPressure[850] wnv.weather$StnPressure[2411]<- wnv.weather$StnPressure[2410] wnv.weather$StnPressure[2412]<- wnv.weather$StnPressure[2410] #Clean SeaLevel wnv.weather$SeaLevel <- as.numeric(as.character(wnv.weather$SeaLevel)) wnv.weather$SeaLevel[88]<- wnv.weather$SeaLevel[87] wnv.weather$SeaLevel[833]<- wnv.weather$SeaLevel[834] wnv.weather$SeaLevel[995]<- wnv.weather$SeaLevel[996] wnv.weather$SeaLevel[1733]<- wnv.weather$SeaLevel[1734] wnv.weather$SeaLevel[1746]<- wnv.weather$SeaLevel[1745] wnv.weather$SeaLevel[1757]<- wnv.weather$SeaLevel[1758] wnv.weather$SeaLevel[2068]<- wnv.weather$SeaLevel[2067] wnv.weather$SeaLevel[2091]<- wnv.weather$SeaLevel[2092] wnv.weather$SeaLevel[2744]<- wnv.weather$SeaLevel[2743] #Clean ResultSpeed wnv.weather$ResultSpeed <- as.numeric(as.character(wnv.weather $ResultSpeed)) #Clean ResultDir wnv.weather$ResultDir <- as.numeric(as.character(wnv.weather $ResultDir)) #Clean AvgSpeed wnv.weather$AvgSpeed <- as.numeric(as.character(wnv.weather$AvgSpeed)) wnv.weather$AvgSpeed[is.na(wnv.weather$AvgSpeed)] <- mean(wnv.weather $AvgSpeed, na.rm =TRUE)
  • 7. #Split weather data into O'Hare and Midway wnv.weather.oh <- subset(wnv.weather, wnv.weather$Station == 1) wnv.weather.mw <- subset(wnv.weather, wnv.weather$Station == 2) #Create New Weather Features: temp.mov.avg, prec.mov.avg, deg.day, accumulated degree day for each year Tbase <- 71.6 #22deg C = 71.6deg F wnv.weather.oh$deg.day <- 0 wnv.weather.oh$deg.day.c <- 0 wnv.weather.oh$acc.deg.day <- 0 wnv.weather.oh$acc.deg.day.c <- 0 newyear <- TRUE #For O'Hare start <- 1 for (year in 2007:2014) { iterations <- nrow(subset(wnv.weather.oh, wnv.weather.oh$date.year == year )) for (i in start:(start+iterations-1)) { Tmean <- (wnv.weather.oh$Tmax[i] + wnv.weather.oh$Tmin[i])/2 if ( Tmean > Tbase) { wnv.weather.oh$deg.day[i] <- Tmean - Tbase wnv.weather.oh$deg.day.c[i] <- 0 } else { wnv.weather.oh$deg.day[i] <- 0 wnv.weather.oh$deg.day.c[i] <- Tbase - Tmean } if (newyear == TRUE) { newyear <- FALSE wnv.weather.oh$acc.deg.day[i] <- 0 } else { wnv.weather.oh$acc.deg.day[i] <- wnv.weather.oh$deg.day[i] + wnv.weather.oh$acc.deg.day[i-1] wnv.weather.oh$acc.deg.day.c[i] <- wnv.weather.oh$deg.day.c[i] + wnv.weather.oh$acc.deg.day.c[i-1] } } start <- start + iterations newyear <- TRUE }
  • 8. #For Midway Tbase <- 71.6 #22deg C = 71.6deg F wnv.weather.mw$deg.day <- 0 wnv.weather.mw$deg.day.c <- 0 wnv.weather.mw$acc.deg.day <- 0 wnv.weather.mw$acc.deg.day.c <- 0 newyear <- TRUE start <- 1 for (year in 2007:2014) { iterations <- nrow(subset(wnv.weather.mw, wnv.weather.mw$date.year == year )) for (i in start:(start+iterations-1)) { Tmean <- (wnv.weather.mw$Tmax[i] + wnv.weather.mw$Tmin[i])/2 if ( Tmean > Tbase) { wnv.weather.mw$deg.day[i] <- Tmean - Tbase wnv.weather.mw$deg.day.c[i] <- 0 } else { wnv.weather.mw$deg.day[i] <- 0 wnv.weather.mw$deg.day.c[i] <- Tbase - Tmean } if (newyear == TRUE) { newyear <- FALSE wnv.weather.mw$acc.deg.day[i] <- 0 } else { wnv.weather.mw$acc.deg.day[i] <- wnv.weather.mw$deg.day[i] + wnv.weather.mw$acc.deg.day[i-1] wnv.weather.mw$acc.deg.day.c[i] <- wnv.weather.mw$deg.day.c[i] + wnv.weather.mw$acc.deg.day.c[i-1] } } start <- start + iterations newyear <- TRUE } #Creating Moving Averages and Moving Sums oh.temp.mov.avg.wk <- vector() oh.temp.mov.avg.2wk <- vector() oh.prec.mov.avg.wk <- vector()
  • 9. oh.prec.mov.avg.2wk <- vector() oh.prec.mov.sum.wk <- vector() oh.prec.mov.sum.2wk <- vector() oh.dd.mov.sum.wk <- vector() oh.dd.mov.sum.2wk <- vector() mw.temp.mov.avg.wk <- vector() mw.temp.mov.avg.2wk <- vector() mw.prec.mov.avg.wk <- vector() mw.prec.mov.avg.2wk <- vector() mw.prec.mov.sum.wk <- vector() mw.prec.mov.sum.2wk <- vector() mw.dd.mov.sum.wk <- vector() mw.dd.mov.sum.2wk <- vector() for (year in 2007:2014) { #O'Hare assign(paste("oh.temp.mov.avg.wk", year, sep=""), runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year == year], 7, alg="C", endrule="mean")) assign(paste("oh.temp.mov.avg.2wk", year, sep=""), runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year == year], 14, alg="C", endrule="mean")) assign(paste("oh.prec.mov.avg.wk", year, sep=""), runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], 7, alg="C", endrule="mean")) assign(paste("oh.prec.mov.avg.2wk", year, sep=""), runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], 14, alg="C", endrule="mean")) assign(paste("oh.prec.mov.sum.wk", year, sep=""), runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], n = 7, cumulative = FALSE)) assign(paste("oh.prec.mov.sum.2wk", year, sep=""), runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], n = 14, cumulative = FALSE)) assign(paste("oh.dd.mov.sum.wk", year, sep=""), runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year == year], n = 7, cumulative = FALSE)) assign(paste("oh.dd.mov.sum.2wk", year, sep=""), runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year == year], n = 14, cumulative = FALSE))
  • 10. oh.temp.mov.avg.wk <- c(oh.temp.mov.avg.wk, get(paste("oh.temp.mov.avg.wk", year, sep=""))) oh.temp.mov.avg.2wk <- c(oh.temp.mov.avg.2wk, get(paste("oh.temp.mov.avg.2wk", year, sep=""))) oh.prec.mov.avg.wk <- c(oh.prec.mov.avg.wk, get(paste("oh.prec.mov.avg.wk", year, sep=""))) oh.prec.mov.avg.2wk <- c(oh.prec.mov.avg.2wk, get(paste("oh.prec.mov.avg.2wk", year, sep=""))) oh.prec.mov.sum.wk <- c(oh.prec.mov.sum.wk, get(paste("oh.prec.mov.sum.wk", year, sep=""))) oh.prec.mov.sum.2wk <- c(oh.prec.mov.sum.2wk, get(paste("oh.prec.mov.sum.2wk", year, sep=""))) oh.dd.mov.sum.wk <- c(oh.dd.mov.sum.wk, get(paste("oh.dd.mov.sum.wk", year, sep=""))) oh.dd.mov.sum.2wk <- c(oh.dd.mov.sum.2wk, get(paste("oh.dd.mov.sum. 2wk", year, sep=""))) #Midway assign(paste("mw.temp.mov.avg.wk", year, sep=""), runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year == year], 7, alg="C", endrule="mean")) assign(paste("mw.temp.mov.avg.2wk", year, sep=""), runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year == year], 14, alg="C", endrule="mean")) assign(paste("mw.prec.mov.avg.wk", year, sep=""), runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], 7, alg="C", endrule="mean")) assign(paste("mw.prec.mov.avg.2wk", year, sep=""), runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], 14, alg="C", endrule="mean")) assign(paste("mw.prec.mov.sum.wk", year, sep=""), runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], n = 7, cumulative = FALSE)) assign(paste("mw.prec.mov.sum.2wk", year, sep=""), runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], n = 14, cumulative = FALSE)) assign(paste("mw.dd.mov.sum.wk", year, sep=""), runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year == year], n = 7, cumulative = FALSE)) assign(paste("mw.dd.mov.sum.2wk", year, sep=""), runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year == year], n = 14, cumulative = FALSE))
  • 11. mw.temp.mov.avg.wk <- c(mw.temp.mov.avg.wk, get(paste("mw.temp.mov.avg.wk", year, sep=""))) mw.temp.mov.avg.2wk <- c(mw.temp.mov.avg.2wk, get(paste("mw.temp.mov.avg.2wk", year, sep=""))) mw.prec.mov.avg.wk <- c(mw.prec.mov.avg.wk, get(paste("mw.prec.mov.avg.wk", year, sep=""))) mw.prec.mov.avg.2wk <- c(mw.prec.mov.avg.2wk, get(paste("mw.prec.mov.avg.2wk", year, sep=""))) mw.prec.mov.sum.wk <- c(mw.prec.mov.sum.wk, get(paste("mw.prec.mov.sum.wk", year, sep=""))) mw.prec.mov.sum.2wk <- c(mw.prec.mov.sum.2wk, get(paste("mw.prec.mov.sum.2wk", year, sep=""))) mw.dd.mov.sum.wk <- c(mw.dd.mov.sum.wk, get(paste("mw.dd.mov.sum.wk", year, sep=""))) mw.dd.mov.sum.2wk <- c(mw.dd.mov.sum.2wk, get(paste("mw.dd.mov.sum. 2wk", year, sep=""))) } #Add Variable Matrices to Weather Data Frame wnv.weather.oh$temp.mov.avg.wk <- oh.temp.mov.avg.wk wnv.weather.oh$temp.mov.avg.2wk <- oh.temp.mov.avg.2wk wnv.weather.oh$prec.mov.avg.wk <- oh.prec.mov.avg.wk wnv.weather.oh$prec.mov.avg.2wk <- oh.prec.mov.avg.2wk wnv.weather.oh$prec.mov.sum.wk <- oh.prec.mov.sum.wk wnv.weather.oh$prec.mov.sum.2wk <- oh.prec.mov.sum.2wk wnv.weather.oh$dd.mov.sum.wk <- oh.dd.mov.sum.wk wnv.weather.oh$dd.mov.sum.2wk <- oh.dd.mov.sum.2wk wnv.weather.oh$prec.mov.sum.wk[is.na(wnv.weather.oh$prec.mov.sum.wk)] <- 0 wnv.weather.oh$prec.mov.sum.2wk[is.na(wnv.weather.oh$prec.mov.sum. 2wk)] <- 0 wnv.weather.oh$dd.mov.sum.wk[is.na(wnv.weather.oh$dd.mov.sum.wk)] <- 0 wnv.weather.oh$dd.mov.sum.2wk[is.na(wnv.weather.oh$dd.mov.sum.2wk)] <- 0 wnv.weather.mw$temp.mov.avg.wk <- mw.temp.mov.avg.wk
  • 12. wnv.weather.mw$temp.mov.avg.2wk <- mw.temp.mov.avg.2wk wnv.weather.mw$prec.mov.avg.wk <- mw.prec.mov.avg.wk wnv.weather.mw$prec.mov.avg.2wk <- mw.prec.mov.avg.2wk wnv.weather.mw$prec.mov.sum.wk <- mw.prec.mov.sum.wk wnv.weather.mw$prec.mov.sum.2wk <- mw.prec.mov.sum.2wk wnv.weather.mw$dd.mov.sum.wk <- mw.dd.mov.sum.wk wnv.weather.mw$dd.mov.sum.2wk <- mw.dd.mov.sum.2wk wnv.weather.mw$prec.mov.sum.wk[is.na(wnv.weather.mw$prec.mov.sum.wk)] <- 0 wnv.weather.mw$prec.mov.sum.2wk[is.na(wnv.weather.mw$prec.mov.sum. 2wk)] <- 0 wnv.weather.mw$dd.mov.sum.wk[is.na(wnv.weather.mw$dd.mov.sum.wk)] <- 0 wnv.weather.mw$dd.mov.sum.2wk[is.na(wnv.weather.mw$dd.mov.sum.2wk)] <- 0 options(scipen=100) #Test Data Pre- processing------------------------------------------------------------ ------------------------------------------ wnv.test <- data.frame(test[,list(Date, Id, dWeek, Species, Trap, Latitude, Longitude, Species2)]) #Cast Date Type wnv.test$Date <- as.Date(wnv.test$Date) oh.lat <- 41.995 oh.long <- -87.933 mw.lat <- 41.786 mw.long <- -87.752 for (i in 1:nrow(wnv.test)) { #Euclidean Distances wnv.test$dist.oh[i] <- sqrt((wnv.test$Latitude[i]-oh.lat)**2+ (wnv.test$Longitude[i]-oh.long)**2)*111 wnv.test$dist.mw[i] <- sqrt((wnv.test$Latitude[i]-mw.lat)**2+ (wnv.test$Longitude[i]-mw.long)**2)*111 #Haversine Distance
  • 13. wnv.test$disth.oh[i] <- distHaversine(c(wnv.test$Longitude[i], wnv.test$Latitude[i]), c(oh.long, oh.lat))/1000 wnv.test$disth.mw[i] <- distHaversine(c(wnv.test$Longitude[i], wnv.test$Latitude[i]), c(mw.long, mw.lat))/1000 if (wnv.test$dist.oh[i] >= wnv.test$dist.mw[i]) { wnv.test$Station.Euc[i] <- 2 #2 is Midway } else { wnv.test$Station.Euc[i] <- 1 #1 is O'Hare } if (wnv.test$disth.oh[i] >= wnv.test$disth.mw[i]) { wnv.test$Station[i] <- 2 #2 is Midway } else { wnv.test$Station[i] <- 1 #1 is O'Hare } #Creating ISO Week Number #wnv.test$week.num[i] <- substring(ISOweek(wnv.test$Date[i]), 7, 8) } #Concatenating Clean Weather Datasets and Merging with Training/Test Data---------------------------------------------------------- #wnv.train$Species2 <- train$Species2 #wnv.test$Species2 <- test$Species2 wnv.weather.clean <- rbind(wnv.weather.oh, wnv.weather.mw) wnv.train.weather <- merge(wnv.train, wnv.weather.clean, by=c("Date", "Station")) wnv.test.weather <- merge(wnv.test, wnv.weather.clean, by=c("Date", "Station")) #Just use one station... #Sorting wnv.test.weather wnv.test.weather.sorted <- wnv.test.weather[order(wnv.test.weather $Id),]
  • 14. #Training/Testing Split----------------------------------------------------------------- ------------------------------------ wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2007,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2007,] #----------------------------------------------------- wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2009,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2009,] #----------------------------------------------------- #Set aside 2011 data as test, and train on the remaining wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2011,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2011,] #----------------------------------------------------- wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2013,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2013,] #Clustering Locations - DBSCAN---------------------------------------------------------------- --------------------------- #Potential epsilon values: #1/4 mile = 0.003623 #1/2 mile = 0.007246 #1 mile = 0.014493 #2 mile = 0.028986 #3 mile = 0.043479
  • 15. #Based on 1deg Latitude/Longitude = 69 miles x.train <- cbind(wnv.train.weather$Longitude, wnv.train.weather $Latitude) dbs.train <- dbscan(x.train, 0.003623) plot(dbs.train, x.train) print(dbs.train) prediction.train <- predict(dbs.train, x.train) #length(prediction.train) #unique(prediction.train) wnv.train.weather$loc.cluster.qmile <- as.factor(prediction.train) x.test <- cbind(wnv.test.weather.sorted$Longitude, wnv.test.weather.sorted$Latitude) prediction.test <- predict(dbs.train, x.train, x.test) #length(prediction.test) #unique(prediction.test) wnv.test.weather.sorted$loc.cluster.2mile <- as.factor(prediction.test) #Clustering Locations - K- Means----------------------------------------------------------------- ----------------------- #With normalized Longitude & Latitude train.means <- apply(wnv.train.weather[c("Longitude", "Latitude")], 2, mean) train.sds <-apply(wnv.train.weather[c("Longitude", "Latitude")], 2, sd) train.loc.norm <- scale(wnv.train.weather[c("Longitude", "Latitude")],center=means,scale=sds) test.means <- apply(wnv.test.weather.sorted[c("Longitude", "Latitude")], 2, mean) test.sds <-apply(wnv.test.weather.sorted[c("Longitude", "Latitude")], 2, sd) test.loc.norm <- scale(wnv.test.weather.sorted[c("Longitude", "Latitude")],center=means,scale=sds) set.seed(123) clusters.kmeans.norm <- kmeans(train.loc.norm, 20)
  • 16. clusters.kmeans.norm$centers plot(clusters.kmeans.norm$centers) pred.cluster.norm.train <- predict(clusters.kmeans.norm, data=train.loc.norm) pred.cluster.norm.test <- predict(clusters.kmeans.norm, data=test.loc.norm) #With un-normalized Longitude & Latitude train.loc <- wnv.train.weather[c("Longitude", "Latitude")] test.loc <- wnv.test.weather.sorted[c("Longitude", "Latitude")] set.seed(123) clusters.kmeans <- kmeans(train.loc, 25) clusters.kmeans$centers plot(clusters.kmeans$centers) pred.cluster.train <- predict(clusters.kmeans.norm, data=train.loc.norm) pred.cluster.test <- predict(clusters.kmeans.norm, data=test.loc.norm) #Assign to training/testing set wnv.train.weather$kmeans.cluster.n <- as.factor(pred.cluster.norm.train) wnv.test.weather.sorted$kmeans.cluster.n <- as.factor(pred.cluster.norm.test) #PCA------------------------------------------------------------------ ------------------------------------------------- keeps1 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed", "acc.deg.day", "temp.mov.avg.wk", "prec.mov.avg.wk", "prec.mov.sum.wk", "dd.mov.sum.wk" ) keeps2 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed") #model.pca <- princomp(wnv.train.weather[keeps1], cor=TRUE,
  • 17. scores=TRUE) #model.pca model.pca.train1 <- princomp(wnv.train.weather[keeps1], cor=TRUE, scores=TRUE) model.pca.test1 <- princomp(wnv.test.weather.sorted[keeps1], cor=TRUE, scores=TRUE) model.pca.train <- princomp(wnv.train.weather[keeps2], cor=TRUE, scores=TRUE) model.pca.test <- princomp(wnv.test.weather.sorted[keeps2], cor=TRUE, scores=TRUE) plot(model.pca.train) summary(model.pca.train) plot(model.pca.test) summary(model.pca.test) plot(model.pca.train1) summary(model.pca.train1) model.pca.train$scores model.pca.test$scores model.pca.train$loadings model.pca.test$loadings model.pca.train1$loadings pca.out.train <- data.frame(model.pca.train$scores) pca.out.test <- data.frame(model.pca.test$scores) wnv.train.weather <- cbind(wnv.train.weather, pca.out.train) wnv.test.weather.sorted <- cbind(wnv.test.weather.sorted, pca.out.test) #wnv.train.weather.pca <- cbind(wnv.train.weather, pca.out) #Dealing with unbalanced classes---------------------------------------------------------------
  • 18. ------------------------ wnv.train.no <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent == 0,] wnv.train.yes <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent == 1,] set.seed(1234) wnv.train.no.75 <- wnv.train.no[sample(1:nrow(wnv.train.no), 5852, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.75, wnv.train.yes) set.seed(1234) wnv.train.no.50 <- wnv.train.no[sample(1:nrow(wnv.train.no), 3901, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.50, wnv.train.yes) set.seed(1234) wnv.train.no.25 <- wnv.train.no[sample(1:nrow(wnv.train.no), 1951, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.25, wnv.train.yes) set.seed(1234) wnv.train.no.10 <- wnv.train.no[sample(1:nrow(wnv.train.no), 780, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.10, wnv.train.yes) set.seed(1234) wnv.train.no.5 <- wnv.train.no[sample(1:nrow(wnv.train.no), 312, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.5, wnv.train.yes) #70/30 Split-------------------------------------------- set.seed(1234) #set.seed(4321) ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7, 0.3)) trainData <- wnv.train.weather[ind==1,] testData <- wnv.train.weather[ind==2,] #GAM Modeling--------------------------------------------------------------
  • 19. -------------------------------------------------- #Experiment with PCA and Kmeans------------------------------- train.model <- gam(WnvPresent ~ s(dWeek) + Species2 + kmeans.cluster.n + s(Comp.1) + s(Comp.2) + s(Comp.3) + s(Comp.4) + s(Comp.5) + s(acc.deg.day) + s(temp.mov.avg.wk) + s(prec.mov.avg.2wk) #+ s(prec.mov.sum.wk) , data = trainData, family="binomial", bf.maxit = 1000) prediction.gam <-predict(train.model, newdata = testData, type = "response") auc(testData$WnvPresent, prediction.gam) #------------------------------------------------------------- fitSubmit <- update(train.model, data=wnv.train.weather) pSubmit.gam<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "response") submissionFile<-cbind(test$Id,pSubmit.gam) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi134prec.csv",row.names=FALSE ,quote=FALSE) #GLM------------------------------------------------------------------ -------------------------------------------------------- #Create Squared Term deg.day, dWeek, dd.mov.sum.2wk wnv.train.weather$acc.deg.day2 <- (wnv.train.weather$acc.deg.day)**2 wnv.train.weather$dWeek2 <- (wnv.train.weather$dWeek)**2 wnv.train.weather$dd.mov.sum.2wk2 <- (wnv.train.weather$dd.mov.sum. 2wk)**2 wnv.test.weather.sorted$acc.deg.day2 <- (wnv.test.weather.sorted
  • 20. $acc.deg.day)**2 wnv.test.weather.sorted$dWeek2 <- (wnv.test.weather.sorted$dWeek)**2 wnv.test.weather.sorted$dd.mov.sum.2wk2 <- (wnv.test.weather.sorted $dd.mov.sum.2wk)**2 #Experiment with PCA and Kmeans-------------------------------- log.model <- glm(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n + dWeek2 + Comp.1 #+ Comp.2 #+ Comp.3 #+ Comp.4 #+ Comp.5 #+ acc.deg.day #+ acc.deg.day2 + temp.mov.avg.2wk + prec.mov.avg.2wk , data = trainData, family=binomial()) summary(log.model) prediction.glm <-predict(log.model, newdata = testData, type = "response") auc(testData$WnvPresent, prediction.glm) #------------------------------------------------------------ fitSubmit <- update(log.model, data=wnv.train.weather) pSubmit.glm <-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "response") #summary(pSubmit.glm) submissionFile<-cbind(test$Id,pSubmit.glm) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #Base Model Submission AUC: 0.64569 #Base Train AUC: 0.78259055 #LDA------------------------------------------------------------------ --------------------------------------------------------
  • 21. lda.model <- lda(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day + acc.deg.day2 #+ temp.mov.avg.2wk + prec.mov.avg.wk + prec.mov.sum.2wk ,data = trainData) prediction.lda <- predict(lda.model, testData) lda.df <- data.frame(prediction.lda$posterior) auc(testData$WnvPresent, lda.df[,2]) #------------------------------------------------------------------- fitSubmit <- update(lda.model, data=wnv.train.weather) pSubmit<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "probability") lda.test <- data.frame(pSubmit$posterior) submissionFile<-cbind(test$Id,lda.df.test[,2]) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #Random Forest---------------------------------------------------------------- ------------------------------------------------ set.seed(1234) rf.model <- randomForest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day #+ temp.mov.avg.2wk
  • 22. #+ prec.mov.avg.2wk #+ prec.mov.sum.2wk , data = trainData, nTree = 1000) prediction.rf <- predict(rf.model, testData, type = "prob") rf.df <- data.frame(prediction.rf) auc(testData$WnvPresent.C, rf.df[,2]) fitSubmit <- update(rf.model, data=wnv.train.weather) pSubmit.rf<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "prob") rf.df.test <- data.frame(pSubmit.rf) submissionFile<-cbind(test$Id, rf.df.test[,2]) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #PCA and Kmeans------------------- set.seed(1234) rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 #+ acc.deg.day #+ temp.mov.avg.2wk #+ prec.mov.avg.2wk #+ prec.mov.sum.wk ,data = trainData, controls=cforest_unbiased(ntree=200, mtry=2)) prediction.rf <- predict(rf.model, newdata=testData, type = "prob") rf.df <- data.frame(prediction.rf) x <- vector() for (i in seq(2, 6232, 2)) { x <- c(x, rf.df[1, i]) } auc(testData$WnvPresent.C, x) set.seed(1234)
  • 23. rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 #+ Comp.3 #+ Comp.4 #+ Comp.5 #+ acc.deg.day #+ temp.mov.avg.2wk #+ prec.mov.avg.2wk ,data = wnv.train.weather, controls=cforest_unbiased(ntree=10, mtry=2, maxdepth = 10)) prediction.rf <- predict(rf.model, newdata=wnv.test.weather.sorted, type = "prob") rf.df <- data.frame(prediction.rf) #H20 Random Forest-------------- library(h2o) localH2O <- h2o.init(nthreads = -1,max_mem_size = '7g') rf.model <- h2o.randomForest(x=c(3, 9, 43, 44:48),y = 53,data = trainData, mtries = 18, sample.rate = 0.5, classification = T,ntree = 200,verbose = T) prediction.rf <- h2o.predict(rf.model,testData) #Support Vector Machines-------------------------------------------------------------- ---------------------------------------- #PCA and kmeans---------------------------------------------------- set.seed(1234) svm.model <- svm(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day + temp.mov.avg.2wk #+ prec.mov.avg.wk
  • 24. #+ prec.mov.sum.2wk , data = trainData, probability = TRUE) #summary(svm.model) prediction.svm <- predict(svm.model, testData, probability = TRUE) svm.df <- data.frame(attr(prediction.svm, "probabilities")) auc(testData$WnvPresent.C, svm.df[,2]) #------------------------------------------------------------------ set.seed(1234) fitSubmit <- update(svm.model, data=wnv.train.weather) pSubmit.svm<-predict(fitSubmit, newdata = wnv.test.weather.sorted, probability = TRUE) svm.df.test <- data.frame(attr(pSubmit.svm, "probabilities")) submissionFile<-cbind(test$Id, svm.df.test[,2]) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #Ensemble------------------------------------------------------------- -------------------------------------------------------- gam.train <- as.matrix(prediction.gam) glm.train <- as.matrix(prediction.glm) lda.train <- as.matrix(lda.df[,2]) rf.train <- as.matrix(rf.df[,2]) svm.train <- as.matrix(svm.df[,2]) ens.train <- cbind(gam.train, lda.train, rf.train, svm.train) gam.test <- as.matrix(pSubmit.gam) glm.test <- as.matrix(pSubmit.glm) lda.test <- as.matrix(lda.df.test[,2]) rf.test <- as.matrix(rf.df.test[,2]) svm.test <- as.matrix(svm.df.test[,2]) ens.test <- cbind(lda.test, rf.test, svm.test) w <- matrix(c(0.0, 0.3, 0.1, 0.6), nrow = 4, ncol = 1) ens.model <- ens.train %*% w auc(testData$WnvPresent, ens.model) w <- matrix(c(0.3, 0.1, 0.6), nrow = 3, ncol = 1) ens.model.test <- ens.test %*% w
  • 25. submissionFile<-cbind(test$Id,ens.model.test) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLiBest3.csv",row.names=FALSE,q uote=FALSE) #Bagging-------------------------------------------------------------- ------------------------------------------------------ set.seed(1234) ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7, 0.3)) trainData <- wnv.train.weather[ind==1,] testData <- wnv.train.weather[ind==2,] set.seed(4321) all_data <- wnv.train.weather positions <- sample(nrow(all_data),size=floor((nrow(all_data)/10)*7)) trainBag<- all_data[positions,] testBag<- all_data[-positions,] library(foreach) length_divisor<-6 iterations<-50 predictions<-foreach(m=1:iterations,.combine=cbind) %do% { training_positions <- sample(nrow(trainData), size=floor((nrow(trainData)/length_divisor))) train_pos<-1:nrow(trainData) %in% training_positions gam.bag <- gam(WnvPresent ~ lo(dWeek) + Species2 + lo(Longitude, Latitude) + s(acc.deg.day), data = trainData[train_pos,] , family="binomial", bf.maxit = 45) predict(gam.bag, newdata = testData, type = "response") } predictions<-data.frame(rowMeans(predictions)) auc(testData$WnvPresent.C, predictions) #-------------------------------------------------------------------- library(foreach) length_divisor<-6 iterations<-1000 predictions<-foreach(m=1:iterations,.combine=cbind) %do% { training_positions <- sample(nrow(trainData), size=floor((nrow(trainData)/length_divisor)))
  • 26. train_pos<-1:nrow(trainData) %in% training_positions glm.bag <- glm(WnvPresent ~ dWeek + dWeek2 + Species2 + Longitude + acc.deg.day + acc.deg.day2 , data = trainData[train_pos,], family=binomial()) #summary(log.model) predict(glm.bag, newdata = testData, type = "response") } predictions<-data.frame(rowMeans(predictions)) auc(testData$WnvPresent.C, predictions) #Vs. gam.test <- gam(WnvPresent ~ lo(dWeek) + Species2 + loc.cluster.qmile, data = trainBag, family="binomial", bf.maxit = 200) prediction.gam <-predict(gam.test, newdata = testBag, type = "response") auc(testBag$WnvPresent, prediction.gam) #Adabag Exploration----------------------------------------------------------- --------------------------------------------- wnv.adaboost <- boosting(WnvPresent.C ~ dWeek, data = trainData, boos=TRUE, mfinal = 5)