Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
1 of 31 10/18/2020, 4:30 AM
https://www.r-project.org/
Essential tools:
RGUI Basic R processing,
RScript to run batch scripts,
RCMD (to install in Unix/Linux) variants
RStudio is a compelling tool – though defer RStudio until you know R very we
ll – tools are limiting you – bad idea to start with RStudio to learn the la
nguage, IMHO.
Reference Sites: (that I often use, don’t leave home without it)
https://www.r-bloggers.com
https://nabble.com/
http://rfunction.com
https://stackoverflow.com/
https://stats.stackexchange.com/
https://www.datasciencemadesimple.com/
http://www.r-tutor.com/
There are thousands if not more, useful R sites you can learn from
Again to do what you want to get done…otherwise you will be sucked into
vortex..
…
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
2 of 31 10/18/2020, 4:30 AM
ls()
## character(0)
X<-5
7->Y
ifelse(X<Y,'X is Less than Y', 'X is atleast equal to Y')
## [1] "X is Less than Y"
vec<-1:13
is.vector(vec)
## [1] TRUE
vec[4]
## [1] 4
by2<-seq(1,13,2)
(xy2<-seq(1,13,2))
## [1] 1 3 5 7 9 11 13
xy2[4]
## [1] 7
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
3 of 31 10/18/2020, 4:30 AM
is.vector(xy2[4])
## [1] TRUE
length(xy2[4])
## [1] 1
vec[vec %in% by2]
## [1] 1 3 5 7 9 11 13
(xyeven<-seq(0,13,2))
## [1] 0 2 4 6 8 10 12
length(vec)
## [1] 13
mean(vec)
## [1] 7
sd(vec)
## [1] 3.89444
sum(vec)
## [1] 91
cumprod(vec)
## [1] 1 2 6 24 120 720
## [7] 5040 40320 362880 3628800 39916800 479001600
## [13] 6227020800
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
4 of 31 10/18/2020, 4:30 AM
L<-list(X=5,reason="I like 5")
L
## $X
## [1] 5
##
## $reason
## [1] "I like 5"
mx<-matrix(c(rep(0,5),seq(1:5)),nrow=2,ncol=5) # fixed the error now the mx
should have correct values not ALL zeros
mx
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 2 4
## [2,] 0 0 1 3 5
mxbyr<-matrix(c(rep(0,5),seq(1:5)),nrow=2,ncol=5,byrow=TRUE)
mxbyr
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 0 0
## [2,] 1 2 3 4 5
dd <- structure(list(
population = c(4.560667108, 1.275920972)
,continents = c('Asia', 'Africa'))
,.Names = c("Pop", "Continent")
,row.names = c(NA, -2L)
,class = "data.frame")
dd
## Pop Continent
## 1 4.560667 Asia
## 2 1.275921 Africa
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
5 of 31 10/18/2020, 4:30 AM
dd<-rbind(dd,c(4.1570842,'Oceania'))
dd
## Pop Continent
## 1 4.560667108 Asia
## 2 1.275920972 Africa
## 3 4.1570842 Oceania
dd<-cbind(dd,density=c(100,36,4))
dd<-rbind(dd,c(0,'pangea'))
dd
## Pop Continent density
## 1 4.560667108 Asia 100
## 2 1.275920972 Africa 36
## 3 4.1570842 Oceania 4
## 4 0 pangea 0
which(dd$Pop==0)
## [1] 4
dd<-dd[-which(dd$Pop==0),]
dd
## Pop Continent density
## 1 4.560667108 Asia 100
## 2 1.275920972 Africa 36
## 3 4.1570842 Oceania 4
birds<-data.frame(nlegs=rep(2,5),can_fly=c(0,1,1,0,1),height=c(25,40,20,150,
10),
color=c('black','black','blue','black','brown'))
birds2<-cbind(birds,c('chicken','vulture','parrot','ostrich','sparrow'))
names(birds2)<-c('nlegs','can_fly','height','color','species')
birds2
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
6 of 31 10/18/2020, 4:30 AM
## nlegs can_fly height color species
## 1 2 0 25 black chicken
## 2 2 1 40 black vulture
## 3 2 1 20 blue parrot
## 4 2 0 150 black ostrich
## 5 2 1 10 brown sparrow
…
chickencolors<-c('black','white','red','mixed')
vulturecolors<-c('grey','black','white')
parrotcolors<-c('teal','green','blue','mixed','pink')
ostrichcolors<-c('grey','black')
sparrowcolors<-c('dark cement','brown')
hchicken<-sample(rnorm(10,25,6),5)
hvulture<-sample(rnorm(10,40,4),5)
hparrot<-sample(rnorm(10,20,2),5)
hostrich<-sample(rnorm(10,150,20),5)
hsparrow<-sample(rnorm(10,10,1),5)
cdset<-rbind(birds2,data.frame(nlegs=rep(2,5),can_fly=rep(0,5), height=hchic
ken,
color=sample(chickencolors,5,replace=T),species=rep('chicken',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hvulture,
color=sample(vulturecolors,5,replace=T),species=rep('vulture',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hparrot,
color=sample(parrotcolors,5,replace=T),species=rep('parrot',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(0,5), height=hostrich,
color=sample(ostrichcolors,5,replace=T),species=rep('ostrich',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hsparrow,
color=sample(sparrowcolors,5,replace=T),species=rep('sparrow',5)))
cdset # just print out the contents
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
7 of 31 10/18/2020, 4:30 AM
## nlegs can_fly height color species
## 1 2 0 25.000000 black chicken
## 2 2 1 40.000000 black vulture
## 3 2 1 20.000000 blue parrot
## 4 2 0 150.000000 black ostrich
## 5 2 1 10.000000 brown sparrow
## 6 2 0 21.795787 red chicken
## 7 2 0 39.459162 mixed chicken
## 8 2 0 22.981968 black chicken
## 9 2 0 17.744720 black chicken
## 10 2 0 25.911222 mixed chicken
## 11 2 1 39.016163 white vulture
## 12 2 1 40.037789 white vulture
## 13 2 1 42.251693 grey vulture
## 14 2 1 39.014589 grey vulture
## 15 2 1 38.475420 white vulture
## 16 2 1 20.316044 mixed parrot
## 17 2 1 22.712721 teal parrot
## 18 2 1 22.840455 mixed parrot
## 19 2 1 14.934359 blue parrot
## 20 2 1 21.195914 blue parrot
## 21 2 0 160.085412 black ostrich
## 22 2 0 140.594205 black ostrich
## 23 2 0 174.088029 grey ostrich
## 24 2 0 157.684178 grey ostrich
## 25 2 0 135.249085 grey ostrich
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
dim(cdset) # what are the dimensions
## [1] 30 5
nrow(cdset) # number of rows
## [1] 30
ncol(cdset) # number of columns
## [1] 5
names(cdset) # data.frames have names matrices dont
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
8 of 31 10/18/2020, 4:30 AM
## [1] "nlegs" "can_fly" "height" "color" "species"
head(cdset)
## nlegs can_fly height color species
## 1 2 0 25.00000 black chicken
## 2 2 1 40.00000 black vulture
## 3 2 1 20.00000 blue parrot
## 4 2 0 150.00000 black ostrich
## 5 2 1 10.00000 brown sparrow
## 6 2 0 21.79579 red chicken
tail(cdset)
## nlegs can_fly height color species
## 25 2 0 135.249085 grey ostrich
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
row.names(cdset)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "1
4" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "2
9" "30"
cdset[1,3]# just one cell
## [1] 25
cdset[1,] # entire observation
## nlegs can_fly height color species
## 1 2 0 25 black chicken
cdset[,3]# entire column or the feature
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
9 of 31 10/18/2020, 4:30 AM
## [1] 25.000000 40.000000 20.000000 150.000000 10.000000 21.795787
## [7] 39.459162 22.981968 17.744720 25.911222 39.016163 40.037789
## [13] 42.251693 39.014589 38.475420 20.316044 22.712721 22.840455
## [19] 14.934359 21.195914 160.085412 140.594205 174.088029 157.684178
## [25] 135.249085 9.295639 11.266186 9.336063 10.169087 11.060101
cdset[cdset$species=='sparrow',]# review just the sparrow data.entire observ
ations
## nlegs can_fly height color species
## 5 2 1 10.000000 brown sparrow
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
cdset[cdset$species=='sparrow',c(1,3,5)]# just some of the columns
## nlegs height species
## 5 2 10.000000 sparrow
## 26 2 9.295639 sparrow
## 27 2 11.266186 sparrow
## 28 2 9.336063 sparrow
## 29 2 10.169087 sparrow
## 30 2 11.060101 sparrow
cdset[cdset$species=='sparrow',c('nlegs','species')]# or by column names
## nlegs species
## 5 2 sparrow
## 26 2 sparrow
## 27 2 sparrow
## 28 2 sparrow
## 29 2 sparrow
## 30 2 sparrow
cdset[cdset$species=='sparrow',-which(names(cdset)=='species')]# filter OUT
some columns
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
10 of 31 10/18/2020, 4:30 AM
## nlegs can_fly height color
## 5 2 1 10.000000 brown
## 26 2 1 9.295639 dark cement
## 27 2 1 11.266186 dark cement
## 28 2 1 9.336063 brown
## 29 2 1 10.169087 brown
## 30 2 1 11.060101 brown
cdset[cdset$species=='sparrow',-which(names(cdset)%in%c('nlegs','species'))]
## can_fly height color
## 5 1 10.000000 brown
## 26 1 9.295639 dark cement
## 27 1 11.266186 dark cement
## 28 1 9.336063 brown
## 29 1 10.169087 brown
## 30 1 11.060101 brown
…
…
…
lapply(1:3,FUN=function(x)x*x) -> exl
exl
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
11 of 31 10/18/2020, 4:30 AM
mx
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 2 4
## [2,] 0 0 1 3 5
apply(mx,2,sd)
## [1] 0.0000000 0.0000000 0.7071068 0.7071068 0.7071068
(mx1<-sapply(mx,FUN=function(x)x+1))
## [1] 1 1 1 1 1 2 3 4 5 6
…
…
prodidlist<-c(paste("P0",1:9,sep=''),paste("P",10:99,sep=''))
cidlist<-c(paste("C0",1:9,sep=''),paste("C",10:22,sep=''))
(df<-data.frame(DID=1,CID="C01",
PID=sample(prodidlist,sample(1:20,1),replace=F),
stringsAsFactors=F))
## DID CID PID
## 1 1 C01 P88
## 2 1 C01 P53
## 3 1 C01 P86
## 4 1 C01 P90
## 5 1 C01 P21
## 6 1 C01 P34
(sample(prodidlist,sample(1:20,1),replace=F))
## [1] "P75" "P34" "P40" "P27" "P71" "P06" "P72"
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
12 of 31 10/18/2020, 4:30 AM
mdf<-do.call('rbind',lapply(1:50,FUN=function(x)
{
cidlist<-sample(cidlist,sample(1:length(cidlist),1),replace=F)
dfi<-do.call('rbind',lapply(cidlist,FUN=
function(cid)data.frame(DID=x,CID=cid,
PID=sample(prodidlist,sample(1:20,1),replace=F))))
}
))
write.table(mdf,
file='purchases.csv',
sep=',',row.names=F,
col.names=T,
quote=F)
head(mdf)
## DID CID PID
## 1 1 C19 P19
## 2 1 C19 P33
## 3 1 C19 P91
## 4 1 C19 P78
## 5 1 C19 P66
## 6 1 C19 P64
nrow(mdf)
## [1] 6436
…
read.csv('purchases.csv',head=T,sep=',')->rmdf
titanic<-read.csv("http://christianherta.de/lehre/dataScience/machineLearnin
g/data/titanic-train.csv",header=T)
head(titanic)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
13 of 31 10/18/2020, 4:30 AM
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Pa
rch
## 1 Braund, Mr. Owen Harris male 22 1
0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
0
## 3 Heikkinen, Miss. Laina female 26 0
0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
0
## 5 Allen, Mr. William Henry male 35 0
0
## 6 Moran, Mr. James male NA 0
0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
dim(titanic)
## [1] 891 12
table(mdf==rmdf)
##
## TRUE
## 19308
cumprod(dim(mdf)) #rows ^ columns the number of elements all of them match a
s they should
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
14 of 31 10/18/2020, 4:30 AM
## [1] 6436 19308
nrow(rmdf)* ncol(rmdf)
## [1] 19308
quantmod::getSymbols(c("IBM","SPY"),from='2020-01-01')
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "IBM" "SPY"
dim(IBM)
## [1] 201 6
#dim(JNJ)
#quantmod::getSymbols(c("SPY"),from='2020-01-01')
head(IBM)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
15 of 31 10/18/2020, 4:30 AM
## IBM.Open IBM.High IBM.Low IBM.Close IBM.Volume IBM.Adjusted
## 2020-01-02 135.00 135.92 134.77 135.42 3148600 130.5377
## 2020-01-03 133.57 134.86 133.56 134.34 2373700 129.4967
## 2020-01-06 133.42 134.24 133.20 134.10 2425500 129.2654
## 2020-01-07 133.69 134.96 133.40 134.19 3090800 129.3521
## 2020-01-08 134.51 135.86 133.92 135.31 4346000 130.4317
## 2020-01-09 135.74 136.79 135.31 136.74 3730600 131.8102
head(IBM$IBM.Adjusted)
## IBM.Adjusted
## 2020-01-02 130.5377
## 2020-01-03 129.4967
## 2020-01-06 129.2654
## 2020-01-07 129.3521
## 2020-01-08 130.4317
## 2020-01-09 131.8102
DIBM<-c(head(IBM$IBM.Adjusted,1),head(IBM$IBM.Adjusted,200)) ## fixed the er
ror
head((dailyIBMReturns<-(((as.numeric(IBM$IBM.Adjusted)/DIBM) -1)*100))) ## f
ixed the error now results are full precision
## IBM.Adjusted
## 2020-01-02 0.00000000
## 2020-01-02 -0.79751719
## 2020-01-03 -0.17863237
## 2020-01-06 0.06709531
## 2020-01-07 0.83464824
## 2020-01-08 1.05682335
dailyReturnIBM<-dailyIBMReturns#(IBM[[6]]/DIBM)-1
head(dailyReturnIBM)
## IBM.Adjusted
## 2020-01-02 0.00000000
## 2020-01-02 -0.79751719
## 2020-01-03 -0.17863237
## 2020-01-06 0.06709531
## 2020-01-07 0.83464824
## 2020-01-08 1.05682335
DSPY<-c(head(SPY$SPY.Adjusted,1),head(SPY$SPY.Adjusted,200)) ## fixed the er
ror
head((dailySPYReturns<-(((as.numeric(SPY$SPY.Adjusted)/DSPY) -1)*100))) ## f
ixed the error now results are full precision
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
16 of 31 10/18/2020, 4:30 AM
## SPY.Adjusted
## 2020-01-02 0.0000000
## 2020-01-02 -0.7572182
## 2020-01-03 0.3815075
## 2020-01-06 -0.2811862
## 2020-01-07 0.5329669
## 2020-01-08 0.6780544
dailyReturnSPY<-dailySPYReturns#(SPY[[6]]/DSPY)-1
lmModel<-lm(dailyReturnIBM~dailyReturnSPY)
summary(lmModel)
##
## Call:
## lm(formula = dailyReturnIBM ~ dailyReturnSPY)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7497 -0.7827 -0.1046 0.6906 7.0070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.04934 0.09647 -0.511 0.61
## dailyReturnSPY 1.02923 0.04193 24.547 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.367 on 199 degrees of freedom
## Multiple R-squared: 0.7517, Adjusted R-squared: 0.7505
## F-statistic: 602.6 on 1 and 199 DF, p-value: < 2.2e-16
oldPar<-par(mfrow=c(2,1))
plot(dailyReturnIBM,color='black')
plot(dailyReturnSPY,color='blue')
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
17 of 31 10/18/2020, 4:30 AM
#abline(dailyReturnIBM,dailyReturnSPY)
cor(dailyReturnIBM,dailyReturnSPY)
## SPY.Adjusted
## IBM.Adjusted 0.8670258
#plot(dailyReturnIBM,pch=3,color='black')
#points(dailyReturnSPY,pch=3,color='blue')
library(sqldf)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
18 of 31 10/18/2020, 4:30 AM
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
purchases<-mdf
rpt01<-sqldf('select DID,count(distinct(CID)) from mdf group by DID')
head(rpt01)
## DID count(distinct(CID))
## 1 1 14
## 2 2 19
## 3 3 6
## 4 4 16
## 5 5 16
## 6 6 9
tail(rpt01)
## DID count(distinct(CID))
## 45 45 22
## 46 46 15
## 47 47 15
## 48 48 19
## 49 49 10
## 50 50 10
sqldf('select distinct CID from mdf where DID=50')
## CID
## 1 C06
## 2 C21
## 3 C03
## 4 C11
## 5 C04
## 6 C17
## 7 C08
## 8 C14
## 9 C18
## 10 C15
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
19 of 31 10/18/2020, 4:30 AM
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
20 of 31 10/18/2020, 4:30 AM
SELECT A.p1,
A.p2,
A.p1p2c / B.p1c AS condProb
FROM (SELECT apid P1,
bpid P2,
Count(*) P1P2C
FROM (SELECT A.did AS ADID,
A.cid AS ACID,
A.pid AS APID,
B.did AS BDID,
B.cid AS BCID,
B.pid AS BPID
FROM purchases A
JOIN purchases B
ON A.cid = B.cid
AND A.did = B.did
AND A.pid < B.pid) X
GROUP BY apid,
bpid) A
JOIN (SELECT pid AS P1,
Count(*) P1C
FROM purchases
GROUP BY pid) B
ON A.p1 = B.p1
ORDER BY condprob DESC;
sqlstr<-"select A.P1,A.P2, (A.P1P2C*100)/B.P1C as condProb from ( Select API
D P1,BPID P2,count(*) P1P2C from ( select A.DID as
ADID, A.CID as ACID , A.PID as APID , B.DID as BDID, B.CID as BCID , B.PID a
s BPID from purchases A join purchases B on
A.CID=B.CID AND A.DID=B.DID AND A.PID < B.PID ) X group by APID,BPID ) A jo
in (select PID as P1, count(*) P1C from purchases group by PID) B on A.P1=B.
P1 order by condProb desc"
sqlstr
## [1] "select A.P1,A.P2, (A.P1P2C*100)/B.P1C as condProb from ( Select APID
P1,BPID P2,count(*) P1P2C from ( select A.DID asnADID, A.CID as ACID , A.P
ID as APID , B.DID as BDID, B.CID as BCID , B.PID as BPID from purchases A j
oin purchases B onnA.CID=B.CID AND A.DID=B.DID AND A.PID < B.PID ) X group
by APID,BPID ) A join (select PID as P1, count(*) P1C from purchases group b
y PID) B on A.P1=B.P1 order by condProb desc"
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
21 of 31 10/18/2020, 4:30 AM
condprob<-sqldf(sqlstr)
#condprob
head(condprob)
## P1 P2 condProb
## 1 P27 P61 31
## 2 P66 P97 31
## 3 P04 P92 29
## 4 P27 P53 29
## 5 P27 P95 29
## 6 P52 P79 29
require(rpart)
## Loading required package: rpart
require(rpart.plot)
## Loading required package: rpart.plot
require(klaR)
## Loading required package: klaR
## Loading required package: MASS
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
22 of 31 10/18/2020, 4:30 AM
set.seed(43)
tridx<-sample(1:30,20,replace=F)
trdata<-cdset[tridx,]
tstdata<-cdset[-tridx,]
trmodel.rpart<-rpart(species~.,data=trdata,minsplit=2)
rpart.plot(trmodel.rpart)
#compare this to
table(trdata$species)/nrow(trdata)
##
## chicken ostrich parrot sparrow vulture
## 0.20 0.25 0.15 0.20 0.20
predicted.trmodel.rpart<-predict(trmodel.rpart,trdata[,-5],type='class')
table(trdata[,5],predicted.trmodel.rpart)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
23 of 31 10/18/2020, 4:30 AM
## predicted.trmodel.rpart
## chicken ostrich parrot sparrow vulture
## chicken 4 0 0 0 0
## ostrich 0 5 0 0 0
## parrot 0 0 3 0 0
## sparrow 0 0 0 4 0
## vulture 0 0 0 0 4
# removing colors that are present in test but not in train -- in small data
set
# tree cannot process that
tstdatnw<-tstdata[tstdata$color %in% trdata$color,]
tstdatnw
## nlegs can_fly height color species
## 10 2 0 25.91122 mixed chicken
## 11 2 1 39.01616 white vulture
## 15 2 1 38.47542 white vulture
## 18 2 1 22.84045 mixed parrot
## 20 2 1 21.19591 blue parrot
## 25 2 0 135.24908 grey ostrich
## 29 2 1 10.16909 brown sparrow
## 30 2 1 11.06010 brown sparrow
predicted.tstdatnw.rpart<-predict(trmodel.rpart,tstdatnw[,-5],type='class')
table(tstdatnw[,5],predicted.tstdatnw.rpart)
## predicted.tstdatnw.rpart
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
caret::confusionMatrix( table(tstdatnw[,5],predicted.tstdatnw.rpart))
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
24 of 31 10/18/2020, 4:30 AM
## Confusion Matrix and Statistics
##
## predicted.tstdatnw.rpart
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.6306, 1)
## No Information Rate : 0.25
## P-Value [Acc > NIR] : 1.526e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: chicken Class: ostrich Class: parrot Class: s
parrow
## Sensitivity 1.000 1.000 1.00
1.00
## Specificity 1.000 1.000 1.00
1.00
## Pos Pred Value 1.000 1.000 1.00
1.00
## Neg Pred Value 1.000 1.000 1.00
1.00
## Prevalence 0.125 0.125 0.25
0.25
## Detection Rate 0.125 0.125 0.25
0.25
## Detection Prevalence 0.125 0.125 0.25
0.25
## Balanced Accuracy 1.000 1.000 1.00
1.00
## Class: vulture
## Sensitivity 1.00
## Specificity 1.00
## Pos Pred Value 1.00
## Neg Pred Value 1.00
## Prevalence 0.25
## Detection Rate 0.25
## Detection Prevalence 0.25
## Balanced Accuracy 1.00
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
25 of 31 10/18/2020, 4:30 AM
tstdatnw[,5]
## [1] "chicken" "vulture" "vulture" "parrot" "parrot" "ostrich" "sparrow"
## [8] "sparrow"
predicted.tstdatnw.rpart
## 10 11 15 18 20 25 29 30
## chicken vulture vulture parrot parrot ostrich sparrow sparrow
## Levels: chicken ostrich parrot sparrow vulture
table(as.character(tstdatnw[,5]),as.character(predicted.tstdatnw.rpart))
##
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
caret::confusionMatrix( table(tstdatnw[,5],predicted.tstdatnw.rpart))
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
26 of 31 10/18/2020, 4:30 AM
## Confusion Matrix and Statistics
##
## predicted.tstdatnw.rpart
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.6306, 1)
## No Information Rate : 0.25
## P-Value [Acc > NIR] : 1.526e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: chicken Class: ostrich Class: parrot Class: s
parrow
## Sensitivity 1.000 1.000 1.00
1.00
## Specificity 1.000 1.000 1.00
1.00
## Pos Pred Value 1.000 1.000 1.00
1.00
## Neg Pred Value 1.000 1.000 1.00
1.00
## Prevalence 0.125 0.125 0.25
0.25
## Detection Rate 0.125 0.125 0.25
0.25
## Detection Prevalence 0.125 0.125 0.25
0.25
## Balanced Accuracy 1.000 1.000 1.00
1.00
## Class: vulture
## Sensitivity 1.00
## Specificity 1.00
## Pos Pred Value 1.00
## Neg Pred Value 1.00
## Prevalence 0.25
## Detection Rate 0.25
## Detection Prevalence 0.25
## Balanced Accuracy 1.00
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
27 of 31 10/18/2020, 4:30 AM
…
cdset
## nlegs can_fly height color species
## 1 2 0 25.000000 black chicken
## 2 2 1 40.000000 black vulture
## 3 2 1 20.000000 blue parrot
## 4 2 0 150.000000 black ostrich
## 5 2 1 10.000000 brown sparrow
## 6 2 0 21.795787 red chicken
## 7 2 0 39.459162 mixed chicken
## 8 2 0 22.981968 black chicken
## 9 2 0 17.744720 black chicken
## 10 2 0 25.911222 mixed chicken
## 11 2 1 39.016163 white vulture
## 12 2 1 40.037789 white vulture
## 13 2 1 42.251693 grey vulture
## 14 2 1 39.014589 grey vulture
## 15 2 1 38.475420 white vulture
## 16 2 1 20.316044 mixed parrot
## 17 2 1 22.712721 teal parrot
## 18 2 1 22.840455 mixed parrot
## 19 2 1 14.934359 blue parrot
## 20 2 1 21.195914 blue parrot
## 21 2 0 160.085412 black ostrich
## 22 2 0 140.594205 black ostrich
## 23 2 0 174.088029 grey ostrich
## 24 2 0 157.684178 grey ostrich
## 25 2 0 135.249085 grey ostrich
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
klaR::partimat(Species~.,data=iris,method="lda")
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
28 of 31 10/18/2020, 4:30 AM
cat ("the sample proportions are")
## the sample proportions are
table(cdset$species)
##
## chicken ostrich parrot sparrow vulture
## 6 6 6 6 6
table(cdset$species)/sum(table(cdset$species))
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
29 of 31 10/18/2020, 4:30 AM
##
## chicken ostrich parrot sparrow vulture
## 0.2 0.2 0.2 0.2 0.2
cat ("the training set proportions are")
## the training set proportions are
table(trdata$species)
##
## chicken ostrich parrot sparrow vulture
## 4 5 3 4 4
table(trdata$species)/sum(table(trdata$species))
##
## chicken ostrich parrot sparrow vulture
## 0.20 0.25 0.15 0.20 0.20
cat ("the test set proportions are")
## the test set proportions are
table(tstdata$species)
##
## chicken ostrich parrot sparrow vulture
## 2 1 3 2 2
table(tstdata$species)/sum(table(tstdata$species))
##
## chicken ostrich parrot sparrow vulture
## 0.2 0.1 0.3 0.2 0.2
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
30 of 31 10/18/2020, 4:30 AM
…
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
31 of 31 10/18/2020, 4:30 AM

Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practitioners

  • 1.
  • 2.
    https://www.r-project.org/ Essential tools: RGUI BasicR processing, RScript to run batch scripts, RCMD (to install in Unix/Linux) variants RStudio is a compelling tool – though defer RStudio until you know R very we ll – tools are limiting you – bad idea to start with RStudio to learn the la nguage, IMHO. Reference Sites: (that I often use, don’t leave home without it) https://www.r-bloggers.com https://nabble.com/ http://rfunction.com https://stackoverflow.com/ https://stats.stackexchange.com/ https://www.datasciencemadesimple.com/ http://www.r-tutor.com/ There are thousands if not more, useful R sites you can learn from Again to do what you want to get done…otherwise you will be sucked into vortex.. … Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 2 of 31 10/18/2020, 4:30 AM
  • 3.
    ls() ## character(0) X<-5 7->Y ifelse(X<Y,'X isLess than Y', 'X is atleast equal to Y') ## [1] "X is Less than Y" vec<-1:13 is.vector(vec) ## [1] TRUE vec[4] ## [1] 4 by2<-seq(1,13,2) (xy2<-seq(1,13,2)) ## [1] 1 3 5 7 9 11 13 xy2[4] ## [1] 7 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 3 of 31 10/18/2020, 4:30 AM
  • 4.
    is.vector(xy2[4]) ## [1] TRUE length(xy2[4]) ##[1] 1 vec[vec %in% by2] ## [1] 1 3 5 7 9 11 13 (xyeven<-seq(0,13,2)) ## [1] 0 2 4 6 8 10 12 length(vec) ## [1] 13 mean(vec) ## [1] 7 sd(vec) ## [1] 3.89444 sum(vec) ## [1] 91 cumprod(vec) ## [1] 1 2 6 24 120 720 ## [7] 5040 40320 362880 3628800 39916800 479001600 ## [13] 6227020800 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 4 of 31 10/18/2020, 4:30 AM
  • 5.
    L<-list(X=5,reason="I like 5") L ##$X ## [1] 5 ## ## $reason ## [1] "I like 5" mx<-matrix(c(rep(0,5),seq(1:5)),nrow=2,ncol=5) # fixed the error now the mx should have correct values not ALL zeros mx ## [,1] [,2] [,3] [,4] [,5] ## [1,] 0 0 0 2 4 ## [2,] 0 0 1 3 5 mxbyr<-matrix(c(rep(0,5),seq(1:5)),nrow=2,ncol=5,byrow=TRUE) mxbyr ## [,1] [,2] [,3] [,4] [,5] ## [1,] 0 0 0 0 0 ## [2,] 1 2 3 4 5 dd <- structure(list( population = c(4.560667108, 1.275920972) ,continents = c('Asia', 'Africa')) ,.Names = c("Pop", "Continent") ,row.names = c(NA, -2L) ,class = "data.frame") dd ## Pop Continent ## 1 4.560667 Asia ## 2 1.275921 Africa Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 5 of 31 10/18/2020, 4:30 AM
  • 6.
    dd<-rbind(dd,c(4.1570842,'Oceania')) dd ## Pop Continent ##1 4.560667108 Asia ## 2 1.275920972 Africa ## 3 4.1570842 Oceania dd<-cbind(dd,density=c(100,36,4)) dd<-rbind(dd,c(0,'pangea')) dd ## Pop Continent density ## 1 4.560667108 Asia 100 ## 2 1.275920972 Africa 36 ## 3 4.1570842 Oceania 4 ## 4 0 pangea 0 which(dd$Pop==0) ## [1] 4 dd<-dd[-which(dd$Pop==0),] dd ## Pop Continent density ## 1 4.560667108 Asia 100 ## 2 1.275920972 Africa 36 ## 3 4.1570842 Oceania 4 birds<-data.frame(nlegs=rep(2,5),can_fly=c(0,1,1,0,1),height=c(25,40,20,150, 10), color=c('black','black','blue','black','brown')) birds2<-cbind(birds,c('chicken','vulture','parrot','ostrich','sparrow')) names(birds2)<-c('nlegs','can_fly','height','color','species') birds2 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 6 of 31 10/18/2020, 4:30 AM
  • 7.
    ## nlegs can_flyheight color species ## 1 2 0 25 black chicken ## 2 2 1 40 black vulture ## 3 2 1 20 blue parrot ## 4 2 0 150 black ostrich ## 5 2 1 10 brown sparrow … chickencolors<-c('black','white','red','mixed') vulturecolors<-c('grey','black','white') parrotcolors<-c('teal','green','blue','mixed','pink') ostrichcolors<-c('grey','black') sparrowcolors<-c('dark cement','brown') hchicken<-sample(rnorm(10,25,6),5) hvulture<-sample(rnorm(10,40,4),5) hparrot<-sample(rnorm(10,20,2),5) hostrich<-sample(rnorm(10,150,20),5) hsparrow<-sample(rnorm(10,10,1),5) cdset<-rbind(birds2,data.frame(nlegs=rep(2,5),can_fly=rep(0,5), height=hchic ken, color=sample(chickencolors,5,replace=T),species=rep('chicken',5)), data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hvulture, color=sample(vulturecolors,5,replace=T),species=rep('vulture',5)), data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hparrot, color=sample(parrotcolors,5,replace=T),species=rep('parrot',5)), data.frame(nlegs=rep(2,5),can_fly=rep(0,5), height=hostrich, color=sample(ostrichcolors,5,replace=T),species=rep('ostrich',5)), data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hsparrow, color=sample(sparrowcolors,5,replace=T),species=rep('sparrow',5))) cdset # just print out the contents Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 7 of 31 10/18/2020, 4:30 AM
  • 8.
    ## nlegs can_flyheight color species ## 1 2 0 25.000000 black chicken ## 2 2 1 40.000000 black vulture ## 3 2 1 20.000000 blue parrot ## 4 2 0 150.000000 black ostrich ## 5 2 1 10.000000 brown sparrow ## 6 2 0 21.795787 red chicken ## 7 2 0 39.459162 mixed chicken ## 8 2 0 22.981968 black chicken ## 9 2 0 17.744720 black chicken ## 10 2 0 25.911222 mixed chicken ## 11 2 1 39.016163 white vulture ## 12 2 1 40.037789 white vulture ## 13 2 1 42.251693 grey vulture ## 14 2 1 39.014589 grey vulture ## 15 2 1 38.475420 white vulture ## 16 2 1 20.316044 mixed parrot ## 17 2 1 22.712721 teal parrot ## 18 2 1 22.840455 mixed parrot ## 19 2 1 14.934359 blue parrot ## 20 2 1 21.195914 blue parrot ## 21 2 0 160.085412 black ostrich ## 22 2 0 140.594205 black ostrich ## 23 2 0 174.088029 grey ostrich ## 24 2 0 157.684178 grey ostrich ## 25 2 0 135.249085 grey ostrich ## 26 2 1 9.295639 dark cement sparrow ## 27 2 1 11.266186 dark cement sparrow ## 28 2 1 9.336063 brown sparrow ## 29 2 1 10.169087 brown sparrow ## 30 2 1 11.060101 brown sparrow dim(cdset) # what are the dimensions ## [1] 30 5 nrow(cdset) # number of rows ## [1] 30 ncol(cdset) # number of columns ## [1] 5 names(cdset) # data.frames have names matrices dont Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 8 of 31 10/18/2020, 4:30 AM
  • 9.
    ## [1] "nlegs""can_fly" "height" "color" "species" head(cdset) ## nlegs can_fly height color species ## 1 2 0 25.00000 black chicken ## 2 2 1 40.00000 black vulture ## 3 2 1 20.00000 blue parrot ## 4 2 0 150.00000 black ostrich ## 5 2 1 10.00000 brown sparrow ## 6 2 0 21.79579 red chicken tail(cdset) ## nlegs can_fly height color species ## 25 2 0 135.249085 grey ostrich ## 26 2 1 9.295639 dark cement sparrow ## 27 2 1 11.266186 dark cement sparrow ## 28 2 1 9.336063 brown sparrow ## 29 2 1 10.169087 brown sparrow ## 30 2 1 11.060101 brown sparrow row.names(cdset) ## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "1 4" "15" ## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "2 9" "30" cdset[1,3]# just one cell ## [1] 25 cdset[1,] # entire observation ## nlegs can_fly height color species ## 1 2 0 25 black chicken cdset[,3]# entire column or the feature Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 9 of 31 10/18/2020, 4:30 AM
  • 10.
    ## [1] 25.00000040.000000 20.000000 150.000000 10.000000 21.795787 ## [7] 39.459162 22.981968 17.744720 25.911222 39.016163 40.037789 ## [13] 42.251693 39.014589 38.475420 20.316044 22.712721 22.840455 ## [19] 14.934359 21.195914 160.085412 140.594205 174.088029 157.684178 ## [25] 135.249085 9.295639 11.266186 9.336063 10.169087 11.060101 cdset[cdset$species=='sparrow',]# review just the sparrow data.entire observ ations ## nlegs can_fly height color species ## 5 2 1 10.000000 brown sparrow ## 26 2 1 9.295639 dark cement sparrow ## 27 2 1 11.266186 dark cement sparrow ## 28 2 1 9.336063 brown sparrow ## 29 2 1 10.169087 brown sparrow ## 30 2 1 11.060101 brown sparrow cdset[cdset$species=='sparrow',c(1,3,5)]# just some of the columns ## nlegs height species ## 5 2 10.000000 sparrow ## 26 2 9.295639 sparrow ## 27 2 11.266186 sparrow ## 28 2 9.336063 sparrow ## 29 2 10.169087 sparrow ## 30 2 11.060101 sparrow cdset[cdset$species=='sparrow',c('nlegs','species')]# or by column names ## nlegs species ## 5 2 sparrow ## 26 2 sparrow ## 27 2 sparrow ## 28 2 sparrow ## 29 2 sparrow ## 30 2 sparrow cdset[cdset$species=='sparrow',-which(names(cdset)=='species')]# filter OUT some columns Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 10 of 31 10/18/2020, 4:30 AM
  • 11.
    ## nlegs can_flyheight color ## 5 2 1 10.000000 brown ## 26 2 1 9.295639 dark cement ## 27 2 1 11.266186 dark cement ## 28 2 1 9.336063 brown ## 29 2 1 10.169087 brown ## 30 2 1 11.060101 brown cdset[cdset$species=='sparrow',-which(names(cdset)%in%c('nlegs','species'))] ## can_fly height color ## 5 1 10.000000 brown ## 26 1 9.295639 dark cement ## 27 1 11.266186 dark cement ## 28 1 9.336063 brown ## 29 1 10.169087 brown ## 30 1 11.060101 brown … … … lapply(1:3,FUN=function(x)x*x) -> exl exl ## [[1]] ## [1] 1 ## ## [[2]] ## [1] 4 ## ## [[3]] ## [1] 9 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 11 of 31 10/18/2020, 4:30 AM
  • 12.
    mx ## [,1] [,2][,3] [,4] [,5] ## [1,] 0 0 0 2 4 ## [2,] 0 0 1 3 5 apply(mx,2,sd) ## [1] 0.0000000 0.0000000 0.7071068 0.7071068 0.7071068 (mx1<-sapply(mx,FUN=function(x)x+1)) ## [1] 1 1 1 1 1 2 3 4 5 6 … … prodidlist<-c(paste("P0",1:9,sep=''),paste("P",10:99,sep='')) cidlist<-c(paste("C0",1:9,sep=''),paste("C",10:22,sep='')) (df<-data.frame(DID=1,CID="C01", PID=sample(prodidlist,sample(1:20,1),replace=F), stringsAsFactors=F)) ## DID CID PID ## 1 1 C01 P88 ## 2 1 C01 P53 ## 3 1 C01 P86 ## 4 1 C01 P90 ## 5 1 C01 P21 ## 6 1 C01 P34 (sample(prodidlist,sample(1:20,1),replace=F)) ## [1] "P75" "P34" "P40" "P27" "P71" "P06" "P72" Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 12 of 31 10/18/2020, 4:30 AM
  • 13.
    mdf<-do.call('rbind',lapply(1:50,FUN=function(x) { cidlist<-sample(cidlist,sample(1:length(cidlist),1),replace=F) dfi<-do.call('rbind',lapply(cidlist,FUN= function(cid)data.frame(DID=x,CID=cid, PID=sample(prodidlist,sample(1:20,1),replace=F)))) } )) write.table(mdf, file='purchases.csv', sep=',',row.names=F, col.names=T, quote=F) head(mdf) ## DID CIDPID ## 1 1 C19 P19 ## 2 1 C19 P33 ## 3 1 C19 P91 ## 4 1 C19 P78 ## 5 1 C19 P66 ## 6 1 C19 P64 nrow(mdf) ## [1] 6436 … read.csv('purchases.csv',head=T,sep=',')->rmdf titanic<-read.csv("http://christianherta.de/lehre/dataScience/machineLearnin g/data/titanic-train.csv",header=T) head(titanic) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 13 of 31 10/18/2020, 4:30 AM
  • 14.
    ## PassengerId SurvivedPclass ## 1 1 0 3 ## 2 2 1 1 ## 3 3 1 3 ## 4 4 1 1 ## 5 5 0 3 ## 6 6 0 3 ## Name Sex Age SibSp Pa rch ## 1 Braund, Mr. Owen Harris male 22 1 0 ## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0 ## 3 Heikkinen, Miss. Laina female 26 0 0 ## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 ## 5 Allen, Mr. William Henry male 35 0 0 ## 6 Moran, Mr. James male NA 0 0 ## Ticket Fare Cabin Embarked ## 1 A/5 21171 7.2500 S ## 2 PC 17599 71.2833 C85 C ## 3 STON/O2. 3101282 7.9250 S ## 4 113803 53.1000 C123 S ## 5 373450 8.0500 S ## 6 330877 8.4583 Q dim(titanic) ## [1] 891 12 table(mdf==rmdf) ## ## TRUE ## 19308 cumprod(dim(mdf)) #rows ^ columns the number of elements all of them match a s they should Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 14 of 31 10/18/2020, 4:30 AM
  • 15.
    ## [1] 643619308 nrow(rmdf)* ncol(rmdf) ## [1] 19308 quantmod::getSymbols(c("IBM","SPY"),from='2020-01-01') ## Registered S3 method overwritten by 'quantmod': ## method from ## as.zoo.data.frame zoo ## 'getSymbols' currently uses auto.assign=TRUE by default, but will ## use auto.assign=FALSE in 0.5-0. You will still be able to use ## 'loadSymbols' to automatically load data. getOption("getSymbols.env") ## and getOption("getSymbols.auto.assign") will still be checked for ## alternate defaults. ## ## This message is shown once per session and may be disabled by setting ## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details. ## [1] "IBM" "SPY" dim(IBM) ## [1] 201 6 #dim(JNJ) #quantmod::getSymbols(c("SPY"),from='2020-01-01') head(IBM) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 15 of 31 10/18/2020, 4:30 AM
  • 16.
    ## IBM.Open IBM.HighIBM.Low IBM.Close IBM.Volume IBM.Adjusted ## 2020-01-02 135.00 135.92 134.77 135.42 3148600 130.5377 ## 2020-01-03 133.57 134.86 133.56 134.34 2373700 129.4967 ## 2020-01-06 133.42 134.24 133.20 134.10 2425500 129.2654 ## 2020-01-07 133.69 134.96 133.40 134.19 3090800 129.3521 ## 2020-01-08 134.51 135.86 133.92 135.31 4346000 130.4317 ## 2020-01-09 135.74 136.79 135.31 136.74 3730600 131.8102 head(IBM$IBM.Adjusted) ## IBM.Adjusted ## 2020-01-02 130.5377 ## 2020-01-03 129.4967 ## 2020-01-06 129.2654 ## 2020-01-07 129.3521 ## 2020-01-08 130.4317 ## 2020-01-09 131.8102 DIBM<-c(head(IBM$IBM.Adjusted,1),head(IBM$IBM.Adjusted,200)) ## fixed the er ror head((dailyIBMReturns<-(((as.numeric(IBM$IBM.Adjusted)/DIBM) -1)*100))) ## f ixed the error now results are full precision ## IBM.Adjusted ## 2020-01-02 0.00000000 ## 2020-01-02 -0.79751719 ## 2020-01-03 -0.17863237 ## 2020-01-06 0.06709531 ## 2020-01-07 0.83464824 ## 2020-01-08 1.05682335 dailyReturnIBM<-dailyIBMReturns#(IBM[[6]]/DIBM)-1 head(dailyReturnIBM) ## IBM.Adjusted ## 2020-01-02 0.00000000 ## 2020-01-02 -0.79751719 ## 2020-01-03 -0.17863237 ## 2020-01-06 0.06709531 ## 2020-01-07 0.83464824 ## 2020-01-08 1.05682335 DSPY<-c(head(SPY$SPY.Adjusted,1),head(SPY$SPY.Adjusted,200)) ## fixed the er ror head((dailySPYReturns<-(((as.numeric(SPY$SPY.Adjusted)/DSPY) -1)*100))) ## f ixed the error now results are full precision Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 16 of 31 10/18/2020, 4:30 AM
  • 17.
    ## SPY.Adjusted ## 2020-01-020.0000000 ## 2020-01-02 -0.7572182 ## 2020-01-03 0.3815075 ## 2020-01-06 -0.2811862 ## 2020-01-07 0.5329669 ## 2020-01-08 0.6780544 dailyReturnSPY<-dailySPYReturns#(SPY[[6]]/DSPY)-1 lmModel<-lm(dailyReturnIBM~dailyReturnSPY) summary(lmModel) ## ## Call: ## lm(formula = dailyReturnIBM ~ dailyReturnSPY) ## ## Residuals: ## Min 1Q Median 3Q Max ## -3.7497 -0.7827 -0.1046 0.6906 7.0070 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -0.04934 0.09647 -0.511 0.61 ## dailyReturnSPY 1.02923 0.04193 24.547 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.367 on 199 degrees of freedom ## Multiple R-squared: 0.7517, Adjusted R-squared: 0.7505 ## F-statistic: 602.6 on 1 and 199 DF, p-value: < 2.2e-16 oldPar<-par(mfrow=c(2,1)) plot(dailyReturnIBM,color='black') plot(dailyReturnSPY,color='blue') Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 17 of 31 10/18/2020, 4:30 AM
  • 18.
    #abline(dailyReturnIBM,dailyReturnSPY) cor(dailyReturnIBM,dailyReturnSPY) ## SPY.Adjusted ## IBM.Adjusted0.8670258 #plot(dailyReturnIBM,pch=3,color='black') #points(dailyReturnSPY,pch=3,color='blue') library(sqldf) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 18 of 31 10/18/2020, 4:30 AM
  • 19.
    ## Loading requiredpackage: gsubfn ## Loading required package: proto ## Loading required package: RSQLite purchases<-mdf rpt01<-sqldf('select DID,count(distinct(CID)) from mdf group by DID') head(rpt01) ## DID count(distinct(CID)) ## 1 1 14 ## 2 2 19 ## 3 3 6 ## 4 4 16 ## 5 5 16 ## 6 6 9 tail(rpt01) ## DID count(distinct(CID)) ## 45 45 22 ## 46 46 15 ## 47 47 15 ## 48 48 19 ## 49 49 10 ## 50 50 10 sqldf('select distinct CID from mdf where DID=50') ## CID ## 1 C06 ## 2 C21 ## 3 C03 ## 4 C11 ## 5 C04 ## 6 C17 ## 7 C08 ## 8 C14 ## 9 C18 ## 10 C15 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 19 of 31 10/18/2020, 4:30 AM
  • 20.
  • 21.
    SELECT A.p1, A.p2, A.p1p2c /B.p1c AS condProb FROM (SELECT apid P1, bpid P2, Count(*) P1P2C FROM (SELECT A.did AS ADID, A.cid AS ACID, A.pid AS APID, B.did AS BDID, B.cid AS BCID, B.pid AS BPID FROM purchases A JOIN purchases B ON A.cid = B.cid AND A.did = B.did AND A.pid < B.pid) X GROUP BY apid, bpid) A JOIN (SELECT pid AS P1, Count(*) P1C FROM purchases GROUP BY pid) B ON A.p1 = B.p1 ORDER BY condprob DESC; sqlstr<-"select A.P1,A.P2, (A.P1P2C*100)/B.P1C as condProb from ( Select API D P1,BPID P2,count(*) P1P2C from ( select A.DID as ADID, A.CID as ACID , A.PID as APID , B.DID as BDID, B.CID as BCID , B.PID a s BPID from purchases A join purchases B on A.CID=B.CID AND A.DID=B.DID AND A.PID < B.PID ) X group by APID,BPID ) A jo in (select PID as P1, count(*) P1C from purchases group by PID) B on A.P1=B. P1 order by condProb desc" sqlstr ## [1] "select A.P1,A.P2, (A.P1P2C*100)/B.P1C as condProb from ( Select APID P1,BPID P2,count(*) P1P2C from ( select A.DID asnADID, A.CID as ACID , A.P ID as APID , B.DID as BDID, B.CID as BCID , B.PID as BPID from purchases A j oin purchases B onnA.CID=B.CID AND A.DID=B.DID AND A.PID < B.PID ) X group by APID,BPID ) A join (select PID as P1, count(*) P1C from purchases group b y PID) B on A.P1=B.P1 order by condProb desc" Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 21 of 31 10/18/2020, 4:30 AM
  • 22.
    condprob<-sqldf(sqlstr) #condprob head(condprob) ## P1 P2condProb ## 1 P27 P61 31 ## 2 P66 P97 31 ## 3 P04 P92 29 ## 4 P27 P53 29 ## 5 P27 P95 29 ## 6 P52 P79 29 require(rpart) ## Loading required package: rpart require(rpart.plot) ## Loading required package: rpart.plot require(klaR) ## Loading required package: klaR ## Loading required package: MASS Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 22 of 31 10/18/2020, 4:30 AM
  • 23.
    set.seed(43) tridx<-sample(1:30,20,replace=F) trdata<-cdset[tridx,] tstdata<-cdset[-tridx,] trmodel.rpart<-rpart(species~.,data=trdata,minsplit=2) rpart.plot(trmodel.rpart) #compare this to table(trdata$species)/nrow(trdata) ## ##chicken ostrich parrot sparrow vulture ## 0.20 0.25 0.15 0.20 0.20 predicted.trmodel.rpart<-predict(trmodel.rpart,trdata[,-5],type='class') table(trdata[,5],predicted.trmodel.rpart) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 23 of 31 10/18/2020, 4:30 AM
  • 24.
    ## predicted.trmodel.rpart ## chickenostrich parrot sparrow vulture ## chicken 4 0 0 0 0 ## ostrich 0 5 0 0 0 ## parrot 0 0 3 0 0 ## sparrow 0 0 0 4 0 ## vulture 0 0 0 0 4 # removing colors that are present in test but not in train -- in small data set # tree cannot process that tstdatnw<-tstdata[tstdata$color %in% trdata$color,] tstdatnw ## nlegs can_fly height color species ## 10 2 0 25.91122 mixed chicken ## 11 2 1 39.01616 white vulture ## 15 2 1 38.47542 white vulture ## 18 2 1 22.84045 mixed parrot ## 20 2 1 21.19591 blue parrot ## 25 2 0 135.24908 grey ostrich ## 29 2 1 10.16909 brown sparrow ## 30 2 1 11.06010 brown sparrow predicted.tstdatnw.rpart<-predict(trmodel.rpart,tstdatnw[,-5],type='class') table(tstdatnw[,5],predicted.tstdatnw.rpart) ## predicted.tstdatnw.rpart ## chicken ostrich parrot sparrow vulture ## chicken 1 0 0 0 0 ## ostrich 0 1 0 0 0 ## parrot 0 0 2 0 0 ## sparrow 0 0 0 2 0 ## vulture 0 0 0 0 2 caret::confusionMatrix( table(tstdatnw[,5],predicted.tstdatnw.rpart)) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 24 of 31 10/18/2020, 4:30 AM
  • 25.
    ## Confusion Matrixand Statistics ## ## predicted.tstdatnw.rpart ## chicken ostrich parrot sparrow vulture ## chicken 1 0 0 0 0 ## ostrich 0 1 0 0 0 ## parrot 0 0 2 0 0 ## sparrow 0 0 0 2 0 ## vulture 0 0 0 0 2 ## ## Overall Statistics ## ## Accuracy : 1 ## 95% CI : (0.6306, 1) ## No Information Rate : 0.25 ## P-Value [Acc > NIR] : 1.526e-05 ## ## Kappa : 1 ## ## Mcnemar's Test P-Value : NA ## ## Statistics by Class: ## ## Class: chicken Class: ostrich Class: parrot Class: s parrow ## Sensitivity 1.000 1.000 1.00 1.00 ## Specificity 1.000 1.000 1.00 1.00 ## Pos Pred Value 1.000 1.000 1.00 1.00 ## Neg Pred Value 1.000 1.000 1.00 1.00 ## Prevalence 0.125 0.125 0.25 0.25 ## Detection Rate 0.125 0.125 0.25 0.25 ## Detection Prevalence 0.125 0.125 0.25 0.25 ## Balanced Accuracy 1.000 1.000 1.00 1.00 ## Class: vulture ## Sensitivity 1.00 ## Specificity 1.00 ## Pos Pred Value 1.00 ## Neg Pred Value 1.00 ## Prevalence 0.25 ## Detection Rate 0.25 ## Detection Prevalence 0.25 ## Balanced Accuracy 1.00 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 25 of 31 10/18/2020, 4:30 AM
  • 26.
    tstdatnw[,5] ## [1] "chicken""vulture" "vulture" "parrot" "parrot" "ostrich" "sparrow" ## [8] "sparrow" predicted.tstdatnw.rpart ## 10 11 15 18 20 25 29 30 ## chicken vulture vulture parrot parrot ostrich sparrow sparrow ## Levels: chicken ostrich parrot sparrow vulture table(as.character(tstdatnw[,5]),as.character(predicted.tstdatnw.rpart)) ## ## chicken ostrich parrot sparrow vulture ## chicken 1 0 0 0 0 ## ostrich 0 1 0 0 0 ## parrot 0 0 2 0 0 ## sparrow 0 0 0 2 0 ## vulture 0 0 0 0 2 caret::confusionMatrix( table(tstdatnw[,5],predicted.tstdatnw.rpart)) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 26 of 31 10/18/2020, 4:30 AM
  • 27.
    ## Confusion Matrixand Statistics ## ## predicted.tstdatnw.rpart ## chicken ostrich parrot sparrow vulture ## chicken 1 0 0 0 0 ## ostrich 0 1 0 0 0 ## parrot 0 0 2 0 0 ## sparrow 0 0 0 2 0 ## vulture 0 0 0 0 2 ## ## Overall Statistics ## ## Accuracy : 1 ## 95% CI : (0.6306, 1) ## No Information Rate : 0.25 ## P-Value [Acc > NIR] : 1.526e-05 ## ## Kappa : 1 ## ## Mcnemar's Test P-Value : NA ## ## Statistics by Class: ## ## Class: chicken Class: ostrich Class: parrot Class: s parrow ## Sensitivity 1.000 1.000 1.00 1.00 ## Specificity 1.000 1.000 1.00 1.00 ## Pos Pred Value 1.000 1.000 1.00 1.00 ## Neg Pred Value 1.000 1.000 1.00 1.00 ## Prevalence 0.125 0.125 0.25 0.25 ## Detection Rate 0.125 0.125 0.25 0.25 ## Detection Prevalence 0.125 0.125 0.25 0.25 ## Balanced Accuracy 1.000 1.000 1.00 1.00 ## Class: vulture ## Sensitivity 1.00 ## Specificity 1.00 ## Pos Pred Value 1.00 ## Neg Pred Value 1.00 ## Prevalence 0.25 ## Detection Rate 0.25 ## Detection Prevalence 0.25 ## Balanced Accuracy 1.00 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 27 of 31 10/18/2020, 4:30 AM
  • 28.
    … cdset ## nlegs can_flyheight color species ## 1 2 0 25.000000 black chicken ## 2 2 1 40.000000 black vulture ## 3 2 1 20.000000 blue parrot ## 4 2 0 150.000000 black ostrich ## 5 2 1 10.000000 brown sparrow ## 6 2 0 21.795787 red chicken ## 7 2 0 39.459162 mixed chicken ## 8 2 0 22.981968 black chicken ## 9 2 0 17.744720 black chicken ## 10 2 0 25.911222 mixed chicken ## 11 2 1 39.016163 white vulture ## 12 2 1 40.037789 white vulture ## 13 2 1 42.251693 grey vulture ## 14 2 1 39.014589 grey vulture ## 15 2 1 38.475420 white vulture ## 16 2 1 20.316044 mixed parrot ## 17 2 1 22.712721 teal parrot ## 18 2 1 22.840455 mixed parrot ## 19 2 1 14.934359 blue parrot ## 20 2 1 21.195914 blue parrot ## 21 2 0 160.085412 black ostrich ## 22 2 0 140.594205 black ostrich ## 23 2 0 174.088029 grey ostrich ## 24 2 0 157.684178 grey ostrich ## 25 2 0 135.249085 grey ostrich ## 26 2 1 9.295639 dark cement sparrow ## 27 2 1 11.266186 dark cement sparrow ## 28 2 1 9.336063 brown sparrow ## 29 2 1 10.169087 brown sparrow ## 30 2 1 11.060101 brown sparrow klaR::partimat(Species~.,data=iris,method="lda") Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 28 of 31 10/18/2020, 4:30 AM
  • 29.
    cat ("the sampleproportions are") ## the sample proportions are table(cdset$species) ## ## chicken ostrich parrot sparrow vulture ## 6 6 6 6 6 table(cdset$species)/sum(table(cdset$species)) Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 29 of 31 10/18/2020, 4:30 AM
  • 30.
    ## ## chicken ostrichparrot sparrow vulture ## 0.2 0.2 0.2 0.2 0.2 cat ("the training set proportions are") ## the training set proportions are table(trdata$species) ## ## chicken ostrich parrot sparrow vulture ## 4 5 3 4 4 table(trdata$species)/sum(table(trdata$species)) ## ## chicken ostrich parrot sparrow vulture ## 0.20 0.25 0.15 0.20 0.20 cat ("the test set proportions are") ## the test set proportions are table(tstdata$species) ## ## chicken ostrich parrot sparrow vulture ## 2 1 3 2 2 table(tstdata$species)/sum(table(tstdata$species)) ## ## chicken ostrich parrot sparrow vulture ## 0.2 0.1 0.3 0.2 0.2 Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D... 30 of 31 10/18/2020, 4:30 AM
  • 31.