R Basics and Simulation
https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM]
R Basics and Simulation
About R
R is a free software environment for statistical computing and graphics.
Provides a wide variety of statistical and graphical techniques
Many classical and modern statistical techniques have been implemented.
A few of these are built into the base R environment, but many are supplied as packages.
Convinient interface, RStudio. It is an integrated development environment (IDE) for R. It includes a
console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history,
debugging and workspace management.
Open R Studio
Frequently Used Data Types and R-Objects
The variables are not declared as some data type.
The variables are assigned with R-Objects and the data type of the R-object becomes the data type of the
variable.
Data Type: Numeric, Integer, Character
## Numeric
## Assign a number to variable num
num <- 3.14
print(num)
## simple calculation by calling the variable
print(num + 1)
## Let's check the data type that has been assigned to num
print(class(num))
## Integer
## Assign the integer part variable num.int
num.int <- as.integer(num)
num.int
## Let's check the data type
class(num.int)
## Character
## Assign the integer part variable num.int
char <- "Hello"
R Basics and Simulation
https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM]
char
## Let's check the data type
class(char)
R-Objects: Vectors, Matrices, Data Frames
# Create a vector with more than one element
# We use c() function which means to combine the elements into a vector
# create a vector of characters
col <- c('red','green',"yellow")
col
# create a vector of numeric
num <- c(1,2,3)
num
# extract elements from vectors
num[1]
# Create a matrices of vectors
# Several way to do this
# Use cbind() function which means column combines
Mcol <- cbind(num,num,num)
Mcol
# Use rbind() function which means row combine
Mrow <- rbind(num,num,num)
Mrow
# Use matrix function to fill in each element
M <- matrix(1:9,nrow=3,ncol=3)
M
# Now lets try combining numeric vector and character vector into a matrix
Mtry <- rbind(num,col)
class(Mtry) # Do you notice what has been changed here?
# extract elements from a matrix
M[1,3]
# Create a data frame
df <- data.frame(x = col, y = num)
df
# extract element from data frame
df$x
df$y[3]
R Basics and Simulation
https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM]
Calculation with R: Multiplication, Log, Exponential ,Power, Square Root
and Some Useful Statistics
# for scaler
x <- 2
x*x
# for vector
num
num + num
y <- c(0,1,2,3,4)
log(y)
# for matrix
M <- matrix(1:9,ncol=3,nrow=3)
exp(M)
sqrt(M)
# for data frame
df
df^2 # why is there a warning message?
# Useful statistics
mean(M)
sum(M)
Simulate Random Variables in R
# generate uniform variable between 0,1
u <- runif(10)
# plot to see what it looks like
plot(u)
hist(u)
# generate more data
u <- runif(1000)
plot(u)
hist(u) # Do you see what has changed?
# sample from a vector
# Type help(sample) to see the function arguments
sampx = sample(x = 1:100, size=1, replace=F)
sampx
# sample 10 numbers from 1-100
R Basics and Simulation
https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM]
sampx = sample(x = 1:100, size=10, replace=F)
sampx
rank(sampx)
# Normal random variable, a very useful random variable used in statistics
n1 <- rnorm(1)
n1
n2 <- rnorm(1)
n2
# Generate a larger sample to see its distribution
n <- rnorm(1000)
plot(n)
hist(n)
plot(density(n), main="Density of n",xlab="n") # remember the shape of the distribution
# set seed to generate the same random number
set.seed(123)
n1 <- rnorm(1)
n1
set.seed(123)
n2 <- rnorm(1)
n2
Why is Normal Distribution so Useful?
# Let's look at some examples
# If we flip 10 coints and count the number of heads.
# what do you think the distribution of the count will look like.
# simulate 30 coin flips
x = sample(c("head","tail"),30,replace = T)
x
# count the number of heads
x == "head"
sum( x == "head" )
# repeat this 1000 times
headcount <- c() # create an empty vector
for (i in 1:1000){
x = sample(c("head","tail"),30,replace = T)
headcount[i] <- sum( x == "head" )
}
hist(headcount,main="Head Count in 30 Coin Flips") # plot the distribution, what do you see?
# How about we simulation from a different distribution?
# simulation from uniform distribution
x = runif(30)
sum(x)
R Basics and Simulation
https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM]
# repeat this 1000 times
sumunif <- c() # create an empty vector
for (i in 1:1000){
x = runif(30)
sumunif[i] <- sum(x)
}
hist(sumunif,main="Sum of Uniform Random Variables") # plot the distribution, what do you see?
# The beam machine
# install.packages("animation")
# library(animation)
# balls = 200
# layers = 15
# ani.options(nmax=balls+layers-2)
# quincunx(balls, layers)
Data Visualization to Classify Glass Fragment Found on “Crime Scene”
library(MASS)
data(fgl)
# See data description
help(fgl)
# Print out part of the data set
head(fgl)
# Plot each composition versus glass type
plot(RI ~ type, data=fgl, col=c(1:6))
plot(Al ~ type, data=fgl, col=c(1:6))
plot(K ~ type, data=fgl, col=c(1:6))
plot(Ca ~ type, data=fgl, col=c(1:6))
# visualize two compositions versus glass type.
# Pick two composition: "Ca" and "K"
# Pick two class types: vehical window glass "Veh", and vehicle hadlamps "Head"
# Can we distinguish these two types of glass based on these two compositions
fgl_subset = subset(fgl,type %in% c("WinNF", "Veh"))
cols = ifelse(fgl_subset[,"type"]=="Veh","blue","red")
set.seed(12)
# pick one observation, pretending that we don't know the glass type
test = 22
fgl_subset[test,c("Ca","K")]
# remake boxplot with a focus on these two type of glass
plot(Ca ~ type, data=fgl_subset, col=c("blue","red"))
abline(h=fgl_subset[test,c("Ca")])
plot(K ~ type, data=fgl_subset, col=c("blue","red"))
abline(h=fgl_subset[test,c("K")])
R Basics and Simulation
https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM]
plot(fgl_subset[-test,c("Ca","K")], col=cols, pch=19,main="Glass type by compositions")
# Add the data points that we left out to see where it lands.
points(fgl_subset[test,c("Ca","K")], col="black", pch=17,cex = 1.5)
legend("topright",col=c("red","blue","black"),legend=c("WinNF","Veh","test case"),pch=c(19,19,17))
# Based on these two composition of the test glass. Can you predict what type of glass it is?
print(fgl_subset[test,"type"])

NCCU: Statistics in the Criminal Justice System, R basics and Simulation - Presented by Yawen Guan, Mar 24, 2018

  • 1.
    R Basics andSimulation https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM] R Basics and Simulation About R R is a free software environment for statistical computing and graphics. Provides a wide variety of statistical and graphical techniques Many classical and modern statistical techniques have been implemented. A few of these are built into the base R environment, but many are supplied as packages. Convinient interface, RStudio. It is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management. Open R Studio Frequently Used Data Types and R-Objects The variables are not declared as some data type. The variables are assigned with R-Objects and the data type of the R-object becomes the data type of the variable. Data Type: Numeric, Integer, Character ## Numeric ## Assign a number to variable num num <- 3.14 print(num) ## simple calculation by calling the variable print(num + 1) ## Let's check the data type that has been assigned to num print(class(num)) ## Integer ## Assign the integer part variable num.int num.int <- as.integer(num) num.int ## Let's check the data type class(num.int) ## Character ## Assign the integer part variable num.int char <- "Hello"
  • 2.
    R Basics andSimulation https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM] char ## Let's check the data type class(char) R-Objects: Vectors, Matrices, Data Frames # Create a vector with more than one element # We use c() function which means to combine the elements into a vector # create a vector of characters col <- c('red','green',"yellow") col # create a vector of numeric num <- c(1,2,3) num # extract elements from vectors num[1] # Create a matrices of vectors # Several way to do this # Use cbind() function which means column combines Mcol <- cbind(num,num,num) Mcol # Use rbind() function which means row combine Mrow <- rbind(num,num,num) Mrow # Use matrix function to fill in each element M <- matrix(1:9,nrow=3,ncol=3) M # Now lets try combining numeric vector and character vector into a matrix Mtry <- rbind(num,col) class(Mtry) # Do you notice what has been changed here? # extract elements from a matrix M[1,3] # Create a data frame df <- data.frame(x = col, y = num) df # extract element from data frame df$x df$y[3]
  • 3.
    R Basics andSimulation https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM] Calculation with R: Multiplication, Log, Exponential ,Power, Square Root and Some Useful Statistics # for scaler x <- 2 x*x # for vector num num + num y <- c(0,1,2,3,4) log(y) # for matrix M <- matrix(1:9,ncol=3,nrow=3) exp(M) sqrt(M) # for data frame df df^2 # why is there a warning message? # Useful statistics mean(M) sum(M) Simulate Random Variables in R # generate uniform variable between 0,1 u <- runif(10) # plot to see what it looks like plot(u) hist(u) # generate more data u <- runif(1000) plot(u) hist(u) # Do you see what has changed? # sample from a vector # Type help(sample) to see the function arguments sampx = sample(x = 1:100, size=1, replace=F) sampx # sample 10 numbers from 1-100
  • 4.
    R Basics andSimulation https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM] sampx = sample(x = 1:100, size=10, replace=F) sampx rank(sampx) # Normal random variable, a very useful random variable used in statistics n1 <- rnorm(1) n1 n2 <- rnorm(1) n2 # Generate a larger sample to see its distribution n <- rnorm(1000) plot(n) hist(n) plot(density(n), main="Density of n",xlab="n") # remember the shape of the distribution # set seed to generate the same random number set.seed(123) n1 <- rnorm(1) n1 set.seed(123) n2 <- rnorm(1) n2 Why is Normal Distribution so Useful? # Let's look at some examples # If we flip 10 coints and count the number of heads. # what do you think the distribution of the count will look like. # simulate 30 coin flips x = sample(c("head","tail"),30,replace = T) x # count the number of heads x == "head" sum( x == "head" ) # repeat this 1000 times headcount <- c() # create an empty vector for (i in 1:1000){ x = sample(c("head","tail"),30,replace = T) headcount[i] <- sum( x == "head" ) } hist(headcount,main="Head Count in 30 Coin Flips") # plot the distribution, what do you see? # How about we simulation from a different distribution? # simulation from uniform distribution x = runif(30) sum(x)
  • 5.
    R Basics andSimulation https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM] # repeat this 1000 times sumunif <- c() # create an empty vector for (i in 1:1000){ x = runif(30) sumunif[i] <- sum(x) } hist(sumunif,main="Sum of Uniform Random Variables") # plot the distribution, what do you see? # The beam machine # install.packages("animation") # library(animation) # balls = 200 # layers = 15 # ani.options(nmax=balls+layers-2) # quincunx(balls, layers) Data Visualization to Classify Glass Fragment Found on “Crime Scene” library(MASS) data(fgl) # See data description help(fgl) # Print out part of the data set head(fgl) # Plot each composition versus glass type plot(RI ~ type, data=fgl, col=c(1:6)) plot(Al ~ type, data=fgl, col=c(1:6)) plot(K ~ type, data=fgl, col=c(1:6)) plot(Ca ~ type, data=fgl, col=c(1:6)) # visualize two compositions versus glass type. # Pick two composition: "Ca" and "K" # Pick two class types: vehical window glass "Veh", and vehicle hadlamps "Head" # Can we distinguish these two types of glass based on these two compositions fgl_subset = subset(fgl,type %in% c("WinNF", "Veh")) cols = ifelse(fgl_subset[,"type"]=="Veh","blue","red") set.seed(12) # pick one observation, pretending that we don't know the glass type test = 22 fgl_subset[test,c("Ca","K")] # remake boxplot with a focus on these two type of glass plot(Ca ~ type, data=fgl_subset, col=c("blue","red")) abline(h=fgl_subset[test,c("Ca")]) plot(K ~ type, data=fgl_subset, col=c("blue","red")) abline(h=fgl_subset[test,c("K")])
  • 6.
    R Basics andSimulation https://www4.stat.ncsu.edu/~eceyhan/Forensic.html[3/27/2018 11:23:07 AM] plot(fgl_subset[-test,c("Ca","K")], col=cols, pch=19,main="Glass type by compositions") # Add the data points that we left out to see where it lands. points(fgl_subset[test,c("Ca","K")], col="black", pch=17,cex = 1.5) legend("topright",col=c("red","blue","black"),legend=c("WinNF","Veh","test case"),pch=c(19,19,17)) # Based on these two composition of the test glass. Can you predict what type of glass it is? print(fgl_subset[test,"type"])