BBA 205 BE UNIT 2 economic systems prof dr kanchan.pptx
Introduction to R for Data Science :: Session 6 [Linear Regression in R]
1. Introduction to R for Data Science
Lecturers
dipl. ing Branko Kovač
Data Analyst at CUBE/Data Science Mentor
at Springboard
Data Science zajednica Srbije
branko.kovac@gmail.com
dr Goran S. Milovanović
Data Scientist at DiploFoundation
Data Science zajednica Srbije
goran.s.milovanovic@gmail.com
goranm@diplomacy.edu
2. Linear Regression in R
• Exploratory Data Analysis
• Assumptions of the Linear Model
• Correlation
• Normality Tests
• Linear Regression
• Prediction, Confidence
Intervals, Residuals
• Influential Cases and
the Influence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
3. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# clear
rm(list=ls())
#### read data
library(datasets)
data(iris)
### iris data set description:
# https://stat.ethz.ch/R-manual/R-devel/library/iriss/html/iris.html
### ExploratoryData Analysis (EDA)
str(iris)
summary(iris)
Linear Regression in R
• Before modeling: Assumptions and Exploratory Data Analysis (EDA)
Intro to R for Data Science
Session 6: Linear Regression in R
4. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
### EDA plots
# plot layout:2 x 2
par(mfcol = c(2,2))
# boxplotiris$Sepal.Length
boxplot(iris$Sepal.Length,
horizontal = TRUE,
xlab="Sepal Length")
# histogram:iris$Sepal.Length
hist(iris$Sepal.Length,
main="",
xlab="Sepal.Length",
prob=T)
# overlay iris$Sepal.Length density functionoverthe empiricaldistribution
lines(density(iris$Sepal.Length),
lty="dashed",
lwd=2.5,
col="red")
EDA
Intro to R for Data Science
Session 6: Linear Regression in R
5. Linear Regression in R
• EDA
Intro to R for Data Science
Session 6: Linear Regression in R
6. Intro to R for Data Science
Session 6: Linear Regression in R
7. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
## Pearsoncorrelationin R {base}
cor1 <- cor(iris$Sepal.Length, iris$Petal.Length,
method="pearson")
cor1
par(mfcol = c(1,1))
plot(iris$Sepal.Length, iris$Petal.Length,
main = "Sepal Length vs Petal Length",
xlab = "Sepal Length", ylab = "Petal Length")
## Correlation matrix and treatmentof missing data
dSet <- iris
# Remove one discretevariable
dSet$Species <- NULL
# introduce NA in dSet$Sepal.Length[5]
dSet$Sepal.Length[5] <- NA
# Pairwise and Listwise Deletion:
cor1a <- cor(dSet,use="complete.obs") # listwise deletion
cor1a <- cor(dSet,use="pairwise.complete.obs") # pairwise deletion
cor1a <- cor(dSet,use="all.obs") # all observations -error
Correlation
Intro to R for Data Science
Session 6: Linear Regression in R
8. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
library(Hmisc)
cor2 <- rcorr(iris$Sepal.Length,
iris$Petal.Length,
type="pearson")
cor2$r # correlations
cor2$r[1,2] # that's whatyou need,right
cor2$P # significantat
cor2$n # num.observations
# NOTE: rcorr uses Pairwise deletion!
# Correlation matrix
cor2a <- rcorr(as.matrix(dSet),
type="pearson") # NOTE:as.matrix
# select significant at alpha == .05
w <- which(!(cor2a$P<.05),arr.ind = T)
cor2a$r[w] <- NA
cor2a$P # comparew.
cor2a$r
Correlation {Hmisc}
Intro to R for Data Science
Session 6: Linear Regression in R
9. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# LinearRegression:lm()
# Predicting:PetalLength from SepalLength
reg <- lm(Petal.Length ~ Sepal.Length, data=iris)
class(reg)
summary(reg)
coefsReg <- coefficients(reg)
coefsReg
slopeReg <- coefsReg[2]
interceptReg <- coefsReg[1]
# Prediction from this model
newSLength <- data.frame(Sepal.Length = runif(100,
min(iris$Sepal.Length),
max(iris$Sepal.Length))
) # watch the variable namesin the new data.frame!
predictPLength <- predict(reg, newSLength)
predictPLength
Linear Regression with lm()
Intro to R for Data Science
Session 6: Linear Regression in R
10. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# Standardizedregressioncoefficients {QuantPsych}
library(QuantPsyc)
lm.beta(reg)
# Reminder:standardizedregressioncoefficientsare...
# What you would obtain upon performinglinearregressionoverstandardizedvariables
# z-score in R
zSLength <- scale(iris$Sepal.Length, center = T, scale = T) # computes z-score
zPLength <- scale(iris$Petal.Length, center = T, scale = T) # again;?scale
# new dSetw. standardized variables
dSet <- data.frame(Sepal.Length <- zSLength,
Petal.Length <- zPLength)
# LinearRegression w.lm() overstandardized variables
reg1 <- lm(Petal.Length ~ Sepal.Length, data=dSet)
summary(reg1)
# compare
coefficients(reg1)[2] # beta from reg1
lm.beta(reg) # standardizedbeta w. QuantPscy lm.beta from reg
Standardized Regression Coefficients
Intro to R for Data Science
Session 6: Linear Regression in R
11. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# plots w. {base}and {ggplot2}
library(ggplot2)
# Predictorvs Criterion {base}
plot(iris$Sepal.Length, iris$Petal.Length,
main = "Petal Length vs Sepal Length",
xlab = "Sepal Length",
ylab = "Petal Length"
)
abline(reg,col="red")
# Predictorvs Criterion {ggplot2}
ggplot(data = iris,
aes(x = Sepal.Length, y = Petal.Length)) +
geom_point(size = 2, colour = "black") +
geom_point(size = 1, colour = "white") +
geom_smooth(aes(colour = "red"),
method='lm') +
ggtitle("Sepal Length vs Petal Length") +
xlab("Sepal Length") + ylab("Petal Length") +
theme(legend.position = "none")
Plots {base} vs {ggplot2}
Intro to R for Data Science
Session 6: Linear Regression in R
12. Plots {base} vs {ggplot2}
Intro to R for Data Science
Session 6: Linear Regression in R
13. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# Predicted vs.residuals {ggplot2}
predReg <- predict(reg) # get predictions from reg
resReg <- residuals(reg) # get residuals from reg
# resStReg <- rstandard(reg)# get residualsfrom reg
plotFrame <- data.frame(predicted = predReg,
residual = resReg);
ggplot(data = plotFrame,
aes(x = predicted, y = residual)) +
geom_point(size = 2, colour = "black") +
geom_point(size = 1, colour = "white") +
geom_smooth(aes(colour = "blue"),
method='lm',
se=F) +
ggtitle("Predicted vs Residual Lengths") +
xlab("Predicted Lengths") + ylab("Residual") +
theme(legend.position = "none")
Predicted vs Residuals
Intro to R for Data Science
Session 6: Linear Regression in R
15. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
## Detectinfluentialcases
infReg <- as.data.frame(influence.measures(reg)$infmat)
# Cook's Distance:Cook and Weisberg(1982):
# values greaterthan 1 are troublesome
wCook <- which(infReg$cook.d>1) # we're fine here
# Average Leverage = (k+1)/n,k - num. of predictors,n - num. observations
# Also termed:hat values,range:0 - 1
# see: https://en.wikipedia.org/wiki/Leverage_%28statistics%29
# Various criteria (twice the leverage,three times the average...)
wLev <- which(infReg$hat>2*(2/length(iris$price))) # we seem to be fine here too...
## Influenceplot
infReg <- as.data.frame(influence.measures(reg)$infmat)
plotFrame <- data.frame(residual = resStReg,
leverage = infReg$hat,
cookD = infReg$cook.d)
Infulential Cases + Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
16. # Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
ggplot(plotFrame,
aes(y = residual,
x = leverage)) +
geom_point(size = plotFrame$cookD*100, shape = 1) +
ggtitle("Influence PlotnSize of the circle corresponds to Cook's distance") +
theme(plot.title = element_text(size=8, face="bold")) +
ylab("Standardized Residual") + xlab("Leverage")
Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R