SlideShare a Scribd company logo
1 of 10
## QuinnMcFee Work Sample (Rprogramming)
## thisisa sample of performingpartial/robustregression,Hypothesistestingand
##other residual diagnostics/remedial measures
## on variousdatasets.AnalysiswrittenbutIsuggestrunningthe plotsinR if you'dlike to
## visualize betterwhatIam talkingaboutwhenreferringtooutliers,influential,points,leverage,ect.
library(faraway)
prostate<-prostate
lcavol<-as.numeric(prostate[,'lcavol'])
lpsa<-as.numeric(prostate[,'lpsa'])
plot(lpsa,lcavol)
model<-lm(lpsa~lcavol)
summary(model)
## fittingregressionline:
abline(model)
h<-1/97 +(lcavol-mean(lcavol))^2/sum((lcavol-mean(lcavol))^2)
lpsa.hat<-fitted(model)
SSE<-sum((lpsa-lpsa.hat)^2)
ExtStRes<-(lpsa-lpsa.hat)*((94/(SSE*(1-h)-(lpsa-lpsa.hat)^2)))^(1/2)
plot(lcavol,lpsa,col=ifelse(abs(ExtStRes)>=abs(qt(.975,df=94)),'red','black'),main='Outliers(red)')
## H1: B1-B2=0 andB1-B3=0 ScientistA
## H2:B1-B2=0 and B2-B3=0 ScientistB
##Test: if 0 not contatinedineitherof the CI'sgivenbelow
### we notice thenforthat the onlything(orthe onlydifference) thatcouldmake
### ScientistA fail torejectnull HypothesisandScientistBreject
### wouldbe whenB2.hatand B3.hat are the furthestapartof the estimates
a<-.05
##let
n<-100
B.hat1<- 1
B.hat2<- 4
B.hat3<- -2
MSE<-1/qt(1-a/4,n-3) #for simplicitysake
sB1<-2
sB2<-2
sB3<-2
#s^2(x-y)=s^2(X)+s^2(y)
ci1a<-c(B.hat1-B.hat2-qt(1-a/4,n-3)*MSE*sqrt(sB12^2),B.hat1-B.hat2+qt(1-a/4,n-
3)*MSE*sqrt(sB1^2+sB2^2))
ci2a<-c(B.hat1-B.hat3-qt(1-a/4,n-3)*MSE*sqrt(sB13^2),B.hat1-B.hat3+qt(1-a/4,n-
3)*MSE*sqrt(sB1^2+sB2^2))
ci1b<-c(B.hat1-B.hat2-qt(1-a/4,n-3)*MSE*sqrt(sB12^2),B.hat1-B.hat2+qt(1-a/4,n-
3)*MSE*sqrt(sB1^2+sB2^2))
ci2b<-c(B.hat2-B.hat3-qt(1-a/4,n-3)*MSE*sqrt(sB13^2),B.hat1-B.hat3+qt(1-a/4,n-
3)*MSE*sqrt(sB1^2+sB2^2))
##Return ValuesScientistA:
# > ci1a
# [1] -9.0000000 -0.1715729
# > ci2a
# [1] -4.000000 5.828427
#########0 isincludedinfirstconfidence intervalsoshe cannotrejectH1
##Return valuesScientistB:
# > ci1b
# [1] -9.0000000 -0.1715729
# > ci2b
# [1] -1.000000 5.828427
### The firstconfidence interval doesn'tinclude 0so he can rejectH2
###The scientistsA andB shoulduse a bonferri correctionof two.
## A andB make 2 differentcomparisonsinthe Hypothesis(B1=B2U B1=B3, B1=B2 U B2=B3)
## while theyshouldbe testingthe unionof all of these(B1=B2U B1=B3 U B2=B3) if they
###in orderto be consistent.
##3a)
##data table writtentocsv file andreadin
deathcigs<-read.csv("Deathcigs.csv")
X<-deathcigs[,'X']
Y<-deathcigs[,'Y']
deathmodel<-lm(Y~X)
plot(X,Y)
abline(deathmodel)
##we see that GreatBrittain(465,1145) and the US (190,1280) are influential points
##As theyare far away fromthe regressionline
##DeletingUnitedStates:
XnoUS<-X[-c(4)]
YnoUS<-Y[-c(4)]
plot(XnoUS,YnoUS)
deathmodelnoUS<-lm(YnoUS~XnoUS)
abline(deathmodelnoUS)
##Now WithoutGreat Brittain:
XnoGB<-X[-c(6)]
YnoGB<-Y[-c(6)]
plot(XnoGB,YnoGB)
deathmodelnoGB<-lm(YnoGB~XnoGB)
abline(deathmodelnoGB)
##Note that whencomparingthe regressionfitforeachof the scenarios(all,withoutUS,andwithout
GB)
## the leastsquaresregressionfitisthe closesttothe datawhenUS is removedsoushas largest
influence
## on the regression.
##4a)
Y<-pipeline[,'Lab']
X<-pipeline[,'Field']
plot(x)
pipemodel<-lm(Y~X)
plot(X,Y,main='FieldvsLabRegression',xlab='Field',ylab='Lab')
abline(pipemodel)
### UsingBrown-Forsythe Test(levene)
h<-1/97 +(X-mean(X))^2/sum((X-mean(X))^2)
Y.hat<-fitted(pipemodel)
SSE<-sum((Y-Y.hat)^2)
ExtStRes<-(Y-Y.hat)*((94/(SSE*(1-h)-(Y-Y.hat)^2)))^(1/2)
#usingmedianof e* as cut off that partition
cutoff<-as.numeric(summary(X)['Median'])
I1<-as.numeric(X[which(X<=cutoff)])
I2<-as.numeric(X[which(X>cutoff)])
Y1<-as.numeric(Y[which(X<=cutoff)])
Y1.hat<-fitted(lm(Y1~I1))
Y2<-as.numeric(Y[which(X>cutoff)])
Y2.hat<-fitted(lm(Y2~I2))
e1<-Y1-Y1.hat
e2<-Y2-Y2.hat
e1tild<-median(e1)
e2tild<-median(e2)
d1<-mean(abs(e1-e1tild))
d2<-mean(abs(e2-e2tild))
s1<-sd(abs(e1-e1tild))
s2<-sd(abs(e2-e2tild))
n1<-length(I1)
n2<-length(I2)
s<-sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2))
tBF<-(d1-d2)/(s*sqrt(1/n1+1/n2)) ## = -4.95
threshold<-qt(.975,n1+n2-2) ## = 1.98
##since abs(tBF)>threshold
## thissuggestsnonconstantvariance by BF test
## tryinga fewtransformations,Ifoundthe logarithmictransformation
## of the response andpredictortohave a nicerlinearregression(seeplotforbeautiful regression)
var(log(X),log(Y)) ##andas a side note orquantitative checkthe variance ismuchsmaller
## than the orignial disrtribution
plot(log(X),log(Y),main="Logof Distribution(nice!)")
abline(lm(log(Y)~log(X)))
# ## BF test
XL<-log(X)
YL<-log(Y)
cutoff<-as.numeric(summary(XL)['Median'])
I1<-as.numeric(XL[which(XL<=cutoff)])
I2<-as.numeric(XL[which(XL>cutoff)])
Y1<-as.numeric(YL[which(XL<=cutoff)])
Y1.hat<-fitted(lm(Y1~I1))
Y2<-as.numeric(YL[which(XL>cutoff)])
Y2.hat<-fitted(lm(Y2~I2))
e1<-Y1-Y1.hat
e2<-Y2-Y2.hat
e1tild<-median(e1)
e2tild<-median(e2)
d1<-mean(abs(e1-e1tild))
d2<-mean(abs(e2-e2tild))
s1<-sd(abs(e1-e1tild))
s2<-sd(abs(e2-e2tild))
n1<-length(I1)
n2<-length(I2)
s<-sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2))
tBF<-(d1-d2)/(s*sqrt(1/n1+1/n2)) ## = .33857
threshold<-qt(.975,n1+n2-2) ## = 1.98
## the test statisticissmallerthanthresholdsuggesting
#constantvariance
Xs<-as.numeric(sockeye[,'spawners'])
Yr<-as.numeric(sockeye[,'recruits'])
plot(Xs,Yr,main='SpawnersvsRecruitsRegression')
sockmodel<-lm(Yr~Xs)
abline(sockmodel)
Yr.hat<-fitted(sockmodel)
e<-as.numeric(Yr-Yr.hat)
## plottingresiduevsresponse:
plot(Xs,e)
abline(lm(e~Xs))
##Short cut BF/levene testfortestof variance
library('lawstat')
levene.test(e,group=Xs>=534.5)
##Test Statistic=3.1035 p-value=.08988 So we don'tsatisfythe conditionof Gauss-Markovthat
requires
## constant variance soa leastsquaressimpleregressionwill notrepresentthe dataina meaningful
way
plot(Xs,Yr)
curve(Xs*exp(-Xs))
##taking the logof bothsidesof the rickermodel:
##log(Y)=log(B0)+log(X)-B1X
##log(Y/X)=log(B0)-B1X
##lettingournewY=log(Y/X) we have a new regressionfit
# andsolvingthe systemwe find
#withB0~3 and B1~.001
sat<-sat
Y<-sat$total
X1<-sat$expend
X2<-sat$salary
X3<-sat$ratio
model1<-lm(Y~X1+X2+X3)
#callingmodel1:
## Coefficients:
##(Intercept) X1 X2 X3
## 1069.234 16.469 -8.823 6.330
## Thismodel saysthat increasingexpenditureand/orpupilsperteacher,
## Thensat scoresgo up and increasingteachersalariesmakesthe
## SAT scoresgo down. Note that itdoesn'tmake too muchsense for
##that more pupilsperstudentandlowersalariesforteachers
##wouldincrease the average total SATscore so we must have leftout
## an importantpredictorvariable
X4<-sat$takers
model2<-lm(Y~X1+X2+X3+X4)
###Coefficients:
###(Intercept) X1 X2 X3 X4
### 1045.972 4.463 1.638 -3.624 -2.904
#Nowwe're talking.Thismodel makesmore sense.Now whenwe increase the teachersperpupil
#and the teaher'ssalaryThe SATscores wouldgoup (inthe model).
##H0:B2=0 Ha: B2!=0
## level setalpha,a=.05## usingT=B.hat2/s{B.hat2}
B.hat0<-1069.234
B.hat1<-4.463
B.hat2<-1.638
B.hat3<--3.624
B.hat4<--2.904
K=4
Y.hat<-as.numeric(fitted(model2))
n <-length(X3)
MSE<- 1/(n-2)*sum((Y-Y.hat)^2) ##1002.6
sB.hat2<-sqrt(MSE/sum((X2-mean(X2))^2)) ##.761
T<-B.hat2/sB.hat2 ## 2.15
t<-abs(qt(.975,n-2))##2.01
##Because T>|t| we rejectH0
sB.hat3<-sqrt(MSE/sum((X3-mean(X3))^2))
T<-B.hat3/sB.hat3
t<-abs(qt(.975,n-2))
##H0: B1&B2&B3=0 Ha: X1,X2,X3 !=0
## level setalpha,a=.05
##B2=0 isrejectedwe donotneedto checkB3 B1
## H0 now requiresB1&B2&B3=0. B2=0 wasrejectedso
## the newhypothesisisrejected
res<-Y-Y.hat
plot(model2,which=c(1:6))
# fromthe residualsvsfittedplotwe observe the fittedcurve tobe non-linearsuggestingnonconstant
variance
# the variance isnonconstantsothe Gauss-Markovconditionisnotsatisfiedmeaningthe errorisnot
normal
#fromthe residualVSfittedplotwe see that29, 24, and 48 are potential outliers
# fromthe Cook'sdistance we see that44 hasthe greatestinfluence onthe regressionof
# the data so 44 ismost likelyaninfluentialpoint.48alsohas relativelyhighCook'sdistance
# howeverthisdatapointisnotisclearlyan outlierinthe residualsVSleverage plot

More Related Content

What's hot

What's hot (7)

Purely Functional Data Structures in Scala
Purely Functional Data Structures in ScalaPurely Functional Data Structures in Scala
Purely Functional Data Structures in Scala
 
08 binarysearchtrees 1
08 binarysearchtrees 108 binarysearchtrees 1
08 binarysearchtrees 1
 
R data mining-Time Series Analysis with R
R data mining-Time Series Analysis with RR data mining-Time Series Analysis with R
R data mining-Time Series Analysis with R
 
SQL for pattern matching (Oracle 12c)
SQL for pattern matching (Oracle 12c)SQL for pattern matching (Oracle 12c)
SQL for pattern matching (Oracle 12c)
 
SQL Pattern Matching – should I start using it?
SQL Pattern Matching – should I start using it?SQL Pattern Matching – should I start using it?
SQL Pattern Matching – should I start using it?
 
Rsplit apply combine
Rsplit apply combineRsplit apply combine
Rsplit apply combine
 
Table of Useful R commands.
Table of Useful R commands.Table of Useful R commands.
Table of Useful R commands.
 

Similar to QuinnMcFeeWorkSample(R)

The Magnificent Seven
The Magnificent SevenThe Magnificent Seven
The Magnificent Seven
Mike Fogus
 
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavSeminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Vyacheslav Arbuzov
 
Ejercicios de estilo en la programación
Ejercicios de estilo en la programaciónEjercicios de estilo en la programación
Ejercicios de estilo en la programación
Software Guru
 
Microsoft Word Practice Exercise Set 2
Microsoft Word   Practice Exercise Set 2Microsoft Word   Practice Exercise Set 2
Microsoft Word Practice Exercise Set 2
rampan
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
carliotwaycave
 

Similar to QuinnMcFeeWorkSample(R) (20)

Term Rewriting
Term RewritingTerm Rewriting
Term Rewriting
 
Scala as a Declarative Language
Scala as a Declarative LanguageScala as a Declarative Language
Scala as a Declarative Language
 
Introduction To Lisp
Introduction To LispIntroduction To Lisp
Introduction To Lisp
 
Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Rcommands-for those who interested in R.
Rcommands-for those who interested in R.
 
The Magnificent Seven
The Magnificent SevenThe Magnificent Seven
The Magnificent Seven
 
Using R Tool for Probability and Statistics
Using R Tool for Probability and Statistics Using R Tool for Probability and Statistics
Using R Tool for Probability and Statistics
 
Data-Oriented Programming with Clojure and Jackdaw (Charles Reese, Funding Ci...
Data-Oriented Programming with Clojure and Jackdaw (Charles Reese, Funding Ci...Data-Oriented Programming with Clojure and Jackdaw (Charles Reese, Funding Ci...
Data-Oriented Programming with Clojure and Jackdaw (Charles Reese, Funding Ci...
 
Basic R Data Manipulation
Basic R Data ManipulationBasic R Data Manipulation
Basic R Data Manipulation
 
R programming
R programmingR programming
R programming
 
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov VyacheslavSeminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
 
Constraint Programming in Haskell
Constraint Programming in HaskellConstraint Programming in Haskell
Constraint Programming in Haskell
 
Ejercicios de estilo en la programación
Ejercicios de estilo en la programaciónEjercicios de estilo en la programación
Ejercicios de estilo en la programación
 
Functional Programming by Examples using Haskell
Functional Programming by Examples using HaskellFunctional Programming by Examples using Haskell
Functional Programming by Examples using Haskell
 
Microsoft Word Practice Exercise Set 2
Microsoft Word   Practice Exercise Set 2Microsoft Word   Practice Exercise Set 2
Microsoft Word Practice Exercise Set 2
 
An example of R code for Data visualization
An example of R code for Data visualizationAn example of R code for Data visualization
An example of R code for Data visualization
 
ML-CheatSheet (1).pdf
ML-CheatSheet (1).pdfML-CheatSheet (1).pdf
ML-CheatSheet (1).pdf
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
 
Declare Your Language: Transformation by Strategic Term Rewriting
Declare Your Language: Transformation by Strategic Term RewritingDeclare Your Language: Transformation by Strategic Term Rewriting
Declare Your Language: Transformation by Strategic Term Rewriting
 
A brief introduction to apply functions
A brief introduction to apply functionsA brief introduction to apply functions
A brief introduction to apply functions
 

QuinnMcFeeWorkSample(R)

  • 1. ## QuinnMcFee Work Sample (Rprogramming) ## thisisa sample of performingpartial/robustregression,Hypothesistestingand ##other residual diagnostics/remedial measures ## on variousdatasets.AnalysiswrittenbutIsuggestrunningthe plotsinR if you'dlike to ## visualize betterwhatIam talkingaboutwhenreferringtooutliers,influential,points,leverage,ect. library(faraway) prostate<-prostate lcavol<-as.numeric(prostate[,'lcavol']) lpsa<-as.numeric(prostate[,'lpsa']) plot(lpsa,lcavol) model<-lm(lpsa~lcavol) summary(model) ## fittingregressionline: abline(model) h<-1/97 +(lcavol-mean(lcavol))^2/sum((lcavol-mean(lcavol))^2) lpsa.hat<-fitted(model) SSE<-sum((lpsa-lpsa.hat)^2) ExtStRes<-(lpsa-lpsa.hat)*((94/(SSE*(1-h)-(lpsa-lpsa.hat)^2)))^(1/2) plot(lcavol,lpsa,col=ifelse(abs(ExtStRes)>=abs(qt(.975,df=94)),'red','black'),main='Outliers(red)') ## H1: B1-B2=0 andB1-B3=0 ScientistA ## H2:B1-B2=0 and B2-B3=0 ScientistB
  • 2. ##Test: if 0 not contatinedineitherof the CI'sgivenbelow ### we notice thenforthat the onlything(orthe onlydifference) thatcouldmake ### ScientistA fail torejectnull HypothesisandScientistBreject ### wouldbe whenB2.hatand B3.hat are the furthestapartof the estimates a<-.05 ##let n<-100 B.hat1<- 1 B.hat2<- 4 B.hat3<- -2 MSE<-1/qt(1-a/4,n-3) #for simplicitysake sB1<-2 sB2<-2 sB3<-2 #s^2(x-y)=s^2(X)+s^2(y) ci1a<-c(B.hat1-B.hat2-qt(1-a/4,n-3)*MSE*sqrt(sB12^2),B.hat1-B.hat2+qt(1-a/4,n- 3)*MSE*sqrt(sB1^2+sB2^2)) ci2a<-c(B.hat1-B.hat3-qt(1-a/4,n-3)*MSE*sqrt(sB13^2),B.hat1-B.hat3+qt(1-a/4,n- 3)*MSE*sqrt(sB1^2+sB2^2)) ci1b<-c(B.hat1-B.hat2-qt(1-a/4,n-3)*MSE*sqrt(sB12^2),B.hat1-B.hat2+qt(1-a/4,n- 3)*MSE*sqrt(sB1^2+sB2^2)) ci2b<-c(B.hat2-B.hat3-qt(1-a/4,n-3)*MSE*sqrt(sB13^2),B.hat1-B.hat3+qt(1-a/4,n- 3)*MSE*sqrt(sB1^2+sB2^2)) ##Return ValuesScientistA: # > ci1a # [1] -9.0000000 -0.1715729 # > ci2a
  • 3. # [1] -4.000000 5.828427 #########0 isincludedinfirstconfidence intervalsoshe cannotrejectH1 ##Return valuesScientistB: # > ci1b # [1] -9.0000000 -0.1715729 # > ci2b # [1] -1.000000 5.828427 ### The firstconfidence interval doesn'tinclude 0so he can rejectH2 ###The scientistsA andB shoulduse a bonferri correctionof two. ## A andB make 2 differentcomparisonsinthe Hypothesis(B1=B2U B1=B3, B1=B2 U B2=B3) ## while theyshouldbe testingthe unionof all of these(B1=B2U B1=B3 U B2=B3) if they ###in orderto be consistent. ##3a) ##data table writtentocsv file andreadin deathcigs<-read.csv("Deathcigs.csv") X<-deathcigs[,'X'] Y<-deathcigs[,'Y'] deathmodel<-lm(Y~X) plot(X,Y) abline(deathmodel) ##we see that GreatBrittain(465,1145) and the US (190,1280) are influential points ##As theyare far away fromthe regressionline ##DeletingUnitedStates: XnoUS<-X[-c(4)]
  • 4. YnoUS<-Y[-c(4)] plot(XnoUS,YnoUS) deathmodelnoUS<-lm(YnoUS~XnoUS) abline(deathmodelnoUS) ##Now WithoutGreat Brittain: XnoGB<-X[-c(6)] YnoGB<-Y[-c(6)] plot(XnoGB,YnoGB) deathmodelnoGB<-lm(YnoGB~XnoGB) abline(deathmodelnoGB) ##Note that whencomparingthe regressionfitforeachof the scenarios(all,withoutUS,andwithout GB) ## the leastsquaresregressionfitisthe closesttothe datawhenUS is removedsoushas largest influence ## on the regression. ##4a) Y<-pipeline[,'Lab'] X<-pipeline[,'Field'] plot(x) pipemodel<-lm(Y~X) plot(X,Y,main='FieldvsLabRegression',xlab='Field',ylab='Lab') abline(pipemodel) ### UsingBrown-Forsythe Test(levene) h<-1/97 +(X-mean(X))^2/sum((X-mean(X))^2) Y.hat<-fitted(pipemodel)
  • 5. SSE<-sum((Y-Y.hat)^2) ExtStRes<-(Y-Y.hat)*((94/(SSE*(1-h)-(Y-Y.hat)^2)))^(1/2) #usingmedianof e* as cut off that partition cutoff<-as.numeric(summary(X)['Median']) I1<-as.numeric(X[which(X<=cutoff)]) I2<-as.numeric(X[which(X>cutoff)]) Y1<-as.numeric(Y[which(X<=cutoff)]) Y1.hat<-fitted(lm(Y1~I1)) Y2<-as.numeric(Y[which(X>cutoff)]) Y2.hat<-fitted(lm(Y2~I2)) e1<-Y1-Y1.hat e2<-Y2-Y2.hat e1tild<-median(e1) e2tild<-median(e2) d1<-mean(abs(e1-e1tild)) d2<-mean(abs(e2-e2tild)) s1<-sd(abs(e1-e1tild)) s2<-sd(abs(e2-e2tild)) n1<-length(I1) n2<-length(I2) s<-sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2)) tBF<-(d1-d2)/(s*sqrt(1/n1+1/n2)) ## = -4.95 threshold<-qt(.975,n1+n2-2) ## = 1.98 ##since abs(tBF)>threshold ## thissuggestsnonconstantvariance by BF test
  • 6. ## tryinga fewtransformations,Ifoundthe logarithmictransformation ## of the response andpredictortohave a nicerlinearregression(seeplotforbeautiful regression) var(log(X),log(Y)) ##andas a side note orquantitative checkthe variance ismuchsmaller ## than the orignial disrtribution plot(log(X),log(Y),main="Logof Distribution(nice!)") abline(lm(log(Y)~log(X))) # ## BF test XL<-log(X) YL<-log(Y) cutoff<-as.numeric(summary(XL)['Median']) I1<-as.numeric(XL[which(XL<=cutoff)]) I2<-as.numeric(XL[which(XL>cutoff)]) Y1<-as.numeric(YL[which(XL<=cutoff)]) Y1.hat<-fitted(lm(Y1~I1)) Y2<-as.numeric(YL[which(XL>cutoff)]) Y2.hat<-fitted(lm(Y2~I2)) e1<-Y1-Y1.hat e2<-Y2-Y2.hat e1tild<-median(e1) e2tild<-median(e2) d1<-mean(abs(e1-e1tild)) d2<-mean(abs(e2-e2tild)) s1<-sd(abs(e1-e1tild)) s2<-sd(abs(e2-e2tild)) n1<-length(I1) n2<-length(I2) s<-sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2))
  • 7. tBF<-(d1-d2)/(s*sqrt(1/n1+1/n2)) ## = .33857 threshold<-qt(.975,n1+n2-2) ## = 1.98 ## the test statisticissmallerthanthresholdsuggesting #constantvariance Xs<-as.numeric(sockeye[,'spawners']) Yr<-as.numeric(sockeye[,'recruits']) plot(Xs,Yr,main='SpawnersvsRecruitsRegression') sockmodel<-lm(Yr~Xs) abline(sockmodel) Yr.hat<-fitted(sockmodel) e<-as.numeric(Yr-Yr.hat) ## plottingresiduevsresponse: plot(Xs,e) abline(lm(e~Xs)) ##Short cut BF/levene testfortestof variance library('lawstat') levene.test(e,group=Xs>=534.5) ##Test Statistic=3.1035 p-value=.08988 So we don'tsatisfythe conditionof Gauss-Markovthat requires ## constant variance soa leastsquaressimpleregressionwill notrepresentthe dataina meaningful way plot(Xs,Yr) curve(Xs*exp(-Xs))
  • 8. ##taking the logof bothsidesof the rickermodel: ##log(Y)=log(B0)+log(X)-B1X ##log(Y/X)=log(B0)-B1X ##lettingournewY=log(Y/X) we have a new regressionfit # andsolvingthe systemwe find #withB0~3 and B1~.001 sat<-sat Y<-sat$total X1<-sat$expend X2<-sat$salary X3<-sat$ratio model1<-lm(Y~X1+X2+X3) #callingmodel1: ## Coefficients: ##(Intercept) X1 X2 X3 ## 1069.234 16.469 -8.823 6.330 ## Thismodel saysthat increasingexpenditureand/orpupilsperteacher, ## Thensat scoresgo up and increasingteachersalariesmakesthe
  • 9. ## SAT scoresgo down. Note that itdoesn'tmake too muchsense for ##that more pupilsperstudentandlowersalariesforteachers ##wouldincrease the average total SATscore so we must have leftout ## an importantpredictorvariable X4<-sat$takers model2<-lm(Y~X1+X2+X3+X4) ###Coefficients: ###(Intercept) X1 X2 X3 X4 ### 1045.972 4.463 1.638 -3.624 -2.904 #Nowwe're talking.Thismodel makesmore sense.Now whenwe increase the teachersperpupil #and the teaher'ssalaryThe SATscores wouldgoup (inthe model). ##H0:B2=0 Ha: B2!=0 ## level setalpha,a=.05## usingT=B.hat2/s{B.hat2} B.hat0<-1069.234 B.hat1<-4.463 B.hat2<-1.638 B.hat3<--3.624 B.hat4<--2.904 K=4 Y.hat<-as.numeric(fitted(model2)) n <-length(X3) MSE<- 1/(n-2)*sum((Y-Y.hat)^2) ##1002.6 sB.hat2<-sqrt(MSE/sum((X2-mean(X2))^2)) ##.761
  • 10. T<-B.hat2/sB.hat2 ## 2.15 t<-abs(qt(.975,n-2))##2.01 ##Because T>|t| we rejectH0 sB.hat3<-sqrt(MSE/sum((X3-mean(X3))^2)) T<-B.hat3/sB.hat3 t<-abs(qt(.975,n-2)) ##H0: B1&B2&B3=0 Ha: X1,X2,X3 !=0 ## level setalpha,a=.05 ##B2=0 isrejectedwe donotneedto checkB3 B1 ## H0 now requiresB1&B2&B3=0. B2=0 wasrejectedso ## the newhypothesisisrejected res<-Y-Y.hat plot(model2,which=c(1:6)) # fromthe residualsvsfittedplotwe observe the fittedcurve tobe non-linearsuggestingnonconstant variance # the variance isnonconstantsothe Gauss-Markovconditionisnotsatisfiedmeaningthe errorisnot normal #fromthe residualVSfittedplotwe see that29, 24, and 48 are potential outliers # fromthe Cook'sdistance we see that44 hasthe greatestinfluence onthe regressionof # the data so 44 ismost likelyaninfluentialpoint.48alsohas relativelyhighCook'sdistance # howeverthisdatapointisnotisclearlyan outlierinthe residualsVSleverage plot