QuinnMcFeeWorkSample(R)

## QuinnMcFee Work Sample (Rprogramming)
## thisisa sample of performingpartial/robustregression,Hypothesistestingand
##other residual diagnostics/remedial measures
## on variousdatasets.AnalysiswrittenbutIsuggestrunningthe plotsinR if you'dlike to
## visualize betterwhatIam talkingaboutwhenreferringtooutliers,influential,points,leverage,ect.
library(faraway)
prostate<-prostate
lcavol<-as.numeric(prostate[,'lcavol'])
lpsa<-as.numeric(prostate[,'lpsa'])
plot(lpsa,lcavol)
model<-lm(lpsa~lcavol)
summary(model)
## fittingregressionline:
abline(model)
h<-1/97 +(lcavol-mean(lcavol))^2/sum((lcavol-mean(lcavol))^2)
lpsa.hat<-fitted(model)
SSE<-sum((lpsa-lpsa.hat)^2)
ExtStRes<-(lpsa-lpsa.hat)*((94/(SSE*(1-h)-(lpsa-lpsa.hat)^2)))^(1/2)
plot(lcavol,lpsa,col=ifelse(abs(ExtStRes)>=abs(qt(.975,df=94)),'red','black'),main='Outliers(red)')
## H1: B1-B2=0 andB1-B3=0 ScientistA
## H2:B1-B2=0 and B2-B3=0 ScientistB

##Test: if 0 not contatinedineitherof the CI'sgivenbelow
### we notice thenforthat the onlything(orthe onlydifference) thatcouldmake
### ScientistA fail torejectnull HypothesisandScientistBreject
### wouldbe whenB2.hatand B3.hat are the furthestapartof the estimates
a<-.05
##let
n<-100
B.hat1<- 1
B.hat2<- 4
B.hat3<- -2
MSE<-1/qt(1-a/4,n-3) #for simplicitysake
sB1<-2
sB2<-2
sB3<-2
#s^2(x-y)=s^2(X)+s^2(y)
ci1a<-c(B.hat1-B.hat2-qt(1-a/4,n-3)*MSE*sqrt(sB12^2),B.hat1-B.hat2+qt(1-a/4,n-
3)*MSE*sqrt(sB1^2+sB2^2))
ci2a<-c(B.hat1-B.hat3-qt(1-a/4,n-3)*MSE*sqrt(sB13^2),B.hat1-B.hat3+qt(1-a/4,n-
ci1b<-c(B.hat1-B.hat2-qt(1-a/4,n-3)*MSE*sqrt(sB12^2),B.hat1-B.hat2+qt(1-a/4,n-
ci2b<-c(B.hat2-B.hat3-qt(1-a/4,n-3)*MSE*sqrt(sB13^2),B.hat1-B.hat3+qt(1-a/4,n-
##Return ValuesScientistA:
# > ci1a
# [1] -9.0000000 -0.1715729
# > ci2a

# [1] -4.000000 5.828427
#########0 isincludedinfirstconfidence intervalsoshe cannotrejectH1
##Return valuesScientistB:
# > ci1b
# [1] -9.0000000 -0.1715729
# > ci2b
# [1] -1.000000 5.828427
### The firstconfidence interval doesn'tinclude 0so he can rejectH2
###The scientistsA andB shoulduse a bonferri correctionof two.
## A andB make 2 differentcomparisonsinthe Hypothesis(B1=B2U B1=B3, B1=B2 U B2=B3)
## while theyshouldbe testingthe unionof all of these(B1=B2U B1=B3 U B2=B3) if they
###in orderto be consistent.
##3a)
##data table writtentocsv file andreadin
deathcigs<-read.csv("Deathcigs.csv")
X<-deathcigs[,'X']
Y<-deathcigs[,'Y']
deathmodel<-lm(Y~X)
plot(X,Y)
abline(deathmodel)
##we see that GreatBrittain(465,1145) and the US (190,1280) are influential points
##As theyare far away fromthe regressionline
##DeletingUnitedStates:
XnoUS<-X[-c(4)]

YnoUS<-Y[-c(4)]
plot(XnoUS,YnoUS)
deathmodelnoUS<-lm(YnoUS~XnoUS)
abline(deathmodelnoUS)
##Now WithoutGreat Brittain:
XnoGB<-X[-c(6)]
YnoGB<-Y[-c(6)]
plot(XnoGB,YnoGB)
deathmodelnoGB<-lm(YnoGB~XnoGB)
abline(deathmodelnoGB)
##Note that whencomparingthe regressionfitforeachof the scenarios(all,withoutUS,andwithout
GB)
## the leastsquaresregressionfitisthe closesttothe datawhenUS is removedsoushas largest
influence
## on the regression.
##4a)
Y<-pipeline[,'Lab']
X<-pipeline[,'Field']
plot(x)
pipemodel<-lm(Y~X)
plot(X,Y,main='FieldvsLabRegression',xlab='Field',ylab='Lab')
abline(pipemodel)
### UsingBrown-Forsythe Test(levene)
h<-1/97 +(X-mean(X))^2/sum((X-mean(X))^2)
Y.hat<-fitted(pipemodel)

SSE<-sum((Y-Y.hat)^2)
ExtStRes<-(Y-Y.hat)*((94/(SSE*(1-h)-(Y-Y.hat)^2)))^(1/2)
#usingmedianof e* as cut off that partition
cutoff<-as.numeric(summary(X)['Median'])
I1<-as.numeric(X[which(X<=cutoff)])
I2<-as.numeric(X[which(X>cutoff)])
Y1<-as.numeric(Y[which(X<=cutoff)])
Y1.hat<-fitted(lm(Y1~I1))
Y2<-as.numeric(Y[which(X>cutoff)])
e1<-Y1-Y1.hat
e2<-Y2-Y2.hat
e1tild<-median(e1)
e2tild<-median(e2)
d1<-mean(abs(e1-e1tild))
s1<-sd(abs(e1-e1tild))
n1<-length(I1)
n2<-length(I2)
s<-sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2))
tBF<-(d1-d2)/(s*sqrt(1/n1+1/n2)) ## = -4.95
threshold<-qt(.975,n1+n2-2) ## = 1.98
##since abs(tBF)>threshold
## thissuggestsnonconstantvariance by BF test

## tryinga fewtransformations,Ifoundthe logarithmictransformation
## of the response andpredictortohave a nicerlinearregression(seeplotforbeautiful regression)
var(log(X),log(Y)) ##andas a side note orquantitative checkthe variance ismuchsmaller
## than the orignial disrtribution
plot(log(X),log(Y),main="Logof Distribution(nice!)")
abline(lm(log(Y)~log(X)))
# ## BF test
XL<-log(X)
YL<-log(Y)
cutoff<-as.numeric(summary(XL)['Median'])
I1<-as.numeric(XL[which(XL<=cutoff)])
I2<-as.numeric(XL[which(XL>cutoff)])
Y1<-as.numeric(YL[which(XL<=cutoff)])
Y2<-as.numeric(YL[which(XL>cutoff)])
e1<-Y1-Y1.hat
e2<-Y2-Y2.hat
e1tild<-median(e1)
e2tild<-median(e2)
n1<-length(I1)
n2<-length(I2)
s<-sqrt(((n1-1)*s1^2+(n2-1)*s2^2)/(n1+n2-2))

tBF<-(d1-d2)/(s*sqrt(1/n1+1/n2)) ## = .33857
threshold<-qt(.975,n1+n2-2) ## = 1.98
## the test statisticissmallerthanthresholdsuggesting
#constantvariance
Xs<-as.numeric(sockeye[,'spawners'])
Yr<-as.numeric(sockeye[,'recruits'])
plot(Xs,Yr,main='SpawnersvsRecruitsRegression')
sockmodel<-lm(Yr~Xs)
abline(sockmodel)
Yr.hat<-fitted(sockmodel)
e<-as.numeric(Yr-Yr.hat)
## plottingresiduevsresponse:
plot(Xs,e)
abline(lm(e~Xs))
##Short cut BF/levene testfortestof variance
library('lawstat')
levene.test(e,group=Xs>=534.5)
##Test Statistic=3.1035 p-value=.08988 So we don'tsatisfythe conditionof Gauss-Markovthat
requires
## constant variance soa leastsquaressimpleregressionwill notrepresentthe dataina meaningful
way
plot(Xs,Yr)
curve(Xs*exp(-Xs))

##taking the logof bothsidesof the rickermodel:
##log(Y)=log(B0)+log(X)-B1X
##log(Y/X)=log(B0)-B1X
##lettingournewY=log(Y/X) we have a new regressionfit
# andsolvingthe systemwe find
#withB0~3 and B1~.001
sat<-sat
Y<-sat$total
X1<-sat$expend
X2<-sat$salary
X3<-sat$ratio
model1<-lm(Y~X1+X2+X3)
#callingmodel1:
## Coefficients:
##(Intercept) X1 X2 X3
## 1069.234 16.469 -8.823 6.330
## Thismodel saysthat increasingexpenditureand/orpupilsperteacher,
## Thensat scoresgo up and increasingteachersalariesmakesthe

## SAT scoresgo down. Note that itdoesn'tmake too muchsense for
##that more pupilsperstudentandlowersalariesforteachers
##wouldincrease the average total SATscore so we must have leftout
## an importantpredictorvariable
X4<-sat$takers
model2<-lm(Y~X1+X2+X3+X4)
###Coefficients:
###(Intercept) X1 X2 X3 X4
### 1045.972 4.463 1.638 -3.624 -2.904
#Nowwe're talking.Thismodel makesmore sense.Now whenwe increase the teachersperpupil
#and the teaher'ssalaryThe SATscores wouldgoup (inthe model).
##H0:B2=0 Ha: B2!=0
## level setalpha,a=.05## usingT=B.hat2/s{B.hat2}
B.hat0<-1069.234
B.hat1<-4.463
B.hat2<-1.638
B.hat3<--3.624
B.hat4<--2.904
K=4
Y.hat<-as.numeric(fitted(model2))
n <-length(X3)
MSE<- 1/(n-2)*sum((Y-Y.hat)^2) ##1002.6
sB.hat2<-sqrt(MSE/sum((X2-mean(X2))^2)) ##.761

T<-B.hat2/sB.hat2 ## 2.15
t<-abs(qt(.975,n-2))##2.01
##Because T>|t| we rejectH0
sB.hat3<-sqrt(MSE/sum((X3-mean(X3))^2))
T<-B.hat3/sB.hat3
t<-abs(qt(.975,n-2))
##H0: B1&B2&B3=0 Ha: X1,X2,X3 !=0
## level setalpha,a=.05
##B2=0 isrejectedwe donotneedto checkB3 B1
## H0 now requiresB1&B2&B3=0. B2=0 wasrejectedso
## the newhypothesisisrejected
res<-Y-Y.hat
plot(model2,which=c(1:6))
# fromthe residualsvsfittedplotwe observe the fittedcurve tobe non-linearsuggestingnonconstant
variance
# the variance isnonconstantsothe Gauss-Markovconditionisnotsatisfiedmeaningthe errorisnot
normal
#fromthe residualVSfittedplotwe see that29, 24, and 48 are potential outliers
# fromthe Cook'sdistance we see that44 hasthe greatestinfluence onthe regressionof
# the data so 44 ismost likelyaninfluentialpoint.48alsohas relativelyhighCook'sdistance
# howeverthisdatapointisnotisclearlyan outlierinthe residualsVSleverage plot

QuinnMcFeeWorkSample(R)

Recommended

Recommended

More Related Content

What's hot

What's hot (7)

Similar to QuinnMcFeeWorkSample(R)

Similar to QuinnMcFeeWorkSample(R) (20)

QuinnMcFeeWorkSample(R)