Credit Card Data Statistical Analysis

cc_proj
group 2
December 3, 2016
R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS
Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com
(http://rmarkdown.rstudio.com).
When you click the Knit button a document will be generated that includes both content as well as the output of
any embedded R code chunks within the document. You can embed an R code chunk like this:
##Data Preprocessing
library(faraway)
## Warning: package 'faraway' was built under R version 3.2.5
library(AER)
## Warning: package 'AER' was built under R version 3.2.5
## Loading required package: car
##
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
##
##     logit, vif
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
##     as.Date, as.Date.numeric

## Loading required package: sandwich
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:faraway':
##
##     rats
data(CreditCard)
head(CreditCard)
##   card reports      age income       share expenditure owner selfemp
## 1  yes       0 37.66667 4.5200 0.033269910  124.983300   yes      no
## 2  yes       0 33.25000 2.4200 0.005216942    9.854167    no      no
## 3  yes       0 33.66667 4.5000 0.004155556   15.000000   yes      no
## 4  yes       0 30.50000 2.5400 0.065213780  137.869200    no      no
## 5  yes       0 32.16667 9.7867 0.067050590  546.503300   yes      no
## 6  yes       0 23.25000 2.5000 0.044438400   91.996670    no      no
##   dependents months majorcards active
## 1          3     54          1     12
## 2          3     34          1     13
## 3          4     58          1      5
## 4          0     25          1      7
## 5          2     64          1      5
## 6          0     54          1      1
summary(CreditCard)

##   card         reports             age              income
##  no : 296   Min.   : 0.0000   Min.   : 0.1667   Min.   : 0.210
##  yes:1023   1st Qu.: 0.0000   1st Qu.:25.4167   1st Qu.: 2.244
##             Median : 0.0000   Median :31.2500   Median : 2.900
##             Mean   : 0.4564   Mean   :33.2131   Mean   : 3.365
##             3rd Qu.: 0.0000   3rd Qu.:39.4167   3rd Qu.: 4.000
##             Max.   :14.0000   Max.   :83.5000   Max.   :13.500
##      share            expenditure       owner     selfemp
##  Min.   :0.0001091   Min.   :   0.000   no :738   no :1228
##  1st Qu.:0.0023159   1st Qu.:   4.583   yes:581   yes:  91
##  Median :0.0388272   Median : 101.298
##  Mean   :0.0687322   Mean   : 185.057
##  3rd Qu.:0.0936168   3rd Qu.: 249.036
##  Max.   :0.9063205   Max.   :3099.505
##    dependents         months         majorcards         active
##  Min.   :0.0000   Min.   :  0.00   Min.   :0.0000   Min.   : 0.000
##  1st Qu.:0.0000   1st Qu.: 12.00   1st Qu.:1.0000   1st Qu.: 2.000
##  Median :1.0000   Median : 30.00   Median :1.0000   Median : 6.000
##  Mean   :0.9939   Mean   : 55.27   Mean   :0.8173   Mean   : 6.997
##  3rd Qu.:2.0000   3rd Qu.: 72.00   3rd Qu.:1.0000   3rd Qu.:11.000
##  Max.   :6.0000   Max.   :540.00   Max.   :1.0000   Max.   :46.000
CreditCard$income <‐ round(CreditCard$income, digits = 2)
CreditCard$expenditure <‐ round(CreditCard$expenditure, digits = 2)
CreditCard$age <‐ round(CreditCard$age + .01)

##Knowing Our Data
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.2.5
oldpar <‐ par(mfrow=c(1,1))
## Density Plot for Expenditure
plot(density(CreditCard$expenditure), main = "expenditure")

## Density Plot for Income
plot(density(CreditCard$income), main = "income")

##QQ Normal Plot for expenditure
qqnorm(CreditCard$expenditure, xlab = "Theoretical", ylab = "Actual values")
qqline(CreditCard$expenditure, col = "blue")

##QQ Normal Plot for income
qqnorm(CreditCard$income, xlab = "Theoretical", ylab = "Actual values")
qqline(CreditCard$income, col = "blue")

##Boxplot
boxplot(expenditure~owner, data=CreditCard, xlab = "House Owner", ylab = "Expenditure")

##Linear Model Selection
##Adjusted R square=0.8108
g1<‐ lm(expenditure~., data = CreditCard)
summary(g1)

##
## Call:
## lm(formula = expenditure ~ ., data = CreditCard)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## ‐671.01  ‐31.86    4.00   31.42 1870.37
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) ‐173.21651   15.10314 ‐11.469   <2e‐16 ***
## cardyes        0.12999    9.75430   0.013   0.9894
## reports       ‐1.55943    2.84062  ‐0.549   0.5831
## age            0.57416    0.38897   1.476   0.1402
## income        51.27075    2.18281  23.488   <2e‐16 ***
## share       2465.69482   37.87678  65.098   <2e‐16 ***
## owneryes      ‐4.60817    7.72632  ‐0.596   0.5510
## selfempyes    ‐5.01405   13.02388  ‐0.385   0.7003
## dependents     4.39018    2.86144   1.534   0.1252
## months        ‐0.13467    0.05526  ‐2.437   0.0149 *
## majorcards    ‐3.08646    8.59266  ‐0.359   0.7195
## active         0.82229    0.56700   1.450   0.1472
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 118.4 on 1307 degrees of freedom
## Multiple R‐squared:  0.8124, Adjusted R‐squared:  0.8108
## F‐statistic: 514.4 on 11 and 1307 DF,  p‐value: < 2.2e‐16
g2<‐ lm(expenditure ~ income + share + months, data = CreditCard)
summary(g2)

##
## Call:
## lm(formula = expenditure ~ income + share + months, data = CreditCard)
##
## Residuals:
## ‐666.87  ‐28.91    3.47   30.12 1872.37
##
## Coefficients:
## (Intercept) ‐157.3660     8.0093 ‐19.648   <2e‐16 ***
## income        53.1772     1.9442  27.352   <2e‐16 ***
## share       2460.0934    34.5478  71.208   <2e‐16 ***
## months        ‐0.1018     0.0497  ‐2.049   0.0407 *
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F‐statistic:  1884 on 3 and 1315 DF,  p‐value: < 2.2e‐16
g3<‐ lm(expenditure ~ age + owner + income, data = CreditCard)
summary(g3)
##
## Call:
## lm(formula = expenditure ~ age + owner + income, data = CreditCard)
##
## Residuals:
## ‐538.70 ‐137.16  ‐67.59   60.48 2509.26
##
## Coefficients:
## (Intercept)   99.011     25.888   3.825 0.000137 ***
## age           ‐2.547      0.782  ‐3.257 0.001155 **
## owneryes      16.422     15.979   1.028 0.304277
## income        48.581      4.607  10.545  < 2e‐16 ***
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##

##We reject the subset models and choose the bigger model g1 for our further analysis.

##Checking non‐constant variance, non‐normality
##We can see the variance of the residuals is increasing as the fitted values get larger
par(mfrow=c(1,1))
plot(fitted(g1), residuals(g1), xlab="Fitted", ylab="Residuals")
abline(h=0)
##An approximate test of nonconstant variance
##There is linear relationship between residuals and the fitted value.
summary(lm(abs(residuals(g1)) ~ fitted(g1)))

##
## Call:
## lm(formula = abs(residuals(g1)) ~ fitted(g1))
##
## Residuals:
## ‐288.72  ‐41.06   ‐9.76   25.68 1569.37
##
## Coefficients:
## (Intercept) 15.250276   2.938698   5.189 2.44e‐07 ***
## fitted(g1)   0.232487   0.009565  24.307  < 2e‐16 ***
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##Checking for non‐normal errors
##QQ‐plots for detecting nonnormality
par(mfrow=c(1,1))
qqnorm(residuals(g1), ylab="Residuals")
qqline(residuals(g1))

##The histogram for detecting nonnormality
hist(residuals(g1))
##A test of normal versus nonnormal errors
shapiro.test(residuals(g1))
##
##  Shapiro‐Wilk normality test
##
## data:  residuals(g1)
## W = 0.63898, p‐value < 2.2e‐16
##Checking for unusual observations
##Checking for influential outliers
##The leverage measure for detecting influential outliers
library("faraway")
sales<‐ row.names(CreditCard)
halfnorm(lm.influence(g1)$hat, labs=sales, ylab="Leverages")

##Cook's Distance for detecting influential outliers
cook <‐ cooks.distance(g1)
##Half normal plot of Cook's Distance with labels of three largest value
halfnorm(cook,3,labs=sales,ylab="Cook's distance")

##Model fit excluding observation with largest Cook's Distance
CreditCard2 <‐ CreditCard[‐447,]
g4<‐lm(expenditure~. ,data = CreditCard2)

##Comparison of model fits with and without influential observation
coef(g1); coef(g4); summary(g1); summary(g4)
##  (Intercept)      cardyes      reports          age       income
## ‐173.2165091    0.1299925   ‐1.5594275    0.5741566   51.2707456
##        share     owneryes   selfempyes   dependents       months
## 2465.6948184   ‐4.6081681   ‐5.0140531    4.3901807   ‐0.1346725
##   majorcards       active
##   ‐3.0864627    0.8222948
## ‐157.4470327    7.0470689   ‐0.2303960    0.4941578   45.9336451
## 2415.3819791   ‐2.2514973   ‐0.2994188    6.4901069   ‐0.1411835
##   ‐1.9361978    0.3704255

##
## Call:
## lm(formula = expenditure ~ ., data = CreditCard)
##
## Residuals:
## ‐671.01  ‐31.86    4.00   31.42 1870.37
##
## Coefficients:
## (Intercept) ‐173.21651   15.10314 ‐11.469   <2e‐16 ***
## cardyes        0.12999    9.75430   0.013   0.9894
## reports       ‐1.55943    2.84062  ‐0.549   0.5831
## age            0.57416    0.38897   1.476   0.1402
## income        51.27075    2.18281  23.488   <2e‐16 ***
## share       2465.69482   37.87678  65.098   <2e‐16 ***
## owneryes      ‐4.60817    7.72632  ‐0.596   0.5510
## selfempyes    ‐5.01405   13.02388  ‐0.385   0.7003
## dependents     4.39018    2.86144   1.534   0.1252
## months        ‐0.13467    0.05526  ‐2.437   0.0149 *
## majorcards    ‐3.08646    8.59266  ‐0.359   0.7195
## active         0.82229    0.56700   1.450   0.1472
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##

##
## Call:
## lm(formula = expenditure ~ ., data = CreditCard2)
##
## Residuals:
## ‐634.84  ‐31.07    2.77   29.50 1083.73
##
## Coefficients:
## (Intercept) ‐157.44703   13.56086 ‐11.610  < 2e‐16 ***
## cardyes        7.04707    8.74830   0.806  0.42066
## reports       ‐0.23040    2.54626  ‐0.090  0.92792
## age            0.49416    0.34855   1.418  0.15650
## income        45.93365    1.97828  23.219  < 2e‐16 ***
## share       2415.38198   34.05304  70.930  < 2e‐16 ***
## owneryes      ‐2.25150    6.92400  ‐0.325  0.74510
## selfempyes    ‐0.29942   11.67230  ‐0.026  0.97954
## dependents     6.49011    2.56651   2.529  0.01156 *
## months        ‐0.14118    0.04951  ‐2.851  0.00442 **
## majorcards    ‐1.93620    7.69926  ‐0.251  0.80148
## active         0.37043    0.50865   0.728  0.46659
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##Indicator of Model fit, adjusted R square increases by eliminating the 447th point.

##Added Variable Plot
d <‐ residuals(lm(expenditure~card+reports+age+share+owner+selfemp+dependents+majorcards+active,
CreditCard))

m <‐ residuals(lm(income~card+reports+age+share+owner+selfemp+dependents+majorcards+active, Cred
itCard))

plot(m,d,xlab="income residuals", ylab="expenditure residuals")
coef(lm(d~m))
##  (Intercept)            m
## 7.617251e‐15 5.136060e+01
coef(g1)

## ‐173.2165091    0.1299925   ‐1.5594275    0.5741566   51.2707456
## 2465.6948184   ‐4.6081681   ‐5.0140531    4.3901807   ‐0.1346725
##   ‐3.0864627    0.8222948
abline(0,coef(g1)['income'])
#The plot indicates that income should be entered linearly into the model.

#ROBUST REGRESSION

#OLS
g.ols <‐ lm(expenditure ~ ., CreditCard)

library(MASS)
g.huber <‐ rlm(expenditure ~ ., psi = psi.huber, data = CreditCard)#huber

g.hampel <‐ rlm(expenditure ~ ., psi = psi.hampel, init = "lts", data = CreditCard) #hampel

g.bisquare <‐ rlm(expenditure ~ ., psi = psi.bisquare, init = "lts", data = CreditCard) #bisquar
e

## Warning in rlm.default(x, y, weights, method = method, wt.method =
## wt.method, : 'rlm' failed to converge in 20 steps
library(robustbase)
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
##     heart
##
##     epilepsy
g.lts <‐ ltsReg(expenditure ~ ., data = CreditCard)  # LTS
## Warning in covMcd(X, alpha = alpha, use.correction = use.correction): The 665‐th order statis
tic of the absolute deviation of variable 1
## is zero.
## There are 1023 observations (in the entire dataset of 1319 obs.)
## lying on the hyperplane with equation a_1*(x_i1 ‐ m_1) + ... +
## a_p*(x_ip ‐ m_p) = 0 with (m_1, ..., m_p) the mean of these
## observations and coefficients a_i from the vector a <‐ c(1, 0, 0,
## 0, 0, 0, 0, 0, 0, 0, 0)
library(quantreg)
## Warning: package 'quantreg' was built under R version 3.2.5
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
##     backsolve
##
## Attaching package: 'quantreg'

##
##     untangle.specials
g.lad <‐ rq(expenditure ~ ., data = CreditCard)  # LAD

library(car)
coefs <‐ compareCoefs(g.ols, g.huber, g.hampel, g.bisquare, g.lts, g.lad, se = FALSE)
## Warning in compareCoefs(g.ols, g.huber, g.hampel, g.bisquare, g.lts,
## g.lad, : models to be compared are of different classes
##
## Call:
## 1: lm(formula = expenditure ~ ., data = CreditCard)
## 2: rlm(formula = expenditure ~ ., data = CreditCard, psi = psi.huber)
## 3: rlm(formula = expenditure ~ ., data = CreditCard, psi = psi.hampel,
##   init = "lts")
## 4: rlm(formula = expenditure ~ ., data = CreditCard, psi =
##   psi.bisquare, init = "lts")
## 5: ltsReg.formula(formula = expenditure ~ ., data = CreditCard)
## 6: rq(formula = expenditure ~ ., data = CreditCard)
##                Est. 1    Est. 2    Est. 3    Est. 4    Est. 5    Est. 6
## (Intercept) ‐1.73e+02 ‐9.84e+01 ‐7.68e+01 ‐5.69e+01           ‐8.11e+01
## cardyes      1.30e‐01  1.66e+00  9.70e+00  4.75e+00  6.00e+00 ‐4.53e+00
## reports     ‐1.56e+00 ‐7.13e‐01 ‐4.49e‐01 ‐6.43e‐01 ‐1.40e‐01 ‐7.52e‐01
## age          5.74e‐01  1.69e‐01  2.24e‐01  1.21e‐01  7.18e‐02  9.04e‐02
## income       5.13e+01  3.33e+01  2.29e+01  1.80e+01  4.01e+00  3.07e+01
## share        2.47e+03  2.26e+03  2.19e+03  2.17e+03  2.07e+03  2.25e+03
## owneryes    ‐4.61e+00  1.37e‐01  4.48e‐01 ‐3.70e‐01 ‐7.00e‐01 ‐2.15e‐01
## selfempyes  ‐5.01e+00 ‐4.06e+00 ‐6.27e+00 ‐3.08e+00  9.24e‐01 ‐1.08e+00
## dependents   4.39e+00  2.50e+00  4.04e+00  1.67e+00  5.42e‐01  6.63e‐01
## months      ‐1.35e‐01 ‐2.22e‐02 ‐2.73e‐03  1.36e‐02 ‐6.80e‐03 ‐1.10e‐02
## majorcards  ‐3.09e+00  2.92e‐01  1.33e+00  1.95e+00 ‐1.15e‐01  3.60e‐01
## active       8.22e‐01 ‐1.85e‐01 ‐3.52e‐01 ‐1.58e‐01  1.80e‐02 ‐1.04e‐01
## Intercept                                           ‐1.56e+01
colnames(coefs) <‐ c("OLS", "Huber", "Hampel", "Bisquare", "LTS", "LAD")
coefs

##                      OLS         Huber        Hampel      Bisquare
## (Intercept) ‐173.2165091  ‐98.42281790 ‐7.683350e+01  ‐56.93246290
## cardyes        0.1299925    1.66296788  9.699189e+00    4.75250636
## reports       ‐1.5594275   ‐0.71321874 ‐4.492695e‐01   ‐0.64294070
## age            0.5741566    0.16876448  2.235484e‐01    0.12083572
## income        51.2707456   33.32804912  2.287674e+01   17.99766048
## share       2465.6948184 2263.93528379  2.191596e+03 2174.47842947
## owneryes      ‐4.6081681    0.13661070  4.484377e‐01   ‐0.36960925
## selfempyes    ‐5.0140531   ‐4.06380494 ‐6.270421e+00   ‐3.07652355
## dependents     4.3901807    2.49797375  4.035395e+00    1.66759545
## months        ‐0.1346725   ‐0.02224799 ‐2.733177e‐03    0.01360463
## majorcards    ‐3.0864627    0.29179580  1.331047e+00    1.94670914
## active         0.8222948   ‐0.18469793 ‐3.517037e‐01   ‐0.15819744
## Intercept             NA            NA            NA            NA
##                       LTS           LAD
## (Intercept)            NA  ‐81.07011726
## cardyes      6.003973e+00   ‐4.53115627
## reports     ‐1.396422e‐01   ‐0.75240228
## age          7.180315e‐02    0.09038923
## income       4.007000e+00   30.69346829
## share        2.074940e+03 2250.32089452
## owneryes    ‐6.996942e‐01   ‐0.21510403
## selfempyes   9.237160e‐01   ‐1.07628232
## dependents   5.419621e‐01    0.66323380
## months      ‐6.796691e‐03   ‐0.01099476
## majorcards  ‐1.151345e‐01    0.36028882
## active       1.801497e‐02   ‐0.10395183
## Intercept   ‐1.555717e+01            NA
##Comments:: All three M‐estimation methods, Huber, Bisquare, and Hample are quite similar to ea
ch other, and different from OLS and both LTS's.LAD is similar to OLS. LTS is recommended since
it has the best breakdown.

#CHECK FOR LEAST MODEL

g.step <‐ step(g1)

## Start:  AIC=12606.27
## expenditure ~ card + reports + age + income + share + owner +
##     selfemp + dependents + months + majorcards + active
##
##              Df Sum of Sq      RSS   AIC
## ‐ card        1         2 18326814 12604
## ‐ majorcards  1      1809 18328620 12604
## ‐ selfemp     1      2078 18328890 12604
## ‐ reports     1      4226 18331037 12605
## ‐ owner       1      4988 18331799 12605
## <none>                    18326811 12606
## ‐ active      1     29492 18356303 12606
## ‐ age         1     30551 18357363 12606
## ‐ dependents  1     33007 18359818 12607
## ‐ months      1     83288 18410100 12610
## ‐ income      1   7736017 26062828 13069
## ‐ share       1  59421549 77748361 14510
##
## Step:  AIC=12604.27
## expenditure ~ reports + age + income + share + owner + selfemp +
##     dependents + months + majorcards + active
##
## ‐ majorcards  1      1809 18328623 12602
## ‐ selfemp     1      2088 18328902 12602
## ‐ owner       1      5009 18331823 12603
## ‐ reports     1      5432 18332246 12603
## <none>                    18326814 12604
## ‐ active      1     30355 18357168 12604
## ‐ age         1     30551 18357364 12604
## ‐ dependents  1     33155 18359969 12605
## ‐ months      1     83286 18410100 12608
## ‐ income      1   7807034 26133848 13070
## ‐ share       1  68280506 86607320 14651
##
## Step:  AIC=12602.4
## expenditure ~ reports + age + income + share + owner + selfemp +
##     dependents + months + active
##
## ‐ selfemp     1      2088 18330710 12600
## ‐ owner       1      5170 18333793 12601
## ‐ reports     1      5328 18333951 12601
## <none>                    18328623 12602
## ‐ active      1     29157 18357780 12602
## ‐ age         1     30715 18359337 12603
## ‐ dependents  1     33672 18362295 12603
## ‐ months      1     82170 18410792 12606
## ‐ income      1   7853017 26181640 13071
## ‐ share       1  68411348 86739970 14651
##
## Step:  AIC=12600.55
## expenditure ~ reports + age + income + share + owner + dependents +

##     months + active
##
## ‐ owner       1      5074 18335784 12599
## ‐ reports     1      5341 18336052 12599
## <none>                    18330710 12600
## ‐ active      1     29117 18359828 12601
## ‐ age         1     30039 18360749 12601
## ‐ dependents  1     33683 18364393 12601
## ‐ months      1     82936 18413647 12604
## ‐ income      1   7885891 26216602 13070
## ‐ share       1  68749553 87080263 14654
##
## Step:  AIC=12598.91
## expenditure ~ reports + age + income + share + dependents + months +
##     active
##
## ‐ reports     1      4186 18339970 12597
## ‐ active      1     25223 18361007 12599
## ‐ age         1     26418 18362202 12599
## <none>                    18335784 12599
## ‐ dependents  1     29711 18365495 12599
## ‐ months      1     88821 18424605 12603
## ‐ income      1   8016819 26352603 13075
## ‐ share       1  68772952 87108736 14652
##
## Step:  AIC=12597.21
## expenditure ~ age + income + share + dependents + months + active
##
## ‐ active      1     22090 18362060 12597
## ‐ age         1     26693 18366663 12597
## <none>                    18339970 12597
## ‐ dependents  1     29784 18369754 12597
## ‐ months      1     89980 18429950 12602
## ‐ income      1   8036240 26376210 13074
## ‐ share       1  70719037 89059006 14680
##
## Step:  AIC=12596.8
## expenditure ~ age + income + share + dependents + months
##
## <none>                    18362060 12597
## ‐ dependents  1     31895 18393955 12597
## ‐ age         1     32293 18394353 12597
## ‐ months      1     87594 18449654 12601
## ‐ income      1   8245145 26607205 13084
## ‐ share       1  70725375 89087435 14678
summary(g.step)

##
## Call:
## lm(formula = expenditure ~ age + income + share + dependents +
##     months, data = CreditCard)
##
## Residuals:
## ‐674.42  ‐29.70    4.84   31.05 1879.22
##
## Coefficients:
## (Intercept) ‐172.91959   12.35476 ‐13.996   <2e‐16 ***
## age            0.57505    0.37842   1.520   0.1289
## income        51.27696    2.11179  24.281   <2e‐16 ***
## share       2468.67451   34.71399  71.115   <2e‐16 ***
## dependents     4.20062    2.78151   1.510   0.1312
## months        ‐0.13683    0.05467  ‐2.503   0.0124 *
## ‐‐‐
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F‐statistic:  1134 on 5 and 1313 DF,  p‐value: < 2.2e‐16
g.step$coef
##  (Intercept)          age       income        share   dependents
## ‐172.9195852    0.5750485   51.2769573 2468.6745073    4.2006201
##       months
##   ‐0.1368346
#ANOVA test

anova(g.step, g1)
## Analysis of Variance Table
##
## Model 1: expenditure ~ age + income + share + dependents + months
## Model 2: expenditure ~ card + reports + age + income + share + owner +
##     selfemp + dependents + months + majorcards + active
##   Res.Df      RSS Df Sum of Sq     F Pr(>F)
## 1   1313 18362060
## 2   1307 18326811  6     35249 0.419 0.8668
#p‐value is 0.8668, greater than 0.05, which means the 5 extra variables in big model do not con
tribute in determining credicard exenditures.

#CROSS VALIDATION
library(DAAG)

## Warning: package 'DAAG' was built under R version 3.2.5
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
##     melanoma
##
## Attaching package: 'DAAG'
## The following object is masked from 'package:robustbase':
##
##     milk
## The following object is masked from 'package:MASS':
##
##     hills
##
##     lung
## The following object is masked from 'package:car':
##
##     vif
## The following objects are masked from 'package:faraway':
##
##     orings, ozone, vif
oldpar <‐ par(mfrow=c(1,2))
p1 <‐ CVlm(data = CreditCard,
     form.lm=g1,
     printit=F)
## Warning in CVlm(data = CreditCard, form.lm = g1, printit = F):
##
##  As there is >1 explanatory variable, cross‐validation
##  predicted values for a fold are not a linear function
##  of corresponding overall predicted values.  Lines that
##  are shown for the different folds are approximate

p2 <‐ CVlm(data = CreditCard,
     form.lm=g.step,
     printit=F)
## Warning in CVlm(data = CreditCard, form.lm = g.step, printit = F):
##

par(oldpar)

#Compare the two models using the mse's from the cross‐validations
#with number of folds equal to 3.
#Which model gives the better mse?

seed <‐ round(runif(1, min=0, max=100))
oldpar1 <‐ par(mfrow=c(1,2))
mse.g1 <‐ CVlm(data = CreditCard,
               form.lm=g1,
               m=3,
               seed=seed,
               printit=F,
               main = "g1")
## Warning in CVlm(data = CreditCard, form.lm = g1, m = 3, seed = seed, printit = F, :
##
mse.g.step <‐ CVlm(data = CreditCard,
               form.lm=g.step,
               m=3,
               seed=seed,
               printit=F,
               main = "g.step")
## Warning in CVlm(data = CreditCard, form.lm = g.step, m = 3, seed = seed, :
##

par(oldpar1)

data.frame(mse.g1=attr(mse.g1, "ms"),
           mse.g.step=attr(mse.g.step, "ms"))
##     mse.g1 mse.g.step
## 1 14325.28   14186.83
library("scatterplot3d")
attach(CreditCard)
s3d <‐ scatterplot3d(income,dependents,expenditure,
                     pch=16,
                     highlight.3d=TRUE,
                     type="h",
                     main="3D Scatterplot")
fit <‐ lm(expenditure ~ income + dependents)
s3d$plane3d(fit)

##This helps us detecting outliers and influential points and it helps us #understanding a confi
dence region for average and predicted response values.

##Individual CIs
confint(g1, level = 0.95)
##                    2.5 %      97.5 %
## (Intercept) ‐202.8455645 ‐143.587454
## cardyes      ‐19.0057968   19.265782
## reports       ‐7.1320925    4.013238
## age           ‐0.1889249    1.337238
## income        46.9885446   55.552947
## share       2391.3888920 2540.000745
## owneryes     ‐19.7655171   10.549181
## selfempyes   ‐30.5640541   20.535948
## dependents    ‐1.2233452   10.003707
## months        ‐0.2430761   ‐0.026269
## majorcards   ‐19.9433837   13.770458
## active        ‐0.2900267    1.934616
##Joint Confidence Interval for share and active credit cards
library(ellipse)

##
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
##
##     ellipse
plot(ellipse(g1, c("share", "active")),
     type = "l",
     main = "Joint Confidence Region")
points(0,0)
points(coef(g1)["share"], coef(g1)["active"],
       pch=18)
abline(v=confint(g1)["share",], lty=2)
abline(h=confint(g1)["active",], lty=2)
##Since zero does not fall in any limit, we reject the null hypothesis and accept the alternate
hypothesis.
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that
generated the plot.

Credit Card Data Statistical Analysis

Recommended

Recommended

More Related Content

What's hot

What's hot (14)

Viewers also liked

Viewers also liked (14)

Similar to Credit Card Data Statistical Analysis

Similar to Credit Card Data Statistical Analysis (20)

Credit Card Data Statistical Analysis