Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

R for Everything

9,709 views

Published on

Delivered by Jared Lander (Author, R for Everyone) at the 2016 New York R Conference on April 8th and 9th at Work-Bench.

Published in: Data & Analytics
  • Be the first to comment

  • Be the first to like this

R for Everything

  1. 1. R for Everything Jared P. Lander
  2. 2. 2/45
  3. 3. 3/45
  4. 4. 4/45
  5. 5. Giants 5/45
  6. 6. Compressed Data Online 6/45
  7. 7. 7/45
  8. 8. Create Directory # See if directory exists  dir.exists('FootballTemp') [1] FALSE # create it  dir.create('FootballTemp')  # check again  dir.exists('FootballTemp') [1] TRUE 8/45
  9. 9. 9/45
  10. 10. Download Files download.file('http://www.jaredlander.com/data/Football1415.tar.gz',                 destfile='FootballTemp/football.tar.gz',                 method='curl') 10/45
  11. 11. Untar 11/45
  12. 12. getXKCD('1168') 12/45
  13. 13. Untar the File untar('FootballTemp/football.tar.gz', exdir='FootballFiles') 13/45
  14. 14. Did They Extract? dir('FootballFiles') [1] "pbp‐2014.csv" "pbp‐2015.csv" 14/45
  15. 15. Delete Tar unlink('FootballTemp/football.tar.gz')  dir('FootballTemp') character(0) 15/45
  16. 16. Inspect One File file.info('FootballFiles/pbp‐2014.csv')                                size isdir mode               mtime  FootballFiles/pbp‐2014.csv 10280324 FALSE  666 2016‐03‐25 00:14:23                                           ctime               atime exe  FootballFiles/pbp‐2014.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47  no 16/45
  17. 17. Inspect All Files dir('FootballFiles') %>% file.info              size isdir mode mtime ctime atime  exe  pbp‐2014.csv   NA    NA <NA>  <NA>  <NA>  <NA> <NA>  pbp‐2015.csv   NA    NA <NA>  <NA>  <NA>  <NA> <NA> 17/45
  18. 18. Inspect All Files dir('FootballFiles', full.names=TRUE) %>% file.info                                size isdir mode               mtime  FootballFiles/pbp‐2014.csv 10280324 FALSE  666 2016‐03‐25 00:14:23  FootballFiles/pbp‐2015.csv 10671016 FALSE  666 2016‐03‐25 00:14:23                                           ctime               atime exe  FootballFiles/pbp‐2014.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47  no  FootballFiles/pbp‐2015.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47  no 18/45
  19. 19. Better Names file.rename(from=dir('FootballFiles', full.names=TRUE),               to=sprintf('FootballFiles/Football%s.csv', 14:15)) [1] TRUE TRUE 19/45
  20. 20. Better Names dir('FootballFiles') [1] "Football14.csv" "Football15.csv" 20/45
  21. 21. Make Copies dir.create('FootballFiles/Backup')  file.copy(dir('FootballFiles', full.names=TRUE, pattern='.csv'),             sprintf('FootballFiles/Backup/Footballl%s.csv', 14:15)) [1] TRUE TRUE 21/45
  22. 22. Make Copies dir('FootballFiles', recursive=TRUE) [1] "Backup/Footballl14.csv" "Backup/Footballl15.csv" "Football14.csv"          [4] "Football15.csv"         22/45
  23. 23. Count Columns count.fields('FootballFiles/Football14.csv', sep=',') %>% head(15)  [1] 45 45 45 45 15 45 45 45 45 45 45 45 45 45 45 count.fields('FootballFiles/Football15.csv', sep=',') %>% head(15)  [1] 45 45 45 45 45 45 45 45 45 NA 15 45 45 45 45 23/45
  24. 24. Line Count system('wc ‐l FootballFiles/Football14.csv') 45696 FootballFiles/Football14.csv system('wc ‐l FootballFiles/Football15.csv') 46278 FootballFiles/Football15.csv 24/45
  25. 25. Reference Files dataPath <‐ 'FootballFiles'  file.path(dataPath, 'Football14.csv') [1] "FootballFiles/Football14.csv" file.path(dataPath, 'Football15.csv') [1] "FootballFiles/Football15.csv" 25/45
  26. 26. Read Data theFiles <‐ dir(dataPath, pattern='.csv', full.names=TRUE)  games <‐ theFiles %>% map_df(read.csv2, sep=',', header=TRUE, stringsAsFactors=FALSE) 26/45
  27. 27. See the Data DT::datatable(data=games%>% slice(sample(nrow(games), size=500, replace=FALSE)),                 rownames=FALSE,                options = list(                    dom = "rtiS",                    scrollY = 400, scrollX=TRUE,                    scrollCollapse = TRUE),                filter=list(position='top')  ) 27/45
  28. 28. See the Data Showing 1 to 10 of 500 entries 2016010300 2016-01-03 1 12 31 BUF NYJ 2015120608 2015-12-06 2 15 0 ATL TB 2015122100 2015-12-21 1 15 0 DET NO 2014111610 11/16/2014 1 6 6 DET ARI 2015112904 2015-11-29 2 1 23 IND TB 2015122710 2015-12-27 2 14 7 GB ARI 2014101203 10/12/2014 1 11 20 PIT CLE 2015101102 2015-10-11 2 8 2 CIN SEA GameId GameDate Quarter Minute Second OffenseTeam DefenseTeam Down ToGo All All All All All All All All All 28/45
  29. 29. Pass vs Rush 29/45
  30. 30. Focus on One Team's Offense oneOff <‐ games %>%      filter(OffenseTeam == 'NYG', PlayType %in% c('PASS', 'RUSH')) %>%      mutate(PlayType=factor(PlayType, levels=c('RUSH', 'PASS')),              Down=factor(Down, levels=c(1, 2, 3, 4))) 30/45
  31. 31. Probability of a Pass passRushMod <‐ glm(PlayType ~ Down + ToGo ‐ 1, data=oneOff, family=binomial)  coefplot(passRushMod, trans=arm::invlogit, title='Probability of Pass') 31/45
  32. 32. Scenarios # make grid of scenarios  scenarios <‐ expand.grid(ToGo=1:15, Down=1:4) %>% as.tbl %>%       mutate(Down=factor(Down, levels=c(1, 2, 3, 4)))  # make prediction based on model  scenarioPredict <‐ predict(passRushMod,                              newdata=scenarios, type='response', se.fit=TRUE)  # build confidence intervals  scenarios <‐ scenarios %>% mutate(Prediction=scenarioPredict$fit,                                     Lower=Prediction ‐ 2*scenarioPredict$se.fit,                                    Upper=Prediction + 2*scenarioPredict$se.fit) 32/45
  33. 33. Scenarios ToGo Down Prediction Lower Upper 1 1 0.2754536 0.2135514 0.3373558 2 1 0.2959441 0.2371832 0.3547051 3 1 0.3172914 0.2621339 0.3724488 4 1 0.3394361 0.2882498 0.3906223 5 1 0.3623061 0.3153154 0.4092968 6 1 0.3858171 0.3430379 0.4285962 knitr::kable(head(scenarios)) 33/45
  34. 34. Probability of Pass ggplot(scenarios, aes(x=ToGo)) + scale_y_continuous(label=scales::percent) +      geom_ribbon(aes(ymin=Lower, ymax=Upper), fill='lightgrey') +      geom_line(aes(y=Prediction)) + facet_wrap(~Down, nrow=2) 34/45
  35. 35. Get Eli's Stats eliPage <‐ read_html('http://www.pro‐football‐reference.com/players/M/MannEl00.htm') eliStats <‐ eliPage %>% html_nodes("#passing") %>%       html_table(header=TRUE) %>% getElement(1)  useful::topleft(eliStats, c=7, r=8)    Year Age  Tm Pos No.  G GS  1  2004  23 NYG  qb  10  9  7  2  2005  24 NYG  QB  10 16 16  3  2006  25 NYG  QB  10 16 16  4  2007  26 NYG  QB  10 16 16  5 2008*  27 NYG  QB  10 16 16  6  2009  28 NYG  QB  10 16 16  7  2010  29 NYG  QB  10 16 16  8 2011*  30 NYG  QB  10 16 16 35/45
  36. 36. 36/45
  37. 37. Save Them dir.create('results')  ggsave('results/EliPass.png')  write.table(eliStats, file='results/eliStats.csv', sep=',', row.names=FALSE) [1] TRUE [1] TRUE 37/45
  38. 38. 38/45
  39. 39. Commit Them repo <‐ repository(getwd())  add(repo, file.path('results', c('eliPass.png', 'eliStats.csv')))  commit(repo, message='Tracking plot and csv')  push(repo) 39/45
  40. 40. 40/45
  41. 41. Email Them footballResults <‐ mime(      To = "jared@landeranalytics.com",      From = "jared@jaredlander.com",      Subject = "Eli Results",      body = "See the attached graph and data.") %>%       attach_file('results/EliPass.png') %>%       attach_file('results/eliStats.csv')  send_message(footballResults) 41/45
  42. 42. Things We've Done Create Directories Query Directories Untar Files Read XKCD Delete Files Get File Info Move Files Copy Files Count Columns Run System Commands · · · · · · · · · · Build File Paths Read Data Munge Data Fit a GLM Make Predictions Generate Plots Save Files Scrape a Website Commit and Push to Git Send an Email · · · · · · · · · · 42/45
  43. 43. Jared P. Lander Chief Data Scientist of Lander Analytics Author of R for Everyone Adjunct Professor at Columbia University Organizer of New York Open Statistical Programming (The R) Meetup Website: http://www.jaredlander.com · · · · · 43/45
  44. 44. Packages rvest ggplot2 dplyr purrr coefplot magritrr useful · · · · · · · 44/45
  45. 45. The Tools R RStudio knitr Pandoc ioslides · · · · · 45/45

×