dplyr
@romain_francois
• Use R since 2002
• #rcatladies
• R Enthusiast
• R/C++ hero
• Performance
• dplyr
• Occasional comedy
%>%from magrittr
enjoy(cool(bake(shape(beat(append(bowl(rep("flour",
2), "yeast", "water", "milk", "oil"), "flour", until
= "soft"), duration = "3mins"), as = "balls", style =
"slightly-flat"), degrees = 200, duration =
"15mins"), duration = "5mins"))
bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>%
append("flour", until = "soft") %>%

beat(duration = "3mins") %>%

shape(as = "balls", style = "slightly-flat") %>%

bake(degrees = 200, duration = "15mins") %>%

cool(buns, duration = "5mins") %>%
enjoy()
nycflights13
> flights
Source: local data frame [336,776 x 16]
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight
1 2013 1 1 517 2 830 11 UA N14228 1545
2 2013 1 1 533 4 850 20 UA N24211 1714
.. ... ... ... ... ... ... ... ... ... ...
Variables not shown: origin (chr), dest (chr), air_time (dbl), distance (dbl),
hour (dbl), minute (dbl)
nycflights13
> glimpse(flights)
Observations: 336,776
Variables: 16
$ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201...
$ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55...
$ dep_delay (dbl) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ...
$ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8...
$ arr_delay (dbl) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,...
$ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA...
$ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463...
$ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,...
$ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "...
$ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "...
$ air_time (dbl) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158...
$ distance (dbl) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10...
$ hour (dbl) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ...
$ minute (dbl) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...
filterA subset of the rows of the data frame
flights %>%
filter( dep_delay < 10 )
flights %>%
filter( arr_delay < dep_delay )
slicefilter rows by position
flights %>%
slice( 1:10 )
arrangereorder a data frame
flights %>%
filter( hour < 8 ) %>%
arrange( year, month, day )
selectselect certain columns from the data frame
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
mutatemodify or create columns based on others
flights %>%
mutate(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
) %>%
filter( gain > 0 ) %>%
arrange( desc(speed) ) %>%
select( year, month, day, dest, gain, speed )
summarisecollapse a data frame into one row …
flights %>%
summarise(delay = mean(dep_delay, na.rm = TRUE))
flights %>%
filter( dep_delay > 0 ) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE))
group_byGroup observations by one or more variables
flights %>%
group_by( tailnum ) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter( is.finite(delay) ) %>%
arrange( desc(count) )
bind_rows
bind_rows( , )
color num
green 1
yellow 2
red 3
blue 4
pink 5
color num
green 1
yellow 2
color num
red 3
blue 4
pink 5
joins
a <- data_frame(
color = c("green", "yellow", "red"),
num = 1:3
)
b <- data_frame(
color = c("green", "yellow", "pink"),
size = c("S", "M", "L")
)
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
inner_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
inner_join( , )
color num size
green 1 S
yellow 2 M
left_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
left_join( , )
color num size
green 1 S
yellow 2 M
red 3
right_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
right_join( , )
color num size
green 1 S
yellow 2 M
pink L
full_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
full_join( , )
color num size
green 1 S
yellow 2 M
red 3
pink L
data_frameJust like data.frame, but better
> data_frame( x = 1:5, y = letters[1:5] ) %>% glimpse
Observations: 5
Variables: 2
$ x (int) 1, 2, 3, 4, 5
$ y (chr) "a", "b", "c", "d", "e"
> data_frame( x = 1:5, y = letters[1:5] , z = x + 1) %>% glimpse
Observations: 5
Variables: 3
$ x (int) 1, 2, 3, 4, 5
$ y (chr) "a", "b", "c", "d", "e"
$ z (dbl) 2, 3, 4, 5, 6
frame_data aka tibble
> frame_data(
+ ~colA, ~colB,
+ "a", 1,
+ "b", 2
+ )
Source: local data frame [2 x 2]
colA colB
(chr) (dbl)
1 a 1
2 b 2
_
g <- c("origin", "dest")
v <- "dep_delay"
flights %>%
group_by( g ) %>%
summarise( result = mean(v, na.rm = TRUE) )
🙀
🙀
g <- c("origin", "dest")
v <- "dep_delay"
flights %>%
group_by_( .dots = g ) %>%
summarise_( .dots =
interp(~ mean(var, na.rm = TRUE), var = as.name(v))
)
Future
• Performance improvements (parallel C++)
• Alternative back ends
• Different type of groupings (e.g. bootstrap)
As soon as we get hoverboard ...
dplyr
Romain François
@romain_francois
romain@r-enthusiasts.com

dplyr

  • 1.
  • 2.
    • Use Rsince 2002 • #rcatladies • R Enthusiast • R/C++ hero • Performance • dplyr • Occasional comedy
  • 5.
  • 6.
    enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water","milk", "oil"), "flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"), degrees = 200, duration = "15mins"), duration = "5mins")) bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>% append("flour", until = "soft") %>%
 beat(duration = "3mins") %>%
 shape(as = "balls", style = "slightly-flat") %>%
 bake(degrees = 200, duration = "15mins") %>%
 cool(buns, duration = "5mins") %>% enjoy()
  • 7.
    nycflights13 > flights Source: localdata frame [336,776 x 16] year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight 1 2013 1 1 517 2 830 11 UA N14228 1545 2 2013 1 1 533 4 850 20 UA N24211 1714 .. ... ... ... ... ... ... ... ... ... ... Variables not shown: origin (chr), dest (chr), air_time (dbl), distance (dbl), hour (dbl), minute (dbl)
  • 8.
    nycflights13 > glimpse(flights) Observations: 336,776 Variables:16 $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201... $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55... $ dep_delay (dbl) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ... $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8... $ arr_delay (dbl) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,... $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA... $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463... $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,... $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "... $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "... $ air_time (dbl) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158... $ distance (dbl) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10... $ hour (dbl) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ... $ minute (dbl) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...
  • 9.
    filterA subset ofthe rows of the data frame flights %>% filter( dep_delay < 10 ) flights %>% filter( arr_delay < dep_delay )
  • 10.
    slicefilter rows byposition flights %>% slice( 1:10 )
  • 11.
    arrangereorder a dataframe flights %>% filter( hour < 8 ) %>% arrange( year, month, day )
  • 12.
    selectselect certain columnsfrom the data frame select(flights, year, month, day) select(flights, year:day) select(flights, -(year:day))
  • 13.
    mutatemodify or createcolumns based on others flights %>% mutate( gain = arr_delay - dep_delay, speed = distance / air_time * 60 ) %>% filter( gain > 0 ) %>% arrange( desc(speed) ) %>% select( year, month, day, dest, gain, speed )
  • 14.
    summarisecollapse a dataframe into one row … flights %>% summarise(delay = mean(dep_delay, na.rm = TRUE)) flights %>% filter( dep_delay > 0 ) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE))
  • 15.
    group_byGroup observations byone or more variables flights %>% group_by( tailnum ) %>% summarise( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter( is.finite(delay) ) %>% arrange( desc(count) )
  • 16.
    bind_rows bind_rows( , ) colornum green 1 yellow 2 red 3 blue 4 pink 5 color num green 1 yellow 2 color num red 3 blue 4 pink 5
  • 17.
    joins a <- data_frame( color= c("green", "yellow", "red"), num = 1:3 ) b <- data_frame( color = c("green", "yellow", "pink"), size = c("S", "M", "L") ) color num green 1 yellow 2 red 3 color size green S yellow M pink L
  • 18.
    inner_join color num green 1 yellow2 red 3 color size green S yellow M pink L inner_join( , ) color num size green 1 S yellow 2 M
  • 19.
    left_join color num green 1 yellow2 red 3 color size green S yellow M pink L left_join( , ) color num size green 1 S yellow 2 M red 3
  • 20.
    right_join color num green 1 yellow2 red 3 color size green S yellow M pink L right_join( , ) color num size green 1 S yellow 2 M pink L
  • 21.
    full_join color num green 1 yellow2 red 3 color size green S yellow M pink L full_join( , ) color num size green 1 S yellow 2 M red 3 pink L
  • 22.
    data_frameJust like data.frame,but better > data_frame( x = 1:5, y = letters[1:5] ) %>% glimpse Observations: 5 Variables: 2 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e" > data_frame( x = 1:5, y = letters[1:5] , z = x + 1) %>% glimpse Observations: 5 Variables: 3 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e" $ z (dbl) 2, 3, 4, 5, 6
  • 23.
    frame_data aka tibble >frame_data( + ~colA, ~colB, + "a", 1, + "b", 2 + ) Source: local data frame [2 x 2] colA colB (chr) (dbl) 1 a 1 2 b 2
  • 24.
  • 25.
    g <- c("origin","dest") v <- "dep_delay" flights %>% group_by( g ) %>% summarise( result = mean(v, na.rm = TRUE) ) 🙀 🙀
  • 26.
    g <- c("origin","dest") v <- "dep_delay" flights %>% group_by_( .dots = g ) %>% summarise_( .dots = interp(~ mean(var, na.rm = TRUE), var = as.name(v)) )
  • 30.
    Future • Performance improvements(parallel C++) • Alternative back ends • Different type of groupings (e.g. bootstrap) As soon as we get hoverboard ...
  • 31.