SlideShare a Scribd company logo
1 of 33
Download to read offline
dplyr use case
Romain François
@romain_francois
#satRday/Budapest
romainfrancois
%>%
enjoy(cool(bake(shape(beat(append(bowl(rep("flour",
2), "yeast", "water", "milk", "oil"), "flour", until
= "soft"), duration = "3mins"), as = "balls", style =
"slightly-flat"), degrees = 200, duration =
"15mins"), duration = "5mins"))
bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>%
append("flour", until = "soft") %>%

beat(duration = "3mins") %>%

shape(as = "balls", style = "slightly-flat") %>%

bake(degrees = 200, duration = "15mins") %>%

cool(buns, duration = "5mins") %>%
enjoy()
tmp <- tempfile( fileext = ".xlsx")
download.file("http://bit.ly/2bPaiyU", tmp )
data <- readxl::read_excel(tmp)
All caps column names with spaces
> names(data)
[1] "DESTINATION" "COMMERCIAL FLAG" "CITY" "REGION"
[5] "COUNTRY" "FLIGH DIRECTION" "FLIGHT TYPE" "DATE"
[9] "DATE YEAR" "DATE HALF YEAR" "DATE YEAR QUARTER" "DATE YEAR MONTH"
[13] "NBR OF PASSENGERS" "CARGO WEIGHT" "NBR OF FLIGHTS" "SEAT CAPACITY"
> unique( data[["COMMERCIAL FLAG"]] )
[1] "Commercial"
> summary( as.factor( data[["REGION"]] ) )
China VH China ZB China ZS Germany ED Germany ET India VA India VI
68 127 3 2160 9 3 1
Japan RJ Russia UI Russia UL Russia UN Russia UR Russia US Russia UU
63 1 167 7 62 59 340
Russia UW Spain GC Spain LE USA KA USA KB USA KC USA KD
6 205 870 4 7 5 19
USA KE USA KI USA KJ USA KM USA KP USA KT USA KW
3 1 114 2 4 2 5
NA's
14520
> data %>% select( starts_with( "DATE") )
# A tibble: 18,837 x 5
DATE DATE YEAR DATE HALF YEAR DATE YEAR QUARTER DATE YEAR MONTH
<time> <dbl> <chr> <chr> <chr>
1 2010-11-01 2010 2010H2 2010Q4 201011
2 2010-12-01 2010 2010H2 2010Q4 201012
3 2011-01-01 2011 2011H1 2011Q1 201101
4 2007-05-01 2007 2007H1 2007Q2 200705
5 2011-07-01 2011 2011H2 2011Q3 201107
6 2010-07-01 2010 2010H2 2010Q3 201007
7 2010-04-01 2010 2010H1 2010Q2 201004
8 2012-06-01 2012 2012H1 2012Q2 201206
9 2009-02-01 2009 2009H1 2009Q1 200902
10 2009-10-01 2009 2009H2 2009Q4 200910
# ... with 18,827 more rows
> data %>%
+ filter( `NBR OF PASSENGERS` == 0 & `CARGO WEIGHT` == 0 ) %>%
+ group_by( CITY, `DATE YEAR MONTH`, `FLIGH DIRECTION` ) %>%
+ summarise( flights = sum( `NBR OF FLIGHTS` ) ) %>%
+ arrange( desc(flights) )
Source: local data frame [2,417 x 4]
Groups: CITY, DATE YEAR MONTH [2,023]
CITY DATE YEAR MONTH FLIGH DIRECTION flights
<chr> <chr> <chr> <dbl>
1 Timisoara 200911 Incoming 26
2 Arad 200701 Incoming 23
3 Arad 200801 Outgoing 22
4 Cluj Napoca 201110 Incoming 22
5 Arad 200801 Incoming 21
# ...
Empty flights
> data %>%
+ filter( `SEAT CAPACITY` < `NBR OF PASSENGERS` ) %>%
+ group_by( CITY ) %>%
+ summarise(
+ flights = sum( `NBR OF FLIGHTS` ),
+ passengers = sum( `NBR OF PASSENGERS`),
+ capacity = sum( `SEAT CAPACITY`)
+ ) %>%
+ arrange( desc(flights) )
# A tibble: 8 x 4
CITY flights passengers capacity
<chr> <dbl> <dbl> <dbl>
1 Hurghada 32 5229 4665
2 Tenerife Sur 4 760 756
3 Djerba 3 530 510
Empty flights
flights <- data %>%
`names<-`( casefold(gsub( " ", "_", names(data))) ) %>%
rename( direction = fligh_direction, type = flight_type ) %>%
mutate(
year = date_year,
month = as.numeric(substr(date_year_month, 5, 6)),
no_passengers = nbr_of_passengers==0,
no_weight = cargo_weight==0
) %>%
group_by( year, month, country, city, direction ) %>%
summarise(
passengers = sum(nbr_of_passengers),
weight = sum(cargo_weight),
capacity = sum(seat_capacity),
flights = sum(nbr_of_flights),
scheduled = sum(nbr_of_flights[type == "Scheduled"] ),
non_scheduled = flights - scheduled,
flights_no_passengers = sum( nbr_of_flights[no_passengers] ),
flights_no_weight = sum( nbr_of_flights[no_weight] ),
flights_empty = sum( nbr_of_flights[no_passengers & no_weight]),
filling = passengers / capacity
) %>%
arrange( year, month, country, city, direction ) %>%
ungroup
> glimpse(flights)
Observations: 14,772
Variables: 14
$ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,...
$ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ country <chr> "Albania", "Albania", "Austria", "Austria", "Azerbaijan", "...
$ city <chr> "Tirana", "Tirana", "Wien", "Wien", "Baku", "Brussels", "Br...
$ direction <chr> "Incoming", "Outgoing", "Incoming", "Outgoing", "Incoming",...
$ passengers <dbl> 2448, 1429, 2530, 3126, 0, 4310, 5362, 2760, 3269, 0, 37, 1...
$ weight <dbl> 0, 0, 0, 0, 597393, 598235, 7343, 0, 0, 0, 0, 0, 0, 0, 0, 2...
$ capacity <dbl> 4024, 4024, 5684, 5673, 2365, 8951, 8922, 4320, 4320, 50, 5...
$ flights <dbl> 44, 44, 91, 90, 15, 95, 87, 24, 24, 1, 1, 24, 24, 3, 3, 68,...
$ scheduled <dbl> 44, 44, 91, 89, 15, 95, 87, 24, 24, 0, 0, 24, 24, 0, 0, 68,...
$ non_scheduled <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 3, 3, 0, 3, 0, 0, 0,...
$ flights_no_passengers <dbl> 0, 0, 0, 1, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ flights_no_weight <dbl> 44, 44, 91, 90, 0, 0, 0, 24, 24, 1, 1, 24, 24, 3, 3, 0, 3, ...
$ flights_empty <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
> places <- with( distinct( flights, city, country ), paste(city, country, sep = ", ") )
>
> p <- progress_estimated(length(places))
> coords <- tbl_df( bind_rows( lapply(places, function(place){
+ res <- geocode( place )[1, ]
+ Sys.sleep( .2 ) # otherwise geocode wont let us get all the data
+ p$tick()$print()
+ res
+ }) ) )
|==== | 8% ~2 m remaining
> glimpse(cities)
Observations: 484
Variables: 10
$ city <chr> "Mazari Sharif", "Bagram Air Base", "Kabul", "Kandahar", "Tirana...
$ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Alb...
$ interpretedPlace <chr> "Mazari Sharif, Afghanistan", "Bagram Airfield, Bagram, Afghanis...
$ longitude <dbl> 67.117951, 69.273567, 69.207486, 65.737175, 19.818698, -2.214323...
$ latitude <dbl> 36.69262, 34.94448, 34.55535, 31.62887, 41.32755, 31.61825, 22.7...
$ xmin <dbl> 67.060633, 69.272218, 68.949509, 65.669574, 19.753568, -2.404461...
$ xmax <dbl> 67.235212, 69.274916, 69.445953, 65.830765, 19.882078, -1.352005...
$ ymin <dbl> 36.65382, 34.94313, 34.34549, 31.59740, 41.29512, 31.18248, 20.8...
$ ymax <dbl> 36.75264, 34.94582, 34.76192, 31.67252, 41.36684, 32.04475, 23.6...
$ uncertainty <dbl> 6692, 194, 31670, 7302, 6535, 51589, 150488, 239610, 12129, 1738...
Who went to Budapest
in summer 2011 ?
flights %>%
filter(
direction == "Incoming",
month %in% c(7,8),
year == 2011
) %>%
group_by( country ) %>%
summarise(
passengers = sum(passengers),
flights = sum(flights)
) %>%
arrange( desc(passengers) )
Summer
Goes to Budapest
flights %>%
filter(
direction == "Incoming",
month %in% c(7,8),
year == 2011
) %>%
group_by( country ) %>%
summarise(
passengers = sum(passengers),
flights = sum(flights)
) %>%
arrange( desc(passengers) )
Who
flights %>%
filter(
direction == "Incoming",
month %in% c(7,8),
year == 2011
) %>%
group_by( country ) %>%
summarise(
passengers = sum(passengers),
flights = sum(flights)
) %>%
arrange( desc(passengers) )
Summarise
flights %>%
filter(
direction == "Incoming",
month %in% c(7,8),
year == 2011
) %>%
group_by( country ) %>%
summarise(
passengers = sum(passengers),
flights = sum(flights)
) %>%
arrange( desc(passengers) ) Organise
flights %>%
filter(
direction == "Incoming",
month %in% c(7,8),
year == 2011
) %>%
group_by( country ) %>%
summarise(
passengers = sum(passengers),
flights = sum(flights)
) %>%
arrange( desc(passengers) ) Organise
When are people
leaving Budapest ?
data <- flights %>%
filter( direction == "Outgoing" ) %>%
mutate(
year = as.factor(year),
month = factor( month.abb[month], levels = month.abb )
) %>%
group_by( year, month ) %>%
summarise( passengers = sum( passengers ))
ggplot( data ) +
aes( x = month, y = passengers,
group = year, color = year
) +
geom_line() +
geom_point()
Leaving Budapest
data <- flights %>%
filter( direction == "Outgoing" ) %>%
mutate(
year = as.factor(year),
month = factor( month.abb[month], levels = month.abb )
) %>%
group_by( year, month ) %>%
summarise( passengers = sum( passengers ))
ggplot( data ) +
aes( x = month, y = passengers,
group = year, color = year
) +
geom_line() +
geom_point()
Collapse #passengers
by year/month
data <- flights %>%
filter( direction == "Outgoing" ) %>%
mutate(
year = as.factor(year),
month = factor( month.abb[month], levels = month.abb )
) %>%
group_by( year, month ) %>%
summarise( passengers = sum( passengers ))
ggplot( data ) +
aes( x = month, y = passengers,
group = year, color = year
) +
geom_line() +
geom_point()
Make year and month factors
data <- flights %>%
filter( direction == "Outgoing" ) %>%
mutate(
year = as.factor(year),
month = factor( month.abb[month], levels = month.abb )
) %>%
group_by( year, month ) %>%
summarise( passengers = sum( passengers ))
ggplot( data ) +
aes( x = month, y = passengers,
group = year, color = year
) +
geom_line() +
geom_point()
viz with ggplot2
When are people leaving Budapest ?
satRday flights fever
Flights from/to Budapest
Budapest #satRday dataviz competition
Romain François Cecile Sauder
Questions
Question

More Related Content

What's hot

The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)
GroovyPuzzlers
 
FrontendLab: Programming UI with FRP and Bacon js - Вячеслав Ворончук
FrontendLab: Programming UI with FRP and Bacon js - Вячеслав ВорончукFrontendLab: Programming UI with FRP and Bacon js - Вячеслав Ворончук
FrontendLab: Programming UI with FRP and Bacon js - Вячеслав Ворончук
GeeksLab Odessa
 
eJADA web development the Ruby way
eJADA web development the Ruby wayeJADA web development the Ruby way
eJADA web development the Ruby way
Mustafah Elbanna
 
Vim Hacks (OSSF)
Vim Hacks (OSSF)Vim Hacks (OSSF)
Vim Hacks (OSSF)
Lin Yo-An
 

What's hot (20)

The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)
 
Escuela Nº 839
Escuela Nº 839Escuela Nº 839
Escuela Nº 839
 
FrontendLab: Programming UI with FRP and Bacon js - Вячеслав Ворончук
FrontendLab: Programming UI with FRP and Bacon js - Вячеслав ВорончукFrontendLab: Programming UI with FRP and Bacon js - Вячеслав Ворончук
FrontendLab: Programming UI with FRP and Bacon js - Вячеслав Ворончук
 
Yahoo! JAPANとKotlin
Yahoo! JAPANとKotlinYahoo! JAPANとKotlin
Yahoo! JAPANとKotlin
 
Extending Operators in Perl with Operator::Util
Extending Operators in Perl with Operator::UtilExtending Operators in Perl with Operator::Util
Extending Operators in Perl with Operator::Util
 
Bloqueador cmd-sh
Bloqueador cmd-shBloqueador cmd-sh
Bloqueador cmd-sh
 
Communities - Perl edition (RioJS)
Communities - Perl edition (RioJS)Communities - Perl edition (RioJS)
Communities - Perl edition (RioJS)
 
Programming Lisp Clojure - 2장 : 클로저 둘러보기
Programming Lisp Clojure - 2장 : 클로저 둘러보기Programming Lisp Clojure - 2장 : 클로저 둘러보기
Programming Lisp Clojure - 2장 : 클로저 둘러보기
 
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
Swift - 혼자 공부하면 분명히 안할테니까 같이 공부하기
 
FRP and Bacon.js
FRP and Bacon.jsFRP and Bacon.js
FRP and Bacon.js
 
Java Unicode with Cool GUI Examples
Java Unicode with Cool GUI ExamplesJava Unicode with Cool GUI Examples
Java Unicode with Cool GUI Examples
 
Java Unicode with Live GUI Examples
Java Unicode with Live GUI ExamplesJava Unicode with Live GUI Examples
Java Unicode with Live GUI Examples
 
Damn Fine CoffeeScript
Damn Fine CoffeeScriptDamn Fine CoffeeScript
Damn Fine CoffeeScript
 
eJADA web development the Ruby way
eJADA web development the Ruby wayeJADA web development the Ruby way
eJADA web development the Ruby way
 
Ruby 1.9
Ruby 1.9Ruby 1.9
Ruby 1.9
 
Islam House
Islam HouseIslam House
Islam House
 
Elixir
ElixirElixir
Elixir
 
Vim Hacks (OSSF)
Vim Hacks (OSSF)Vim Hacks (OSSF)
Vim Hacks (OSSF)
 
1024+ Seconds of JS Wizardry - JSConf.eu 2013
1024+ Seconds of JS Wizardry - JSConf.eu 20131024+ Seconds of JS Wizardry - JSConf.eu 2013
1024+ Seconds of JS Wizardry - JSConf.eu 2013
 
The Marvelous Land of Higher Order Functions
The Marvelous Land of Higher Order FunctionsThe Marvelous Land of Higher Order Functions
The Marvelous Land of Higher Order Functions
 

Similar to dplyr use case

Let’s Talk About Ruby
Let’s Talk About RubyLet’s Talk About Ruby
Let’s Talk About Ruby
Ian Bishop
 
Useful javascript
Useful javascriptUseful javascript
Useful javascript
Lei Kang
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin
 

Similar to dplyr use case (20)

dplyr
dplyrdplyr
dplyr
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Using a mobile phone as a therapist - Superweek 2018
Using a mobile phone as a therapist - Superweek 2018Using a mobile phone as a therapist - Superweek 2018
Using a mobile phone as a therapist - Superweek 2018
 
Using R for Building a Simple and Effective Dashboard
Using R for Building a Simple and Effective DashboardUsing R for Building a Simple and Effective Dashboard
Using R for Building a Simple and Effective Dashboard
 
Let’s Talk About Ruby
Let’s Talk About RubyLet’s Talk About Ruby
Let’s Talk About Ruby
 
A Shiny Example-- R
A Shiny Example-- RA Shiny Example-- R
A Shiny Example-- R
 
Useful javascript
Useful javascriptUseful javascript
Useful javascript
 
JQuery Flot
JQuery FlotJQuery Flot
JQuery Flot
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.
 
{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1
 
Joclad 2010 d
Joclad 2010 dJoclad 2010 d
Joclad 2010 d
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
Send + More = Money – Let’s mash 2 monads to solve a simple CSP
Send + More = Money – Let’s mash 2 monads to solve a simple CSPSend + More = Money – Let’s mash 2 monads to solve a simple CSP
Send + More = Money – Let’s mash 2 monads to solve a simple CSP
 
R for you
R for youR for you
R for you
 
Apache big-data-2017-scala-sql
Apache big-data-2017-scala-sqlApache big-data-2017-scala-sql
Apache big-data-2017-scala-sql
 
JavaOne2010 Groovy/Spring Roo
JavaOne2010 Groovy/Spring RooJavaOne2010 Groovy/Spring Roo
JavaOne2010 Groovy/Spring Roo
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την Python
 
Pop3ck sh
Pop3ck shPop3ck sh
Pop3ck sh
 
R programming language
R programming languageR programming language
R programming language
 

More from Romain Francois

Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
Romain Francois
 
RProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for RRProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for R
Romain Francois
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
Romain Francois
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
Romain Francois
 

More from Romain Francois (19)

R/C++
R/C++R/C++
R/C++
 
user2015 keynote talk
user2015 keynote talkuser2015 keynote talk
user2015 keynote talk
 
SevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittrSevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittr
 
Data manipulation with dplyr
Data manipulation with dplyrData manipulation with dplyr
Data manipulation with dplyr
 
R/C++ talk at earl 2014
R/C++ talk at earl 2014R/C++ talk at earl 2014
R/C++ talk at earl 2014
 
Rcpp11 genentech
Rcpp11 genentechRcpp11 genentech
Rcpp11 genentech
 
Rcpp11 useR2014
Rcpp11 useR2014Rcpp11 useR2014
Rcpp11 useR2014
 
Rcpp11
Rcpp11Rcpp11
Rcpp11
 
R and C++
R and C++R and C++
R and C++
 
R and cpp
R and cppR and cpp
R and cpp
 
Rcpp attributes
Rcpp attributesRcpp attributes
Rcpp attributes
 
Rcpp is-ready
Rcpp is-readyRcpp is-ready
Rcpp is-ready
 
Rcpp
RcppRcpp
Rcpp
 
Integrating R with C++: Rcpp, RInside and RProtoBuf
Integrating R with C++: Rcpp, RInside and RProtoBufIntegrating R with C++: Rcpp, RInside and RProtoBuf
Integrating R with C++: Rcpp, RInside and RProtoBuf
 
Object Oriented Design(s) in R
Object Oriented Design(s) in RObject Oriented Design(s) in R
Object Oriented Design(s) in R
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
 
RProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for RRProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for R
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
 

Recently uploaded

Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Safe Software
 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire business
panagenda
 

Recently uploaded (20)

Spring Boot vs Quarkus the ultimate battle - DevoxxUK
Spring Boot vs Quarkus the ultimate battle - DevoxxUKSpring Boot vs Quarkus the ultimate battle - DevoxxUK
Spring Boot vs Quarkus the ultimate battle - DevoxxUK
 
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ..."I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
 
DBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor Presentation
 
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
 
Six Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal OntologySix Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal Ontology
 
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot ModelMcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
Mcleodganj Call Girls 🥰 8617370543 Service Offer VIP Hot Model
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 
FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024FWD Group - Insurer Innovation Award 2024
FWD Group - Insurer Innovation Award 2024
 
CNIC Information System with Pakdata Cf In Pakistan
CNIC Information System with Pakdata Cf In PakistanCNIC Information System with Pakdata Cf In Pakistan
CNIC Information System with Pakdata Cf In Pakistan
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptx
 
Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)
 
AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of Terraform
 
Introduction to use of FHIR Documents in ABDM
Introduction to use of FHIR Documents in ABDMIntroduction to use of FHIR Documents in ABDM
Introduction to use of FHIR Documents in ABDM
 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
 
Artificial Intelligence Chap.5 : Uncertainty
Artificial Intelligence Chap.5 : UncertaintyArtificial Intelligence Chap.5 : Uncertainty
Artificial Intelligence Chap.5 : Uncertainty
 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire business
 
Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..
 
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
 
AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)
AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)
AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)
 

dplyr use case

  • 1. dplyr use case Romain François @romain_francois #satRday/Budapest romainfrancois
  • 2. %>%
  • 3.
  • 4. enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water", "milk", "oil"), "flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"), degrees = 200, duration = "15mins"), duration = "5mins")) bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>% append("flour", until = "soft") %>%
 beat(duration = "3mins") %>%
 shape(as = "balls", style = "slightly-flat") %>%
 bake(degrees = 200, duration = "15mins") %>%
 cool(buns, duration = "5mins") %>% enjoy()
  • 5.
  • 6. tmp <- tempfile( fileext = ".xlsx") download.file("http://bit.ly/2bPaiyU", tmp ) data <- readxl::read_excel(tmp)
  • 7. All caps column names with spaces > names(data) [1] "DESTINATION" "COMMERCIAL FLAG" "CITY" "REGION" [5] "COUNTRY" "FLIGH DIRECTION" "FLIGHT TYPE" "DATE" [9] "DATE YEAR" "DATE HALF YEAR" "DATE YEAR QUARTER" "DATE YEAR MONTH" [13] "NBR OF PASSENGERS" "CARGO WEIGHT" "NBR OF FLIGHTS" "SEAT CAPACITY"
  • 8. > unique( data[["COMMERCIAL FLAG"]] ) [1] "Commercial" > summary( as.factor( data[["REGION"]] ) ) China VH China ZB China ZS Germany ED Germany ET India VA India VI 68 127 3 2160 9 3 1 Japan RJ Russia UI Russia UL Russia UN Russia UR Russia US Russia UU 63 1 167 7 62 59 340 Russia UW Spain GC Spain LE USA KA USA KB USA KC USA KD 6 205 870 4 7 5 19 USA KE USA KI USA KJ USA KM USA KP USA KT USA KW 3 1 114 2 4 2 5 NA's 14520
  • 9. > data %>% select( starts_with( "DATE") ) # A tibble: 18,837 x 5 DATE DATE YEAR DATE HALF YEAR DATE YEAR QUARTER DATE YEAR MONTH <time> <dbl> <chr> <chr> <chr> 1 2010-11-01 2010 2010H2 2010Q4 201011 2 2010-12-01 2010 2010H2 2010Q4 201012 3 2011-01-01 2011 2011H1 2011Q1 201101 4 2007-05-01 2007 2007H1 2007Q2 200705 5 2011-07-01 2011 2011H2 2011Q3 201107 6 2010-07-01 2010 2010H2 2010Q3 201007 7 2010-04-01 2010 2010H1 2010Q2 201004 8 2012-06-01 2012 2012H1 2012Q2 201206 9 2009-02-01 2009 2009H1 2009Q1 200902 10 2009-10-01 2009 2009H2 2009Q4 200910 # ... with 18,827 more rows
  • 10. > data %>% + filter( `NBR OF PASSENGERS` == 0 & `CARGO WEIGHT` == 0 ) %>% + group_by( CITY, `DATE YEAR MONTH`, `FLIGH DIRECTION` ) %>% + summarise( flights = sum( `NBR OF FLIGHTS` ) ) %>% + arrange( desc(flights) ) Source: local data frame [2,417 x 4] Groups: CITY, DATE YEAR MONTH [2,023] CITY DATE YEAR MONTH FLIGH DIRECTION flights <chr> <chr> <chr> <dbl> 1 Timisoara 200911 Incoming 26 2 Arad 200701 Incoming 23 3 Arad 200801 Outgoing 22 4 Cluj Napoca 201110 Incoming 22 5 Arad 200801 Incoming 21 # ... Empty flights
  • 11. > data %>% + filter( `SEAT CAPACITY` < `NBR OF PASSENGERS` ) %>% + group_by( CITY ) %>% + summarise( + flights = sum( `NBR OF FLIGHTS` ), + passengers = sum( `NBR OF PASSENGERS`), + capacity = sum( `SEAT CAPACITY`) + ) %>% + arrange( desc(flights) ) # A tibble: 8 x 4 CITY flights passengers capacity <chr> <dbl> <dbl> <dbl> 1 Hurghada 32 5229 4665 2 Tenerife Sur 4 760 756 3 Djerba 3 530 510 Empty flights
  • 12.
  • 13. flights <- data %>% `names<-`( casefold(gsub( " ", "_", names(data))) ) %>% rename( direction = fligh_direction, type = flight_type ) %>% mutate( year = date_year, month = as.numeric(substr(date_year_month, 5, 6)), no_passengers = nbr_of_passengers==0, no_weight = cargo_weight==0 ) %>% group_by( year, month, country, city, direction ) %>% summarise( passengers = sum(nbr_of_passengers), weight = sum(cargo_weight), capacity = sum(seat_capacity), flights = sum(nbr_of_flights), scheduled = sum(nbr_of_flights[type == "Scheduled"] ), non_scheduled = flights - scheduled, flights_no_passengers = sum( nbr_of_flights[no_passengers] ), flights_no_weight = sum( nbr_of_flights[no_weight] ), flights_empty = sum( nbr_of_flights[no_passengers & no_weight]), filling = passengers / capacity ) %>% arrange( year, month, country, city, direction ) %>% ungroup
  • 14. > glimpse(flights) Observations: 14,772 Variables: 14 $ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,... $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... $ country <chr> "Albania", "Albania", "Austria", "Austria", "Azerbaijan", "... $ city <chr> "Tirana", "Tirana", "Wien", "Wien", "Baku", "Brussels", "Br... $ direction <chr> "Incoming", "Outgoing", "Incoming", "Outgoing", "Incoming",... $ passengers <dbl> 2448, 1429, 2530, 3126, 0, 4310, 5362, 2760, 3269, 0, 37, 1... $ weight <dbl> 0, 0, 0, 0, 597393, 598235, 7343, 0, 0, 0, 0, 0, 0, 0, 0, 2... $ capacity <dbl> 4024, 4024, 5684, 5673, 2365, 8951, 8922, 4320, 4320, 50, 5... $ flights <dbl> 44, 44, 91, 90, 15, 95, 87, 24, 24, 1, 1, 24, 24, 3, 3, 68,... $ scheduled <dbl> 44, 44, 91, 89, 15, 95, 87, 24, 24, 0, 0, 24, 24, 0, 0, 68,... $ non_scheduled <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 3, 3, 0, 3, 0, 0, 0,... $ flights_no_passengers <dbl> 0, 0, 0, 1, 15, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... $ flights_no_weight <dbl> 44, 44, 91, 90, 0, 0, 0, 24, 24, 1, 1, 24, 24, 3, 3, 0, 3, ... $ flights_empty <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
  • 15.
  • 16. > places <- with( distinct( flights, city, country ), paste(city, country, sep = ", ") ) > > p <- progress_estimated(length(places)) > coords <- tbl_df( bind_rows( lapply(places, function(place){ + res <- geocode( place )[1, ] + Sys.sleep( .2 ) # otherwise geocode wont let us get all the data + p$tick()$print() + res + }) ) ) |==== | 8% ~2 m remaining > glimpse(cities) Observations: 484 Variables: 10 $ city <chr> "Mazari Sharif", "Bagram Air Base", "Kabul", "Kandahar", "Tirana... $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Alb... $ interpretedPlace <chr> "Mazari Sharif, Afghanistan", "Bagram Airfield, Bagram, Afghanis... $ longitude <dbl> 67.117951, 69.273567, 69.207486, 65.737175, 19.818698, -2.214323... $ latitude <dbl> 36.69262, 34.94448, 34.55535, 31.62887, 41.32755, 31.61825, 22.7... $ xmin <dbl> 67.060633, 69.272218, 68.949509, 65.669574, 19.753568, -2.404461... $ xmax <dbl> 67.235212, 69.274916, 69.445953, 65.830765, 19.882078, -1.352005... $ ymin <dbl> 36.65382, 34.94313, 34.34549, 31.59740, 41.29512, 31.18248, 20.8... $ ymax <dbl> 36.75264, 34.94582, 34.76192, 31.67252, 41.36684, 32.04475, 23.6... $ uncertainty <dbl> 6692, 194, 31670, 7302, 6535, 51589, 150488, 239610, 12129, 1738...
  • 17.
  • 18. Who went to Budapest in summer 2011 ?
  • 19. flights %>% filter( direction == "Incoming", month %in% c(7,8), year == 2011 ) %>% group_by( country ) %>% summarise( passengers = sum(passengers), flights = sum(flights) ) %>% arrange( desc(passengers) ) Summer Goes to Budapest
  • 20. flights %>% filter( direction == "Incoming", month %in% c(7,8), year == 2011 ) %>% group_by( country ) %>% summarise( passengers = sum(passengers), flights = sum(flights) ) %>% arrange( desc(passengers) ) Who
  • 21. flights %>% filter( direction == "Incoming", month %in% c(7,8), year == 2011 ) %>% group_by( country ) %>% summarise( passengers = sum(passengers), flights = sum(flights) ) %>% arrange( desc(passengers) ) Summarise
  • 22. flights %>% filter( direction == "Incoming", month %in% c(7,8), year == 2011 ) %>% group_by( country ) %>% summarise( passengers = sum(passengers), flights = sum(flights) ) %>% arrange( desc(passengers) ) Organise
  • 23. flights %>% filter( direction == "Incoming", month %in% c(7,8), year == 2011 ) %>% group_by( country ) %>% summarise( passengers = sum(passengers), flights = sum(flights) ) %>% arrange( desc(passengers) ) Organise
  • 25. data <- flights %>% filter( direction == "Outgoing" ) %>% mutate( year = as.factor(year), month = factor( month.abb[month], levels = month.abb ) ) %>% group_by( year, month ) %>% summarise( passengers = sum( passengers )) ggplot( data ) + aes( x = month, y = passengers, group = year, color = year ) + geom_line() + geom_point() Leaving Budapest
  • 26. data <- flights %>% filter( direction == "Outgoing" ) %>% mutate( year = as.factor(year), month = factor( month.abb[month], levels = month.abb ) ) %>% group_by( year, month ) %>% summarise( passengers = sum( passengers )) ggplot( data ) + aes( x = month, y = passengers, group = year, color = year ) + geom_line() + geom_point() Collapse #passengers by year/month
  • 27. data <- flights %>% filter( direction == "Outgoing" ) %>% mutate( year = as.factor(year), month = factor( month.abb[month], levels = month.abb ) ) %>% group_by( year, month ) %>% summarise( passengers = sum( passengers )) ggplot( data ) + aes( x = month, y = passengers, group = year, color = year ) + geom_line() + geom_point() Make year and month factors
  • 28. data <- flights %>% filter( direction == "Outgoing" ) %>% mutate( year = as.factor(year), month = factor( month.abb[month], levels = month.abb ) ) %>% group_by( year, month ) %>% summarise( passengers = sum( passengers )) ggplot( data ) + aes( x = month, y = passengers, group = year, color = year ) + geom_line() + geom_point() viz with ggplot2
  • 29. When are people leaving Budapest ?
  • 30. satRday flights fever Flights from/to Budapest Budapest #satRday dataviz competition Romain François Cecile Sauder
  • 31.