SlideShare a Scribd company logo
1 of 84
How to use open source tools and data science to get insights on business
and customers
 The goal of this talk is
 Give you a flavour of what can you do with open
source data analysis tools like R or Python
 Give you some useful «code snippets» to make
practice
 Provide a way of reasoning while commenting
code and slides
 The setting
 You are a rampant Data Scientist
 Someone want to start a new business in NY and
create a taxi company (or the new Uber!) and ask
you an advice
 You want to prepare a beautiful and simple
dashboard with the most relevant insights and KPI
 First think first… Get some Data
 http://www.nyc.gov/html/tlc/html/home/home.shtml
Customer behaviour
Economics
Insights & Graphics
Other Insights
 Sketch an idea of your Dashboard/Report
 Start Exploring Data
 Trip Details Data
▪ medallion, hack_license, vendor_id, rate_code,
store_and_fwd_flag, Pickup_datetime, Drop-
off_datetime, passenger_count, trip_time_in_secs,
trip_distance, Pickup_longitude, Pickup_latitude, Drop-
off_longitude, Drop-off_latitude
 Trip Fare Data:
▪ medallion, hack_license, vendor_id, Pickup_datetime,
payment_type, fare_amount, surcharge, mta_tax,
tip_amount, tolls_amount, total_amount
In the following I’ll make extensive use of R
(https://www.r-project.org), Rstudio
(https://www.rstudio.com) and the following R
libraries
library(psych)
library(dplyr)
library(ggmap)
library(lattice)
 Download data in <your folder> from here:
 Unzip
 Import in a R DataFrame:
setwd(“<your folder>")
Import them in a Dataframe:
#read trip_data.csv
data_trip<-read.csv("trip_data.csv",sep=',',
header=1,nrows=500000)
#read trip_fare.csv
data_fares<-read.csv("trip_fare.csv",sep=',‘,
header=1,nrows=500000)
 Let’s do some Cleansing, for example
#exclude trip with time less than 60 seconds
data_trip<-data_trip[(
data_trip$trip_time_in_secs)>60,]
#exclude trip with distance less than 0.1 miles
data_trip<-data_trip[(
data_trip$trip_distance)>0.1,]
data_trip<-data_trip
[!(data_trip$pickup_latitude==0 |
data_trip$pickup_longitude==0),]
#work on a selection of the NYC area
data_trip<-data_trip[(
data_trip$pickup_latitude>(40.62)&
data_trip$pickup_latitude<40.9 &
data_trip$pickup_longitude>(-74.1)&
data_trip$pickup_longitude<(-73.75)&
data_trip$dropoff_latitude>(40.62)&
data_trip$dropoff_latitude<40.9&
data_trip$dropoff_longitude>(-74.1)&
data_trip$dropoff_longitude<(73.75))
,]
 Build new variables,
#create a column for pickup_hour
data_trip$pickup_hour<-as.POSIXlt(
data_trip$pickup_datetime)$hour
#create a column for dropoff_hour
data_trip$dropoff_hour<-as.POSIXlt(
data_trip$dropoff_datetime)$hour
#create a column for counting
data_trip$ones<-1
 Remove some variables,
data_fares$medallion<-NULL
data_fares$vendor_id<-NULL
data_trip$dropoff_datetime<-NULL
data_trip$medallion<-NULL
data_trip$vendor_id<-NULL
data_trip$store_and_fwd_flag<-NULL
data_trip$rate_code<-NULL
 Plot some Histograms
#Distribution of number of passengers per trip
hist(data_trip$passenger_count,6,
main="Distribution of Number of Passengers
per Trip",xlab="Number of Passengers
p/Trip")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
hist(data_trip$passenger_count,6,
add = TRUE,col=" lightgoldenrod2 ")
#Distribution of payment_type
barplot(sort(table(data_fares$payment_type),
decreasing = TRUE), xaxt = 'n')
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
barplot(sort(table(data_fares$payment_type),
decreasing = TRUE), ylab="Frequency“,
col="lightgoldenrod2", add =TRUE,
main="Distribution of Payement Type“)
#Distribution of number of trip time length
hist(data_trip$trip_time_in_secs/60,10,
xlim=c(0,100),main="Distribution of
Trip Time",xlab="Trip Time in minutes")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
hist(data_trip$trip_time_in_secs/60,10, add =
TRUE,col="lightgoldenrod2")
#Distribution of number of trip distance
hist(data_trip$trip_distance,100,xlim=c(0,40),
main="Distribution of Trip Distance",
xlab="Trip Distance")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
hist(data_trip$trip_distance,100, add =TRUE,
col="lightgoldenrod2")
#Distribution of fare amount (full domain)
hist(data_fares$fare_amount,
main="Distribution of Fare Amount",
xlab="Fare Amount")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
hist(data_fares$fare_amount,add =
TRUE,col="lightgoldenrod2")
#Distribution of fare amount (restricted domain)
hist(data_fares$fare_amount,xlim=c(0,80),200,
main="Distribution of Fare Amount",
xlab="Fare Amount")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey“)
hist(data_fares$fare_amount,200, xlim=c(0,80),add
= TRUE,col="lightgoldenrod2")
#Distribution of tip amount
hist(data_fares$tip_amount,500,xlim=c(0,20),
main="Distribution of Tip Amount",
xlab="Tip Amount")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
hist(data_fares$tip_amount,500,xlim=c(0,20),add =
TRUE,col="lightgoldenrod2")
#Distribution of Total Amount
hist(data_fares$total_amount,1000,xlim=c(0,100),
main="Distribution of Total Amount",
xlab="Total Amount")
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
hist(data_fares$total_amount,add = TRUE,
col="lightgoldenrod2",1000,xlim=c(0,100))
#Distribution of pickups during the day
barplot(table(data_trip$pickup_hour))
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
barplot(table(data_trip$pickup_hour), add = TRUE,
col="lightgoldenrod2",
main="Distribution of Pickups in 24H",
ylab="Frequency")
#Distribution of pickups during the day (ordered)
barplot(sort(table(data_trip$pickup_hour),
decreasing = TRUE))
rect(par("usr")[1], par("usr")[3], par("usr")[2],
par("usr")[4], col = "grey")
barplot(sort(table(data_trip$pickup_hour),
decreasing = TRUE)),add = TRUE,
col="lightgoldenrod2",
main="Distribution of Pickups in 24H",
ylab="Frequency")
#Top 5 busiest hours of the day
busy_hours<-aggregate(data_trip$ones ~
data_trip$pickup_hour, data_trip, sum)
#select top 5 pickup_hours
busy_hours.top5<- busy_hours %>%
arrange(desc(busy_hours[,2])) %>%
top_n(5)
names(busy_hours.top5)[names(busy_hours.top5)==
"data_trip$pickup_hour"]<-"pickup_hour"
names(busy_hours.top5)[names(busy_hours.top5)==
"data_trip$ones"] <- "nr_runs"
busy_hours.top5
pickup_hour nr_runs
1 23 32829
2 0 31392
3 22 27887
4 1 26800
5 12 25711
#Distribution of pickups during the day in %
names(busy_hours)[names(busy_hours)==
"data_trip$pickup_hour"]<-"pickup_hour“
names(busy_hours)[names(busy_hours)==
"data_trip$ones"] <- "counter“
hoursum<-sum(busy_hours$counter)
busy_hours$perc<-busy_hours$counter/hoursum
ggplot(busy_hours,aes(x = pickup_hour,
y = perc*100))+ geom_ribbon(aes(ymin=0,
ymax=perc*100), fill="lightgoldenrod2",
color="lightgoldenrod2")+
scale_x_continuous(breaks = seq(from = 0,
to = 23, by = 1))+ geom_point(size=3,
color="burlywood3")+
geom_line(color="burlywood3", lwd=0.5)+
ggtitle("Number of Pickups per Hour every 100
Daily Pickups")+ xlab("Hour of the Day")+
theme(axis.title.y=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text=element_text(size=22))
#Top 10 busiest locations of the city
#Build variables to define «locations»
data_trip$latpickup<-
round(data_trip$pickup_latitude/0.005)*0.005
data_trip$slatpickup<-
lapply(data_trip$latpickup,toString)
data_trip$lonpickup<-
round(data_trip$pickup_longitude/0.005)*0.005
data_trip$slonpickup<-
lapply(data_trip$lonpickup,toString)
data_trip$trip_start<-
paste(data_trip$slatpickup,
data_trip$slonpickup,sep="|")
#build a trip identifier concatenating rounded
#latitude and longitude in string format
data_trip$trip_start<-paste(data_trip$slatpickup,
data_trip$slonpickup,sep="|")
#get rid of unuseful variables
data_trip$latpickup<-NULL
data_trip$lonpickup<-NULL
data_trip$slatpickup<-NULL
data_trip$slonpickup<-NULL
#groupby trip identifier and count
busy_locations <- aggregate(data_trip$ones ~
data_trip$trip_start, data_trip, sum)
names(busy_locations)[names(busy_locations)==
"data_trip$trip_start"] <- "location“
names(busy_locations)[names(busy_locations)==
"data_trip$ones"] <- "counter"
#total number of trip
tripsum <- sum(busy_locations$counter)
#total number of trip
busy_locations$perc <- busy_locations$counter
/tripsum
top10_loc <- busy_locations %>% arrange(
desc(busy_locations[,2]))
%>% top_n(10)
#print top 10 busiest location
top10_loc
location counter perc
1 40.75|-73.99 8937 0.01846335
2 40.74|-74.005 7705 0.01591811
3 40.76|-73.985 7108 0.01468474
4 40.745|-73.98 6990 0.01444096
5 40.735|-73.99 6585 0.01360425
6 40.725|-73.99 6295 0.01300512
7 40.745|-73.985 6289 0.01299273
8 40.75|-73.975 6287 0.01298860
9 40.765|-73.98 6187 0.01278200
10 40.72|-73.99 6183 0.01277374
#get address of busy locations
C <- unlist(strsplit(top10_loc$location, "[|]"))
coordinates = matrix(as.double(c), nrow=10,
ncol=2,byrow=TRUE)
top10_loc$lat<-coordinates[,1]
top10_loc$lon<-coordinates[,2]
top10_loc$address<-mapply(FUN =
function(lon, lat)
revgeocode(c(lon, lat)),
top10_loc$lon, top10_loc$lat)
top10_loc$address
[1] "137 W 33rd St, New York, NY 10120, USA"
[2] "345 W 13th St, New York, NY 10014, USA"
[3] "1585-1589 Broadway, New York, NY 10036, USA"
[4] "145 E 32nd St, New York, NY 10016, USA"
[5] "10 Union Square E, New York, NY 10003, USA"
[6] "42 2nd Ave, New York, NY 10003, USA"
[7] "110-112 Madison Ave, New York, NY 10016, USA"
[8] "633-637 3rd Ave, New York, NY 10017, USA"
[9] "Carnegie Hall, 152 W 57th St, New York, NY 10019,
USA"
[10] "129-131 Allen St, New York, NY 10002, USA"
#represent busiest addresses in a barchart
ggplot(top10_loc, aes(x=reorder(address,
counter), y=perc*1000)) +
geom_bar(stat='identity',fill="lightgoldenrod2")
+ coord_flip()
+ ggtitle("Top 10 Locations with
Highest Numbernof Pickups p/1000
Trips")
#build map for busy locations
ny_map<-get_map(location = c(-73.9308,
40.7336),maptype = "satellite",
zoom=11)
ny_map2<-get_map(location=c(-73.9874,
40.7539),maptype = "satellite",
zoom=13)
ny_map3<-get_map(location=c(-73.99,40.75),
maptype = "roadmap", zoom=13)
#represent busiest location in a map
ggmap(ny_map3)+geom_point(aes(x=top10_loc$lon,y=t
op10_loc$lat,size=top10_loc$counter),data=top10_l
oc)
#build map for a sample of pickups
data_sample<-data_trip[sample(nrow(data_trip),
400000), ]
ggmap(ny_map, extent = "device") +
geom_point(aes(x =
data_sample$pickup_longitude,
y = data_sample$pickup_latitude),
colour = "yellow", alpha = 0.1,
size = 1, data = data_sample)
#build a heat map of pickups
ggmap(ny_map, extent = "device") +
geom_point(
aes(x = data_sample$pickup_longitude,
y = data_sample$pickup_latitude),
colour = "yellow",
alpha = 0.1,
size = 1,
data = data_sample)
#build a heat map of pickups
ggmap(ny_map3, extent = "device") +
geom_density2d(data = data_sample,
aes(x = data_sample$pickup_longitude,
y = data_sample$pickup_latitude),
size = 0.3) +
stat_density2d(data = data_sample,
aes(x = data_sample$pickup_longitude,
y = data_sample$pickup_latitude,
fill = ..level.., alpha = ..level..),
size = 0.01, geom = "polygon") +
scale_fill_gradient(low = "yellow", high =
"red") + scale_alpha(range = c(0.4, 0.9),
guide = FALSE)
+ geom_point(aes(x=top10_loc$lon,y=top10_loc$lat,
size=top10_loc$counter),data=top10_loc)
#Trip with highest standard deviation of travel
#time
#I assume "trip" means "a taxi run with a given
#trip_start and trip_end".
data_trip$latdropoff<-
round(data_trip$dropoff_latitude/0.005)*0.005
data_trip$slatdropoff<-
lapply(data_trip$latdropoff,toString)
data_trip$londropoff<-
round(data_trip$dropoff_longitude/0.005)*0.005
data_trip$slondropoff<-
lapply(data_trip$londropoff,toString)
data_trip$trip_end<-
paste(data_trip$slatdropoff,data_trip$slondropo
ff,sep="|")
#get rit of not useful variables
data_trip$latdropoff<-NULL
data_trip$londropoff<-NULL
data_trip$slatdropoff<-NULL
data_trip$slondropoff<-NULL
#trip_id variable
data_trip$trip_id<-paste(data_trip$trip_start,
data_trip$trip_end,sep="|")
#compute standard deviation for every trip
trips<-aggregate(data_trip$trip_time_in_secs ~
data_trip$trip_id, data_trip, sd)
#get the trip with highest standard deviation
#and find pickup and dropoff locations
trips.topsd<-trips %>% arrange(desc(trips[,2]))
%>% top_n(10)
names(trips.topsd)[names(trips.topsd)==
"data_trip$trip_id"] <- "trip_id"
names(trips.topsd)[names(trips.topsd)==
"data_trip$trip_time_in_secs"]
<- "trip_sd"
#recover from google maps and print top 10 trip by sd
trip_text=list()
for(i in 1:10) {
coords=matrix(as.double(unlist(strsplit(
trips.topsd$trip_id[i], "[|]"))),
nrow=2,ncol=2,byrow=TRUE)
from=coords[1,]
to=coords[2,]
origin<-mapply(FUN = function(lon, lat)
revgeocode(c(lon, lat)), from[2], from[1])
destination<-mapply(FUN = function(lon, lat)
revgeocode(c(lon, lat)), to[2], to[1])
trip_text[i]=paste("Trip",i,"from",origin,"to",
destination,"has",round(trips.topsd$trip_sd[i],2),
" SD.")}
print(trip_text)
[[1]] [1] "Trip 1 from JFK Expressway, Jamaica, NY
11430, USA to JFK Expressway, Jamaica, NY 11430, USA
has 3660.94 SD."
[[2]] [1] "Trip 2 from Perimeter Rd, Jamaica, NY 11430,
USA to 826 Greene Ave, Brooklyn, NY 11221, USA has
3436.54 SD."
[[3]] [1] "Trip 3 from 46-36 54th Rd, Flushing, NY
11378, USA to 107-11 Van Wyck Expy, Jamaica, NY 11435,
USA has 3181.98 SD.”
…
…
[[10]] [1] "Trip 10 from Central Terminal Area,
Jamaica, NY 11430, USA to 34-40 E Houston St, New York,
NY 10012, USA has 2206.17 SD."
#Trip with the lowest fare’s Standard Deviation
#I assume each taxy run is uniquely identified
#by "hack licence" and "pickup time".
#I can build unique run_id's for data_fares and
#data_trip tables and join them
data_fares$run_id<-paste(data_fares$hack_license,
data_fares$pickup_datetime,sep="|")
data_trip$run_id<-paste(data_trip$hack_license,
data_trip$pickup_datetime,sep="|")
#I create a new dataframe merging data_fares and
#data_trip on run_id
df_merge=merge(x=data_trip,y=data_fares,
by.x="run_id", by.y="run_id", all.x=TRUE)
#groupby and standard deviation computation for
#fare ampount
fares<-aggregate(df_merge$fare_amount ~
df_merge$trip_id, df_merge, sd)
#Keep track of tot number of runs for each trip
fares_c<-aggregate(df_merge$ones ~ df_merge$trip_id,
df_merge, sum)
fares_merge=merge(x=fares,y=fares_c,
by.x="df_merge$trip_id",
by.y="df_merge$trip_id",
all.x=TRUE)
names(fares_merge)[names(fares_merge)==
"df_merge$trip_id"] <- "trip_id"
names(fares_merge)[names(fares_merge)==
"df_merge$fare_amount"] <- "fare_sd"
names(fares_merge)[names(fares_merge)==
"df_merge$ones"] <- "trip_count"
#exclude trip with less then 30 runs and order
fares_merge<-fares_merge[(fares_merge$trip_count>30),]
fares_merge<- fares_merge %>%
arrange((fares_merge$fare_sd))
#get some extra information beyond numbers
trip_text=list()
for(i in 1:10) {
coords=matrix(as.double(unlist(strsplit(
fares_merge$trip_id[i], "[|]"))), nrow=2,
ncol=2,byrow=TRUE)
from=coords[1,]
to=coords[2,]
origin<-mapply(FUN = function(lon, lat)
revgeocode(c(lon, lat)), from[2], from[1])
destination<-mapply(FUN = function(lon, lat)
revgeocode(c(lon, lat)), to[2], to[1])
trip_text[i]=paste("Trip",i,"starts
from",origin,"and end to to",destination)
}
print(trip_text)
[[1]] [1] "Trip 1 starts from 1585-1589 Broadway, New
York, NY 10036, USA and end to 107-11 Van Wyck Expy,
Jamaica, NY 11435, USA"
[[2]] [1] "Trip 2 starts from 1700 3rd Ave, New York,
NY 10128, USA and end to 53 E 124th St, New York, NY
10035, USA"
[[3]] [1] "Trip 3 starts from 330 W 95th St, New York,
NY 10025, USA and end to 534 W 112th St, New York, NY
10025, USA"
…
…
[[10]][1] "Trip 10 starts from 762 Amsterdam Ave, New
York, NY 10025, USA and end to 192 Claremont Ave, New
York, NY 10027, USA"
#prepare points to visualize
nr_points=100
ffrom=matrix(nr_points*2,nrow=nr_points,ncol=2)
tto=matrix(nr_points*2,nrow=nr_points,ncol=2)
for(i in 1:nr_points) {coords=
matrix(as.double(unlist(strsplit(
fares_merge$trip_id[i], "[|]"))),
nrow=2, ncol=2,byrow=TRUE)
from=coords[1,]
to=coords[2,]
ffrom[i,1]=coords[1,1]
ffrom[i,2]=coords[1,2]
tto[i,1]=coords[2,1]
tto[i,2]=coords[2,2]
}
#transform points in a matrix to points in a dataframe
start_end<-as_data_frame(list(from.lat=
ffrom[,1],from.lon=ffrom[,2],to.lat=tto[,1],
to.lon=tto[,2]))
#plot the trip with the lowest fare’s SD
ggmap(ny_map, extent = "device") +
geom_point(aes(x = start_end$to.lon[1],
y = start_end$to.lat[1]),
colour = "red", alpha = 0.6,
size = 10, data=start_end) +
geom_point(aes(x = start_end$from.lon[1],
y = start_end$from.lat[1]),
colour = "yellow", alpha = 0.6,
size = 10, data=start_end)
#plot the other trips aroung Manhattan area
ggmap(ny_map3, extent = "device") +
geom_point(aes(x = start_end$to.lon+0.00085,
y = start_end$to.lat), colour = "red",
alpha = 0.2, size = 10, data=start_end) +
geom_point(aes(x = start_end$from.lon,
y = start_end$from.lat),colour = "green",
alpha = 0.2, size = 10, data=start_end)
Customer behaviour
Economics
Insights & Graphics
Other Insights
 We can fill our mockup now
Customer behaviour
Economics
Insights & Graphics
Other Insights
 We can fill our mockup
 Let’s use some descriptive stats instead of
graph in the Customer’s Behavior Section
> summary(data_trip$passenger_count)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 1.000 1.000 2.182 3.000 6.000
> summary(data_trip$trip_time_in_secs/60)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.083 6.000 10.000 11.97 15.000 128.0
> summary(data_trip$trip_distance)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.110 1.160 1.930 2.943 3.420 45.46
> summary(data_fares$payment_type)
CRD CSH DIS NOC UNK
257247 242503 2 16 232
 Customer Behaviour entries
Average Number of Passengers p/Trip AverageTime Spent onTaxi p/Trip
2.18 12'
25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile
1 1.0 3 6' 10' 15'
Average Number of Miles p/Trip PayementsType
2.94 miles Credit Card (51%)
25th Percentile Median 75th Percentile Cash NOC Other
1.2 1.9 3.4 48% 0.00% 1%
Customer behaviour
Economics
Insights & Graphics
Other Insights
 We can fill our mockup
 Let’s use some descriptive statistics instead of
graph in the Economics Section
> summary(data_fares$fare_amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.50 6.50 9.50 12.18 14.00 385.00
> summary(data_fares$tip_amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 1.22 1.90 200.00
> summary(data_fares$total_amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.50 8.00 11.00 14.31 16.10 490.80
> summary(data_fares$total_amount-
data_fares$tip_amount-data_fares$fare_amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.5000 0.5000 0.9158 1.0000 20.0000
AverageTip p/Trip Average Other Earnings p/Trip
1.22 $ 0.92 $
25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile
0 $ 0 $ 1.9 $ 0.50 $ 0.50 $ 1.00 $
AverageAmount Earned p/Trip Average Fare p/Trip
14.31 $ 12.18 $
25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile
8.00 $ 11.00 $ 16.10 $ 6.5 $ 9.50 $ 14 $
Customer behaviour
Economics
Insights & Graphics
Other Insights
 We can fill our mockup
Customer behaviour
Economics
Insights & Graphics
Other Insights
 We can fill our mockup
 Include some facts from which you can infer something
interesting
Top 5 Busiest Hours
The Busiest Hours are from 22:00 to 02:00
Trip with MostVolatileTravelTime
Trip from JFK Expressway, Jamaica, NY 11430, USA to JFK Expressway, Jamaica, NY 11430, USA
has 3660.94 SD.
TripWith Most Consisten Fares
From 1585-1589 Broadway, NY 10036 to 107-11VanWyck Expy, Jamaica, NY 11435
Customer Habits on a Taxi Trip
25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile Cash NOC Other
1 1.0 3 6' 10' 15' 1.2 1.9 3.4 48% 0.00% 1%
Economics
25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile
8.00 $ 11.00 $ 16.10 $ 6.5 $ 9.50 $ 14 $ 0 $ 0 $ 1.9 $ 0.50 $ 0.50 $ 1.00 $
Taxi Life Insights
Top 10 Busiest Locations
Trip from JFK Expressway, Jamaica, NY 11430, USA to JFK Expressway,
Jamaica, NY 11430, USA has 3660.94 SD.
Trip With Most Consisten Fares
From 1585-1589 Broadway, NY 10036 to 107-11 Van Wyck Expy, Jamaica,
NY 11435
Pickup Points Busy Areas Top 10 Busiest Locations
Top 5 Busiest Hours
The Busiest Hours are from 22:00 to 02:00
Trip with Most Volatile Travel Time
Average Amount Earned p/Trip Average Fare p/Trip Average Tip p/Trip Average Other Earnings p/Trip
14.31 $ 12.18 $ 1.22 $ 0.92 $
Average Number of Passengers p/Trip Average Time Spent on Taxi p/Trip Average Number of Miles p/Trip Payements Type
2.18 12' 2.94 miles Credit Card (51%)
NYC Taxy Data Insigths
Using R for Building a Simple and Effective Dashboard

More Related Content

What's hot

The Aggregation Framework
The Aggregation FrameworkThe Aggregation Framework
The Aggregation FrameworkMongoDB
 
Agg framework selectgroup feb2015 v2
Agg framework selectgroup feb2015 v2Agg framework selectgroup feb2015 v2
Agg framework selectgroup feb2015 v2MongoDB
 
Doing More with MongoDB Aggregation
Doing More with MongoDB AggregationDoing More with MongoDB Aggregation
Doing More with MongoDB AggregationMongoDB
 
Data Governance with JSON Schema
Data Governance with JSON SchemaData Governance with JSON Schema
Data Governance with JSON SchemaMongoDB
 
ETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDBETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDBMongoDB
 
MongoDB Europe 2016 - Advanced MongoDB Aggregation Pipelines
MongoDB Europe 2016 - Advanced MongoDB Aggregation PipelinesMongoDB Europe 2016 - Advanced MongoDB Aggregation Pipelines
MongoDB Europe 2016 - Advanced MongoDB Aggregation PipelinesMongoDB
 
Powerful Analysis with the Aggregation Pipeline
Powerful Analysis with the Aggregation PipelinePowerful Analysis with the Aggregation Pipeline
Powerful Analysis with the Aggregation PipelineMongoDB
 
[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...
[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...
[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...MongoDB
 
MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...MongoDB
 
MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...MongoDB
 
MongoDB Aggregation Framework
MongoDB Aggregation FrameworkMongoDB Aggregation Framework
MongoDB Aggregation FrameworkCaserta
 
ETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDBETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDBMongoDB
 
Devs for Leokz e 7Masters - WTF Oriented Programming
Devs for Leokz e 7Masters - WTF Oriented ProgrammingDevs for Leokz e 7Masters - WTF Oriented Programming
Devs for Leokz e 7Masters - WTF Oriented ProgrammingFabio Akita
 
Quill + Spark = Better Together
Quill + Spark = Better TogetherQuill + Spark = Better Together
Quill + Spark = Better TogetherAlexander Ioffe
 
"Powerful Analysis with the Aggregation Pipeline (Tutorial)"
"Powerful Analysis with the Aggregation Pipeline (Tutorial)""Powerful Analysis with the Aggregation Pipeline (Tutorial)"
"Powerful Analysis with the Aggregation Pipeline (Tutorial)"MongoDB
 
Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...
Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...
Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...Andrea Lazzarotto
 
Google Visualization API
Google  Visualization  APIGoogle  Visualization  API
Google Visualization APIJason Young
 
Deep dive to PostgreSQL Indexes
Deep dive to PostgreSQL IndexesDeep dive to PostgreSQL Indexes
Deep dive to PostgreSQL IndexesIbrar Ahmed
 

What's hot (20)

Drupal Mobile
Drupal MobileDrupal Mobile
Drupal Mobile
 
The Aggregation Framework
The Aggregation FrameworkThe Aggregation Framework
The Aggregation Framework
 
Agg framework selectgroup feb2015 v2
Agg framework selectgroup feb2015 v2Agg framework selectgroup feb2015 v2
Agg framework selectgroup feb2015 v2
 
Doing More with MongoDB Aggregation
Doing More with MongoDB AggregationDoing More with MongoDB Aggregation
Doing More with MongoDB Aggregation
 
Data Governance with JSON Schema
Data Governance with JSON SchemaData Governance with JSON Schema
Data Governance with JSON Schema
 
ETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDBETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDB
 
MongoDB Europe 2016 - Advanced MongoDB Aggregation Pipelines
MongoDB Europe 2016 - Advanced MongoDB Aggregation PipelinesMongoDB Europe 2016 - Advanced MongoDB Aggregation Pipelines
MongoDB Europe 2016 - Advanced MongoDB Aggregation Pipelines
 
Powerful Analysis with the Aggregation Pipeline
Powerful Analysis with the Aggregation PipelinePowerful Analysis with the Aggregation Pipeline
Powerful Analysis with the Aggregation Pipeline
 
[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...
[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...
[MongoDB.local Bengaluru 2018] Tutorial: Pipeline Power - Doing More with Mon...
 
MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Toronto 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
 
MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
MongoDB .local Chicago 2019: Aggregation Pipeline Power++: How MongoDB 4.2 Pi...
 
MongoDB Aggregation Framework
MongoDB Aggregation FrameworkMongoDB Aggregation Framework
MongoDB Aggregation Framework
 
ETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDBETL for Pros: Getting Data Into MongoDB
ETL for Pros: Getting Data Into MongoDB
 
Devs for Leokz e 7Masters - WTF Oriented Programming
Devs for Leokz e 7Masters - WTF Oriented ProgrammingDevs for Leokz e 7Masters - WTF Oriented Programming
Devs for Leokz e 7Masters - WTF Oriented Programming
 
Quill + Spark = Better Together
Quill + Spark = Better TogetherQuill + Spark = Better Together
Quill + Spark = Better Together
 
"Powerful Analysis with the Aggregation Pipeline (Tutorial)"
"Powerful Analysis with the Aggregation Pipeline (Tutorial)""Powerful Analysis with the Aggregation Pipeline (Tutorial)"
"Powerful Analysis with the Aggregation Pipeline (Tutorial)"
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...
Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...
Data Visualization — Le funzionalità matematiche di Sage per la visualizzazio...
 
Google Visualization API
Google  Visualization  APIGoogle  Visualization  API
Google Visualization API
 
Deep dive to PostgreSQL Indexes
Deep dive to PostgreSQL IndexesDeep dive to PostgreSQL Indexes
Deep dive to PostgreSQL Indexes
 

Similar to Using R for Building a Simple and Effective Dashboard

All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2goMoriyoshi Koizumi
 
Map reduce hackerdojo
Map reduce hackerdojoMap reduce hackerdojo
Map reduce hackerdojonagwww
 
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkSpark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkZalando Technology
 
Dataiku - Paris JUG 2013 - Hadoop is a batch
Dataiku - Paris JUG 2013 - Hadoop is a batch Dataiku - Paris JUG 2013 - Hadoop is a batch
Dataiku - Paris JUG 2013 - Hadoop is a batch Dataiku
 
Monitoring Your ISP Using InfluxDB Cloud and Raspberry Pi
Monitoring Your ISP Using InfluxDB Cloud and Raspberry PiMonitoring Your ISP Using InfluxDB Cloud and Raspberry Pi
Monitoring Your ISP Using InfluxDB Cloud and Raspberry PiInfluxData
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with ClojureDmitry Buzdin
 
A Map of the PyData Stack
A Map of the PyData StackA Map of the PyData Stack
A Map of the PyData StackPeadar Coyle
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Dr. Volkan OBAN
 
Aggregating In Accumulo
Aggregating In AccumuloAggregating In Accumulo
Aggregating In AccumuloBill Slacum
 
Malli: inside data-driven schemas
Malli: inside data-driven schemasMalli: inside data-driven schemas
Malli: inside data-driven schemasMetosin Oy
 
Mapreduce in Search
Mapreduce in SearchMapreduce in Search
Mapreduce in SearchAmund Tveit
 
R getting spatial
R getting spatialR getting spatial
R getting spatialFAO
 
Historical Finance Data
Historical Finance DataHistorical Finance Data
Historical Finance DataJEE HYUN PARK
 
Beyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the codeBeyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the codeWim Godden
 

Similar to Using R for Building a Simple and Effective Dashboard (20)

A Shiny Example-- R
A Shiny Example-- RA Shiny Example-- R
A Shiny Example-- R
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2go
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Map reduce hackerdojo
Map reduce hackerdojoMap reduce hackerdojo
Map reduce hackerdojo
 
Php functions
Php functionsPhp functions
Php functions
 
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkSpark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
 
Dataiku - Paris JUG 2013 - Hadoop is a batch
Dataiku - Paris JUG 2013 - Hadoop is a batch Dataiku - Paris JUG 2013 - Hadoop is a batch
Dataiku - Paris JUG 2013 - Hadoop is a batch
 
Monitoring Your ISP Using InfluxDB Cloud and Raspberry Pi
Monitoring Your ISP Using InfluxDB Cloud and Raspberry PiMonitoring Your ISP Using InfluxDB Cloud and Raspberry Pi
Monitoring Your ISP Using InfluxDB Cloud and Raspberry Pi
 
10. R getting spatial
10.  R getting spatial10.  R getting spatial
10. R getting spatial
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
A Map of the PyData Stack
A Map of the PyData StackA Map of the PyData Stack
A Map of the PyData Stack
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.
 
Aggregating In Accumulo
Aggregating In AccumuloAggregating In Accumulo
Aggregating In Accumulo
 
CPP Homework Help
CPP Homework HelpCPP Homework Help
CPP Homework Help
 
Malli: inside data-driven schemas
Malli: inside data-driven schemasMalli: inside data-driven schemas
Malli: inside data-driven schemas
 
Mapreduce in Search
Mapreduce in SearchMapreduce in Search
Mapreduce in Search
 
R getting spatial
R getting spatialR getting spatial
R getting spatial
 
dplyr use case
dplyr use casedplyr use case
dplyr use case
 
Historical Finance Data
Historical Finance DataHistorical Finance Data
Historical Finance Data
 
Beyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the codeBeyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the code
 

More from Andrea Gigli

How organizations can become data-driven: three main rules
How organizations can become data-driven: three main rulesHow organizations can become data-driven: three main rules
How organizations can become data-driven: three main rulesAndrea Gigli
 
Equity Value for Startups.pdf
Equity Value for Startups.pdfEquity Value for Startups.pdf
Equity Value for Startups.pdfAndrea Gigli
 
Introduction to recommender systems
Introduction to recommender systemsIntroduction to recommender systems
Introduction to recommender systemsAndrea Gigli
 
Data Analytics per Manager
Data Analytics per ManagerData Analytics per Manager
Data Analytics per ManagerAndrea Gigli
 
Balance-sheet dynamics impact on FVA, MVA, KVA
Balance-sheet dynamics impact on FVA, MVA, KVABalance-sheet dynamics impact on FVA, MVA, KVA
Balance-sheet dynamics impact on FVA, MVA, KVAAndrea Gigli
 
Reasons behind XVAs
Reasons behind XVAs Reasons behind XVAs
Reasons behind XVAs Andrea Gigli
 
Recommendation Systems in banking and Financial Services
Recommendation Systems in banking and Financial ServicesRecommendation Systems in banking and Financial Services
Recommendation Systems in banking and Financial ServicesAndrea Gigli
 
Mine the Wine by Andrea Gigli
Mine the Wine by Andrea GigliMine the Wine by Andrea Gigli
Mine the Wine by Andrea GigliAndrea Gigli
 
Fast Feature Selection for Learning to Rank - ACM International Conference on...
Fast Feature Selection for Learning to Rank - ACM International Conference on...Fast Feature Selection for Learning to Rank - ACM International Conference on...
Fast Feature Selection for Learning to Rank - ACM International Conference on...Andrea Gigli
 
Feature Selection for Document Ranking
Feature Selection for Document RankingFeature Selection for Document Ranking
Feature Selection for Document RankingAndrea Gigli
 
Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...
Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...
Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...Andrea Gigli
 
Comparing Machine Learning Algorithms in Text Mining
Comparing Machine Learning Algorithms in Text MiningComparing Machine Learning Algorithms in Text Mining
Comparing Machine Learning Algorithms in Text MiningAndrea Gigli
 
Master in Big Data Analytics and Social Mining 20015
Master in Big Data Analytics and Social Mining 20015Master in Big Data Analytics and Social Mining 20015
Master in Big Data Analytics and Social Mining 20015Andrea Gigli
 
Electricity Derivatives
Electricity DerivativesElectricity Derivatives
Electricity DerivativesAndrea Gigli
 
Crawling Tripadvisor Attracion Reviews - Italiano
Crawling Tripadvisor Attracion Reviews - ItalianoCrawling Tripadvisor Attracion Reviews - Italiano
Crawling Tripadvisor Attracion Reviews - ItalianoAndrea Gigli
 
Search Engine for World Recipes Expo 2015
Search Engine for World Recipes Expo 2015Search Engine for World Recipes Expo 2015
Search Engine for World Recipes Expo 2015Andrea Gigli
 
A Data Scientist Job Map Visualization Tool using Python, D3.js and MySQL
A Data Scientist Job Map Visualization Tool using Python, D3.js and MySQLA Data Scientist Job Map Visualization Tool using Python, D3.js and MySQL
A Data Scientist Job Map Visualization Tool using Python, D3.js and MySQLAndrea Gigli
 
Search Engine Query Suggestion Application
Search Engine Query Suggestion ApplicationSearch Engine Query Suggestion Application
Search Engine Query Suggestion ApplicationAndrea Gigli
 
From real to risk neutral probability measure for pricing and managing cva
From real to risk neutral probability measure for pricing and managing cvaFrom real to risk neutral probability measure for pricing and managing cva
From real to risk neutral probability measure for pricing and managing cvaAndrea Gigli
 
Startup Saturday Internet Festival 2014
Startup Saturday Internet Festival 2014Startup Saturday Internet Festival 2014
Startup Saturday Internet Festival 2014Andrea Gigli
 

More from Andrea Gigli (20)

How organizations can become data-driven: three main rules
How organizations can become data-driven: three main rulesHow organizations can become data-driven: three main rules
How organizations can become data-driven: three main rules
 
Equity Value for Startups.pdf
Equity Value for Startups.pdfEquity Value for Startups.pdf
Equity Value for Startups.pdf
 
Introduction to recommender systems
Introduction to recommender systemsIntroduction to recommender systems
Introduction to recommender systems
 
Data Analytics per Manager
Data Analytics per ManagerData Analytics per Manager
Data Analytics per Manager
 
Balance-sheet dynamics impact on FVA, MVA, KVA
Balance-sheet dynamics impact on FVA, MVA, KVABalance-sheet dynamics impact on FVA, MVA, KVA
Balance-sheet dynamics impact on FVA, MVA, KVA
 
Reasons behind XVAs
Reasons behind XVAs Reasons behind XVAs
Reasons behind XVAs
 
Recommendation Systems in banking and Financial Services
Recommendation Systems in banking and Financial ServicesRecommendation Systems in banking and Financial Services
Recommendation Systems in banking and Financial Services
 
Mine the Wine by Andrea Gigli
Mine the Wine by Andrea GigliMine the Wine by Andrea Gigli
Mine the Wine by Andrea Gigli
 
Fast Feature Selection for Learning to Rank - ACM International Conference on...
Fast Feature Selection for Learning to Rank - ACM International Conference on...Fast Feature Selection for Learning to Rank - ACM International Conference on...
Fast Feature Selection for Learning to Rank - ACM International Conference on...
 
Feature Selection for Document Ranking
Feature Selection for Document RankingFeature Selection for Document Ranking
Feature Selection for Document Ranking
 
Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...
Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...
Impact of Valuation Adjustments (CVA, DVA, FVA, KVA) on Bank's Processes - An...
 
Comparing Machine Learning Algorithms in Text Mining
Comparing Machine Learning Algorithms in Text MiningComparing Machine Learning Algorithms in Text Mining
Comparing Machine Learning Algorithms in Text Mining
 
Master in Big Data Analytics and Social Mining 20015
Master in Big Data Analytics and Social Mining 20015Master in Big Data Analytics and Social Mining 20015
Master in Big Data Analytics and Social Mining 20015
 
Electricity Derivatives
Electricity DerivativesElectricity Derivatives
Electricity Derivatives
 
Crawling Tripadvisor Attracion Reviews - Italiano
Crawling Tripadvisor Attracion Reviews - ItalianoCrawling Tripadvisor Attracion Reviews - Italiano
Crawling Tripadvisor Attracion Reviews - Italiano
 
Search Engine for World Recipes Expo 2015
Search Engine for World Recipes Expo 2015Search Engine for World Recipes Expo 2015
Search Engine for World Recipes Expo 2015
 
A Data Scientist Job Map Visualization Tool using Python, D3.js and MySQL
A Data Scientist Job Map Visualization Tool using Python, D3.js and MySQLA Data Scientist Job Map Visualization Tool using Python, D3.js and MySQL
A Data Scientist Job Map Visualization Tool using Python, D3.js and MySQL
 
Search Engine Query Suggestion Application
Search Engine Query Suggestion ApplicationSearch Engine Query Suggestion Application
Search Engine Query Suggestion Application
 
From real to risk neutral probability measure for pricing and managing cva
From real to risk neutral probability measure for pricing and managing cvaFrom real to risk neutral probability measure for pricing and managing cva
From real to risk neutral probability measure for pricing and managing cva
 
Startup Saturday Internet Festival 2014
Startup Saturday Internet Festival 2014Startup Saturday Internet Festival 2014
Startup Saturday Internet Festival 2014
 

Recently uploaded

Predictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesPredictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesBoston Institute of Analytics
 
Harnessing the Power of GenAI for BI and Reporting.pptx
Harnessing the Power of GenAI for BI and Reporting.pptxHarnessing the Power of GenAI for BI and Reporting.pptx
Harnessing the Power of GenAI for BI and Reporting.pptxParas Gupta
 
5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed
5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed
5CL-ADBA,5cladba, Chinese supplier, safety is guaranteedamy56318795
 
Credit Card Fraud Detection: Safeguarding Transactions in the Digital Age
Credit Card Fraud Detection: Safeguarding Transactions in the Digital AgeCredit Card Fraud Detection: Safeguarding Transactions in the Digital Age
Credit Card Fraud Detection: Safeguarding Transactions in the Digital AgeBoston Institute of Analytics
 
Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...
Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...
Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...Klinik Aborsi
 
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTSDBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTSSnehalVinod
 
Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...
Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...
Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...Voces Mineras
 
Bios of leading Astrologers & Researchers
Bios of leading Astrologers & ResearchersBios of leading Astrologers & Researchers
Bios of leading Astrologers & Researchersdarmandersingh4580
 
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxRESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxronsairoathenadugay
 
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证acoha1
 
sourabh vyas1222222222222222222244444444
sourabh vyas1222222222222222222244444444sourabh vyas1222222222222222222244444444
sourabh vyas1222222222222222222244444444saurabvyas476
 
原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证
原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证
原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证pwgnohujw
 
DATA SUMMIT 24 Building Real-Time Pipelines With FLaNK
DATA SUMMIT 24  Building Real-Time Pipelines With FLaNKDATA SUMMIT 24  Building Real-Time Pipelines With FLaNK
DATA SUMMIT 24 Building Real-Time Pipelines With FLaNKTimothy Spann
 
jll-asia-pacific-capital-tracker-1q24.pdf
jll-asia-pacific-capital-tracker-1q24.pdfjll-asia-pacific-capital-tracker-1q24.pdf
jll-asia-pacific-capital-tracker-1q24.pdfjaytendertech
 
如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样
如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样
如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样wsppdmt
 
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证zifhagzkk
 
Capstone in Interprofessional Informatic // IMPACT OF COVID 19 ON EDUCATION
Capstone in Interprofessional Informatic  // IMPACT OF COVID 19 ON EDUCATIONCapstone in Interprofessional Informatic  // IMPACT OF COVID 19 ON EDUCATION
Capstone in Interprofessional Informatic // IMPACT OF COVID 19 ON EDUCATIONLakpaYanziSherpa
 

Recently uploaded (20)

Predictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesPredictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting Techniques
 
Harnessing the Power of GenAI for BI and Reporting.pptx
Harnessing the Power of GenAI for BI and Reporting.pptxHarnessing the Power of GenAI for BI and Reporting.pptx
Harnessing the Power of GenAI for BI and Reporting.pptx
 
5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed
5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed
5CL-ADBA,5cladba, Chinese supplier, safety is guaranteed
 
Credit Card Fraud Detection: Safeguarding Transactions in the Digital Age
Credit Card Fraud Detection: Safeguarding Transactions in the Digital AgeCredit Card Fraud Detection: Safeguarding Transactions in the Digital Age
Credit Card Fraud Detection: Safeguarding Transactions in the Digital Age
 
Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...
Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...
Jual Obat Aborsi Bandung (Asli No.1) Wa 082134680322 Klinik Obat Penggugur Ka...
 
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTSDBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
 
Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...
Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...
Las implicancias del memorándum de entendimiento entre Codelco y SQM según la...
 
Abortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Jeddah | +966572737505 | Get CytotecAbortion pills in Jeddah | +966572737505 | Get Cytotec
Abortion pills in Jeddah | +966572737505 | Get Cytotec
 
Bios of leading Astrologers & Researchers
Bios of leading Astrologers & ResearchersBios of leading Astrologers & Researchers
Bios of leading Astrologers & Researchers
 
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxRESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
 
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
 
sourabh vyas1222222222222222222244444444
sourabh vyas1222222222222222222244444444sourabh vyas1222222222222222222244444444
sourabh vyas1222222222222222222244444444
 
Abortion pills in Doha {{ QATAR }} +966572737505) Get Cytotec
Abortion pills in Doha {{ QATAR }} +966572737505) Get CytotecAbortion pills in Doha {{ QATAR }} +966572737505) Get Cytotec
Abortion pills in Doha {{ QATAR }} +966572737505) Get Cytotec
 
Abortion pills in Riyadh Saudi Arabia| +966572737505 | Get Cytotec, Unwanted Kit
Abortion pills in Riyadh Saudi Arabia| +966572737505 | Get Cytotec, Unwanted KitAbortion pills in Riyadh Saudi Arabia| +966572737505 | Get Cytotec, Unwanted Kit
Abortion pills in Riyadh Saudi Arabia| +966572737505 | Get Cytotec, Unwanted Kit
 
原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证
原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证
原件一样(UWO毕业证书)西安大略大学毕业证成绩单留信学历认证
 
DATA SUMMIT 24 Building Real-Time Pipelines With FLaNK
DATA SUMMIT 24  Building Real-Time Pipelines With FLaNKDATA SUMMIT 24  Building Real-Time Pipelines With FLaNK
DATA SUMMIT 24 Building Real-Time Pipelines With FLaNK
 
jll-asia-pacific-capital-tracker-1q24.pdf
jll-asia-pacific-capital-tracker-1q24.pdfjll-asia-pacific-capital-tracker-1q24.pdf
jll-asia-pacific-capital-tracker-1q24.pdf
 
如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样
如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样
如何办理澳洲拉筹伯大学毕业证(LaTrobe毕业证书)成绩单原件一模一样
 
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
 
Capstone in Interprofessional Informatic // IMPACT OF COVID 19 ON EDUCATION
Capstone in Interprofessional Informatic  // IMPACT OF COVID 19 ON EDUCATIONCapstone in Interprofessional Informatic  // IMPACT OF COVID 19 ON EDUCATION
Capstone in Interprofessional Informatic // IMPACT OF COVID 19 ON EDUCATION
 

Using R for Building a Simple and Effective Dashboard

  • 1. How to use open source tools and data science to get insights on business and customers
  • 2.  The goal of this talk is  Give you a flavour of what can you do with open source data analysis tools like R or Python  Give you some useful «code snippets» to make practice  Provide a way of reasoning while commenting code and slides
  • 3.
  • 4.  The setting  You are a rampant Data Scientist  Someone want to start a new business in NY and create a taxi company (or the new Uber!) and ask you an advice  You want to prepare a beautiful and simple dashboard with the most relevant insights and KPI
  • 5.  First think first… Get some Data  http://www.nyc.gov/html/tlc/html/home/home.shtml
  • 6. Customer behaviour Economics Insights & Graphics Other Insights  Sketch an idea of your Dashboard/Report
  • 7.  Start Exploring Data  Trip Details Data ▪ medallion, hack_license, vendor_id, rate_code, store_and_fwd_flag, Pickup_datetime, Drop- off_datetime, passenger_count, trip_time_in_secs, trip_distance, Pickup_longitude, Pickup_latitude, Drop- off_longitude, Drop-off_latitude  Trip Fare Data: ▪ medallion, hack_license, vendor_id, Pickup_datetime, payment_type, fare_amount, surcharge, mta_tax, tip_amount, tolls_amount, total_amount
  • 8. In the following I’ll make extensive use of R (https://www.r-project.org), Rstudio (https://www.rstudio.com) and the following R libraries library(psych) library(dplyr) library(ggmap) library(lattice)
  • 9.  Download data in <your folder> from here:  Unzip  Import in a R DataFrame: setwd(“<your folder>") Import them in a Dataframe: #read trip_data.csv data_trip<-read.csv("trip_data.csv",sep=',', header=1,nrows=500000) #read trip_fare.csv data_fares<-read.csv("trip_fare.csv",sep=',‘, header=1,nrows=500000)
  • 10.  Let’s do some Cleansing, for example #exclude trip with time less than 60 seconds data_trip<-data_trip[( data_trip$trip_time_in_secs)>60,] #exclude trip with distance less than 0.1 miles data_trip<-data_trip[( data_trip$trip_distance)>0.1,] data_trip<-data_trip [!(data_trip$pickup_latitude==0 | data_trip$pickup_longitude==0),]
  • 11. #work on a selection of the NYC area data_trip<-data_trip[( data_trip$pickup_latitude>(40.62)& data_trip$pickup_latitude<40.9 & data_trip$pickup_longitude>(-74.1)& data_trip$pickup_longitude<(-73.75)& data_trip$dropoff_latitude>(40.62)& data_trip$dropoff_latitude<40.9& data_trip$dropoff_longitude>(-74.1)& data_trip$dropoff_longitude<(73.75)) ,]
  • 12.  Build new variables, #create a column for pickup_hour data_trip$pickup_hour<-as.POSIXlt( data_trip$pickup_datetime)$hour #create a column for dropoff_hour data_trip$dropoff_hour<-as.POSIXlt( data_trip$dropoff_datetime)$hour #create a column for counting data_trip$ones<-1
  • 13.  Remove some variables, data_fares$medallion<-NULL data_fares$vendor_id<-NULL data_trip$dropoff_datetime<-NULL data_trip$medallion<-NULL data_trip$vendor_id<-NULL data_trip$store_and_fwd_flag<-NULL data_trip$rate_code<-NULL
  • 14.  Plot some Histograms #Distribution of number of passengers per trip hist(data_trip$passenger_count,6, main="Distribution of Number of Passengers per Trip",xlab="Number of Passengers p/Trip") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") hist(data_trip$passenger_count,6, add = TRUE,col=" lightgoldenrod2 ")
  • 15.
  • 16. #Distribution of payment_type barplot(sort(table(data_fares$payment_type), decreasing = TRUE), xaxt = 'n') rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") barplot(sort(table(data_fares$payment_type), decreasing = TRUE), ylab="Frequency“, col="lightgoldenrod2", add =TRUE, main="Distribution of Payement Type“)
  • 17.
  • 18. #Distribution of number of trip time length hist(data_trip$trip_time_in_secs/60,10, xlim=c(0,100),main="Distribution of Trip Time",xlab="Trip Time in minutes") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") hist(data_trip$trip_time_in_secs/60,10, add = TRUE,col="lightgoldenrod2")
  • 19.
  • 20. #Distribution of number of trip distance hist(data_trip$trip_distance,100,xlim=c(0,40), main="Distribution of Trip Distance", xlab="Trip Distance") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") hist(data_trip$trip_distance,100, add =TRUE, col="lightgoldenrod2")
  • 21.
  • 22. #Distribution of fare amount (full domain) hist(data_fares$fare_amount, main="Distribution of Fare Amount", xlab="Fare Amount") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") hist(data_fares$fare_amount,add = TRUE,col="lightgoldenrod2")
  • 23.
  • 24. #Distribution of fare amount (restricted domain) hist(data_fares$fare_amount,xlim=c(0,80),200, main="Distribution of Fare Amount", xlab="Fare Amount") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey“) hist(data_fares$fare_amount,200, xlim=c(0,80),add = TRUE,col="lightgoldenrod2")
  • 25.
  • 26. #Distribution of tip amount hist(data_fares$tip_amount,500,xlim=c(0,20), main="Distribution of Tip Amount", xlab="Tip Amount") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") hist(data_fares$tip_amount,500,xlim=c(0,20),add = TRUE,col="lightgoldenrod2")
  • 27.
  • 28. #Distribution of Total Amount hist(data_fares$total_amount,1000,xlim=c(0,100), main="Distribution of Total Amount", xlab="Total Amount") rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") hist(data_fares$total_amount,add = TRUE, col="lightgoldenrod2",1000,xlim=c(0,100))
  • 29.
  • 30. #Distribution of pickups during the day barplot(table(data_trip$pickup_hour)) rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") barplot(table(data_trip$pickup_hour), add = TRUE, col="lightgoldenrod2", main="Distribution of Pickups in 24H", ylab="Frequency")
  • 31.
  • 32. #Distribution of pickups during the day (ordered) barplot(sort(table(data_trip$pickup_hour), decreasing = TRUE)) rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "grey") barplot(sort(table(data_trip$pickup_hour), decreasing = TRUE)),add = TRUE, col="lightgoldenrod2", main="Distribution of Pickups in 24H", ylab="Frequency")
  • 33.
  • 34. #Top 5 busiest hours of the day busy_hours<-aggregate(data_trip$ones ~ data_trip$pickup_hour, data_trip, sum) #select top 5 pickup_hours busy_hours.top5<- busy_hours %>% arrange(desc(busy_hours[,2])) %>% top_n(5) names(busy_hours.top5)[names(busy_hours.top5)== "data_trip$pickup_hour"]<-"pickup_hour" names(busy_hours.top5)[names(busy_hours.top5)== "data_trip$ones"] <- "nr_runs"
  • 35. busy_hours.top5 pickup_hour nr_runs 1 23 32829 2 0 31392 3 22 27887 4 1 26800 5 12 25711
  • 36. #Distribution of pickups during the day in % names(busy_hours)[names(busy_hours)== "data_trip$pickup_hour"]<-"pickup_hour“ names(busy_hours)[names(busy_hours)== "data_trip$ones"] <- "counter“ hoursum<-sum(busy_hours$counter) busy_hours$perc<-busy_hours$counter/hoursum
  • 37. ggplot(busy_hours,aes(x = pickup_hour, y = perc*100))+ geom_ribbon(aes(ymin=0, ymax=perc*100), fill="lightgoldenrod2", color="lightgoldenrod2")+ scale_x_continuous(breaks = seq(from = 0, to = 23, by = 1))+ geom_point(size=3, color="burlywood3")+ geom_line(color="burlywood3", lwd=0.5)+ ggtitle("Number of Pickups per Hour every 100 Daily Pickups")+ xlab("Hour of the Day")+ theme(axis.title.y=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text=element_text(size=22))
  • 38.
  • 39. #Top 10 busiest locations of the city #Build variables to define «locations» data_trip$latpickup<- round(data_trip$pickup_latitude/0.005)*0.005 data_trip$slatpickup<- lapply(data_trip$latpickup,toString) data_trip$lonpickup<- round(data_trip$pickup_longitude/0.005)*0.005 data_trip$slonpickup<- lapply(data_trip$lonpickup,toString) data_trip$trip_start<- paste(data_trip$slatpickup, data_trip$slonpickup,sep="|")
  • 40. #build a trip identifier concatenating rounded #latitude and longitude in string format data_trip$trip_start<-paste(data_trip$slatpickup, data_trip$slonpickup,sep="|") #get rid of unuseful variables data_trip$latpickup<-NULL data_trip$lonpickup<-NULL data_trip$slatpickup<-NULL data_trip$slonpickup<-NULL
  • 41. #groupby trip identifier and count busy_locations <- aggregate(data_trip$ones ~ data_trip$trip_start, data_trip, sum) names(busy_locations)[names(busy_locations)== "data_trip$trip_start"] <- "location“ names(busy_locations)[names(busy_locations)== "data_trip$ones"] <- "counter"
  • 42. #total number of trip tripsum <- sum(busy_locations$counter) #total number of trip busy_locations$perc <- busy_locations$counter /tripsum top10_loc <- busy_locations %>% arrange( desc(busy_locations[,2])) %>% top_n(10)
  • 43. #print top 10 busiest location top10_loc location counter perc 1 40.75|-73.99 8937 0.01846335 2 40.74|-74.005 7705 0.01591811 3 40.76|-73.985 7108 0.01468474 4 40.745|-73.98 6990 0.01444096 5 40.735|-73.99 6585 0.01360425 6 40.725|-73.99 6295 0.01300512 7 40.745|-73.985 6289 0.01299273 8 40.75|-73.975 6287 0.01298860 9 40.765|-73.98 6187 0.01278200 10 40.72|-73.99 6183 0.01277374
  • 44. #get address of busy locations C <- unlist(strsplit(top10_loc$location, "[|]")) coordinates = matrix(as.double(c), nrow=10, ncol=2,byrow=TRUE) top10_loc$lat<-coordinates[,1] top10_loc$lon<-coordinates[,2] top10_loc$address<-mapply(FUN = function(lon, lat) revgeocode(c(lon, lat)), top10_loc$lon, top10_loc$lat)
  • 45. top10_loc$address [1] "137 W 33rd St, New York, NY 10120, USA" [2] "345 W 13th St, New York, NY 10014, USA" [3] "1585-1589 Broadway, New York, NY 10036, USA" [4] "145 E 32nd St, New York, NY 10016, USA" [5] "10 Union Square E, New York, NY 10003, USA" [6] "42 2nd Ave, New York, NY 10003, USA" [7] "110-112 Madison Ave, New York, NY 10016, USA" [8] "633-637 3rd Ave, New York, NY 10017, USA" [9] "Carnegie Hall, 152 W 57th St, New York, NY 10019, USA" [10] "129-131 Allen St, New York, NY 10002, USA"
  • 46. #represent busiest addresses in a barchart ggplot(top10_loc, aes(x=reorder(address, counter), y=perc*1000)) + geom_bar(stat='identity',fill="lightgoldenrod2") + coord_flip() + ggtitle("Top 10 Locations with Highest Numbernof Pickups p/1000 Trips")
  • 47.
  • 48. #build map for busy locations ny_map<-get_map(location = c(-73.9308, 40.7336),maptype = "satellite", zoom=11) ny_map2<-get_map(location=c(-73.9874, 40.7539),maptype = "satellite", zoom=13) ny_map3<-get_map(location=c(-73.99,40.75), maptype = "roadmap", zoom=13) #represent busiest location in a map ggmap(ny_map3)+geom_point(aes(x=top10_loc$lon,y=t op10_loc$lat,size=top10_loc$counter),data=top10_l oc)
  • 49.
  • 50. #build map for a sample of pickups data_sample<-data_trip[sample(nrow(data_trip), 400000), ] ggmap(ny_map, extent = "device") + geom_point(aes(x = data_sample$pickup_longitude, y = data_sample$pickup_latitude), colour = "yellow", alpha = 0.1, size = 1, data = data_sample)
  • 51.
  • 52. #build a heat map of pickups ggmap(ny_map, extent = "device") + geom_point( aes(x = data_sample$pickup_longitude, y = data_sample$pickup_latitude), colour = "yellow", alpha = 0.1, size = 1, data = data_sample)
  • 53.
  • 54. #build a heat map of pickups ggmap(ny_map3, extent = "device") + geom_density2d(data = data_sample, aes(x = data_sample$pickup_longitude, y = data_sample$pickup_latitude), size = 0.3) + stat_density2d(data = data_sample, aes(x = data_sample$pickup_longitude, y = data_sample$pickup_latitude, fill = ..level.., alpha = ..level..), size = 0.01, geom = "polygon") + scale_fill_gradient(low = "yellow", high = "red") + scale_alpha(range = c(0.4, 0.9), guide = FALSE)
  • 55.
  • 57. #Trip with highest standard deviation of travel #time #I assume "trip" means "a taxi run with a given #trip_start and trip_end". data_trip$latdropoff<- round(data_trip$dropoff_latitude/0.005)*0.005 data_trip$slatdropoff<- lapply(data_trip$latdropoff,toString) data_trip$londropoff<- round(data_trip$dropoff_longitude/0.005)*0.005 data_trip$slondropoff<- lapply(data_trip$londropoff,toString) data_trip$trip_end<- paste(data_trip$slatdropoff,data_trip$slondropo ff,sep="|")
  • 58. #get rit of not useful variables data_trip$latdropoff<-NULL data_trip$londropoff<-NULL data_trip$slatdropoff<-NULL data_trip$slondropoff<-NULL #trip_id variable data_trip$trip_id<-paste(data_trip$trip_start, data_trip$trip_end,sep="|")
  • 59. #compute standard deviation for every trip trips<-aggregate(data_trip$trip_time_in_secs ~ data_trip$trip_id, data_trip, sd) #get the trip with highest standard deviation #and find pickup and dropoff locations trips.topsd<-trips %>% arrange(desc(trips[,2])) %>% top_n(10) names(trips.topsd)[names(trips.topsd)== "data_trip$trip_id"] <- "trip_id" names(trips.topsd)[names(trips.topsd)== "data_trip$trip_time_in_secs"] <- "trip_sd"
  • 60. #recover from google maps and print top 10 trip by sd trip_text=list() for(i in 1:10) { coords=matrix(as.double(unlist(strsplit( trips.topsd$trip_id[i], "[|]"))), nrow=2,ncol=2,byrow=TRUE) from=coords[1,] to=coords[2,] origin<-mapply(FUN = function(lon, lat) revgeocode(c(lon, lat)), from[2], from[1]) destination<-mapply(FUN = function(lon, lat) revgeocode(c(lon, lat)), to[2], to[1]) trip_text[i]=paste("Trip",i,"from",origin,"to", destination,"has",round(trips.topsd$trip_sd[i],2), " SD.")}
  • 61. print(trip_text) [[1]] [1] "Trip 1 from JFK Expressway, Jamaica, NY 11430, USA to JFK Expressway, Jamaica, NY 11430, USA has 3660.94 SD." [[2]] [1] "Trip 2 from Perimeter Rd, Jamaica, NY 11430, USA to 826 Greene Ave, Brooklyn, NY 11221, USA has 3436.54 SD." [[3]] [1] "Trip 3 from 46-36 54th Rd, Flushing, NY 11378, USA to 107-11 Van Wyck Expy, Jamaica, NY 11435, USA has 3181.98 SD.” … … [[10]] [1] "Trip 10 from Central Terminal Area, Jamaica, NY 11430, USA to 34-40 E Houston St, New York, NY 10012, USA has 2206.17 SD."
  • 62. #Trip with the lowest fare’s Standard Deviation #I assume each taxy run is uniquely identified #by "hack licence" and "pickup time". #I can build unique run_id's for data_fares and #data_trip tables and join them data_fares$run_id<-paste(data_fares$hack_license, data_fares$pickup_datetime,sep="|") data_trip$run_id<-paste(data_trip$hack_license, data_trip$pickup_datetime,sep="|")
  • 63. #I create a new dataframe merging data_fares and #data_trip on run_id df_merge=merge(x=data_trip,y=data_fares, by.x="run_id", by.y="run_id", all.x=TRUE) #groupby and standard deviation computation for #fare ampount fares<-aggregate(df_merge$fare_amount ~ df_merge$trip_id, df_merge, sd)
  • 64. #Keep track of tot number of runs for each trip fares_c<-aggregate(df_merge$ones ~ df_merge$trip_id, df_merge, sum) fares_merge=merge(x=fares,y=fares_c, by.x="df_merge$trip_id", by.y="df_merge$trip_id", all.x=TRUE) names(fares_merge)[names(fares_merge)== "df_merge$trip_id"] <- "trip_id" names(fares_merge)[names(fares_merge)== "df_merge$fare_amount"] <- "fare_sd" names(fares_merge)[names(fares_merge)== "df_merge$ones"] <- "trip_count" #exclude trip with less then 30 runs and order fares_merge<-fares_merge[(fares_merge$trip_count>30),] fares_merge<- fares_merge %>% arrange((fares_merge$fare_sd))
  • 65. #get some extra information beyond numbers trip_text=list() for(i in 1:10) { coords=matrix(as.double(unlist(strsplit( fares_merge$trip_id[i], "[|]"))), nrow=2, ncol=2,byrow=TRUE) from=coords[1,] to=coords[2,] origin<-mapply(FUN = function(lon, lat) revgeocode(c(lon, lat)), from[2], from[1]) destination<-mapply(FUN = function(lon, lat) revgeocode(c(lon, lat)), to[2], to[1]) trip_text[i]=paste("Trip",i,"starts from",origin,"and end to to",destination) }
  • 66. print(trip_text) [[1]] [1] "Trip 1 starts from 1585-1589 Broadway, New York, NY 10036, USA and end to 107-11 Van Wyck Expy, Jamaica, NY 11435, USA" [[2]] [1] "Trip 2 starts from 1700 3rd Ave, New York, NY 10128, USA and end to 53 E 124th St, New York, NY 10035, USA" [[3]] [1] "Trip 3 starts from 330 W 95th St, New York, NY 10025, USA and end to 534 W 112th St, New York, NY 10025, USA" … … [[10]][1] "Trip 10 starts from 762 Amsterdam Ave, New York, NY 10025, USA and end to 192 Claremont Ave, New York, NY 10027, USA"
  • 67. #prepare points to visualize nr_points=100 ffrom=matrix(nr_points*2,nrow=nr_points,ncol=2) tto=matrix(nr_points*2,nrow=nr_points,ncol=2) for(i in 1:nr_points) {coords= matrix(as.double(unlist(strsplit( fares_merge$trip_id[i], "[|]"))), nrow=2, ncol=2,byrow=TRUE) from=coords[1,] to=coords[2,] ffrom[i,1]=coords[1,1] ffrom[i,2]=coords[1,2] tto[i,1]=coords[2,1] tto[i,2]=coords[2,2] }
  • 68. #transform points in a matrix to points in a dataframe start_end<-as_data_frame(list(from.lat= ffrom[,1],from.lon=ffrom[,2],to.lat=tto[,1], to.lon=tto[,2])) #plot the trip with the lowest fare’s SD ggmap(ny_map, extent = "device") + geom_point(aes(x = start_end$to.lon[1], y = start_end$to.lat[1]), colour = "red", alpha = 0.6, size = 10, data=start_end) + geom_point(aes(x = start_end$from.lon[1], y = start_end$from.lat[1]), colour = "yellow", alpha = 0.6, size = 10, data=start_end)
  • 69.
  • 70. #plot the other trips aroung Manhattan area ggmap(ny_map3, extent = "device") + geom_point(aes(x = start_end$to.lon+0.00085, y = start_end$to.lat), colour = "red", alpha = 0.2, size = 10, data=start_end) + geom_point(aes(x = start_end$from.lon, y = start_end$from.lat),colour = "green", alpha = 0.2, size = 10, data=start_end)
  • 71.
  • 72. Customer behaviour Economics Insights & Graphics Other Insights  We can fill our mockup now
  • 73. Customer behaviour Economics Insights & Graphics Other Insights  We can fill our mockup
  • 74.  Let’s use some descriptive stats instead of graph in the Customer’s Behavior Section > summary(data_trip$passenger_count) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.00 1.000 1.000 2.182 3.000 6.000 > summary(data_trip$trip_time_in_secs/60) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.083 6.000 10.000 11.97 15.000 128.0 > summary(data_trip$trip_distance) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.110 1.160 1.930 2.943 3.420 45.46 > summary(data_fares$payment_type) CRD CSH DIS NOC UNK 257247 242503 2 16 232
  • 75.  Customer Behaviour entries Average Number of Passengers p/Trip AverageTime Spent onTaxi p/Trip 2.18 12' 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 1 1.0 3 6' 10' 15' Average Number of Miles p/Trip PayementsType 2.94 miles Credit Card (51%) 25th Percentile Median 75th Percentile Cash NOC Other 1.2 1.9 3.4 48% 0.00% 1%
  • 76. Customer behaviour Economics Insights & Graphics Other Insights  We can fill our mockup
  • 77.  Let’s use some descriptive statistics instead of graph in the Economics Section > summary(data_fares$fare_amount) Min. 1st Qu. Median Mean 3rd Qu. Max. 2.50 6.50 9.50 12.18 14.00 385.00 > summary(data_fares$tip_amount) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 1.22 1.90 200.00 > summary(data_fares$total_amount) Min. 1st Qu. Median Mean 3rd Qu. Max. 2.50 8.00 11.00 14.31 16.10 490.80 > summary(data_fares$total_amount- data_fares$tip_amount-data_fares$fare_amount) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.5000 0.5000 0.9158 1.0000 20.0000
  • 78. AverageTip p/Trip Average Other Earnings p/Trip 1.22 $ 0.92 $ 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 0 $ 0 $ 1.9 $ 0.50 $ 0.50 $ 1.00 $ AverageAmount Earned p/Trip Average Fare p/Trip 14.31 $ 12.18 $ 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 8.00 $ 11.00 $ 16.10 $ 6.5 $ 9.50 $ 14 $
  • 79. Customer behaviour Economics Insights & Graphics Other Insights  We can fill our mockup
  • 80.
  • 81. Customer behaviour Economics Insights & Graphics Other Insights  We can fill our mockup
  • 82.  Include some facts from which you can infer something interesting Top 5 Busiest Hours The Busiest Hours are from 22:00 to 02:00 Trip with MostVolatileTravelTime Trip from JFK Expressway, Jamaica, NY 11430, USA to JFK Expressway, Jamaica, NY 11430, USA has 3660.94 SD. TripWith Most Consisten Fares From 1585-1589 Broadway, NY 10036 to 107-11VanWyck Expy, Jamaica, NY 11435
  • 83. Customer Habits on a Taxi Trip 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile Cash NOC Other 1 1.0 3 6' 10' 15' 1.2 1.9 3.4 48% 0.00% 1% Economics 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 25th Percentile Median 75th Percentile 8.00 $ 11.00 $ 16.10 $ 6.5 $ 9.50 $ 14 $ 0 $ 0 $ 1.9 $ 0.50 $ 0.50 $ 1.00 $ Taxi Life Insights Top 10 Busiest Locations Trip from JFK Expressway, Jamaica, NY 11430, USA to JFK Expressway, Jamaica, NY 11430, USA has 3660.94 SD. Trip With Most Consisten Fares From 1585-1589 Broadway, NY 10036 to 107-11 Van Wyck Expy, Jamaica, NY 11435 Pickup Points Busy Areas Top 10 Busiest Locations Top 5 Busiest Hours The Busiest Hours are from 22:00 to 02:00 Trip with Most Volatile Travel Time Average Amount Earned p/Trip Average Fare p/Trip Average Tip p/Trip Average Other Earnings p/Trip 14.31 $ 12.18 $ 1.22 $ 0.92 $ Average Number of Passengers p/Trip Average Time Spent on Taxi p/Trip Average Number of Miles p/Trip Payements Type 2.18 12' 2.94 miles Credit Card (51%) NYC Taxy Data Insigths