SlideShare a Scribd company logo
Better	Visualization	of		Trips	Through					
Agglomerative	Clustering	
Anbarasan S
February 2, 2016
1.	Problem	Statement	:
Visualization of flow/mobility data on a map always gives a cluttered view, even for small
dataset .Hence it is difficult , to derive any insights or make decisions out of it.
2.	Solution:
Devise a methodology to group or aggregate similar flows
3.	Methodology	:
Step 1: K-Nearest Neighbours
1.a. Find the k-nearest neighbors for the Origin location of particular Trip/Flow.
1.b.Find The k-nearest Neighbour For the destination location of particular
Trip/Flow.
Step 2: Contiguous Flows
Two flows/Trips ,are said to be contiguous if and only if, it satisfies both the
following conditions
1. K-NN of Origin1(Trip1) overlaps with k-NN of Origin2(Trip2).
2. K-NN of Destination1(Trip1) overlaps with k-NN of Destination2(Trip2).
Step 3: Agglomerative Clustering
Two flows are clustered in a agglomerative fashion, based on a distance measure
defined by the number of nearest neighbours shared
Dist(Trip1,Trip2) = 1- [KNN(O1) η KNN(O2)/k * KNN(D1) η KNN(D2)/k]
O1,O2- Origins Of Trip1 and Trip2 respectively
D1,D2 destination of Trip1 and Trip2 respectively
Very low value of dist , suggests that the flows are very nearer, and larger value >=1
suggests that flows cannot be clustered together.
Step 4: Visualization
Agglomerative Clusters , when projected on to map, gives meaningful insights
4. Implementation:
Dataset: Taxi Service Trajectory - Prediction Challenge, ECML PKDD 2015 Data Set
https://archive.ics.uci.edu/ml/datasets/Taxi+Service+Trajectory+-
+Prediction+Challenge,+ECML+PKDD+2015
tt_trajectory <- read.csv("G:/TaxiTrajectory/train.csv",
nrows = 500, stringsAsFactors = FALSE)
head(tt_trajectory,1)
## TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP
## 1 1.372637e+18 C NA NA 20000589 137263 6858
## DAY_TYPE MISSING_DATA
## 1 A False
##
POLYLINE
## 1 [[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-
8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-
8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-
8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-
8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-
8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-
8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-
8.630829,41.154498],[-8.630838,41.154489]]
4.1.Get	the	Origin	And	Destination	location	for	each	trip
∑ The Polyline field contains the Longitude Latitude location details of a trip taken every
15 secs.
∑ The polyline field is parsed using JSON library , to get the Origin and destination
locations of each trip
library(RJSONIO)
## Warning: package 'RJSONIO' was built under R version 3.2.3
# Function to get Origin and Destination Location of each trip
# using json library
positions_1 <- function(row){
trajectory_list <- fromJSON(row$POLYLINE)
no_of_points <- length(trajectory_list)
if(no_of_points>0){
lan_lat_pairs<-data.frame(rbind(trajectory_list[[1]],
trajectory_list[[no_of_points]]))
return(lan_lat_pairs)
}}
coordinates <- data.frame(TripId=c(), Ordinal=c(),Booking_time= c(), Lat=c(),
Lon=c(), Status=c())
for (i in 1:nrow(tt_trajectory)) {
lat_lon_df <- positions_1(tt_trajectory[i,])
status <- c("Origin","Destination")
if(!is.null(lat_lon_df)){
coordinates <- rbind(coordinates,
data.frame(TripId = tt_trajectory$TRIP_ID[i],
Booking_time = tt_trajectory$TIMESTAMP[i],
Lat = lat_lon_df$X2,
Lon = lat_lon_df$X1,
Status = status ))
}}
coordinates$Status <- factor(coordinates$Status, levels <-
c("Origin","Destination"))
head(coordinates)
## TripId Booking_time Lat Lon Status
## 1 1.372637e+18 1372636858 41.14141 -8.618643 Origin
## 2 1.372637e+18 1372636858 41.15449 -8.630838 Destination
## 3 1.372637e+18 1372637303 41.15983 -8.639847 Origin
## 4 1.372637e+18 1372637303 41.17067 -8.665740 Destination
## 5 1.372637e+18 1372636951 41.14036 -8.612964 Origin
## 6 1.372637e+18 1372636951 41.14053 -8.615970 Destination
4.2	Visualize	the	Trips
Visualize	the	trips	on	to	a	map
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
lat.centre = median(coordinates$Lat)
lon.centre = median(coordinates$Lon)
map.portugal <-ggmap(get_map(location = c(lon.centre,lat.centre),
maptype = "roadmap",
source="google",
zoom = 12))
## Map from URL :
http://maps.googleapis.com/maps/api/staticmap?center=41.157607, -
8.615227&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en -
EN&sensor=false
map.portugal+
geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)
## Warning: Removed 5 rows containing missing values (geom_point).
# Connect SOurce and Destination for Each trip
map.portugal+
geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)+
geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color =
"blue")
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missin g values (geom_path).
map.portugal+
geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ),
color = "blue")
## Warning: Removed 5 rows containing missing values (geom_path).
Projection of 1/100 of the original dataset, gives a cluttered view on the map.
4.3.Find	K-NN	for	origin	points	and	destination	points	
separately
4.3.1. Knn for Origin
# Find distance using Haversiandistance
library(dplyr)
library(geosphere)
library(dbscan)
# Function to Convert Ids to TripIds
Ids_to_tripIds<- function(row) lat_lon_origin[row,"TripId"]
# Origin of all the trips
lat_lon_origin <- filter(coordinates,Status == "Origin") %>%
select(TripId,Lat,Lon) %>%
as.matrix()
# Find the actual distance in meters between the origin points
distance_matrix_origin <- distm(lat_lon_origin[,c(3,2)],
fun=distHaversine)
# Find kNN For origin
# No. of. nearest neighbours = 50
knn_origin <- kNN(distance_matrix_origin,k=50)
Origin_NN_ids <- apply(knn_origin$id,1,Ids_to_tripIds)
# Construct nearest neighbour for every origin point
origin_NN <- data.frame(FlowId = c(),Neighbour_Flows = c())
for (i in 1:nrow(lat_lon_origin)) {
origin_NN <- rbind(origin_NN,
data.frame(FlowId = lat_lon_origin[i,"TripId"],
Neighbour_Flows =
Origin_NN_ids[,i]))
}
head(origin_NN)
## FlowId Neighbour_Flows
## 1 1.372637e+18 1.372657e+18
## 2 1.372637e+18 1.372662e+18
## 3 1.372637e+18 1.372638e+18
## 4 1.372637e+18 1.372663e+18
## 5 1.372637e+18 1.372656e+18
## 6 1.372637e+18 1.372642e+18
4.3.2.Knn for destination
# Destination
lat_lon_dest <- filter(coordinates,Status == "Destination")%>%
select(TripId,Lat,Lon) %>% as.matrix()
distance_matrix_dest <- distm(lat_lon_dest[,c(3,2)], fun=distHaversine)
# Find kNN For destination
knn_destination <- kNN(distance_matrix_dest,k=50)
dest_NN_ids <- apply(knn_destination$id,1,Ids_to_tripIds)
dest_NN <- data.frame(FlowId = c(),Neighbour_Flows = c())
for (i in 1:nrow(lat_lon_dest)) {
dest_NN <- rbind(dest_NN,
data.frame(FlowId = lat_lon_dest[i,"TripId"],
Neighbour_Flows = dest_NN_ids[,i]))
}
4.4	Find	Contiguous	Flow	Pair
flow_pair <- data.frame(flow = c(), Neighbour_flow = c(),
snn_distance = c() )
flow_distance <- function(row,o_NN,d_NN){
onn <-
origin_NN[origin_NN$FlowId==row,]$Neighbour_Flows
dnn <-
dest_NN[dest_NN$FlowId==row,]$Neighbour_Flows
common_o_points <- sum(o_NN %in% onn)/50
common_d_points <- sum(d_NN %in% dnn)/50
snn <- 1-(common_o_points*common_d_points)
return(snn)
}
for(i in tt_trajectory$TRIP_ID){
o_NN <- filter(origin_NN,FlowId==i)%>% select(Neighbour_Flows) #nearest
neighbours of given origin
d_NN <- filter(dest_NN,FlowId == i )%>%select(Neighbour_Flows) #NNs of
given destination
NN_matches <- o_NN[,1] %in% d_NN[,1] #Flows having Common Origin and
Common destination
# Contiguous/Nearest Flows for a given flow
# Two Flows are said to be Contiguous if they are in
# Nearest neighbour of both origin and destination of a given flow
contiguous_flows <-o_NN$Neighbour_Flows[NN_matches]
#dist bw Flows
#Arguments Passed:
##1.List of Flows Found to be Contiguous to a given flow
##2.flow_distance -Function to calculate distance between 2 flows
if(length(contiguous_flows)!=0){
snn_flow_distance <-
sapply(contiguous_flows,flow_distance,o_NN[,1],d_NN[,1])
flow_pair <- rbind(flow_pair,
data.frame(flow = i,
Neighbour_flow = contiguous_flows,
snn_distance = snn_flow_distance ))
}
# print(length(contiguous_flows))
# print(length(snn_flow_distance))
}
flow_pair <- flow_pair[flow_pair$snn_distance!=1,]
flow_pair <-
flow_pair[order(flow_pair$flow,flow_pair$snn_distance),]
4.5	Agglomerative	Clustering
## Initialize Clusters
for(i in 1:nrow(tt_trajectory)){
tt_trajectory$Cluster[i] = paste("c",i,sep = "")
## Group Clusters
for(i in 1:nrow(flow_pair)){
flow_1 <- flow_pair$flow[i]
flow_2 <- flow_pair$Neighbour_flow[i]
#Determine the Cluster,the Flows associated with
cluster_label_flow_1 <- tt_trajectory[tt_trajectory$TRIP_ID
==flow_1,]$Cluster
cluster_label_flow_2 <- tt_trajectory[tt_trajectory$TRIP_ID
==flow_2,]$Cluster
#Find Distance Between Clusters
#Step 1:Get All points in a single cluster
flows_with_cluster_1 <- tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_1,]$TRIP_ID
flows_with_cluster_2 <- tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_2,]$TRIP_ID
#Step 2:Compute Distance Matrix
distance_bw_clusters = data.frame(distances = c())
for(j in flows_with_cluster_1){
for(k in flows_with_cluster_2 ){
rec_in_flowpair <- subset(flow_pair,flow == j &
Neighbour_flow == k)
if(nrow(rec_in_flowpair)!=0){
distance_bw_clusters = rbind(distance_bw_clusters,
data.frame(distances =
rec_in_flowpair$snn_distance))
}
}
}
#Step 3:Condition for grouping Clusters Together -Linkage
Measures
avg_dist_bw_clusters <- mean(distance_bw_clusters$distances)
if(!is.na(avg_dist_bw_clusters) & avg_dist_bw_clusters < 1){
tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_2,]$Cluster <-
cluster_label_flow_1
}
}
4.6	Visuaize	the	clusters
cluster_table <- group_by(tt_trajectory,Cluster) %>%
count(Cluster) %>%
filter(n>1)%>% select(Cluster,n)
Cluster_centroids <- data.frame(ClusterId = c(),Lat = c(),Lon =
c(),no_of_flows = c())
for(i in cluster_table$Cluster){
tripids_of_cluster <-
tt_trajectory[tt_trajectory$Cluster==i,]$TRIP_ID
lat_lon_o_df <- data.frame(lat_lon_origin)
cluster_o_positions <- filter(lat_lon_o_df,
lat_lon_o_df$TripId %in%
tripids_of_cluster)%>%select(Lat,Lon)
Cluster_centroids <- rbind(Cluster_centroids,
data.frame(ClusterId=i,
Lat =
apply(cluster_o_positions,2,median)[1],
Lon =
apply(cluster_o_positions,2,median)[2],
no_of_flows =
cluster_table[cluster_table$Cluster==i,]$n))
lat_lon_d_df <- data.frame(lat_lon_dest)
cluster_d_positions <- filter(lat_lon_d_df,
lat_lon_d_df$TripId %in%
tripids_of_cluster) %>% select(Lat,Lon)
Cluster_centroids <- rbind(Cluster_centroids,
data.frame(ClusterId = i,
Lat =
apply(cluster_d_positions,2,median)[1],
Lon =
apply(cluster_d_positions,2,median)[2],
no_of_flows =
cluster_table[cluster_table$Cluster==i,]$n))
}
map.portugal+
geom_line(data = Cluster_centroids,
aes(x=Lon,y=Lat,group =ClusterId, size = no_of_flows,
alpha = 1.5),color = "red")
CONCLUSION
Flow	Clustering	of	trips	,	thus	provides	a	meaningful	insights	about	the	flow	
,the	more	broader	the	clustered	flow,	it	has	more	number	of	actual	flows.On	
increasing	the	number	of	nearest	neighbours	,	the	flow	clusters	gives	a	clear	
picture	of	the	flows.	Due	to	extensive	computation	required	for	such	analysis,	
I	restricted	to	reduced	number	of	trips(500)	and	reduced	number	of	nearest	
neighbours(50)

More Related Content

What's hot

Math426_Project3-1
Math426_Project3-1Math426_Project3-1
Math426_Project3-1
Yijun Zhou
 
RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
Hidekazu Tanaka
 
Geo Spatial Plot using R
Geo Spatial Plot using R Geo Spatial Plot using R
Geo Spatial Plot using R
Rupak Roy
 
Geospatial Data in R
Geospatial Data in RGeospatial Data in R
Geospatial Data in R
Barry Rowlingson
 
Maximum Likelihood Calibration of the Hercules Data Set
Maximum Likelihood Calibration of the Hercules Data SetMaximum Likelihood Calibration of the Hercules Data Set
Maximum Likelihood Calibration of the Hercules Data Set
Christopher Garling
 
imageCorrectionLinearDiffusion
imageCorrectionLinearDiffusionimageCorrectionLinearDiffusion
imageCorrectionLinearDiffusion
Kellen Betts
 
Data visualization using the grammar of graphics
Data visualization using the grammar of graphicsData visualization using the grammar of graphics
Data visualization using the grammar of graphics
Rupak Roy
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
Paul Richards
 
Data visualization with multiple groups using ggplot2
Data visualization with multiple groups using ggplot2Data visualization with multiple groups using ggplot2
Data visualization with multiple groups using ggplot2
Rupak Roy
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
Dr. Volkan OBAN
 
Raster package jacob
Raster package jacobRaster package jacob
Group01_Project3
Group01_Project3Group01_Project3
Group01_Project3
Shashidhar Sanda
 
Ggplot2 ch2
Ggplot2 ch2Ggplot2 ch2
Ggplot2 ch2
heba_ahmad
 
Introduction to spatial data analysis in r
Introduction to spatial data analysis in rIntroduction to spatial data analysis in r
Introduction to spatial data analysis in r
Richard Wamalwa
 
Geolocation Databases in Ruby on Rails
Geolocation Databases in Ruby on RailsGeolocation Databases in Ruby on Rails
Geolocation Databases in Ruby on Rails
Ireneusz Skrobiś
 
Quiz 2
Quiz 2Quiz 2
Quiz 2
Gopi Saiteja
 
Move your data (Hans Rosling style) with googleVis + 1 line of R code
Move your data (Hans Rosling style) with googleVis + 1 line of R codeMove your data (Hans Rosling style) with googleVis + 1 line of R code
Move your data (Hans Rosling style) with googleVis + 1 line of R code
Jeffrey Breen
 
Optimisation random graph presentation
Optimisation random graph presentationOptimisation random graph presentation
Optimisation random graph presentation
Venkat Sai Sharath Mudhigonda
 
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
csandit
 

What's hot (19)

Math426_Project3-1
Math426_Project3-1Math426_Project3-1
Math426_Project3-1
 
RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Geo Spatial Plot using R
Geo Spatial Plot using R Geo Spatial Plot using R
Geo Spatial Plot using R
 
Geospatial Data in R
Geospatial Data in RGeospatial Data in R
Geospatial Data in R
 
Maximum Likelihood Calibration of the Hercules Data Set
Maximum Likelihood Calibration of the Hercules Data SetMaximum Likelihood Calibration of the Hercules Data Set
Maximum Likelihood Calibration of the Hercules Data Set
 
imageCorrectionLinearDiffusion
imageCorrectionLinearDiffusionimageCorrectionLinearDiffusion
imageCorrectionLinearDiffusion
 
Data visualization using the grammar of graphics
Data visualization using the grammar of graphicsData visualization using the grammar of graphics
Data visualization using the grammar of graphics
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
Data visualization with multiple groups using ggplot2
Data visualization with multiple groups using ggplot2Data visualization with multiple groups using ggplot2
Data visualization with multiple groups using ggplot2
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
 
Raster package jacob
Raster package jacobRaster package jacob
Raster package jacob
 
Group01_Project3
Group01_Project3Group01_Project3
Group01_Project3
 
Ggplot2 ch2
Ggplot2 ch2Ggplot2 ch2
Ggplot2 ch2
 
Introduction to spatial data analysis in r
Introduction to spatial data analysis in rIntroduction to spatial data analysis in r
Introduction to spatial data analysis in r
 
Geolocation Databases in Ruby on Rails
Geolocation Databases in Ruby on RailsGeolocation Databases in Ruby on Rails
Geolocation Databases in Ruby on Rails
 
Quiz 2
Quiz 2Quiz 2
Quiz 2
 
Move your data (Hans Rosling style) with googleVis + 1 line of R code
Move your data (Hans Rosling style) with googleVis + 1 line of R codeMove your data (Hans Rosling style) with googleVis + 1 line of R code
Move your data (Hans Rosling style) with googleVis + 1 line of R code
 
Optimisation random graph presentation
Optimisation random graph presentationOptimisation random graph presentation
Optimisation random graph presentation
 
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
 

Viewers also liked

Decomposing Object oriented class
Decomposing Object oriented classDecomposing Object oriented class
Decomposing Object oriented class
dineshppc
 
Data miningpresentation
Data miningpresentationData miningpresentation
Data miningpresentation
Manoj Krishna Yadavalli
 
Hierarchical clustering
Hierarchical clusteringHierarchical clustering
Hierarchical clustering
ishmecse13
 
Correspondence analysis(step by step)
Correspondence analysis(step by step)Correspondence analysis(step by step)
Correspondence analysis(step by step)
Nguyen Van Chuc
 
Hierarchical Clustering
Hierarchical ClusteringHierarchical Clustering
Hierarchical Clustering
Carlos Castillo (ChaTo)
 
The Top Skills That Can Get You Hired in 2017
The Top Skills That Can Get You Hired in 2017The Top Skills That Can Get You Hired in 2017
The Top Skills That Can Get You Hired in 2017
LinkedIn
 

Viewers also liked (6)

Decomposing Object oriented class
Decomposing Object oriented classDecomposing Object oriented class
Decomposing Object oriented class
 
Data miningpresentation
Data miningpresentationData miningpresentation
Data miningpresentation
 
Hierarchical clustering
Hierarchical clusteringHierarchical clustering
Hierarchical clustering
 
Correspondence analysis(step by step)
Correspondence analysis(step by step)Correspondence analysis(step by step)
Correspondence analysis(step by step)
 
Hierarchical Clustering
Hierarchical ClusteringHierarchical Clustering
Hierarchical Clustering
 
The Top Skills That Can Get You Hired in 2017
The Top Skills That Can Get You Hired in 2017The Top Skills That Can Get You Hired in 2017
The Top Skills That Can Get You Hired in 2017
 

Similar to Better Visualization of Trips through Agglomerative Clustering

algorithm Unit 3
algorithm Unit 3algorithm Unit 3
algorithm Unit 3
Monika Choudhery
 
Cab travel time prediction using ensemble models
Cab travel time prediction using ensemble modelsCab travel time prediction using ensemble models
Cab travel time prediction using ensemble models
Ayan Sengupta
 
Prob-Dist-Toll-Forecast-Uncertainty
Prob-Dist-Toll-Forecast-UncertaintyProb-Dist-Toll-Forecast-Uncertainty
Prob-Dist-Toll-Forecast-Uncertainty
Ankoor Bhagat
 
Unit 3 daa
Unit 3 daaUnit 3 daa
Unit 3 daa
Nv Thejaswini
 
Spark - Citi Bike NYC
Spark - Citi Bike NYCSpark - Citi Bike NYC
Spark - Citi Bike NYC
Sushmanth Sagala
 
Lecture set 5
Lecture set 5Lecture set 5
Lecture set 5
Gopi Saiteja
 
routing algorithm
routing algorithmrouting algorithm
routing algorithm
AnusuaBasu
 
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
zukun
 
Where in the world
Where in the worldWhere in the world
Where in the world
Marcus Deglos
 
Supply chain logistics : vehicle routing and scheduling
Supply chain logistics : vehicle  routing and  schedulingSupply chain logistics : vehicle  routing and  scheduling
Supply chain logistics : vehicle routing and scheduling
Retigence Technologies
 
2013.11.14 Big Data Workshop Bruno Voisin
2013.11.14 Big Data Workshop Bruno Voisin 2013.11.14 Big Data Workshop Bruno Voisin
2013.11.14 Big Data Workshop Bruno Voisin
NUI Galway
 
10. R getting spatial
10.  R getting spatial10.  R getting spatial
10. R getting spatial
ExternalEvents
 
Branch and bounding : Data structures
Branch and bounding : Data structuresBranch and bounding : Data structures
Branch and bounding : Data structures
Kàŕtheek Jåvvàjí
 
IAP presentation-1.pptx
IAP presentation-1.pptxIAP presentation-1.pptx
IAP presentation-1.pptx
HirazNor
 
Page rank
Page rankPage rank
Page rank
Carlos
 
Mrongraphs acm-sig-2 (1)
Mrongraphs acm-sig-2 (1)Mrongraphs acm-sig-2 (1)
Mrongraphs acm-sig-2 (1)
Nima Sarshar
 
GEOMETRIC TAMPERING ESTIMATION BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
GEOMETRIC TAMPERING ESTIMATION  BY MEANS OF A SIFT-BASED FORENSIC ANALYSISGEOMETRIC TAMPERING ESTIMATION  BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
GEOMETRIC TAMPERING ESTIMATION BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
ICL - Image Communication Laboratory
 
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
Khoa Mac Tu
 
Data manipulation with dplyr
Data manipulation with dplyrData manipulation with dplyr
Data manipulation with dplyr
Romain Francois
 
R getting spatial
R getting spatialR getting spatial
R getting spatial
FAO
 

Similar to Better Visualization of Trips through Agglomerative Clustering (20)

algorithm Unit 3
algorithm Unit 3algorithm Unit 3
algorithm Unit 3
 
Cab travel time prediction using ensemble models
Cab travel time prediction using ensemble modelsCab travel time prediction using ensemble models
Cab travel time prediction using ensemble models
 
Prob-Dist-Toll-Forecast-Uncertainty
Prob-Dist-Toll-Forecast-UncertaintyProb-Dist-Toll-Forecast-Uncertainty
Prob-Dist-Toll-Forecast-Uncertainty
 
Unit 3 daa
Unit 3 daaUnit 3 daa
Unit 3 daa
 
Spark - Citi Bike NYC
Spark - Citi Bike NYCSpark - Citi Bike NYC
Spark - Citi Bike NYC
 
Lecture set 5
Lecture set 5Lecture set 5
Lecture set 5
 
routing algorithm
routing algorithmrouting algorithm
routing algorithm
 
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
 
Where in the world
Where in the worldWhere in the world
Where in the world
 
Supply chain logistics : vehicle routing and scheduling
Supply chain logistics : vehicle  routing and  schedulingSupply chain logistics : vehicle  routing and  scheduling
Supply chain logistics : vehicle routing and scheduling
 
2013.11.14 Big Data Workshop Bruno Voisin
2013.11.14 Big Data Workshop Bruno Voisin 2013.11.14 Big Data Workshop Bruno Voisin
2013.11.14 Big Data Workshop Bruno Voisin
 
10. R getting spatial
10.  R getting spatial10.  R getting spatial
10. R getting spatial
 
Branch and bounding : Data structures
Branch and bounding : Data structuresBranch and bounding : Data structures
Branch and bounding : Data structures
 
IAP presentation-1.pptx
IAP presentation-1.pptxIAP presentation-1.pptx
IAP presentation-1.pptx
 
Page rank
Page rankPage rank
Page rank
 
Mrongraphs acm-sig-2 (1)
Mrongraphs acm-sig-2 (1)Mrongraphs acm-sig-2 (1)
Mrongraphs acm-sig-2 (1)
 
GEOMETRIC TAMPERING ESTIMATION BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
GEOMETRIC TAMPERING ESTIMATION  BY MEANS OF A SIFT-BASED FORENSIC ANALYSISGEOMETRIC TAMPERING ESTIMATION  BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
GEOMETRIC TAMPERING ESTIMATION BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
 
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
 
Data manipulation with dplyr
Data manipulation with dplyrData manipulation with dplyr
Data manipulation with dplyr
 
R getting spatial
R getting spatialR getting spatial
R getting spatial
 

Better Visualization of Trips through Agglomerative Clustering

  • 1. Better Visualization of Trips Through Agglomerative Clustering Anbarasan S February 2, 2016 1. Problem Statement : Visualization of flow/mobility data on a map always gives a cluttered view, even for small dataset .Hence it is difficult , to derive any insights or make decisions out of it. 2. Solution: Devise a methodology to group or aggregate similar flows 3. Methodology : Step 1: K-Nearest Neighbours 1.a. Find the k-nearest neighbors for the Origin location of particular Trip/Flow. 1.b.Find The k-nearest Neighbour For the destination location of particular Trip/Flow. Step 2: Contiguous Flows Two flows/Trips ,are said to be contiguous if and only if, it satisfies both the following conditions 1. K-NN of Origin1(Trip1) overlaps with k-NN of Origin2(Trip2). 2. K-NN of Destination1(Trip1) overlaps with k-NN of Destination2(Trip2). Step 3: Agglomerative Clustering Two flows are clustered in a agglomerative fashion, based on a distance measure defined by the number of nearest neighbours shared Dist(Trip1,Trip2) = 1- [KNN(O1) η KNN(O2)/k * KNN(D1) η KNN(D2)/k] O1,O2- Origins Of Trip1 and Trip2 respectively D1,D2 destination of Trip1 and Trip2 respectively Very low value of dist , suggests that the flows are very nearer, and larger value >=1 suggests that flows cannot be clustered together.
  • 2. Step 4: Visualization Agglomerative Clusters , when projected on to map, gives meaningful insights 4. Implementation: Dataset: Taxi Service Trajectory - Prediction Challenge, ECML PKDD 2015 Data Set https://archive.ics.uci.edu/ml/datasets/Taxi+Service+Trajectory+- +Prediction+Challenge,+ECML+PKDD+2015 tt_trajectory <- read.csv("G:/TaxiTrajectory/train.csv", nrows = 500, stringsAsFactors = FALSE) head(tt_trajectory,1) ## TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP ## 1 1.372637e+18 C NA NA 20000589 137263 6858 ## DAY_TYPE MISSING_DATA ## 1 A False ## POLYLINE ## 1 [[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[- 8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[- 8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[- 8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[- 8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[- 8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[- 8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[- 8.630829,41.154498],[-8.630838,41.154489]] 4.1.Get the Origin And Destination location for each trip ∑ The Polyline field contains the Longitude Latitude location details of a trip taken every 15 secs. ∑ The polyline field is parsed using JSON library , to get the Origin and destination locations of each trip
  • 3. library(RJSONIO) ## Warning: package 'RJSONIO' was built under R version 3.2.3 # Function to get Origin and Destination Location of each trip # using json library positions_1 <- function(row){ trajectory_list <- fromJSON(row$POLYLINE) no_of_points <- length(trajectory_list) if(no_of_points>0){ lan_lat_pairs<-data.frame(rbind(trajectory_list[[1]], trajectory_list[[no_of_points]])) return(lan_lat_pairs) }} coordinates <- data.frame(TripId=c(), Ordinal=c(),Booking_time= c(), Lat=c(), Lon=c(), Status=c()) for (i in 1:nrow(tt_trajectory)) { lat_lon_df <- positions_1(tt_trajectory[i,]) status <- c("Origin","Destination") if(!is.null(lat_lon_df)){ coordinates <- rbind(coordinates, data.frame(TripId = tt_trajectory$TRIP_ID[i], Booking_time = tt_trajectory$TIMESTAMP[i], Lat = lat_lon_df$X2, Lon = lat_lon_df$X1, Status = status )) }} coordinates$Status <- factor(coordinates$Status, levels <- c("Origin","Destination")) head(coordinates) ## TripId Booking_time Lat Lon Status ## 1 1.372637e+18 1372636858 41.14141 -8.618643 Origin ## 2 1.372637e+18 1372636858 41.15449 -8.630838 Destination ## 3 1.372637e+18 1372637303 41.15983 -8.639847 Origin ## 4 1.372637e+18 1372637303 41.17067 -8.665740 Destination ## 5 1.372637e+18 1372636951 41.14036 -8.612964 Origin ## 6 1.372637e+18 1372636951 41.14053 -8.615970 Destination
  • 4. 4.2 Visualize the Trips Visualize the trips on to a map library(ggmap) ## Warning: package 'ggmap' was built under R version 3.2.3 ## Loading required package: ggplot2 ## Warning: package 'ggplot2' was built under R version 3.2.3 lat.centre = median(coordinates$Lat) lon.centre = median(coordinates$Lon) map.portugal <-ggmap(get_map(location = c(lon.centre,lat.centre), maptype = "roadmap", source="google", zoom = 12)) ## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=41.157607, - 8.615227&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en - EN&sensor=false map.portugal+ geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1) ## Warning: Removed 5 rows containing missing values (geom_point).
  • 5. # Connect SOurce and Destination for Each trip map.portugal+ geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)+ geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color = "blue") ## Warning: Removed 5 rows containing missing values (geom_point). ## Warning: Removed 5 rows containing missin g values (geom_path).
  • 6. map.portugal+ geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color = "blue") ## Warning: Removed 5 rows containing missing values (geom_path).
  • 7. Projection of 1/100 of the original dataset, gives a cluttered view on the map. 4.3.Find K-NN for origin points and destination points separately 4.3.1. Knn for Origin # Find distance using Haversiandistance library(dplyr) library(geosphere) library(dbscan) # Function to Convert Ids to TripIds Ids_to_tripIds<- function(row) lat_lon_origin[row,"TripId"] # Origin of all the trips lat_lon_origin <- filter(coordinates,Status == "Origin") %>% select(TripId,Lat,Lon) %>% as.matrix() # Find the actual distance in meters between the origin points distance_matrix_origin <- distm(lat_lon_origin[,c(3,2)], fun=distHaversine)
  • 8. # Find kNN For origin # No. of. nearest neighbours = 50 knn_origin <- kNN(distance_matrix_origin,k=50) Origin_NN_ids <- apply(knn_origin$id,1,Ids_to_tripIds) # Construct nearest neighbour for every origin point origin_NN <- data.frame(FlowId = c(),Neighbour_Flows = c()) for (i in 1:nrow(lat_lon_origin)) { origin_NN <- rbind(origin_NN, data.frame(FlowId = lat_lon_origin[i,"TripId"], Neighbour_Flows = Origin_NN_ids[,i])) } head(origin_NN) ## FlowId Neighbour_Flows ## 1 1.372637e+18 1.372657e+18 ## 2 1.372637e+18 1.372662e+18 ## 3 1.372637e+18 1.372638e+18 ## 4 1.372637e+18 1.372663e+18 ## 5 1.372637e+18 1.372656e+18 ## 6 1.372637e+18 1.372642e+18 4.3.2.Knn for destination # Destination lat_lon_dest <- filter(coordinates,Status == "Destination")%>% select(TripId,Lat,Lon) %>% as.matrix() distance_matrix_dest <- distm(lat_lon_dest[,c(3,2)], fun=distHaversine) # Find kNN For destination knn_destination <- kNN(distance_matrix_dest,k=50) dest_NN_ids <- apply(knn_destination$id,1,Ids_to_tripIds) dest_NN <- data.frame(FlowId = c(),Neighbour_Flows = c()) for (i in 1:nrow(lat_lon_dest)) { dest_NN <- rbind(dest_NN, data.frame(FlowId = lat_lon_dest[i,"TripId"],
  • 9. Neighbour_Flows = dest_NN_ids[,i])) } 4.4 Find Contiguous Flow Pair flow_pair <- data.frame(flow = c(), Neighbour_flow = c(), snn_distance = c() ) flow_distance <- function(row,o_NN,d_NN){ onn <- origin_NN[origin_NN$FlowId==row,]$Neighbour_Flows dnn <- dest_NN[dest_NN$FlowId==row,]$Neighbour_Flows common_o_points <- sum(o_NN %in% onn)/50 common_d_points <- sum(d_NN %in% dnn)/50 snn <- 1-(common_o_points*common_d_points) return(snn) } for(i in tt_trajectory$TRIP_ID){ o_NN <- filter(origin_NN,FlowId==i)%>% select(Neighbour_Flows) #nearest neighbours of given origin d_NN <- filter(dest_NN,FlowId == i )%>%select(Neighbour_Flows) #NNs of given destination NN_matches <- o_NN[,1] %in% d_NN[,1] #Flows having Common Origin and Common destination # Contiguous/Nearest Flows for a given flow # Two Flows are said to be Contiguous if they are in # Nearest neighbour of both origin and destination of a given flow contiguous_flows <-o_NN$Neighbour_Flows[NN_matches] #dist bw Flows #Arguments Passed: ##1.List of Flows Found to be Contiguous to a given flow ##2.flow_distance -Function to calculate distance between 2 flows if(length(contiguous_flows)!=0){ snn_flow_distance <- sapply(contiguous_flows,flow_distance,o_NN[,1],d_NN[,1]) flow_pair <- rbind(flow_pair, data.frame(flow = i, Neighbour_flow = contiguous_flows, snn_distance = snn_flow_distance )) } # print(length(contiguous_flows)) # print(length(snn_flow_distance))
  • 10. } flow_pair <- flow_pair[flow_pair$snn_distance!=1,] flow_pair <- flow_pair[order(flow_pair$flow,flow_pair$snn_distance),] 4.5 Agglomerative Clustering ## Initialize Clusters for(i in 1:nrow(tt_trajectory)){ tt_trajectory$Cluster[i] = paste("c",i,sep = "") ## Group Clusters for(i in 1:nrow(flow_pair)){ flow_1 <- flow_pair$flow[i] flow_2 <- flow_pair$Neighbour_flow[i] #Determine the Cluster,the Flows associated with cluster_label_flow_1 <- tt_trajectory[tt_trajectory$TRIP_ID ==flow_1,]$Cluster cluster_label_flow_2 <- tt_trajectory[tt_trajectory$TRIP_ID ==flow_2,]$Cluster #Find Distance Between Clusters #Step 1:Get All points in a single cluster flows_with_cluster_1 <- tt_trajectory[tt_trajectory$Cluster == cluster_label_flow_1,]$TRIP_ID flows_with_cluster_2 <- tt_trajectory[tt_trajectory$Cluster == cluster_label_flow_2,]$TRIP_ID
  • 11. #Step 2:Compute Distance Matrix distance_bw_clusters = data.frame(distances = c()) for(j in flows_with_cluster_1){ for(k in flows_with_cluster_2 ){ rec_in_flowpair <- subset(flow_pair,flow == j & Neighbour_flow == k) if(nrow(rec_in_flowpair)!=0){ distance_bw_clusters = rbind(distance_bw_clusters, data.frame(distances = rec_in_flowpair$snn_distance)) } } } #Step 3:Condition for grouping Clusters Together -Linkage Measures avg_dist_bw_clusters <- mean(distance_bw_clusters$distances) if(!is.na(avg_dist_bw_clusters) & avg_dist_bw_clusters < 1){ tt_trajectory[tt_trajectory$Cluster == cluster_label_flow_2,]$Cluster <- cluster_label_flow_1 } }
  • 12. 4.6 Visuaize the clusters cluster_table <- group_by(tt_trajectory,Cluster) %>% count(Cluster) %>% filter(n>1)%>% select(Cluster,n) Cluster_centroids <- data.frame(ClusterId = c(),Lat = c(),Lon = c(),no_of_flows = c()) for(i in cluster_table$Cluster){ tripids_of_cluster <- tt_trajectory[tt_trajectory$Cluster==i,]$TRIP_ID lat_lon_o_df <- data.frame(lat_lon_origin) cluster_o_positions <- filter(lat_lon_o_df, lat_lon_o_df$TripId %in% tripids_of_cluster)%>%select(Lat,Lon) Cluster_centroids <- rbind(Cluster_centroids, data.frame(ClusterId=i, Lat = apply(cluster_o_positions,2,median)[1], Lon = apply(cluster_o_positions,2,median)[2], no_of_flows = cluster_table[cluster_table$Cluster==i,]$n)) lat_lon_d_df <- data.frame(lat_lon_dest) cluster_d_positions <- filter(lat_lon_d_df, lat_lon_d_df$TripId %in% tripids_of_cluster) %>% select(Lat,Lon)
  • 13. Cluster_centroids <- rbind(Cluster_centroids, data.frame(ClusterId = i, Lat = apply(cluster_d_positions,2,median)[1], Lon = apply(cluster_d_positions,2,median)[2], no_of_flows = cluster_table[cluster_table$Cluster==i,]$n)) } map.portugal+ geom_line(data = Cluster_centroids, aes(x=Lon,y=Lat,group =ClusterId, size = no_of_flows, alpha = 1.5),color = "red")