SlideShare a Scribd company logo
1 of 54
Download to read offline
What More Can I Learn from My
OpenTelemetry Traces?
Analyzing traces with SQL
February 1, 2022
John Pruitt
What we will
cover today 👀
01
02
Context: Tracing..? 🤔
Our Example System
● Request Rates / Error Rates
● Request Durations
● Service Dependencies
● Upstream Spans
● Downstream Spans
04 Q&A!
03 Building Grafana Dashboards for Traces
tsdb.co/opentelemetry-demo
01
Context
Tracing?
A trace is a tree of spans
SPAN1
SPAN2 SPAN3 SPAN4
SPAN5 SPAN6 SPAN7 SPAN8
SPAN9 SPAN10
A trace is also a time series
SPAN1
SPAN2 SPAN3 SPAN4
SPAN5 SPAN6 SPAN7 SPAN8
SPAN9 SPAN10
TIME
A trace in Jaeger 👇
By the end of this talk, you will be able to build this 🙌
02
Our Example System
An Absurd Password Generator
LOAD
GENERATOR
DIGIT
UPPER
LOWER
SPECIAL
OTEL
COLLECTOR
PROMSCALE
COLLECTOR
PROMSCALE
03
Building Grafana
Dashboards for Traces
Using simple (but powerful) SQL
Request Rates / Error Rates
A simple dashboard to get started
1 Requests per second
SELECT
time_bucket('1 minute', start_time) as time,
count(*) / 60.0 as req_per_sec
FROM ps_trace.span s
WHERE s.start_time >= now() - interval '5 minutes'
AND parent_span_id is null -- just the root spans
GROUP BY 1
ORDER BY 1
1 Requests per second
2 Requests per second
SELECT
time_bucket('1 second', start_time) as time,
count(*) as req_per_sec
FROM ps_trace.span s
WHERE s.start_time >= now() - interval '5 minutes'
AND parent_span_id is null -- just the root spans
GROUP BY 1
ORDER BY 1
2 Requests per second
Errors by service
Errors by service
SELECT
service_name,
count(*) as num_err
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND status_code = 'STATUS_CODE_ERROR'
GROUP BY 1
Errors by operation
Errors by operation
SELECT
x.service_name,
x.span_name,
x.num_err::numeric / x.num_total as err_rate
FROM
(
SELECT
service_name,
span_name,
count(*) filter (where status_code = 'STATUS_CODE_ERROR') as num_err,
count(*) as num_total
FROM ps_trace.span
WHERE $__timeFilter(start_time)
GROUP BY 1, 2
) x
ORDER BY err_rate desc
Error rates by operation over time
Error rates by operation over time
SELECT
x.time,
x.service_name,
x.span_name,
x.num_err::numeric / x.num_total as err_rate
FROM
(
SELECT
time_bucket('1 minute', start_time) as time,
service_name,
span_name,
count(*) filter (where status_code = 'STATUS_CODE_ERROR') as num_err,
count(*) as num_total
FROM ps_trace.span
WHERE $__timeFilter(start_time)
GROUP BY 1, 2, 3
) x
ORDER BY time
Request Durations
A somewhat more complex dashboard…
Request durations
Slowest requests
SELECT
trace_id,
start_time,
duration_ms,
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND parent_span_id is null
ORDER BY duration_ms DESC
LIMIT 10
Slowest requests
Histogram of request durations
Histogram of request durations
SELECT duration_ms
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND parent_span_id is null
Distribution of request durations over time
Distribution of request durations over time
SELECT
start_time as time,
duration_ms
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND parent_span_id is null
ORDER BY 1
Request duration percentiles over time
SELECT
r.time,
'p' || lpad((p.p * 100.0)::int::text, 2, '0') as percentile,
approx_percentile(p.p, percentile_agg(r.duration_ms)) as duration
FROM
(
SELECT
time_bucket('1 minute', start_time) as time,
duration_ms
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND parent_span_id is null
) r
CROSS JOIN
(
SELECT unnest(ARRAY[.01, .5, .75, .9, .95, .99]) as p
) p
GROUP BY r.time, p.p
ORDER BY r.time
Request duration percentiles over time
Service Dependencies
A real-time minimap!
Service dependencies
SELECT
value#>>'{}' as id,
value#>>'{}' as title
FROM _ps_trace.tag
WHERE key = 'service.name'
SELECT
p.service_name || '->' || k.service_name as id,
p.service_name as source,
k.service_name as target,
k.span_name as "mainStat",
count(*) as "secondaryStat"
FROM ps_trace.span p
INNER JOIN ps_trace.span k
ON (p.trace_id = k.trace_id
AND p.span_id = k.parent_span_id
AND p.service_name != k.service_name)
WHERE $__timeFilter(p.start_time)
GROUP BY 1, 2, 3, 4
Service dependencies
Service dependencies
SELECT
p.service_name as source,
k.service_name as target,
k.span_name,
count(*) as calls,
sum(k.duration_ms) as total_exec_ms,
avg(k.duration_ms) as avg_exec_ms
FROM ps_trace.span p
INNER JOIN ps_trace.span k
ON (p.trace_id = k.trace_id
AND p.span_id = k.parent_span_id
AND p.service_name != k.service_name)
WHERE $__timeFilter(p.start_time)
GROUP BY 1, 2, 3
ORDER BY total_exec_ms DESC
Service dependencies
Upstream Spans
Analyzing the traces’ tree structure
Upstream spans
WITH RECURSIVE x AS
(
SELECT
trace_id, span_id, parent_span_id,
service_name, span_name
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND service_name = '${service}'
AND span_name = '${operation}'
UNION ALL
SELECT
s.trace_id, s.span_id, s.parent_span_id,
s.service_name, s.span_name
FROM x
INNER JOIN ps_trace.span s
ON (x.trace_id = s.trace_id
AND x.parent_span_id = s.span_id)
)
SELECT
md5(service_name || '-' || span_name) as id,
span_name as title,
service_name as "subTitle",
count(*) as "mainStat"
FROM x
GROUP BY service_name, span_name
Upstream spans (nodes)
WITH RECURSIVE x AS
(
SELECT
trace_id, span_id, parent_span_id, service_name, span_name,
null::text as id,
null::text as target,
null::text as source
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND service_name = '${service}'
AND span_name = '${operation}'
UNION ALL
SELECT
s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
md5(s.service_name || '-' || s.span_name || '-' || x.service_name || '-' || x.span_name) as id,
md5(x.service_name || '-' || x.span_name) as target,
md5(s.service_name || '-' || s.span_name) as source
FROM x
INNER JOIN ps_trace.span s
ON (x.trace_id = s.trace_id
AND x.parent_span_id = s.span_id)
)
SELECT DISTINCT x.id, x.target, x.source
FROM x
WHERE id is not null
Upstream spans (edges)
Downstream Spans
Analyzing both the tree + the time series
Downstream spans
WITH RECURSIVE x AS
(
SELECT trace_id, span_id, parent_span_id, service_name, span_name
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND service_name = '${service}' AND span_name = '${operation}'
UNION ALL
SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name
FROM x
INNER JOIN ps_trace.span s
ON (x.trace_id = s.trace_id
AND x.span_id = s.parent_span_id)
)
SELECT
md5(service_name || '-' || span_name) as id,
span_name as title, service_name as "subTitle", count(*) as "mainStat"
FROM x
GROUP BY service_name, span_name
Downstream spans (nodes)
WITH RECURSIVE x AS
(
SELECT trace_id, span_id, parent_span_id, service_name, span_name,
null::text as id,
null::text as source,
null::text as target
FROM ps_trace.span
WHERE $__timeFilter(start_time)
AND service_name = '${service}'
AND span_name = '${operation}'
UNION ALL
SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
md5(s.service_name || '-' || s.span_name || '-' || x.service_name || '-' ||
x.span_name) as id,
md5(x.service_name || '-' || x.span_name) as source,
md5(s.service_name || '-' || s.span_name) as target
FROM x
INNER JOIN ps_trace.span s
ON (x.trace_id = s.trace_id
AND x.span_id = s.parent_span_id)
)
SELECT DISTINCT x.id, x.source, x.target
FROM x
WHERE id is not null
Downstream spans (edges)
Total execution time by operation
WITH RECURSIVE x AS
(
SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
s.duration_ms - coalesce(
(
SELECT sum(z.duration_ms)
FROM ps_trace.span z
WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id
), 0.0) as duration_ms
FROM ps_trace.span s
WHERE $__timeFilter(s.start_time) AND s.service_name = '${service}' AND s.span_name = '${operation}'
UNION ALL
SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
s.duration_ms - coalesce(
(
SELECT sum(z.duration_ms)
FROM ps_trace.span z
WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id
), 0.0) as duration_ms
FROM x
INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id)
)
SELECT service_name, span_name, sum(duration_ms) as total_exec_time
FROM x
GROUP BY 1, 2
ORDER BY 3 DESC
Total execution time by operation
Total execution time by operation over time
WITH RECURSIVE x AS
(
SELECT time_bucket('15 seconds', s.start_time) as time,
s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
s.duration_ms - coalesce(
(
SELECT sum(z.duration_ms)
FROM ps_trace.span z
WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id
), 0.0) as duration_ms
FROM ps_trace.span s
WHERE $__timeFilter(s.start_time) AND s.service_name = '${service}' AND s.span_name = '${operation}'
UNION ALL
SELECT time_bucket('15 seconds', s.start_time) as time,
s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
s.duration_ms - coalesce(
(
SELECT sum(z.duration_ms)
FROM ps_trace.span z
WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id
), 0.0) as duration_ms
FROM x
INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id)
)
SELECT time, service_name || ' ' || span_name as series, sum(duration_ms) as exec_ms
FROM x
GROUP BY 1, 2 ORDER BY 1
Total execution time by operation over time
Operation execution times
WITH RECURSIVE x AS
(
SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
s.duration_ms - coalesce(
(
SELECT sum(z.duration_ms)
FROM ps_trace.span z
WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id
), 0.0) as duration_ms,
s.status_code = 'STATUS_CODE_ERROR' as is_err
FROM ps_trace.span s
WHERE $__timeFilter(s.start_time) AND s.service_name = '${service}' AND s.span_name = '${operation}'
UNION ALL
SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name,
s.duration_ms - coalesce(
(
SELECT sum(z.duration_ms)
FROM ps_trace.span z
WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id
), 0.0) as duration_ms,
s.status_code = 'STATUS_CODE_ERROR' as is_err
FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id)
)
SELECT service_name, span_name as operation,
sum(duration_ms) as total_exec_time,
approx_percentile(0.5, percentile_agg(duration_ms)) as p50,
approx_percentile(0.95, percentile_agg(duration_ms)) as p95,
approx_percentile(0.99, percentile_agg(duration_ms)) as p99,
count(*) FILTER (WHERE x.is_err) as num_errors
FROM x
GROUP BY 1, 2 ORDER BY 3 DESC
Operation execution times
04
Questions?
Thank you!
Chat with us 👉 slack.timescale.com
tsdb.co/opentelemetry-demo

More Related Content

Similar to Dok Talks #115 - What More Can I Learn From My OpenTelemetry Traces?

Easing the Complex with SPBench framework
Easing the Complex with SPBench frameworkEasing the Complex with SPBench framework
Easing the Complex with SPBench frameworkadriano1mg
 
Postgresql Database Administration- Day4
Postgresql Database Administration- Day4Postgresql Database Administration- Day4
Postgresql Database Administration- Day4PoguttuezhiniVP
 
Do snow.rwn
Do snow.rwnDo snow.rwn
Do snow.rwnARUN DN
 
Beyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the codeBeyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the codeWim Godden
 
HBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQL
HBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQLHBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQL
HBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQLCloudera, Inc.
 
More Stored Procedures and MUMPS for DivConq
More Stored Procedures and  MUMPS for DivConqMore Stored Procedures and  MUMPS for DivConq
More Stored Procedures and MUMPS for DivConqeTimeline, LLC
 
SWP - A Generic Language Parser
SWP - A Generic Language ParserSWP - A Generic Language Parser
SWP - A Generic Language Parserkamaelian
 
Postgres performance for humans
Postgres performance for humansPostgres performance for humans
Postgres performance for humansCraig Kerstiens
 
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Dan Robinson
 
Wait Events 10g
Wait Events 10gWait Events 10g
Wait Events 10gsagai
 
fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)
fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)
fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)Wesley Beary
 
fog or: How I Learned to Stop Worrying and Love the Cloud
fog or: How I Learned to Stop Worrying and Love the Cloudfog or: How I Learned to Stop Worrying and Love the Cloud
fog or: How I Learned to Stop Worrying and Love the CloudWesley Beary
 
Refactoring to symfony components
Refactoring to symfony componentsRefactoring to symfony components
Refactoring to symfony componentsMichael Peacock
 
Neo4j after 1 year in production
Neo4j after 1 year in productionNeo4j after 1 year in production
Neo4j after 1 year in productionAndrew Nikishaev
 
Next Generation Solutions with Neo4j
Next Generation Solutions with Neo4jNext Generation Solutions with Neo4j
Next Generation Solutions with Neo4jNeo4j
 

Similar to Dok Talks #115 - What More Can I Learn From My OpenTelemetry Traces? (20)

Easing the Complex with SPBench framework
Easing the Complex with SPBench frameworkEasing the Complex with SPBench framework
Easing the Complex with SPBench framework
 
Postgresql Database Administration- Day4
Postgresql Database Administration- Day4Postgresql Database Administration- Day4
Postgresql Database Administration- Day4
 
Php functions
Php functionsPhp functions
Php functions
 
Tt subtemplates-caching
Tt subtemplates-cachingTt subtemplates-caching
Tt subtemplates-caching
 
Do snow.rwn
Do snow.rwnDo snow.rwn
Do snow.rwn
 
Beyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the codeBeyond PHP - It's not (just) about the code
Beyond PHP - It's not (just) about the code
 
HBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQL
HBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQLHBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQL
HBaseCon 2013: How (and Why) Phoenix Puts the SQL Back into NoSQL
 
More Stored Procedures and MUMPS for DivConq
More Stored Procedures and  MUMPS for DivConqMore Stored Procedures and  MUMPS for DivConq
More Stored Procedures and MUMPS for DivConq
 
SWP - A Generic Language Parser
SWP - A Generic Language ParserSWP - A Generic Language Parser
SWP - A Generic Language Parser
 
Postgres performance for humans
Postgres performance for humansPostgres performance for humans
Postgres performance for humans
 
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
 
Wait Events 10g
Wait Events 10gWait Events 10g
Wait Events 10g
 
Writing Faster Python 3
Writing Faster Python 3Writing Faster Python 3
Writing Faster Python 3
 
fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)
fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)
fog or: How I Learned to Stop Worrying and Love the Cloud (OpenStack Edition)
 
fog or: How I Learned to Stop Worrying and Love the Cloud
fog or: How I Learned to Stop Worrying and Love the Cloudfog or: How I Learned to Stop Worrying and Love the Cloud
fog or: How I Learned to Stop Worrying and Love the Cloud
 
Refactoring to symfony components
Refactoring to symfony componentsRefactoring to symfony components
Refactoring to symfony components
 
Learning Dtrace
Learning DtraceLearning Dtrace
Learning Dtrace
 
Postman On Steroids
Postman On SteroidsPostman On Steroids
Postman On Steroids
 
Neo4j after 1 year in production
Neo4j after 1 year in productionNeo4j after 1 year in production
Neo4j after 1 year in production
 
Next Generation Solutions with Neo4j
Next Generation Solutions with Neo4jNext Generation Solutions with Neo4j
Next Generation Solutions with Neo4j
 

More from DoKC

Distributed Vector Databases - What, Why, and How
Distributed Vector Databases - What, Why, and HowDistributed Vector Databases - What, Why, and How
Distributed Vector Databases - What, Why, and HowDoKC
 
Is It Safe? Security Hardening for Databases Using Kubernetes Operators
Is It Safe? Security Hardening for Databases Using Kubernetes OperatorsIs It Safe? Security Hardening for Databases Using Kubernetes Operators
Is It Safe? Security Hardening for Databases Using Kubernetes OperatorsDoKC
 
Stop Worrying and Keep Querying, Using Automated Multi-Region Disaster Recovery
Stop Worrying and Keep Querying, Using Automated Multi-Region Disaster RecoveryStop Worrying and Keep Querying, Using Automated Multi-Region Disaster Recovery
Stop Worrying and Keep Querying, Using Automated Multi-Region Disaster RecoveryDoKC
 
Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...
Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...
Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...DoKC
 
The State of Stateful on Kubernetes
The State of Stateful on KubernetesThe State of Stateful on Kubernetes
The State of Stateful on KubernetesDoKC
 
Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...
Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...
Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...DoKC
 
Make Your Kafka Cluster Production-Ready
Make Your Kafka Cluster Production-ReadyMake Your Kafka Cluster Production-Ready
Make Your Kafka Cluster Production-ReadyDoKC
 
Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...
Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...
Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...DoKC
 
Run PostgreSQL in Warp Speed Using NVMe/TCP in the Cloud
Run PostgreSQL in Warp Speed Using NVMe/TCP in the CloudRun PostgreSQL in Warp Speed Using NVMe/TCP in the Cloud
Run PostgreSQL in Warp Speed Using NVMe/TCP in the CloudDoKC
 
The Kubernetes Native Database
The Kubernetes Native DatabaseThe Kubernetes Native Database
The Kubernetes Native DatabaseDoKC
 
ING Data Services hosted on ICHP DoK Amsterdam 2023
ING Data Services hosted on ICHP DoK Amsterdam 2023ING Data Services hosted on ICHP DoK Amsterdam 2023
ING Data Services hosted on ICHP DoK Amsterdam 2023DoKC
 
Implementing data and databases on K8s within the Dutch government
Implementing data and databases on K8s within the Dutch governmentImplementing data and databases on K8s within the Dutch government
Implementing data and databases on K8s within the Dutch governmentDoKC
 
StatefulSets in K8s - DoK Talks #154
StatefulSets in K8s - DoK Talks #154StatefulSets in K8s - DoK Talks #154
StatefulSets in K8s - DoK Talks #154DoKC
 
Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...
Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...
Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...DoKC
 
Analytics with Apache Superset and ClickHouse - DoK Talks #151
Analytics with Apache Superset and ClickHouse - DoK Talks #151Analytics with Apache Superset and ClickHouse - DoK Talks #151
Analytics with Apache Superset and ClickHouse - DoK Talks #151DoKC
 
Overcoming challenges with protecting and migrating data in multi-cloud K8s e...
Overcoming challenges with protecting and migrating data in multi-cloud K8s e...Overcoming challenges with protecting and migrating data in multi-cloud K8s e...
Overcoming challenges with protecting and migrating data in multi-cloud K8s e...DoKC
 
Evaluating Cloud Native Storage Vendors - DoK Talks #147
Evaluating Cloud Native Storage Vendors - DoK Talks #147Evaluating Cloud Native Storage Vendors - DoK Talks #147
Evaluating Cloud Native Storage Vendors - DoK Talks #147DoKC
 
Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...
Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...
Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...DoKC
 
We will Dok You! - The journey to adopt stateful workloads on k8s
We will Dok You! - The journey to adopt stateful workloads on k8sWe will Dok You! - The journey to adopt stateful workloads on k8s
We will Dok You! - The journey to adopt stateful workloads on k8sDoKC
 
Mastering MongoDB on Kubernetes, the power of operators
Mastering MongoDB on Kubernetes, the power of operators Mastering MongoDB on Kubernetes, the power of operators
Mastering MongoDB on Kubernetes, the power of operators DoKC
 

More from DoKC (20)

Distributed Vector Databases - What, Why, and How
Distributed Vector Databases - What, Why, and HowDistributed Vector Databases - What, Why, and How
Distributed Vector Databases - What, Why, and How
 
Is It Safe? Security Hardening for Databases Using Kubernetes Operators
Is It Safe? Security Hardening for Databases Using Kubernetes OperatorsIs It Safe? Security Hardening for Databases Using Kubernetes Operators
Is It Safe? Security Hardening for Databases Using Kubernetes Operators
 
Stop Worrying and Keep Querying, Using Automated Multi-Region Disaster Recovery
Stop Worrying and Keep Querying, Using Automated Multi-Region Disaster RecoveryStop Worrying and Keep Querying, Using Automated Multi-Region Disaster Recovery
Stop Worrying and Keep Querying, Using Automated Multi-Region Disaster Recovery
 
Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...
Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...
Transforming Data Processing with Kubernetes: Journey Towards a Self-Serve Da...
 
The State of Stateful on Kubernetes
The State of Stateful on KubernetesThe State of Stateful on Kubernetes
The State of Stateful on Kubernetes
 
Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...
Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...
Colocating Data Workloads and Web Services on Kubernetes to Improve Resource ...
 
Make Your Kafka Cluster Production-Ready
Make Your Kafka Cluster Production-ReadyMake Your Kafka Cluster Production-Ready
Make Your Kafka Cluster Production-Ready
 
Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...
Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...
Dynamic Large Scale Spark on Kubernetes: Empowering the Community with Argo W...
 
Run PostgreSQL in Warp Speed Using NVMe/TCP in the Cloud
Run PostgreSQL in Warp Speed Using NVMe/TCP in the CloudRun PostgreSQL in Warp Speed Using NVMe/TCP in the Cloud
Run PostgreSQL in Warp Speed Using NVMe/TCP in the Cloud
 
The Kubernetes Native Database
The Kubernetes Native DatabaseThe Kubernetes Native Database
The Kubernetes Native Database
 
ING Data Services hosted on ICHP DoK Amsterdam 2023
ING Data Services hosted on ICHP DoK Amsterdam 2023ING Data Services hosted on ICHP DoK Amsterdam 2023
ING Data Services hosted on ICHP DoK Amsterdam 2023
 
Implementing data and databases on K8s within the Dutch government
Implementing data and databases on K8s within the Dutch governmentImplementing data and databases on K8s within the Dutch government
Implementing data and databases on K8s within the Dutch government
 
StatefulSets in K8s - DoK Talks #154
StatefulSets in K8s - DoK Talks #154StatefulSets in K8s - DoK Talks #154
StatefulSets in K8s - DoK Talks #154
 
Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...
Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...
Running PostgreSQL in Kubernetes: from day 0 to day 2 with CloudNativePG - Do...
 
Analytics with Apache Superset and ClickHouse - DoK Talks #151
Analytics with Apache Superset and ClickHouse - DoK Talks #151Analytics with Apache Superset and ClickHouse - DoK Talks #151
Analytics with Apache Superset and ClickHouse - DoK Talks #151
 
Overcoming challenges with protecting and migrating data in multi-cloud K8s e...
Overcoming challenges with protecting and migrating data in multi-cloud K8s e...Overcoming challenges with protecting and migrating data in multi-cloud K8s e...
Overcoming challenges with protecting and migrating data in multi-cloud K8s e...
 
Evaluating Cloud Native Storage Vendors - DoK Talks #147
Evaluating Cloud Native Storage Vendors - DoK Talks #147Evaluating Cloud Native Storage Vendors - DoK Talks #147
Evaluating Cloud Native Storage Vendors - DoK Talks #147
 
Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...
Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...
Kubernetes Cluster Upgrade Strategies and Data: Best Practices for your State...
 
We will Dok You! - The journey to adopt stateful workloads on k8s
We will Dok You! - The journey to adopt stateful workloads on k8sWe will Dok You! - The journey to adopt stateful workloads on k8s
We will Dok You! - The journey to adopt stateful workloads on k8s
 
Mastering MongoDB on Kubernetes, the power of operators
Mastering MongoDB on Kubernetes, the power of operators Mastering MongoDB on Kubernetes, the power of operators
Mastering MongoDB on Kubernetes, the power of operators
 

Recently uploaded

EY_Graph Database Powered Sustainability
EY_Graph Database Powered SustainabilityEY_Graph Database Powered Sustainability
EY_Graph Database Powered SustainabilityNeo4j
 
Adobe Marketo Engage Deep Dives: Using Webhooks to Transfer Data
Adobe Marketo Engage Deep Dives: Using Webhooks to Transfer DataAdobe Marketo Engage Deep Dives: Using Webhooks to Transfer Data
Adobe Marketo Engage Deep Dives: Using Webhooks to Transfer DataBradBedford3
 
Salesforce Certified Field Service Consultant
Salesforce Certified Field Service ConsultantSalesforce Certified Field Service Consultant
Salesforce Certified Field Service ConsultantAxelRicardoTrocheRiq
 
Hand gesture recognition PROJECT PPT.pptx
Hand gesture recognition PROJECT PPT.pptxHand gesture recognition PROJECT PPT.pptx
Hand gesture recognition PROJECT PPT.pptxbodapatigopi8531
 
Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)OPEN KNOWLEDGE GmbH
 
Project Based Learning (A.I).pptx detail explanation
Project Based Learning (A.I).pptx detail explanationProject Based Learning (A.I).pptx detail explanation
Project Based Learning (A.I).pptx detail explanationkaushalgiri8080
 
Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...
Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...
Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...MyIntelliSource, Inc.
 
DNT_Corporate presentation know about us
DNT_Corporate presentation know about usDNT_Corporate presentation know about us
DNT_Corporate presentation know about usDynamic Netsoft
 
(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...
(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...
(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...gurkirankumar98700
 
TECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service providerTECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service providermohitmore19
 
The Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdf
The Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdfThe Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdf
The Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdfkalichargn70th171
 
Unveiling the Tech Salsa of LAMs with Janus in Real-Time Applications
Unveiling the Tech Salsa of LAMs with Janus in Real-Time ApplicationsUnveiling the Tech Salsa of LAMs with Janus in Real-Time Applications
Unveiling the Tech Salsa of LAMs with Janus in Real-Time ApplicationsAlberto González Trastoy
 
Engage Usergroup 2024 - The Good The Bad_The Ugly
Engage Usergroup 2024 - The Good The Bad_The UglyEngage Usergroup 2024 - The Good The Bad_The Ugly
Engage Usergroup 2024 - The Good The Bad_The UglyFrank van der Linden
 
Asset Management Software - Infographic
Asset Management Software - InfographicAsset Management Software - Infographic
Asset Management Software - InfographicHr365.us smith
 
Optimizing AI for immediate response in Smart CCTV
Optimizing AI for immediate response in Smart CCTVOptimizing AI for immediate response in Smart CCTV
Optimizing AI for immediate response in Smart CCTVshikhaohhpro
 
Building Real-Time Data Pipelines: Stream & Batch Processing workshop Slide
Building Real-Time Data Pipelines: Stream & Batch Processing workshop SlideBuilding Real-Time Data Pipelines: Stream & Batch Processing workshop Slide
Building Real-Time Data Pipelines: Stream & Batch Processing workshop SlideChristina Lin
 
Learn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdf
Learn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdfLearn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdf
Learn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdfkalichargn70th171
 
HR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.comHR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.comFatema Valibhai
 
Unlocking the Future of AI Agents with Large Language Models
Unlocking the Future of AI Agents with Large Language ModelsUnlocking the Future of AI Agents with Large Language Models
Unlocking the Future of AI Agents with Large Language Modelsaagamshah0812
 

Recently uploaded (20)

Call Girls In Mukherjee Nagar 📱 9999965857 🤩 Delhi 🫦 HOT AND SEXY VVIP 🍎 SE...
Call Girls In Mukherjee Nagar 📱  9999965857  🤩 Delhi 🫦 HOT AND SEXY VVIP 🍎 SE...Call Girls In Mukherjee Nagar 📱  9999965857  🤩 Delhi 🫦 HOT AND SEXY VVIP 🍎 SE...
Call Girls In Mukherjee Nagar 📱 9999965857 🤩 Delhi 🫦 HOT AND SEXY VVIP 🍎 SE...
 
EY_Graph Database Powered Sustainability
EY_Graph Database Powered SustainabilityEY_Graph Database Powered Sustainability
EY_Graph Database Powered Sustainability
 
Adobe Marketo Engage Deep Dives: Using Webhooks to Transfer Data
Adobe Marketo Engage Deep Dives: Using Webhooks to Transfer DataAdobe Marketo Engage Deep Dives: Using Webhooks to Transfer Data
Adobe Marketo Engage Deep Dives: Using Webhooks to Transfer Data
 
Salesforce Certified Field Service Consultant
Salesforce Certified Field Service ConsultantSalesforce Certified Field Service Consultant
Salesforce Certified Field Service Consultant
 
Hand gesture recognition PROJECT PPT.pptx
Hand gesture recognition PROJECT PPT.pptxHand gesture recognition PROJECT PPT.pptx
Hand gesture recognition PROJECT PPT.pptx
 
Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)
 
Project Based Learning (A.I).pptx detail explanation
Project Based Learning (A.I).pptx detail explanationProject Based Learning (A.I).pptx detail explanation
Project Based Learning (A.I).pptx detail explanation
 
Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...
Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...
Steps To Getting Up And Running Quickly With MyTimeClock Employee Scheduling ...
 
DNT_Corporate presentation know about us
DNT_Corporate presentation know about usDNT_Corporate presentation know about us
DNT_Corporate presentation know about us
 
(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...
(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...
(Genuine) Escort Service Lucknow | Starting ₹,5K To @25k with A/C 🧑🏽‍❤️‍🧑🏻 89...
 
TECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service providerTECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service provider
 
The Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdf
The Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdfThe Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdf
The Essentials of Digital Experience Monitoring_ A Comprehensive Guide.pdf
 
Unveiling the Tech Salsa of LAMs with Janus in Real-Time Applications
Unveiling the Tech Salsa of LAMs with Janus in Real-Time ApplicationsUnveiling the Tech Salsa of LAMs with Janus in Real-Time Applications
Unveiling the Tech Salsa of LAMs with Janus in Real-Time Applications
 
Engage Usergroup 2024 - The Good The Bad_The Ugly
Engage Usergroup 2024 - The Good The Bad_The UglyEngage Usergroup 2024 - The Good The Bad_The Ugly
Engage Usergroup 2024 - The Good The Bad_The Ugly
 
Asset Management Software - Infographic
Asset Management Software - InfographicAsset Management Software - Infographic
Asset Management Software - Infographic
 
Optimizing AI for immediate response in Smart CCTV
Optimizing AI for immediate response in Smart CCTVOptimizing AI for immediate response in Smart CCTV
Optimizing AI for immediate response in Smart CCTV
 
Building Real-Time Data Pipelines: Stream & Batch Processing workshop Slide
Building Real-Time Data Pipelines: Stream & Batch Processing workshop SlideBuilding Real-Time Data Pipelines: Stream & Batch Processing workshop Slide
Building Real-Time Data Pipelines: Stream & Batch Processing workshop Slide
 
Learn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdf
Learn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdfLearn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdf
Learn the Fundamentals of XCUITest Framework_ A Beginner's Guide.pdf
 
HR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.comHR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.com
 
Unlocking the Future of AI Agents with Large Language Models
Unlocking the Future of AI Agents with Large Language ModelsUnlocking the Future of AI Agents with Large Language Models
Unlocking the Future of AI Agents with Large Language Models
 

Dok Talks #115 - What More Can I Learn From My OpenTelemetry Traces?

  • 1. What More Can I Learn from My OpenTelemetry Traces? Analyzing traces with SQL February 1, 2022 John Pruitt
  • 2. What we will cover today 👀 01 02 Context: Tracing..? 🤔 Our Example System ● Request Rates / Error Rates ● Request Durations ● Service Dependencies ● Upstream Spans ● Downstream Spans 04 Q&A! 03 Building Grafana Dashboards for Traces
  • 5. A trace is a tree of spans SPAN1 SPAN2 SPAN3 SPAN4 SPAN5 SPAN6 SPAN7 SPAN8 SPAN9 SPAN10
  • 6. A trace is also a time series SPAN1 SPAN2 SPAN3 SPAN4 SPAN5 SPAN6 SPAN7 SPAN8 SPAN9 SPAN10 TIME
  • 7. A trace in Jaeger 👇
  • 8. By the end of this talk, you will be able to build this 🙌
  • 9. 02 Our Example System An Absurd Password Generator
  • 11. 03 Building Grafana Dashboards for Traces Using simple (but powerful) SQL
  • 12.
  • 13. Request Rates / Error Rates A simple dashboard to get started
  • 14. 1 Requests per second
  • 15. SELECT time_bucket('1 minute', start_time) as time, count(*) / 60.0 as req_per_sec FROM ps_trace.span s WHERE s.start_time >= now() - interval '5 minutes' AND parent_span_id is null -- just the root spans GROUP BY 1 ORDER BY 1 1 Requests per second
  • 16. 2 Requests per second
  • 17. SELECT time_bucket('1 second', start_time) as time, count(*) as req_per_sec FROM ps_trace.span s WHERE s.start_time >= now() - interval '5 minutes' AND parent_span_id is null -- just the root spans GROUP BY 1 ORDER BY 1 2 Requests per second
  • 19. Errors by service SELECT service_name, count(*) as num_err FROM ps_trace.span WHERE $__timeFilter(start_time) AND status_code = 'STATUS_CODE_ERROR' GROUP BY 1
  • 21. Errors by operation SELECT x.service_name, x.span_name, x.num_err::numeric / x.num_total as err_rate FROM ( SELECT service_name, span_name, count(*) filter (where status_code = 'STATUS_CODE_ERROR') as num_err, count(*) as num_total FROM ps_trace.span WHERE $__timeFilter(start_time) GROUP BY 1, 2 ) x ORDER BY err_rate desc
  • 22. Error rates by operation over time
  • 23. Error rates by operation over time SELECT x.time, x.service_name, x.span_name, x.num_err::numeric / x.num_total as err_rate FROM ( SELECT time_bucket('1 minute', start_time) as time, service_name, span_name, count(*) filter (where status_code = 'STATUS_CODE_ERROR') as num_err, count(*) as num_total FROM ps_trace.span WHERE $__timeFilter(start_time) GROUP BY 1, 2, 3 ) x ORDER BY time
  • 24. Request Durations A somewhat more complex dashboard…
  • 27. SELECT trace_id, start_time, duration_ms, FROM ps_trace.span WHERE $__timeFilter(start_time) AND parent_span_id is null ORDER BY duration_ms DESC LIMIT 10 Slowest requests
  • 28. Histogram of request durations
  • 29. Histogram of request durations SELECT duration_ms FROM ps_trace.span WHERE $__timeFilter(start_time) AND parent_span_id is null
  • 30. Distribution of request durations over time
  • 31. Distribution of request durations over time SELECT start_time as time, duration_ms FROM ps_trace.span WHERE $__timeFilter(start_time) AND parent_span_id is null ORDER BY 1
  • 33. SELECT r.time, 'p' || lpad((p.p * 100.0)::int::text, 2, '0') as percentile, approx_percentile(p.p, percentile_agg(r.duration_ms)) as duration FROM ( SELECT time_bucket('1 minute', start_time) as time, duration_ms FROM ps_trace.span WHERE $__timeFilter(start_time) AND parent_span_id is null ) r CROSS JOIN ( SELECT unnest(ARRAY[.01, .5, .75, .9, .95, .99]) as p ) p GROUP BY r.time, p.p ORDER BY r.time Request duration percentiles over time
  • 36. SELECT value#>>'{}' as id, value#>>'{}' as title FROM _ps_trace.tag WHERE key = 'service.name' SELECT p.service_name || '->' || k.service_name as id, p.service_name as source, k.service_name as target, k.span_name as "mainStat", count(*) as "secondaryStat" FROM ps_trace.span p INNER JOIN ps_trace.span k ON (p.trace_id = k.trace_id AND p.span_id = k.parent_span_id AND p.service_name != k.service_name) WHERE $__timeFilter(p.start_time) GROUP BY 1, 2, 3, 4 Service dependencies
  • 38. SELECT p.service_name as source, k.service_name as target, k.span_name, count(*) as calls, sum(k.duration_ms) as total_exec_ms, avg(k.duration_ms) as avg_exec_ms FROM ps_trace.span p INNER JOIN ps_trace.span k ON (p.trace_id = k.trace_id AND p.span_id = k.parent_span_id AND p.service_name != k.service_name) WHERE $__timeFilter(p.start_time) GROUP BY 1, 2, 3 ORDER BY total_exec_ms DESC Service dependencies
  • 39. Upstream Spans Analyzing the traces’ tree structure
  • 41. WITH RECURSIVE x AS ( SELECT trace_id, span_id, parent_span_id, service_name, span_name FROM ps_trace.span WHERE $__timeFilter(start_time) AND service_name = '${service}' AND span_name = '${operation}' UNION ALL SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.parent_span_id = s.span_id) ) SELECT md5(service_name || '-' || span_name) as id, span_name as title, service_name as "subTitle", count(*) as "mainStat" FROM x GROUP BY service_name, span_name Upstream spans (nodes)
  • 42. WITH RECURSIVE x AS ( SELECT trace_id, span_id, parent_span_id, service_name, span_name, null::text as id, null::text as target, null::text as source FROM ps_trace.span WHERE $__timeFilter(start_time) AND service_name = '${service}' AND span_name = '${operation}' UNION ALL SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, md5(s.service_name || '-' || s.span_name || '-' || x.service_name || '-' || x.span_name) as id, md5(x.service_name || '-' || x.span_name) as target, md5(s.service_name || '-' || s.span_name) as source FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.parent_span_id = s.span_id) ) SELECT DISTINCT x.id, x.target, x.source FROM x WHERE id is not null Upstream spans (edges)
  • 43. Downstream Spans Analyzing both the tree + the time series
  • 45. WITH RECURSIVE x AS ( SELECT trace_id, span_id, parent_span_id, service_name, span_name FROM ps_trace.span WHERE $__timeFilter(start_time) AND service_name = '${service}' AND span_name = '${operation}' UNION ALL SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id) ) SELECT md5(service_name || '-' || span_name) as id, span_name as title, service_name as "subTitle", count(*) as "mainStat" FROM x GROUP BY service_name, span_name Downstream spans (nodes)
  • 46. WITH RECURSIVE x AS ( SELECT trace_id, span_id, parent_span_id, service_name, span_name, null::text as id, null::text as source, null::text as target FROM ps_trace.span WHERE $__timeFilter(start_time) AND service_name = '${service}' AND span_name = '${operation}' UNION ALL SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, md5(s.service_name || '-' || s.span_name || '-' || x.service_name || '-' || x.span_name) as id, md5(x.service_name || '-' || x.span_name) as source, md5(s.service_name || '-' || s.span_name) as target FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id) ) SELECT DISTINCT x.id, x.source, x.target FROM x WHERE id is not null Downstream spans (edges)
  • 47. Total execution time by operation
  • 48. WITH RECURSIVE x AS ( SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, s.duration_ms - coalesce( ( SELECT sum(z.duration_ms) FROM ps_trace.span z WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id ), 0.0) as duration_ms FROM ps_trace.span s WHERE $__timeFilter(s.start_time) AND s.service_name = '${service}' AND s.span_name = '${operation}' UNION ALL SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, s.duration_ms - coalesce( ( SELECT sum(z.duration_ms) FROM ps_trace.span z WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id ), 0.0) as duration_ms FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id) ) SELECT service_name, span_name, sum(duration_ms) as total_exec_time FROM x GROUP BY 1, 2 ORDER BY 3 DESC Total execution time by operation
  • 49. Total execution time by operation over time
  • 50. WITH RECURSIVE x AS ( SELECT time_bucket('15 seconds', s.start_time) as time, s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, s.duration_ms - coalesce( ( SELECT sum(z.duration_ms) FROM ps_trace.span z WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id ), 0.0) as duration_ms FROM ps_trace.span s WHERE $__timeFilter(s.start_time) AND s.service_name = '${service}' AND s.span_name = '${operation}' UNION ALL SELECT time_bucket('15 seconds', s.start_time) as time, s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, s.duration_ms - coalesce( ( SELECT sum(z.duration_ms) FROM ps_trace.span z WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id ), 0.0) as duration_ms FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id) ) SELECT time, service_name || ' ' || span_name as series, sum(duration_ms) as exec_ms FROM x GROUP BY 1, 2 ORDER BY 1 Total execution time by operation over time
  • 52. WITH RECURSIVE x AS ( SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, s.duration_ms - coalesce( ( SELECT sum(z.duration_ms) FROM ps_trace.span z WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id ), 0.0) as duration_ms, s.status_code = 'STATUS_CODE_ERROR' as is_err FROM ps_trace.span s WHERE $__timeFilter(s.start_time) AND s.service_name = '${service}' AND s.span_name = '${operation}' UNION ALL SELECT s.trace_id, s.span_id, s.parent_span_id, s.service_name, s.span_name, s.duration_ms - coalesce( ( SELECT sum(z.duration_ms) FROM ps_trace.span z WHERE s.trace_id = z.trace_id AND s.span_id = z.parent_span_id ), 0.0) as duration_ms, s.status_code = 'STATUS_CODE_ERROR' as is_err FROM x INNER JOIN ps_trace.span s ON (x.trace_id = s.trace_id AND x.span_id = s.parent_span_id) ) SELECT service_name, span_name as operation, sum(duration_ms) as total_exec_time, approx_percentile(0.5, percentile_agg(duration_ms)) as p50, approx_percentile(0.95, percentile_agg(duration_ms)) as p95, approx_percentile(0.99, percentile_agg(duration_ms)) as p99, count(*) FILTER (WHERE x.is_err) as num_errors FROM x GROUP BY 1, 2 ORDER BY 3 DESC Operation execution times
  • 54. Thank you! Chat with us 👉 slack.timescale.com tsdb.co/opentelemetry-demo