SlideShare a Scribd company logo
1 of 67
Download to read offline
November 13, 2014 | Las Vegas, NV 
Timon Karnezos, Director Infrastructure, Neustar 
VidhyaSrinivasan, Sr. Manager Software Development, Amazon Redshift
Petabyte scale 
Massively parallelRelational data warehouseFully managed; zero admin
10 GigE 
(HPC) 
Ingestion 
Backup 
Restore 
JDBC/ODBC
Ad TechUse Cases
692.8s 
34.9s 
< 0.76%
00 
01 
10 
11 
00 
01 
10 
11 
P 
Space filling Curve for Two Dimensions
Frequency
Attribution
Overlap
Ad-hoc
0.7B 
/ day 
2B 
/ week 
8B 
/ month 
21B 
/ quarter
--Number of ads seen per user 
WITH frequency_intermediateAS ( 
SELECT user_id , 
SUM(1)AS impression_count, 
SUM(cost)AS cost , 
SUM(revenue)AS revenue 
FROM impressions 
WHERE record_dateBETWEEN <...> 
GROUP BY 1 
) 
--Number of people who saw N ads 
SELECT impression_count, SUM(1), SUM(cost), SUM(revenue) 
FROM frequency_intermediate 
GROUP BY 1;
CREATE TABLE ( 
record_datedateENCODENOT NULL , 
campaign_idbigintENCODENOT NULL , 
site_idbigintENCODENOT NULL , 
user_idbigintENCODENOT NULL DISTKEY, 
impression_countint ENCODENOT NULL , 
costbigintENCODENOT NULL , 
revenuebigintENCODENOT NULL 
)SORTKEY(,,,);
WITH user_frequencyAS ( 
SELECT user_id, campaign_id, site_id, 
SUM(impression_count)AS frequency, 
SUM(cost)AScost , 
SUM(revenue)AS revenue 
FROM frequency_intermediate 
WHERE record_dateBETWEEN <...> 
GROUP BY 1,2,3 
) 
SELECT campaign_id, site_id, frequency, 
SUM(1), SUM(cost), SUM(revenue) 
FROM user_frequency 
GROUP BY 1,2,3;
--Basic sessionization query, assemble user activity 
--that ended in a conversion into a timeline. 
SELECT <...> 
FROM impressions i 
JOIN conversions cON 
i.user_id =c.user_id AND 
i.record_date <c.record_date 
ORDER BY i.record_date;
Position: 1 
Position: 2 
Position: 3
Hour offset: 3 
Position: 1 
Position: 2 
Hour offset: 12 
Hour offset: 16 
Position: 3
--Sessionize user activity per conversion, partition by campaign (45-day lookback window) 
SELECT c.record_dateAS conversion_date , 
c.event_idAS conversion_id , 
i.campaign_idAS campaign_id , 
i.site_idAS site_id , 
i.user_idAS user_id , 
c.revenueAS conversion_revenue, 
DATEDIFF('hour', i.record_date, c.record_date) AS hour_offset, 
SUM(1)OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id 
ORDER BY i.record_dateDESC ROWS UNBOUNDED PRECEDING) AS position 
FROM impressions i 
JOIN conversions cON 
i.user_id= c.user_idAND 
i.campaign_id= c.campaign_idAND 
i.record_date< c.record_dateAND 
i.record_date> (c.record_date-interval '45 days') AND 
c.record_dateBETWEEN <...>;
--Compute statistics on sessions (funnel placement, last-touch, site-count, etc...) 
SELECT campaign_id , 
site_id , 
conversion_date, 
AVG(position)ASaverage_position, 
SUM(conversion_revenue * (position = 1)::int)ASlta_attributed , 
AVG(COUNT(DISTINCT site_id) 
OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id 
ORDER BY i.record_dateASC 
ROWS UNBOUNDED PRECEDING)) AS average_unique_preceding_site_count 
FROMsessions 
GROUPBY 1,2,3;
Site A 
Site B 
SiteC 
Site A 
20% 
60% 
Site B 
90% 
Site C 
CPM 
$0.06 
$1.05 
$9.50
Site A 
Site B 
SiteC 
Site A 
20% 
60% 
Site B 
90% 
Site C 
CPM 
$0.06 
$1.05 
$9.50
Site A 
Site B 
SiteC 
Site A 
20% 
60% 
Site B 
90% 
Site C 
CPM 
$0.06 
$1.05 
$9.50
CREATE TABLE ( 
user_idbigintENCODENOT NULL DISTKEY, 
site_id bigintENCODENOT NULL 
)SORTKEY();
WITH co_occurencesAS ( 
SELECT 
oi.site_idAS site1 , 
oi2.site_id AS site2 
FROM overlap_intermediate oi 
JOIN overlap_intermediate oi2 ON 
oi.site_id> oi2.site_id AND 
oi.ak_user_id= oi2.ak_user_id 
) 
SELECT site1, site2, SUM(1) 
FROM co_occurences 
GROUP BY 1,2;
CREATE TABLE ( 
record_datedateENCODENOT NULL , 
campaign_idbigintENCODENOT NULL , 
site_idbigintENCODENOT NULL , 
user_idbigintENCODENOT NULL DISTKEY 
)SORTKEY(,);
WITH 
site_overlap_intermediateAS ( 
SELECT user_id, site_id, campaign_id 
FROM overlap_intermediateWHERE record_dateBETWEEN <...> GROUP BY 1,2,3 
), 
site_co_occurencesAS ( 
SELECT oi.campaign_idAS c_id, oi.site_idAS site1,oi2.site_id AS site2 
FROM site_overlap_intermediate oi 
JOIN site_overlap_intermediate oi2 ON 
oi.site_id> oi2.site_idAND 
oi.ak_user_id= oi2.ak_user_id AND 
oi.campaign_id = oi2.campaign_id 
) 
SELECT c_id, site1, site2, SUM(1)FROM site_co_occurencesGROUP BY 1,2,3;
8 
fact tables 
26 
dimension tables 
7 
mapping tables
42 
views 
121 
joins 
1100 
sloc
$ pg_dump–Fc some_file --table=foo --table=bar 
$ pg_restore--schema-only --clean –Fc some_file > schema.sql 
$ pg_restore--data-only --table=foo –Fc some_file > foo.tsv 
$ aws s3 cp schema.sql s3://metadata-bucket/YYYYMMDD/schema.sql 
$ aws s3 cp foo.tsv s3://metadata-bucket/YYYYMMDD/foo.tsv 
> i schema.sql 
> COPY foo FROM ‘s3://metadata-bucket/YYYYMMDD/foo.tsv’ <...> 
# or combine ‘COPY <..> FROM <...> SSH’and pg_restore/psql
UNLOAD 
(' 
SELECT i.* 
FROM impressions i 
JOIN client_to_campaign_mapping m ON 
m.campaign_id= i.campaign_id 
WHERE i.record_date>= '{{yyyy}}-{{mm}}-{{dd}}' -interval '1 day'AND 
i.record_date< '{{yyyy}}-{{mm}}-{{dd}}' AND 
m.client_id= <...> 
‘) 
TO's3://{{bucket}}/us_eastern/{{yyyy}}/{{mm}}/{{dd}}/dsdk_events/{{vers}}/impressions/' 
WITH CREDENTIALS 'aws_access_key_id={{key}};aws_secret_access_key={{secret}}' 
DELIMITER ','NULL 'N'ADDQUOTES ESCAPE GZIP MANIFEST;
Workload 
Node Count 
Node Type 
Restore 
Maint. 
Exec. 
Frequency 
& Attribution 
& Overlap 
&Ad Hoc 
16 
dw2.8xlarge 
2h 
1h 
6h 
= $691.20
Workload 
Node Count 
Node Type 
Restore 
Maint. 
Exec. 
Frequency 
8 
dw2.8xlarge 
1.5h 
0.5h 
2.5h 
Attribution 
8 
dw2.8xlarge 
1.5h 
0.5h 
2h 
Overlap 
8 
dw2.8xlarge 
1h 
0.5h 
2.5h 
Ad-hoc 
8 
dw2.8xlarge 
0h 
0.5h 
1.5h 
= $556.80 
(-19%)
http://bit.ly/awsevals

More Related Content

Viewers also liked

Machine learning with Spark
Machine learning with SparkMachine learning with Spark
Machine learning with SparkKhalid Salama
 
(DVO203) The Life of a Netflix Engineer Using 37% of the Internet
(DVO203) The Life of a Netflix Engineer Using 37% of the Internet(DVO203) The Life of a Netflix Engineer Using 37% of the Internet
(DVO203) The Life of a Netflix Engineer Using 37% of the InternetAmazon Web Services
 
Programmatic Media Scenario
Programmatic Media ScenarioProgrammatic Media Scenario
Programmatic Media ScenarioMediaMath
 
(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR
(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR
(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMRAmazon Web Services
 
Best Practices for Using Apache Spark on AWS
Best Practices for Using Apache Spark on AWSBest Practices for Using Apache Spark on AWS
Best Practices for Using Apache Spark on AWSAmazon Web Services
 
Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...
Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...
Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...Amazon Web Services
 
(ISM213) Building and Deploying a Modern Big Data Architecture on AWS
(ISM213) Building and Deploying a Modern Big Data Architecture on AWS(ISM213) Building and Deploying a Modern Big Data Architecture on AWS
(ISM213) Building and Deploying a Modern Big Data Architecture on AWSAmazon Web Services
 
10 R Packages to Win Kaggle Competitions
10 R Packages to Win Kaggle Competitions10 R Packages to Win Kaggle Competitions
10 R Packages to Win Kaggle CompetitionsDataRobot
 
Myths and Mathemagical Superpowers of Data Scientists
Myths and Mathemagical Superpowers of Data ScientistsMyths and Mathemagical Superpowers of Data Scientists
Myths and Mathemagical Superpowers of Data ScientistsDavid Pittman
 
How to Become a Data Scientist
How to Become a Data ScientistHow to Become a Data Scientist
How to Become a Data Scientistryanorban
 
Artificial neural network
Artificial neural networkArtificial neural network
Artificial neural networkDEEPASHRI HK
 
Artificial Intelligence Presentation
Artificial Intelligence PresentationArtificial Intelligence Presentation
Artificial Intelligence Presentationlpaviglianiti
 
Tips for data science competitions
Tips for data science competitionsTips for data science competitions
Tips for data science competitionsOwen Zhang
 
Tutorial on Deep learning and Applications
Tutorial on Deep learning and ApplicationsTutorial on Deep learning and Applications
Tutorial on Deep learning and ApplicationsNhatHai Phan
 
Getting Started with Amazon Redshift
Getting Started with Amazon RedshiftGetting Started with Amazon Redshift
Getting Started with Amazon RedshiftAmazon Web Services
 
Hadoop and Machine Learning
Hadoop and Machine LearningHadoop and Machine Learning
Hadoop and Machine Learningjoshwills
 
Deep Learning for Natural Language Processing
Deep Learning for Natural Language ProcessingDeep Learning for Natural Language Processing
Deep Learning for Natural Language ProcessingDevashish Shanker
 
Data By The People, For The People
Data By The People, For The PeopleData By The People, For The People
Data By The People, For The PeopleDaniel Tunkelang
 

Viewers also liked (20)

Machine learning with Spark
Machine learning with SparkMachine learning with Spark
Machine learning with Spark
 
Amazon Machine Learning
Amazon Machine LearningAmazon Machine Learning
Amazon Machine Learning
 
(DVO203) The Life of a Netflix Engineer Using 37% of the Internet
(DVO203) The Life of a Netflix Engineer Using 37% of the Internet(DVO203) The Life of a Netflix Engineer Using 37% of the Internet
(DVO203) The Life of a Netflix Engineer Using 37% of the Internet
 
Programmatic Media Scenario
Programmatic Media ScenarioProgrammatic Media Scenario
Programmatic Media Scenario
 
(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR
(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR
(BDT309) Data Science & Best Practices for Apache Spark on Amazon EMR
 
Best Practices for Using Apache Spark on AWS
Best Practices for Using Apache Spark on AWSBest Practices for Using Apache Spark on AWS
Best Practices for Using Apache Spark on AWS
 
Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...
Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...
Consolidate MySQL Shards Into Amazon Aurora Using AWS Database Migration Serv...
 
(ISM213) Building and Deploying a Modern Big Data Architecture on AWS
(ISM213) Building and Deploying a Modern Big Data Architecture on AWS(ISM213) Building and Deploying a Modern Big Data Architecture on AWS
(ISM213) Building and Deploying a Modern Big Data Architecture on AWS
 
Amazon Machine Learning
Amazon Machine LearningAmazon Machine Learning
Amazon Machine Learning
 
10 R Packages to Win Kaggle Competitions
10 R Packages to Win Kaggle Competitions10 R Packages to Win Kaggle Competitions
10 R Packages to Win Kaggle Competitions
 
Myths and Mathemagical Superpowers of Data Scientists
Myths and Mathemagical Superpowers of Data ScientistsMyths and Mathemagical Superpowers of Data Scientists
Myths and Mathemagical Superpowers of Data Scientists
 
How to Become a Data Scientist
How to Become a Data ScientistHow to Become a Data Scientist
How to Become a Data Scientist
 
Artificial neural network
Artificial neural networkArtificial neural network
Artificial neural network
 
Artificial Intelligence Presentation
Artificial Intelligence PresentationArtificial Intelligence Presentation
Artificial Intelligence Presentation
 
Tips for data science competitions
Tips for data science competitionsTips for data science competitions
Tips for data science competitions
 
Tutorial on Deep learning and Applications
Tutorial on Deep learning and ApplicationsTutorial on Deep learning and Applications
Tutorial on Deep learning and Applications
 
Getting Started with Amazon Redshift
Getting Started with Amazon RedshiftGetting Started with Amazon Redshift
Getting Started with Amazon Redshift
 
Hadoop and Machine Learning
Hadoop and Machine LearningHadoop and Machine Learning
Hadoop and Machine Learning
 
Deep Learning for Natural Language Processing
Deep Learning for Natural Language ProcessingDeep Learning for Natural Language Processing
Deep Learning for Natural Language Processing
 
Data By The People, For The People
Data By The People, For The PeopleData By The People, For The People
Data By The People, For The People
 

Similar to (ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014

Goal Based Data Production with Sim Simeonov
Goal Based Data Production with Sim SimeonovGoal Based Data Production with Sim Simeonov
Goal Based Data Production with Sim SimeonovDatabricks
 
Flexible Event Tracking (Paul Gebheim)
Flexible Event Tracking (Paul Gebheim)Flexible Event Tracking (Paul Gebheim)
Flexible Event Tracking (Paul Gebheim)MongoSF
 
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Dan Robinson
 
Cassandra SF 2015 - Repeatable, Scalable, Reliable, Observable Cassandra
Cassandra SF 2015 - Repeatable, Scalable, Reliable, Observable CassandraCassandra SF 2015 - Repeatable, Scalable, Reliable, Observable Cassandra
Cassandra SF 2015 - Repeatable, Scalable, Reliable, Observable Cassandraaaronmorton
 
The Last Pickle: Repeatable, Scalable, Reliable, Observable: Cassandra
The Last Pickle: Repeatable, Scalable, Reliable, Observable: CassandraThe Last Pickle: Repeatable, Scalable, Reliable, Observable: Cassandra
The Last Pickle: Repeatable, Scalable, Reliable, Observable: CassandraDataStax Academy
 
Digital analytics with R - Sydney Users of R Forum - May 2015
Digital analytics with R - Sydney Users of R Forum - May 2015Digital analytics with R - Sydney Users of R Forum - May 2015
Digital analytics with R - Sydney Users of R Forum - May 2015Johann de Boer
 
Database Development Replication Security Maintenance Report
Database Development Replication Security Maintenance ReportDatabase Development Replication Security Maintenance Report
Database Development Replication Security Maintenance Reportnyin27
 
ClickHouse Materialized Views: The Magic Continues
ClickHouse Materialized Views: The Magic ContinuesClickHouse Materialized Views: The Magic Continues
ClickHouse Materialized Views: The Magic ContinuesAltinity Ltd
 
Platform agnostic information systems development
Platform agnostic information systems developmentPlatform agnostic information systems development
Platform agnostic information systems developmentMark Jayson Fuentes
 
ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...
ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...
ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...Altinity Ltd
 
Google Analytics for Beginners - Training
Google Analytics for Beginners - TrainingGoogle Analytics for Beginners - Training
Google Analytics for Beginners - TrainingRuben Vezzoli
 
ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...
ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...
ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...Altinity Ltd
 
Scaling Experimentation & Data Capture at Grab
Scaling Experimentation & Data Capture at GrabScaling Experimentation & Data Capture at Grab
Scaling Experimentation & Data Capture at GrabRoman
 
REX Hadoop et R
REX Hadoop et RREX Hadoop et R
REX Hadoop et Rpkernevez
 
Startup Safary | Fight against robots with enbrite.ly data platform
Startup Safary | Fight against robots with enbrite.ly data platformStartup Safary | Fight against robots with enbrite.ly data platform
Startup Safary | Fight against robots with enbrite.ly data platformMészáros József
 
Budapest Spark Meetup - Apache Spark @enbrite.ly
Budapest Spark Meetup - Apache Spark @enbrite.lyBudapest Spark Meetup - Apache Spark @enbrite.ly
Budapest Spark Meetup - Apache Spark @enbrite.lyMészáros József
 
MongoDB World 2014 - BillRun, Billing on top of MongoDB
MongoDB World 2014 - BillRun, Billing on top of MongoDBMongoDB World 2014 - BillRun, Billing on top of MongoDB
MongoDB World 2014 - BillRun, Billing on top of MongoDBOfer Cohen
 
A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...
A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...
A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...Databricks
 

Similar to (ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014 (20)

Goal Based Data Production with Sim Simeonov
Goal Based Data Production with Sim SimeonovGoal Based Data Production with Sim Simeonov
Goal Based Data Production with Sim Simeonov
 
Flexible Event Tracking (Paul Gebheim)
Flexible Event Tracking (Paul Gebheim)Flexible Event Tracking (Paul Gebheim)
Flexible Event Tracking (Paul Gebheim)
 
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
Powering Heap With PostgreSQL And CitusDB (PGConf Silicon Valley 2015)
 
Cassandra SF 2015 - Repeatable, Scalable, Reliable, Observable Cassandra
Cassandra SF 2015 - Repeatable, Scalable, Reliable, Observable CassandraCassandra SF 2015 - Repeatable, Scalable, Reliable, Observable Cassandra
Cassandra SF 2015 - Repeatable, Scalable, Reliable, Observable Cassandra
 
The Last Pickle: Repeatable, Scalable, Reliable, Observable: Cassandra
The Last Pickle: Repeatable, Scalable, Reliable, Observable: CassandraThe Last Pickle: Repeatable, Scalable, Reliable, Observable: Cassandra
The Last Pickle: Repeatable, Scalable, Reliable, Observable: Cassandra
 
Digital analytics with R - Sydney Users of R Forum - May 2015
Digital analytics with R - Sydney Users of R Forum - May 2015Digital analytics with R - Sydney Users of R Forum - May 2015
Digital analytics with R - Sydney Users of R Forum - May 2015
 
Database Development Replication Security Maintenance Report
Database Development Replication Security Maintenance ReportDatabase Development Replication Security Maintenance Report
Database Development Replication Security Maintenance Report
 
ORACLE_23-03-31_en.pdf
ORACLE_23-03-31_en.pdfORACLE_23-03-31_en.pdf
ORACLE_23-03-31_en.pdf
 
ClickHouse Materialized Views: The Magic Continues
ClickHouse Materialized Views: The Magic ContinuesClickHouse Materialized Views: The Magic Continues
ClickHouse Materialized Views: The Magic Continues
 
Platform agnostic information systems development
Platform agnostic information systems developmentPlatform agnostic information systems development
Platform agnostic information systems development
 
ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...
ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...
ClickHouse and the Magic of Materialized Views, By Robert Hodges and Altinity...
 
Google Analytics for Beginners - Training
Google Analytics for Beginners - TrainingGoogle Analytics for Beginners - Training
Google Analytics for Beginners - Training
 
ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...
ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...
ClickHouse Data Warehouse 101: The First Billion Rows, by Alexander Zaitsev a...
 
Scaling Experimentation & Data Capture at Grab
Scaling Experimentation & Data Capture at GrabScaling Experimentation & Data Capture at Grab
Scaling Experimentation & Data Capture at Grab
 
REX Hadoop et R
REX Hadoop et RREX Hadoop et R
REX Hadoop et R
 
Startup Safary | Fight against robots with enbrite.ly data platform
Startup Safary | Fight against robots with enbrite.ly data platformStartup Safary | Fight against robots with enbrite.ly data platform
Startup Safary | Fight against robots with enbrite.ly data platform
 
Talk MongoDB - Amil
Talk MongoDB - AmilTalk MongoDB - Amil
Talk MongoDB - Amil
 
Budapest Spark Meetup - Apache Spark @enbrite.ly
Budapest Spark Meetup - Apache Spark @enbrite.lyBudapest Spark Meetup - Apache Spark @enbrite.ly
Budapest Spark Meetup - Apache Spark @enbrite.ly
 
MongoDB World 2014 - BillRun, Billing on top of MongoDB
MongoDB World 2014 - BillRun, Billing on top of MongoDBMongoDB World 2014 - BillRun, Billing on top of MongoDB
MongoDB World 2014 - BillRun, Billing on top of MongoDB
 
A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...
A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...
A Practical Approach to Building a Streaming Processing Pipeline for an Onlin...
 

More from Amazon Web Services

Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...
Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...
Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...Amazon Web Services
 
Big Data per le Startup: come creare applicazioni Big Data in modalità Server...
Big Data per le Startup: come creare applicazioni Big Data in modalità Server...Big Data per le Startup: come creare applicazioni Big Data in modalità Server...
Big Data per le Startup: come creare applicazioni Big Data in modalità Server...Amazon Web Services
 
Esegui pod serverless con Amazon EKS e AWS Fargate
Esegui pod serverless con Amazon EKS e AWS FargateEsegui pod serverless con Amazon EKS e AWS Fargate
Esegui pod serverless con Amazon EKS e AWS FargateAmazon Web Services
 
Costruire Applicazioni Moderne con AWS
Costruire Applicazioni Moderne con AWSCostruire Applicazioni Moderne con AWS
Costruire Applicazioni Moderne con AWSAmazon Web Services
 
Come spendere fino al 90% in meno con i container e le istanze spot
Come spendere fino al 90% in meno con i container e le istanze spot Come spendere fino al 90% in meno con i container e le istanze spot
Come spendere fino al 90% in meno con i container e le istanze spot Amazon Web Services
 
Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...
Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...
Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...Amazon Web Services
 
OpsWorks Configuration Management: automatizza la gestione e i deployment del...
OpsWorks Configuration Management: automatizza la gestione e i deployment del...OpsWorks Configuration Management: automatizza la gestione e i deployment del...
OpsWorks Configuration Management: automatizza la gestione e i deployment del...Amazon Web Services
 
Microsoft Active Directory su AWS per supportare i tuoi Windows Workloads
Microsoft Active Directory su AWS per supportare i tuoi Windows WorkloadsMicrosoft Active Directory su AWS per supportare i tuoi Windows Workloads
Microsoft Active Directory su AWS per supportare i tuoi Windows WorkloadsAmazon Web Services
 
Database Oracle e VMware Cloud on AWS i miti da sfatare
Database Oracle e VMware Cloud on AWS i miti da sfatareDatabase Oracle e VMware Cloud on AWS i miti da sfatare
Database Oracle e VMware Cloud on AWS i miti da sfatareAmazon Web Services
 
Crea la tua prima serverless ledger-based app con QLDB e NodeJS
Crea la tua prima serverless ledger-based app con QLDB e NodeJSCrea la tua prima serverless ledger-based app con QLDB e NodeJS
Crea la tua prima serverless ledger-based app con QLDB e NodeJSAmazon Web Services
 
API moderne real-time per applicazioni mobili e web
API moderne real-time per applicazioni mobili e webAPI moderne real-time per applicazioni mobili e web
API moderne real-time per applicazioni mobili e webAmazon Web Services
 
Database Oracle e VMware Cloud™ on AWS: i miti da sfatare
Database Oracle e VMware Cloud™ on AWS: i miti da sfatareDatabase Oracle e VMware Cloud™ on AWS: i miti da sfatare
Database Oracle e VMware Cloud™ on AWS: i miti da sfatareAmazon Web Services
 
Tools for building your MVP on AWS
Tools for building your MVP on AWSTools for building your MVP on AWS
Tools for building your MVP on AWSAmazon Web Services
 
How to Build a Winning Pitch Deck
How to Build a Winning Pitch DeckHow to Build a Winning Pitch Deck
How to Build a Winning Pitch DeckAmazon Web Services
 
Building a web application without servers
Building a web application without serversBuilding a web application without servers
Building a web application without serversAmazon Web Services
 
AWS_HK_StartupDay_Building Interactive websites while automating for efficien...
AWS_HK_StartupDay_Building Interactive websites while automating for efficien...AWS_HK_StartupDay_Building Interactive websites while automating for efficien...
AWS_HK_StartupDay_Building Interactive websites while automating for efficien...Amazon Web Services
 
Introduzione a Amazon Elastic Container Service
Introduzione a Amazon Elastic Container ServiceIntroduzione a Amazon Elastic Container Service
Introduzione a Amazon Elastic Container ServiceAmazon Web Services
 

More from Amazon Web Services (20)

Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...
Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...
Come costruire servizi di Forecasting sfruttando algoritmi di ML e deep learn...
 
Big Data per le Startup: come creare applicazioni Big Data in modalità Server...
Big Data per le Startup: come creare applicazioni Big Data in modalità Server...Big Data per le Startup: come creare applicazioni Big Data in modalità Server...
Big Data per le Startup: come creare applicazioni Big Data in modalità Server...
 
Esegui pod serverless con Amazon EKS e AWS Fargate
Esegui pod serverless con Amazon EKS e AWS FargateEsegui pod serverless con Amazon EKS e AWS Fargate
Esegui pod serverless con Amazon EKS e AWS Fargate
 
Costruire Applicazioni Moderne con AWS
Costruire Applicazioni Moderne con AWSCostruire Applicazioni Moderne con AWS
Costruire Applicazioni Moderne con AWS
 
Come spendere fino al 90% in meno con i container e le istanze spot
Come spendere fino al 90% in meno con i container e le istanze spot Come spendere fino al 90% in meno con i container e le istanze spot
Come spendere fino al 90% in meno con i container e le istanze spot
 
Open banking as a service
Open banking as a serviceOpen banking as a service
Open banking as a service
 
Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...
Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...
Rendi unica l’offerta della tua startup sul mercato con i servizi Machine Lea...
 
OpsWorks Configuration Management: automatizza la gestione e i deployment del...
OpsWorks Configuration Management: automatizza la gestione e i deployment del...OpsWorks Configuration Management: automatizza la gestione e i deployment del...
OpsWorks Configuration Management: automatizza la gestione e i deployment del...
 
Microsoft Active Directory su AWS per supportare i tuoi Windows Workloads
Microsoft Active Directory su AWS per supportare i tuoi Windows WorkloadsMicrosoft Active Directory su AWS per supportare i tuoi Windows Workloads
Microsoft Active Directory su AWS per supportare i tuoi Windows Workloads
 
Computer Vision con AWS
Computer Vision con AWSComputer Vision con AWS
Computer Vision con AWS
 
Database Oracle e VMware Cloud on AWS i miti da sfatare
Database Oracle e VMware Cloud on AWS i miti da sfatareDatabase Oracle e VMware Cloud on AWS i miti da sfatare
Database Oracle e VMware Cloud on AWS i miti da sfatare
 
Crea la tua prima serverless ledger-based app con QLDB e NodeJS
Crea la tua prima serverless ledger-based app con QLDB e NodeJSCrea la tua prima serverless ledger-based app con QLDB e NodeJS
Crea la tua prima serverless ledger-based app con QLDB e NodeJS
 
API moderne real-time per applicazioni mobili e web
API moderne real-time per applicazioni mobili e webAPI moderne real-time per applicazioni mobili e web
API moderne real-time per applicazioni mobili e web
 
Database Oracle e VMware Cloud™ on AWS: i miti da sfatare
Database Oracle e VMware Cloud™ on AWS: i miti da sfatareDatabase Oracle e VMware Cloud™ on AWS: i miti da sfatare
Database Oracle e VMware Cloud™ on AWS: i miti da sfatare
 
Tools for building your MVP on AWS
Tools for building your MVP on AWSTools for building your MVP on AWS
Tools for building your MVP on AWS
 
How to Build a Winning Pitch Deck
How to Build a Winning Pitch DeckHow to Build a Winning Pitch Deck
How to Build a Winning Pitch Deck
 
Building a web application without servers
Building a web application without serversBuilding a web application without servers
Building a web application without servers
 
Fundraising Essentials
Fundraising EssentialsFundraising Essentials
Fundraising Essentials
 
AWS_HK_StartupDay_Building Interactive websites while automating for efficien...
AWS_HK_StartupDay_Building Interactive websites while automating for efficien...AWS_HK_StartupDay_Building Interactive websites while automating for efficien...
AWS_HK_StartupDay_Building Interactive websites while automating for efficien...
 
Introduzione a Amazon Elastic Container Service
Introduzione a Amazon Elastic Container ServiceIntroduzione a Amazon Elastic Container Service
Introduzione a Amazon Elastic Container Service
 

Recently uploaded

My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024The Digital Insurer
 
My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationRidwan Fadjar
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brandgvaughan
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr LapshynFwdays
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...Fwdays
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr BaganFwdays
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsRizwan Syed
 
Connect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationConnect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationSlibray Presentation
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machinePadma Pradeep
 
APIForce Zurich 5 April Automation LPDG
APIForce Zurich 5 April  Automation LPDGAPIForce Zurich 5 April  Automation LPDG
APIForce Zurich 5 April Automation LPDGMarianaLemus7
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):comworks
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticscarlostorres15106
 
New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024
New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024
New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024BookNet Canada
 
Key Features Of Token Development (1).pptx
Key  Features Of Token  Development (1).pptxKey  Features Of Token  Development (1).pptx
Key Features Of Token Development (1).pptxLBM Solutions
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii SoldatenkoFwdays
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Scott Keck-Warren
 
Streamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project SetupStreamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project SetupFlorian Wilhelm
 
SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024Lorenzo Miniero
 

Recently uploaded (20)

My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024
 
My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 Presentation
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brand
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL Certs
 
Connect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationConnect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck Presentation
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machine
 
APIForce Zurich 5 April Automation LPDG
APIForce Zurich 5 April  Automation LPDGAPIForce Zurich 5 April  Automation LPDG
APIForce Zurich 5 April Automation LPDG
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
 
New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024
New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024
New from BookNet Canada for 2024: BNC BiblioShare - Tech Forum 2024
 
Key Features Of Token Development (1).pptx
Key  Features Of Token  Development (1).pptxKey  Features Of Token  Development (1).pptx
Key Features Of Token Development (1).pptx
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024
 
Streamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project SetupStreamlining Python Development: A Guide to a Modern Project Setup
Streamlining Python Development: A Guide to a Modern Project Setup
 
SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024
 
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptxE-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
 

(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014

  • 1. November 13, 2014 | Las Vegas, NV Timon Karnezos, Director Infrastructure, Neustar VidhyaSrinivasan, Sr. Manager Software Development, Amazon Redshift
  • 2. Petabyte scale Massively parallelRelational data warehouseFully managed; zero admin
  • 3. 10 GigE (HPC) Ingestion Backup Restore JDBC/ODBC
  • 5.
  • 6.
  • 8.
  • 9. 00 01 10 11 00 01 10 11 P Space filling Curve for Two Dimensions
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 16.
  • 18.
  • 20.
  • 22.
  • 23.
  • 24.
  • 25. 0.7B / day 2B / week 8B / month 21B / quarter
  • 26. --Number of ads seen per user WITH frequency_intermediateAS ( SELECT user_id , SUM(1)AS impression_count, SUM(cost)AS cost , SUM(revenue)AS revenue FROM impressions WHERE record_dateBETWEEN <...> GROUP BY 1 ) --Number of people who saw N ads SELECT impression_count, SUM(1), SUM(cost), SUM(revenue) FROM frequency_intermediate GROUP BY 1;
  • 27.
  • 28.
  • 29. CREATE TABLE ( record_datedateENCODENOT NULL , campaign_idbigintENCODENOT NULL , site_idbigintENCODENOT NULL , user_idbigintENCODENOT NULL DISTKEY, impression_countint ENCODENOT NULL , costbigintENCODENOT NULL , revenuebigintENCODENOT NULL )SORTKEY(,,,);
  • 30. WITH user_frequencyAS ( SELECT user_id, campaign_id, site_id, SUM(impression_count)AS frequency, SUM(cost)AScost , SUM(revenue)AS revenue FROM frequency_intermediate WHERE record_dateBETWEEN <...> GROUP BY 1,2,3 ) SELECT campaign_id, site_id, frequency, SUM(1), SUM(cost), SUM(revenue) FROM user_frequency GROUP BY 1,2,3;
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36. --Basic sessionization query, assemble user activity --that ended in a conversion into a timeline. SELECT <...> FROM impressions i JOIN conversions cON i.user_id =c.user_id AND i.record_date <c.record_date ORDER BY i.record_date;
  • 37. Position: 1 Position: 2 Position: 3
  • 38. Hour offset: 3 Position: 1 Position: 2 Hour offset: 12 Hour offset: 16 Position: 3
  • 39. --Sessionize user activity per conversion, partition by campaign (45-day lookback window) SELECT c.record_dateAS conversion_date , c.event_idAS conversion_id , i.campaign_idAS campaign_id , i.site_idAS site_id , i.user_idAS user_id , c.revenueAS conversion_revenue, DATEDIFF('hour', i.record_date, c.record_date) AS hour_offset, SUM(1)OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id ORDER BY i.record_dateDESC ROWS UNBOUNDED PRECEDING) AS position FROM impressions i JOIN conversions cON i.user_id= c.user_idAND i.campaign_id= c.campaign_idAND i.record_date< c.record_dateAND i.record_date> (c.record_date-interval '45 days') AND c.record_dateBETWEEN <...>;
  • 40. --Compute statistics on sessions (funnel placement, last-touch, site-count, etc...) SELECT campaign_id , site_id , conversion_date, AVG(position)ASaverage_position, SUM(conversion_revenue * (position = 1)::int)ASlta_attributed , AVG(COUNT(DISTINCT site_id) OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id ORDER BY i.record_dateASC ROWS UNBOUNDED PRECEDING)) AS average_unique_preceding_site_count FROMsessions GROUPBY 1,2,3;
  • 41.
  • 42.
  • 43.
  • 44. Site A Site B SiteC Site A 20% 60% Site B 90% Site C CPM $0.06 $1.05 $9.50
  • 45. Site A Site B SiteC Site A 20% 60% Site B 90% Site C CPM $0.06 $1.05 $9.50
  • 46. Site A Site B SiteC Site A 20% 60% Site B 90% Site C CPM $0.06 $1.05 $9.50
  • 47. CREATE TABLE ( user_idbigintENCODENOT NULL DISTKEY, site_id bigintENCODENOT NULL )SORTKEY();
  • 48. WITH co_occurencesAS ( SELECT oi.site_idAS site1 , oi2.site_id AS site2 FROM overlap_intermediate oi JOIN overlap_intermediate oi2 ON oi.site_id> oi2.site_id AND oi.ak_user_id= oi2.ak_user_id ) SELECT site1, site2, SUM(1) FROM co_occurences GROUP BY 1,2;
  • 49. CREATE TABLE ( record_datedateENCODENOT NULL , campaign_idbigintENCODENOT NULL , site_idbigintENCODENOT NULL , user_idbigintENCODENOT NULL DISTKEY )SORTKEY(,);
  • 50. WITH site_overlap_intermediateAS ( SELECT user_id, site_id, campaign_id FROM overlap_intermediateWHERE record_dateBETWEEN <...> GROUP BY 1,2,3 ), site_co_occurencesAS ( SELECT oi.campaign_idAS c_id, oi.site_idAS site1,oi2.site_id AS site2 FROM site_overlap_intermediate oi JOIN site_overlap_intermediate oi2 ON oi.site_id> oi2.site_idAND oi.ak_user_id= oi2.ak_user_id AND oi.campaign_id = oi2.campaign_id ) SELECT c_id, site1, site2, SUM(1)FROM site_co_occurencesGROUP BY 1,2,3;
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56. 8 fact tables 26 dimension tables 7 mapping tables
  • 57. 42 views 121 joins 1100 sloc
  • 58. $ pg_dump–Fc some_file --table=foo --table=bar $ pg_restore--schema-only --clean –Fc some_file > schema.sql $ pg_restore--data-only --table=foo –Fc some_file > foo.tsv $ aws s3 cp schema.sql s3://metadata-bucket/YYYYMMDD/schema.sql $ aws s3 cp foo.tsv s3://metadata-bucket/YYYYMMDD/foo.tsv > i schema.sql > COPY foo FROM ‘s3://metadata-bucket/YYYYMMDD/foo.tsv’ <...> # or combine ‘COPY <..> FROM <...> SSH’and pg_restore/psql
  • 59. UNLOAD (' SELECT i.* FROM impressions i JOIN client_to_campaign_mapping m ON m.campaign_id= i.campaign_id WHERE i.record_date>= '{{yyyy}}-{{mm}}-{{dd}}' -interval '1 day'AND i.record_date< '{{yyyy}}-{{mm}}-{{dd}}' AND m.client_id= <...> ‘) TO's3://{{bucket}}/us_eastern/{{yyyy}}/{{mm}}/{{dd}}/dsdk_events/{{vers}}/impressions/' WITH CREDENTIALS 'aws_access_key_id={{key}};aws_secret_access_key={{secret}}' DELIMITER ','NULL 'N'ADDQUOTES ESCAPE GZIP MANIFEST;
  • 60.
  • 61.
  • 62.
  • 63.
  • 64. Workload Node Count Node Type Restore Maint. Exec. Frequency & Attribution & Overlap &Ad Hoc 16 dw2.8xlarge 2h 1h 6h = $691.20
  • 65. Workload Node Count Node Type Restore Maint. Exec. Frequency 8 dw2.8xlarge 1.5h 0.5h 2.5h Attribution 8 dw2.8xlarge 1.5h 0.5h 2h Overlap 8 dw2.8xlarge 1h 0.5h 2.5h Ad-hoc 8 dw2.8xlarge 0h 0.5h 1.5h = $556.80 (-19%)
  • 66.