SlideShare a Scribd company logo
Assignment 7.1.a

airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao:
string, callsign: string, country: string, active: bool>

child 0, airline_id: int64

child 1, name: string

child 2, alias: string

child 3, iata: string

In [93]: import os

import json

from pathlib import Path

import gzip

import hashlib

import shutil

import pandas as pd

import pygeohash

import s3fs


current_dir = Path(os.getcwd()).absolute()

results_dir = current_dir.joinpath('results')

if results_dir.exists():


results_dir.mkdir(parents=True, exist_ok=True)

def read_jsonl_data():

s3 = s3fs.S3FileSystem(



'endpoint_url': endpoint_url



src_data_path = 'data/processed/openflights/routes.jsonl.gz'

with, 'rb') as f_gz:

with, 'rb') as f:

records = [json.loads(line) for line in f.readlines()]

return records

In [94]: from pyarrow.json import read_json

import pyarrow.parquet as pq

def create_parquet_dataset():

src_data_path = 'data/processed/openflights/routes.jsonl.gz'

parquet_output_path = results_dir.joinpath('routes.parquet')

s3 = s3fs.S3FileSystem(



'endpoint_url': endpoint_url



with, 'rb') as f_gz:

with, 'rb') as f:

## TODO: Use Apache Arrow to create Parquet table and save the dataset

table = read_json(f)


pq.write_table(table, parquet_output_path, compression='none')

child 4, icao: string

child 5, callsign: string

child 6, country: string

child 7, active: bool

src_airport: struct<airport_id: int64, name: string, city: string, country: string,
iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>

child 0, airport_id: int64

child 1, name: string

child 2, city: string

child 3, country: string

child 4, iata: string

child 5, icao: string

child 6, latitude: double

child 7, longitude: double

child 8, altitude: int64

child 9, timezone: double

child 10, dst: string

child 11, tz_id: string

child 12, type: string

child 13, source: string

dst_airport: struct<airport_id: int64, name: string, city: string, country: string,
iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>

child 0, airport_id: int64

child 1, name: string

child 2, city: string

child 3, country: string

child 4, iata: string

child 5, icao: string

child 6, latitude: double

child 7, longitude: double

child 8, altitude: int64

child 9, timezone: double

child 10, dst: string

child 11, tz_id: string

child 12, type: string

child 13, source: string

codeshare: bool

equipment: list<item: string>

child 0, item: string

['codeshare', 'equipment', 'airline.airline_id', '', 'airline.alias', 'a
irline.iata', 'airline.icao', 'airline.callsign', '', 'airline.activ
e', 'src_airport.airport_id', '', '', 'src_airport.c
ountry', 'src_airport.iata', 'src_airport.icao', 'src_airport.latitude', 'src_airpor
t.longitude', 'src_airport.altitude', 'src_airport.timezone', 'src_airport.dst', 'sr
c_airport.tz_id', 'src_airport.type', 'src_airport.source', 'dst_airport.airport_i
d', '', '', '', 'dst_airport.iat
a', 'dst_airport.icao', 'dst_airport.latitude', 'dst_airport.longitude', 'dst_airpor
t.altitude', 'dst_airport.timezone', 'dst_airport.dst', 'dst_airport.tz_id', 'dst_ai
rport.type', 'dst_airport.source']
In [95]: parquet_output_path = results_dir.joinpath('routes.parquet')

pq = pd.read_parquet(parquet_output_path, engine='fastparquet')


In [96]: partitions = (

('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'),

('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'),

('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'),

('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z')


In [97]: partitions_keys = (

'A', 'B', 'C-D', 'E-F',

'G-H', 'I-J', 'K-L', 'M',

'N', 'O-P', 'Q-R', 'S-T',
{'A': ('A', 'A'), 'B': ('B', 'B'), 'C-D': ('C', 'D'), 'E-F': ('E', 'F'), 'G-H':
('G', 'H'), 'I-J': ('I', 'J'), 'K-L': ('K', 'L'), 'M': ('M', 'M'), 'N': ('N', 'N'),
'O-P': ('O', 'P'), 'Q-R': ('Q', 'R'), 'S-T': ('S', 'T'), 'U': ('U', 'U'), 'V': ('V',
'V'), 'W-X': ('W', 'X'), 'Y-Z': ('Y', 'Z')}

Assignment 7.1.b
codeshare equipment airline.airline_id airline.alias airline.iata airline.icao airline.
0 0.0 [CR2] 410.0 Aerocondor
1 0.0 [CR2] 410.0 Aerocondor
2 0.0 [CR2] 410.0 Aerocondor
3 0.0 [CR2] 410.0 Aerocondor
4 0.0 [CR2] 410.0 Aerocondor
5 rows × 41 columns
'U', 'V', 'W-X', 'Y-Z'


In [98]: parts_k_v = dict(zip(partitions_keys, partitions))


In [99]: def get_key(val):

for key, value in parts_k_v.items():

if val in value:

return key

return "0"

In [100… pq['key'] = pq['src_airport.iata'] + pq['dst_airport.iata'] + pq['airline.iata']

pq['partition_value'] = pq['key'].str[:1]

pq['kv_key'] = pq.apply(lambda x: get_key(x.partition_value), axis=1)

In [101… # remove invalid keys

pq = pq[pq.kv_key != "0"].astype('float32', errors='ignore')

In [102… pq.head()

In [104… import pyarrow as pa

import pyarrow.parquet as parpq

pq_tab = pa.Table.from_pandas(pq)





Assignment 7.1.c
{'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'}

Assignment 7.1.d
In [105… import hashlib

def hash_key(key):

m = hashlib.sha256()


return m.hexdigest()

In [106… pq['key'] = pq['src_airport.iata']+pq['dst_airport.iata']+pq['airline.iata']

pq['hashed'] = pq.apply(lambda x: hash_key(x.key), axis=1)

pq['hash_key'] = pq['hashed'].str[:1]

In [107… pq_tab1 = pa.Table.from_pandas(pq)






In [109… #get hash for datacenters

datacenters = {}

datacenters['west'] = pygeohash.encode(45.5945645, -121.1786823)

datacenters['central'] = pygeohash.encode(41.1544433, -96.0422378)

datacenters['east'] = pygeohash.encode(39.08344, -77.6497145)


In [110… def closest_datacenter(latitude, longitude):

geohash = pygeohash.encode(latitude, longitude)

dist_dict = {}

closest_datacenter = ''

last_distance = None

for key, value in datacenters.items():

dist = pygeohash.geohash_approximate_distance(str(geohash), str(value))

dist_dict[key] = dist

if (last_distance == None) or (dist < last_distance):
closest_datacenter = key

last_distance = dist

return closest_datacenter

In [113… pq['datacenter'] = pq[['src_airport.latitude', 'src_airport.longitude']].apply(lambd
In [114… pq_tab2 = pa.Table.from_pandas(pq)





0 410.0

1 410.0

2 410.0

3 410.0

4 410.0

Name: airline.airline_id, dtype: float32
[{-1.0: 1}, {10.0: 1}, {21.0: 1}, {24.0: 1}, {28.0: 1}, {29.0: 1}, {32.0: 1}, {35.0:
1}, {42.0: 1}, {43.0: 1}, {55.0: 1}, {68.0: 1}, {83.0: 1}, {90.0: 1}, {96.0: 1}, {10
6.0: 1}, {109.0: 1}, {116.0: 1}, {125.0: 1}, {130.0: 1}, {132.0: 1}, {137.0: 1}, {13
9.0: 1}, {146.0: 1}, {153.0: 1}, {179.0: 1}, {197.0: 1}, {214.0: 1}, {218.0: 1}, {22
0.0: 1}, {221.0: 1}, {225.0: 1}, {231.0: 1}, {240.0: 1}, {241.0: 1}, {242.0: 1}, {24
6.0: 1}, {312.0: 1}, {316.0: 1}, {319.0: 1}, {321.0: 1}, {324.0: 1}, {328.0: 1}, {32
9.0: 1}, {330.0: 1}, {333.0: 1}, {336.0: 1}, {338.0: 1}, {341.0: 1}, {345.0: 1}, {38
6.0: 1}, {397.0: 1}, {410.0: 1}, {412.0: 1}, {426.0: 1}, {439.0: 1}, {442.0: 1}, {46
2.0: 1}, {470.0: 1}, {476.0: 1}, {477.0: 1}, {491.0: 1}, {502.0: 1}, {503.0: 1}, {50
8.0: 1}, {515.0: 1}, {524.0: 1}, {543.0: 1}, {563.0: 1}, {567.0: 1}, {569.0: 1}, {57
6.0: 1}, {595.0: 1}, {596.0: 1}, {603.0: 1}, {608.0: 1}, {622.0: 1}, {641.0: 1}, {68
3.0: 1}, {690.0: 2}, {692.0: 2}, {751.0: 2}, {753.0: 2}, {794.0: 2}, {807.0: 2}, {83
7.0: 2}, {879.0: 2}, {881.0: 2}, {882.0: 2}, {896.0: 2}, {897.0: 2}, {921.0: 2}, {97
0.0: 2}, {995.0: 2}, {998.0: 2}, {1006.0: 2}, {1008.0: 2}, {1034.0: 2}, {1048.0: 2},
{1057.0: 2}, {1066.0: 2}, {1073.0: 2}, {1109.0: 2}, {1173.0: 2}, {1191.0: 2}, {1203.
0: 2}, {1206.0: 2}, {1230.0: 2}, {1266.0: 2}, {1287.0: 2}, {1290.0: 2}, {1299.0: 2},
{1308.0: 2}, {1316.0: 2}, {1317.0: 2}, {1338.0: 2}, {1340.0: 2}, {1355.0: 2}, {1359.
0: 2}, {1392.0: 2}, {1401.0: 2}, {1403.0: 2}, {1422.0: 2}, {1434.0: 2}, {1441.0: 2},
{1463.0: 2}, {1469.0: 2}, {1472.0: 2}, {1478.0: 2}, {1492.0: 2}, {1500.0: 2}, {1508.
0: 2}, {1531.0: 2}, {1539.0: 2}, {1548.0: 2}, {1581.0: 2}, {1611.0: 2}, {1623.0: 2},
{1629.0: 2}, {1654.0: 2}, {1663.0: 2}, {1669.0: 2}, {1680.0: 2}, {1682.0: 2}, {1683.
0: 2}, {1729.0: 2}, {1750.0: 2}, {1756.0: 2}, {1758.0: 2}, {1767.0: 2}, {1769.0: 2},
{1775.0: 2}, {1790.0: 2}, {1792.0: 2}, {1829.0: 2}, {1844.0: 2}, {1868.0: 2}, {1886.
0: 2}, {1889.0: 3}, {1908.0: 3}, {1909.0: 3}, {1925.0: 3}, {1936.0: 3}, {1942.0: 3},
{1943.0: 3}, {1946.0: 3}, {1954.0: 3}, {1966.0: 3}, {2009.0: 3}, {2056.0: 3}, {2058.
0: 3}, {2091.0: 3}, {2094.0: 3}, {2104.0: 3}, {2117.0: 3}, {2143.0: 3}, {2150.0: 3},
{2183.0: 3}, {2193.0: 3}, {2217.0: 3}, {2218.0: 3}, {2220.0: 3}, {2222.0: 3}, {2226.
0: 3}, {2245.0: 3}, {2260.0: 3}, {2264.0: 3}, {2293.0: 3}, {2297.0: 3}, {2324.0: 3},
{2350.0: 3}, {2353.0: 3}, {2354.0: 3}, {2395.0: 3}, {2409.0: 3}, {2417.0: 3}, {2418.
0: 3}, {2419.0: 3}, {2420.0: 3}, {2421.0: 3}, {2439.0: 3}, {2468.0: 3}, {2520.0: 3},
{2524.0: 3}, {2538.0: 3}, {2541.0: 3}, {2547.0: 3}, {2548.0: 3}, {2575.0: 3}, {2585.
0: 3}, {2607.0: 3}, {2622.0: 3}, {2638.0: 3}, {2660.0: 3}, {2681.0: 3}, {2682.0: 3},
{2684.0: 3}, {2688.0: 3}, {2692.0: 3}, {2731.0: 3}, {2748.0: 3}, {2750.0: 3}, {2757.
0: 3}, {2765.0: 3}, {2773.0: 3}, {2774.0: 3}, {2822.0: 3}, {2825.0: 3}, {2826.0: 3},
{2835.0: 3}, {2850.0: 3}, {2857.0: 3}, {2881.0: 3}, {2896.0: 3}, {2916.0: 3}, {2922.
0: 3}, {2923.0: 3}, {2942.0: 4}, {2951.0: 4}, {2954.0: 4}, {2987.0: 4}, {2989.0: 4},
In [128… pq['airline.airline_id'].head()

In [133… def balance_partitions(keys, num_partitions):

ac = keys.cumsum()

#sum of the entire array

partsum = ac[-1]//num_partitions 

#generates the cumulative sums of each part

cum_part_sums = np.array(range(1,p))*partsum

#finds the indices 

inds = np.searchsorted(ac,cum_part_sums) 

#split into approximately equal-sum arrays

parts = np.split(arr,inds)

return parts

In [134… keys = list(pq['airline.airline_id'])


In [135… print(balance_partitions(keys, num_partitions))
{2990.0: 4}, {2993.0: 4}, {2994.0: 4}, {3000.0: 4}, {3021.0: 4}, {3026.0: 4}, {3029.
0: 4}, {3052.0: 4}, {3081.0: 4}, {3090.0: 4}, {3097.0: 4}, {3123.0: 4}, {3126.0: 4},
{3148.0: 4}, {3163.0: 4}, {3179.0: 4}, {3197.0: 4}, {3200.0: 4}, {3201.0: 4}, {3210.
0: 4}, {3233.0: 4}, {3251.0: 4}, {3258.0: 4}, {3287.0: 4}, {3290.0: 4}, {3320.0: 4},
{3329.0: 4}, {3342.0: 4}, {3354.0: 4}, {3370.0: 4}, {3378.0: 4}, {3386.0: 4}, {3391.
0: 4}, {3392.0: 4}, {3393.0: 4}, {3432.0: 4}, {3437.0: 4}, {3463.0: 4}, {3490.0: 4},
{3498.0: 4}, {3534.0: 4}, {3539.0: 4}, {3545.0: 4}, {3547.0: 4}, {3574.0: 4}, {3589.
0: 4}, {3613.0: 4}, {3618.0: 4}, {3637.0: 4}, {3652.0: 4}, {3661.0: 4}, {3674.0: 4},
{3721.0: 4}, {3734.0: 4}, {3737.0: 4}, {3740.0: 4}, {3754.0: 4}, {3764.0: 4}, {3776.
0: 4}, {3778.0: 4}, {3781.0: 4}, {3783.0: 4}, {3788.0: 4}, {3805.0: 4}, {3811.0: 4},
{3826.0: 4}, {3834.0: 4}, {3835.0: 4}, {3850.0: 4}, {3856.0: 4}, {3857.0: 4}, {3865.
0: 4}, {3871.0: 4}, {3926.0: 4}, {3935.0: 5}, {3952.0: 5}, {3969.0: 5}, {3976.0: 5},
{4021.0: 5}, {4026.0: 5}, {4031.0: 5}, {4044.0: 5}, {4066.0: 5}, {4089.0: 5}, {4091.
0: 5}, {4165.0: 5}, {4178.0: 5}, {4234.0: 5}, {4248.0: 5}, {4255.0: 5}, {4259.0: 5},
{4292.0: 5}, {4296.0: 5}, {4304.0: 5}, {4305.0: 5}, {4311.0: 5}, {4319.0: 5}, {4329.
0: 5}, {4335.0: 5}, {4349.0: 5}, {4356.0: 5}, {4375.0: 5}, {4388.0: 5}, {4429.0: 5},
{4435.0: 5}, {4436.0: 5}, {4438.0: 5}, {4454.0: 5}, {4475.0: 5}, {4496.0: 5}, {4513.
0: 5}, {4521.0: 5}, {4533.0: 5}, {4547.0: 5}, {4550.0: 5}, {4559.0: 5}, {4573.0: 5},
{4599.0: 5}, {4608.0: 5}, {4609.0: 5}, {4611.0: 5}, {4687.0: 5}, {4691.0: 5}, {4735.
0: 5}, {4737.0: 5}, {4740.0: 5}, {4750.0: 5}, {4752.0: 5}, {4797.0: 5}, {4805.0: 5},
{4808.0: 5}, {4822.0: 5}, {4840.0: 5}, {4863.0: 5}, {4867.0: 5}, {4869.0: 5}, {4870.
0: 5}, {4897.0: 5}, {4936.0: 5}, {4937.0: 5}, {4940.0: 5}, {4947.0: 5}, {4951.0: 5},
{4965.0: 5}, {5002.0: 5}, {5013.0: 5}, {5016.0: 5}, {5038.0: 5}, {5039.0: 5}, {5041.
0: 5}, {5067.0: 5}, {5083.0: 5}, {5085.0: 5}, {5097.0: 6}, {5133.0: 6}, {5156.0: 6},
{5179.0: 6}, {5188.0: 6}, {5209.0: 6}, {5234.0: 6}, {5265.0: 6}, {5281.0: 6}, {5282.
0: 6}, {5297.0: 6}, {5309.0: 6}, {5325.0: 6}, {5331.0: 6}, {5333.0: 6}, {5347.0: 6},
{5354.0: 6}, {5360.0: 6}, {5368.0: 6}, {5399.0: 6}, {5416.0: 6}, {5439.0: 6}, {5461.
0: 6}, {5479.0: 6}, {5484.0: 6}, {5496.0: 6}, {5521.0: 6}, {5523.0: 6}, {5651.0: 6},
{5813.0: 6}, {5982.0: 6}, {6557.0: 6}, {8359.0: 6}, {8463.0: 6}, {8576.0: 6}, {8745.
0: 6}, {8809.0: 6}, {9082.0: 6}, {9531.0: 6}, {9541.0: 6}, {9620.0: 6}, {9666.0: 6},
{9764.0: 6}, {9784.0: 6}, {9809.0: 6}, {9810.0: 6}, {9818.0: 6}, {9828.0: 6}, {9829.
0: 6}, {10121.0: 6}, {10122.0: 6}, {10128.0: 6}, {10646.0: 6}, {10650.0: 6}, {10675.
0: 6}, {10737.0: 6}, {10739.0: 6}, {10741.0: 6}, {10758.0: 6}, {10765.0: 6}, {10776.
0: 6}, {10800.0: 6}, {10912.0: 6}, {10955.0: 6}, {11741.0: 6}, {11763.0: 6}, {11794.
0: 6}, {11806.0: 6}, {11808.0: 6}, {11811.0: 6}, {11814.0: 6}, {11838.0: 6}, {11857.
0: 6}, {11948.0: 6}, {11963.0: 6}, {12978.0: 6}, {13088.0: 6}, {13108.0: 6}, {13200.
0: 6}, {13335.0: 7}, {13704.0: 7}, {13757.0: 7}, {13899.0: 7}, {13983.0: 7}, {14061.
0: 7}, {14118.0: 7}, {14485.0: 7}, {14849.0: 7}, {15814.0: 7}, {15837.0: 7}, {15893.
0: 7}, {15999.0: 7}, {16120.0: 7}, {16133.0: 7}, {16136.0: 7}, {16149.0: 7}, {16150.
0: 7}, {16262.0: 7}, {16415.0: 7}, {16475.0: 7}, {16508.0: 7}, {16615.0: 7}, {16624.
0: 7}, {16660.0: 7}, {16707.0: 7}, {16725.0: 7}, {16726.0: 7}, {16844.0: 7}, {16882.
0: 7}, {16942.0: 7}, {16960.0: 7}, {16963.0: 7}, {17023.0: 7}, {17083.0: 7}, {17094.
0: 7}, {17095.0: 7}, {17099.0: 7}, {17408.0: 7}, {17519.0: 7}, {17675.0: 7}, {17885.
0: 7}, {17891.0: 7}, {18169.0: 7}, {18232.0: 7}, {18529.0: 7}, {18543.0: 7}, {18553.
0: 7}, {18700.0: 7}, {18732.0: 7}, {18825.0: 7}, {18828.0: 7}, {18944.0: 7}, {18946.
0: 7}, {19016.0: 7}, {19305.0: 7}, {19582.0: 7}, {19610.0: 7}, {19676.0: 7}, {19804.
0: 7}, {19810.0: 7}, {19944.0: 7}, {20004.0: 7}, {20047.0: 7}, {20160.0: 7}, {20270.
0: 7}, {20565.0: 7}, {20577.0: 7}, {20686.0: 7}, {20710.0: 7}, {20963.0: 7}, {20976.
0: 7}, {21012.0: 7}]

In [ ]:

More Related Content

What's hot

CSS3 notes
CSS3 notesCSS3 notes
CSS3 notes
Rex Wang
World of CSS Grid
World of CSS GridWorld of CSS Grid
World of CSS Grid
Elad Shechter
Window functions in MySQL 8.0
Window functions in MySQL 8.0Window functions in MySQL 8.0
Window functions in MySQL 8.0
MySQL: Indexing for Better Performance
MySQL: Indexing for Better PerformanceMySQL: Indexing for Better Performance
MySQL: Indexing for Better Performance
Oracle basic queries
Oracle basic queriesOracle basic queries
Oracle basic queries
HTML and CSS.pptx
HTML and CSS.pptxHTML and CSS.pptx
HTML and CSS.pptx
Css Display Property
Css Display PropertyCss Display Property
Css Display Property
Webtech Learning
Norvald Ryeng
How to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better PerformanceHow to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better Performance
Practica de la Sesión 3: Programación de Códigos de PHP
Practica de la Sesión 3: Programación de Códigos de PHPPractica de la Sesión 3: Programación de Códigos de PHP
Practica de la Sesión 3: Programación de Códigos de PHP
Google scholar
Google scholarGoogle scholar
Google scholar
Bubble in link list
Bubble in link listBubble in link list
Bubble in link list
university of Gujrat, pakistan
How to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better PerformanceHow to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better Performance
07 Using Oracle-Supported Package in Application Development
07 Using Oracle-Supported Package in Application Development07 Using Oracle-Supported Package in Application Development
07 Using Oracle-Supported Package in Application Development

What's hot (20)

CSS3 notes
CSS3 notesCSS3 notes
CSS3 notes
World of CSS Grid
World of CSS GridWorld of CSS Grid
World of CSS Grid
Window functions in MySQL 8.0
Window functions in MySQL 8.0Window functions in MySQL 8.0
Window functions in MySQL 8.0
MySQL: Indexing for Better Performance
MySQL: Indexing for Better PerformanceMySQL: Indexing for Better Performance
MySQL: Indexing for Better Performance
Oracle basic queries
Oracle basic queriesOracle basic queries
Oracle basic queries
HTML and CSS.pptx
HTML and CSS.pptxHTML and CSS.pptx
HTML and CSS.pptx
RESTful API Design, Second Edition
RESTful API Design, Second EditionRESTful API Design, Second Edition
RESTful API Design, Second Edition
Css Display Property
Css Display PropertyCss Display Property
Css Display Property
How to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better PerformanceHow to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better Performance
Introduction to PHP
Introduction to PHPIntroduction to PHP
Introduction to PHP
Practica de la Sesión 3: Programación de Códigos de PHP
Practica de la Sesión 3: Programación de Códigos de PHPPractica de la Sesión 3: Programación de Códigos de PHP
Practica de la Sesión 3: Programación de Códigos de PHP
Google scholar
Google scholarGoogle scholar
Google scholar
Bubble in link list
Bubble in link listBubble in link list
Bubble in link list
How to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better PerformanceHow to Analyze and Tune MySQL Queries for Better Performance
How to Analyze and Tune MySQL Queries for Better Performance
07 Using Oracle-Supported Package in Application Development
07 Using Oracle-Supported Package in Application Development07 Using Oracle-Supported Package in Application Development
07 Using Oracle-Supported Package in Application Development

Similar to Assignment7.pdf

node.js and the AR.Drone: building a real-time dashboard using
node.js and the AR.Drone: building a real-time dashboard using socket.ionode.js and the AR.Drone: building a real-time dashboard using
node.js and the AR.Drone: building a real-time dashboard using
Steven Beeckman
MySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of ThingsMySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of Things
Alexander Rubin
Cdr stats-vo ip-analytics_solution_mongodb_meetup
Cdr stats-vo ip-analytics_solution_mongodb_meetupCdr stats-vo ip-analytics_solution_mongodb_meetup
Cdr stats-vo ip-analytics_solution_mongodb_meetup
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDBCDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
Areski Belaid
The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210
Mahmoud Samir Fayed
Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Remy Sharp
Is html5-ready-workshop-110727181512-phpapp02
Is html5-ready-workshop-110727181512-phpapp02Is html5-ready-workshop-110727181512-phpapp02
Is html5-ready-workshop-110727181512-phpapp02PL dream
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 201910 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
Matt Raible
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docxVersion1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
How to Hack a Road Trip with a Webcam, a GSP and Some Fun with Node
How to Hack a Road Trip  with a Webcam, a GSP and Some Fun with NodeHow to Hack a Road Trip  with a Webcam, a GSP and Some Fun with Node
How to Hack a Road Trip with a Webcam, a GSP and Some Fun with Node
Web+GISという視点から見たGISの方向性Hidenori Fujimura
Exploring Canvas
Exploring CanvasExploring Canvas
Exploring CanvasKevin Hoyt
The Ring programming language version 1.5.2 book - Part 52 of 181
The Ring programming language version 1.5.2 book - Part 52 of 181The Ring programming language version 1.5.2 book - Part 52 of 181
The Ring programming language version 1.5.2 book - Part 52 of 181
Mahmoud Samir Fayed
Bonnes pratiques de développement avec Node js
Bonnes pratiques de développement avec Node jsBonnes pratiques de développement avec Node js
Bonnes pratiques de développement avec Node js
Francois Zaninotto
The Ring programming language version 1.4.1 book - Part 13 of 31
The Ring programming language version 1.4.1 book - Part 13 of 31The Ring programming language version 1.4.1 book - Part 13 of 31
The Ring programming language version 1.4.1 book - Part 13 of 31
Mahmoud Samir Fayed
Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...
Andrey Karpov
The Ring programming language version 1.10 book - Part 54 of 212
The Ring programming language version 1.10 book - Part 54 of 212The Ring programming language version 1.10 book - Part 54 of 212
The Ring programming language version 1.10 book - Part 54 of 212
Mahmoud Samir Fayed
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】

Similar to Assignment7.pdf (20)

node.js and the AR.Drone: building a real-time dashboard using
node.js and the AR.Drone: building a real-time dashboard using socket.ionode.js and the AR.Drone: building a real-time dashboard using
node.js and the AR.Drone: building a real-time dashboard using
MySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of ThingsMySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of Things
Cdr stats-vo ip-analytics_solution_mongodb_meetup
Cdr stats-vo ip-analytics_solution_mongodb_meetupCdr stats-vo ip-analytics_solution_mongodb_meetup
Cdr stats-vo ip-analytics_solution_mongodb_meetup
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDBCDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210
Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)
Is html5-ready-workshop-110727181512-phpapp02
Is html5-ready-workshop-110727181512-phpapp02Is html5-ready-workshop-110727181512-phpapp02
Is html5-ready-workshop-110727181512-phpapp02
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 201910 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docxVersion1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
How to Hack a Road Trip with a Webcam, a GSP and Some Fun with Node
How to Hack a Road Trip  with a Webcam, a GSP and Some Fun with NodeHow to Hack a Road Trip  with a Webcam, a GSP and Some Fun with Node
How to Hack a Road Trip with a Webcam, a GSP and Some Fun with Node
Exploring Canvas
Exploring CanvasExploring Canvas
Exploring Canvas
The Ring programming language version 1.5.2 book - Part 52 of 181
The Ring programming language version 1.5.2 book - Part 52 of 181The Ring programming language version 1.5.2 book - Part 52 of 181
The Ring programming language version 1.5.2 book - Part 52 of 181
Bonnes pratiques de développement avec Node js
Bonnes pratiques de développement avec Node jsBonnes pratiques de développement avec Node js
Bonnes pratiques de développement avec Node js
The Ring programming language version 1.4.1 book - Part 13 of 31
The Ring programming language version 1.4.1 book - Part 13 of 31The Ring programming language version 1.4.1 book - Part 13 of 31
The Ring programming language version 1.4.1 book - Part 13 of 31
Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...
The Ring programming language version 1.10 book - Part 54 of 212
The Ring programming language version 1.10 book - Part 54 of 212The Ring programming language version 1.10 book - Part 54 of 212
The Ring programming language version 1.10 book - Part 54 of 212
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】

More from dash41

Assignment 6.3.pdf
Assignment 6.3.pdfAssignment 6.3.pdf
Assignment 6.3.pdf
Assignment 6.2a.pdf
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdf
Assignment 6.1.pdf
Assignment 6.1.pdfAssignment 6.1.pdf
Assignment 6.1.pdf
Assignment 5.3.pdf
Assignment 5.3.pdfAssignment 5.3.pdf
Assignment 5.3.pdf
Assignment 5.2.pdf
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdf
Assignment 5.1.pdf
Assignment 5.1.pdfAssignment 5.1.pdf
Assignment 5.1.pdf
Assignment 3.pdf
Assignment 3.pdfAssignment 3.pdf
Assignment 3.pdf

More from dash41 (9)

Assignment 6.3.pdf
Assignment 6.3.pdfAssignment 6.3.pdf
Assignment 6.3.pdf
Assignment 6.2a.pdf
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdf
Assignment 6.1.pdf
Assignment 6.1.pdfAssignment 6.1.pdf
Assignment 6.1.pdf
Assignment 5.3.pdf
Assignment 5.3.pdfAssignment 5.3.pdf
Assignment 5.3.pdf
Assignment 5.2.pdf
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdf
Assignment 5.1.pdf
Assignment 5.1.pdfAssignment 5.1.pdf
Assignment 5.1.pdf
Assignment 3.pdf
Assignment 3.pdfAssignment 3.pdf
Assignment 3.pdf

Recently uploaded

Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project PresentationPredicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Boston Institute of Analytics
Best best suvichar in gujarati english meaning of this sentence as Silk road ...
Best best suvichar in gujarati english meaning of this sentence as Silk road ...Best best suvichar in gujarati english meaning of this sentence as Silk road ...
Best best suvichar in gujarati english meaning of this sentence as Silk road ...
Empowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptxEmpowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptx
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdfSample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdfCriminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP
standardisation of garbhpala offhgfffghh
standardisation of garbhpala offhgfffghhstandardisation of garbhpala offhgfffghh
standardisation of garbhpala offhgfffghh
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdfCriminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP
Q1’2024 Update: MYCI’s Leap Year Rebound
Q1’2024 Update: MYCI’s Leap Year ReboundQ1’2024 Update: MYCI’s Leap Year Rebound
Q1’2024 Update: MYCI’s Leap Year Rebound
1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx
1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx
1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx
Opendatabay - Open Data Marketplace.pptx
Opendatabay - Open Data Marketplace.pptxOpendatabay - Open Data Marketplace.pptx
Opendatabay - Open Data Marketplace.pptx
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Subhajit Sahu

Recently uploaded (20)

Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project PresentationPredicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Best best suvichar in gujarati english meaning of this sentence as Silk road ...
Best best suvichar in gujarati english meaning of this sentence as Silk road ...Best best suvichar in gujarati english meaning of this sentence as Silk road ...
Best best suvichar in gujarati english meaning of this sentence as Silk road ...
Empowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptxEmpowering Data Analytics Ecosystem.pptx
Empowering Data Analytics Ecosystem.pptx
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdfSample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdfCriminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdf
standardisation of garbhpala offhgfffghh
standardisation of garbhpala offhgfffghhstandardisation of garbhpala offhgfffghh
standardisation of garbhpala offhgfffghh
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdfCriminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdf
Q1’2024 Update: MYCI’s Leap Year Rebound
Q1’2024 Update: MYCI’s Leap Year ReboundQ1’2024 Update: MYCI’s Leap Year Rebound
Q1’2024 Update: MYCI’s Leap Year Rebound
1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx
1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx
1.Seydhcuxhxyxhccuuxuxyxyxmisolids 2019.pptx
Opendatabay - Open Data Marketplace.pptx
Opendatabay - Open Data Marketplace.pptxOpendatabay - Open Data Marketplace.pptx
Opendatabay - Open Data Marketplace.pptx
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...


  • 1. Assignment 7.1.a pyarrow.Table airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao: string, callsign: string, country: string, active: bool> child 0, airline_id: int64 child 1, name: string child 2, alias: string child 3, iata: string In [93]: import os import json from pathlib import Path import gzip import hashlib import shutil import pandas as pd import pygeohash import s3fs endpoint_url='' current_dir = Path(os.getcwd()).absolute() results_dir = current_dir.joinpath('results') if results_dir.exists(): shutil.rmtree(results_dir) results_dir.mkdir(parents=True, exist_ok=True) def read_jsonl_data(): s3 = s3fs.S3FileSystem( anon=True, client_kwargs={ 'endpoint_url': endpoint_url } ) src_data_path = 'data/processed/openflights/routes.jsonl.gz' with, 'rb') as f_gz: with, 'rb') as f: records = [json.loads(line) for line in f.readlines()] return records In [94]: from pyarrow.json import read_json import pyarrow.parquet as pq def create_parquet_dataset(): src_data_path = 'data/processed/openflights/routes.jsonl.gz' parquet_output_path = results_dir.joinpath('routes.parquet') s3 = s3fs.S3FileSystem( anon=True, client_kwargs={ 'endpoint_url': endpoint_url } ) with, 'rb') as f_gz: with, 'rb') as f: ## TODO: Use Apache Arrow to create Parquet table and save the dataset table = read_json(f) print(table) pq.write_table(table, parquet_output_path, compression='none') create_parquet_dataset()
  • 2. child 4, icao: string child 5, callsign: string child 6, country: string child 7, active: bool src_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti mezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string dst_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti mezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string codeshare: bool equipment: list<item: string> child 0, item: string ['codeshare', 'equipment', 'airline.airline_id', '', 'airline.alias', 'a irline.iata', 'airline.icao', 'airline.callsign', '', 'airline.activ e', 'src_airport.airport_id', '', '', 'src_airport.c ountry', 'src_airport.iata', 'src_airport.icao', 'src_airport.latitude', 'src_airpor t.longitude', 'src_airport.altitude', 'src_airport.timezone', 'src_airport.dst', 'sr c_airport.tz_id', 'src_airport.type', 'src_airport.source', 'dst_airport.airport_i d', '', '', '', 'dst_airport.iat a', 'dst_airport.icao', 'dst_airport.latitude', 'dst_airport.longitude', 'dst_airpor t.altitude', 'dst_airport.timezone', 'dst_airport.dst', 'dst_airport.tz_id', 'dst_ai rport.type', 'dst_airport.source'] In [95]: parquet_output_path = results_dir.joinpath('routes.parquet') pq = pd.read_parquet(parquet_output_path, engine='fastparquet') print(list(pq.columns.values)) In [96]: partitions = ( ('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'), ('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'), ('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'), ('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z') ) In [97]: partitions_keys = ( 'A', 'B', 'C-D', 'E-F', 'G-H', 'I-J', 'K-L', 'M', 'N', 'O-P', 'Q-R', 'S-T',
  • 3. {'A': ('A', 'A'), 'B': ('B', 'B'), 'C-D': ('C', 'D'), 'E-F': ('E', 'F'), 'G-H': ('G', 'H'), 'I-J': ('I', 'J'), 'K-L': ('K', 'L'), 'M': ('M', 'M'), 'N': ('N', 'N'), 'O-P': ('O', 'P'), 'Q-R': ('Q', 'R'), 'S-T': ('S', 'T'), 'U': ('U', 'U'), 'V': ('V', 'V'), 'W-X': ('W', 'X'), 'Y-Z': ('Y', 'Z')} Assignment 7.1.b codeshare equipment airline.airline_id airline.alias airline.iata airline.icao airline. 0 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 1 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 2 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 3 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 4 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 5 rows × 41 columns 'U', 'V', 'W-X', 'Y-Z' ) In [98]: parts_k_v = dict(zip(partitions_keys, partitions)) print(parts_k_v) In [99]: def get_key(val): for key, value in parts_k_v.items(): if val in value: return key return "0" In [100… pq['key'] = pq['src_airport.iata'] + pq['dst_airport.iata'] + pq['airline.iata'] pq['partition_value'] = pq['key'].str[:1] pq['kv_key'] = pq.apply(lambda x: get_key(x.partition_value), axis=1) In [101… # remove invalid keys pq = pq[pq.kv_key != "0"].astype('float32', errors='ignore') In [102… pq.head() Out[102… In [104… import pyarrow as pa import pyarrow.parquet as parpq pq_tab = pa.Table.from_pandas(pq) parpq.write_to_dataset( pq_tab, root_path=results_dir.joinpath('kv'), partition_cols=['kv_key'], )
  • 4. Assignment 7.1.c {'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'} Assignment 7.1.d In [105… import hashlib def hash_key(key): m = hashlib.sha256() m.update(str(key).encode('utf-8')) return m.hexdigest() In [106… pq['key'] = pq['src_airport.iata']+pq['dst_airport.iata']+pq['airline.iata'] pq['hashed'] = pq.apply(lambda x: hash_key(x.key), axis=1) pq['hash_key'] = pq['hashed'].str[:1] In [107… pq_tab1 = pa.Table.from_pandas(pq) parpq.write_to_dataset( pq_tab1, root_path=results_dir.joinpath('hash'), partition_cols=['hash_key'], ) In [109… #get hash for datacenters datacenters = {} datacenters['west'] = pygeohash.encode(45.5945645, -121.1786823) datacenters['central'] = pygeohash.encode(41.1544433, -96.0422378) datacenters['east'] = pygeohash.encode(39.08344, -77.6497145) print(datacenters) In [110… def closest_datacenter(latitude, longitude): geohash = pygeohash.encode(latitude, longitude) dist_dict = {} closest_datacenter = '' last_distance = None for key, value in datacenters.items(): dist = pygeohash.geohash_approximate_distance(str(geohash), str(value)) dist_dict[key] = dist if (last_distance == None) or (dist < last_distance): closest_datacenter = key last_distance = dist return closest_datacenter In [113… pq['datacenter'] = pq[['src_airport.latitude', 'src_airport.longitude']].apply(lambd In [114… pq_tab2 = pa.Table.from_pandas(pq) parpq.write_to_dataset( pq_tab2, root_path=results_dir.joinpath('geo'), partition_cols=['datacenter'], )
  • 5. 0 410.0 1 410.0 2 410.0 3 410.0 4 410.0 Name: airline.airline_id, dtype: float32 [{-1.0: 1}, {10.0: 1}, {21.0: 1}, {24.0: 1}, {28.0: 1}, {29.0: 1}, {32.0: 1}, {35.0: 1}, {42.0: 1}, {43.0: 1}, {55.0: 1}, {68.0: 1}, {83.0: 1}, {90.0: 1}, {96.0: 1}, {10 6.0: 1}, {109.0: 1}, {116.0: 1}, {125.0: 1}, {130.0: 1}, {132.0: 1}, {137.0: 1}, {13 9.0: 1}, {146.0: 1}, {153.0: 1}, {179.0: 1}, {197.0: 1}, {214.0: 1}, {218.0: 1}, {22 0.0: 1}, {221.0: 1}, {225.0: 1}, {231.0: 1}, {240.0: 1}, {241.0: 1}, {242.0: 1}, {24 6.0: 1}, {312.0: 1}, {316.0: 1}, {319.0: 1}, {321.0: 1}, {324.0: 1}, {328.0: 1}, {32 9.0: 1}, {330.0: 1}, {333.0: 1}, {336.0: 1}, {338.0: 1}, {341.0: 1}, {345.0: 1}, {38 6.0: 1}, {397.0: 1}, {410.0: 1}, {412.0: 1}, {426.0: 1}, {439.0: 1}, {442.0: 1}, {46 2.0: 1}, {470.0: 1}, {476.0: 1}, {477.0: 1}, {491.0: 1}, {502.0: 1}, {503.0: 1}, {50 8.0: 1}, {515.0: 1}, {524.0: 1}, {543.0: 1}, {563.0: 1}, {567.0: 1}, {569.0: 1}, {57 6.0: 1}, {595.0: 1}, {596.0: 1}, {603.0: 1}, {608.0: 1}, {622.0: 1}, {641.0: 1}, {68 3.0: 1}, {690.0: 2}, {692.0: 2}, {751.0: 2}, {753.0: 2}, {794.0: 2}, {807.0: 2}, {83 7.0: 2}, {879.0: 2}, {881.0: 2}, {882.0: 2}, {896.0: 2}, {897.0: 2}, {921.0: 2}, {97 0.0: 2}, {995.0: 2}, {998.0: 2}, {1006.0: 2}, {1008.0: 2}, {1034.0: 2}, {1048.0: 2}, {1057.0: 2}, {1066.0: 2}, {1073.0: 2}, {1109.0: 2}, {1173.0: 2}, {1191.0: 2}, {1203. 0: 2}, {1206.0: 2}, {1230.0: 2}, {1266.0: 2}, {1287.0: 2}, {1290.0: 2}, {1299.0: 2}, {1308.0: 2}, {1316.0: 2}, {1317.0: 2}, {1338.0: 2}, {1340.0: 2}, {1355.0: 2}, {1359. 0: 2}, {1392.0: 2}, {1401.0: 2}, {1403.0: 2}, {1422.0: 2}, {1434.0: 2}, {1441.0: 2}, {1463.0: 2}, {1469.0: 2}, {1472.0: 2}, {1478.0: 2}, {1492.0: 2}, {1500.0: 2}, {1508. 0: 2}, {1531.0: 2}, {1539.0: 2}, {1548.0: 2}, {1581.0: 2}, {1611.0: 2}, {1623.0: 2}, {1629.0: 2}, {1654.0: 2}, {1663.0: 2}, {1669.0: 2}, {1680.0: 2}, {1682.0: 2}, {1683. 0: 2}, {1729.0: 2}, {1750.0: 2}, {1756.0: 2}, {1758.0: 2}, {1767.0: 2}, {1769.0: 2}, {1775.0: 2}, {1790.0: 2}, {1792.0: 2}, {1829.0: 2}, {1844.0: 2}, {1868.0: 2}, {1886. 0: 2}, {1889.0: 3}, {1908.0: 3}, {1909.0: 3}, {1925.0: 3}, {1936.0: 3}, {1942.0: 3}, {1943.0: 3}, {1946.0: 3}, {1954.0: 3}, {1966.0: 3}, {2009.0: 3}, {2056.0: 3}, {2058. 0: 3}, {2091.0: 3}, {2094.0: 3}, {2104.0: 3}, {2117.0: 3}, {2143.0: 3}, {2150.0: 3}, {2183.0: 3}, {2193.0: 3}, {2217.0: 3}, {2218.0: 3}, {2220.0: 3}, {2222.0: 3}, {2226. 0: 3}, {2245.0: 3}, {2260.0: 3}, {2264.0: 3}, {2293.0: 3}, {2297.0: 3}, {2324.0: 3}, {2350.0: 3}, {2353.0: 3}, {2354.0: 3}, {2395.0: 3}, {2409.0: 3}, {2417.0: 3}, {2418. 0: 3}, {2419.0: 3}, {2420.0: 3}, {2421.0: 3}, {2439.0: 3}, {2468.0: 3}, {2520.0: 3}, {2524.0: 3}, {2538.0: 3}, {2541.0: 3}, {2547.0: 3}, {2548.0: 3}, {2575.0: 3}, {2585. 0: 3}, {2607.0: 3}, {2622.0: 3}, {2638.0: 3}, {2660.0: 3}, {2681.0: 3}, {2682.0: 3}, {2684.0: 3}, {2688.0: 3}, {2692.0: 3}, {2731.0: 3}, {2748.0: 3}, {2750.0: 3}, {2757. 0: 3}, {2765.0: 3}, {2773.0: 3}, {2774.0: 3}, {2822.0: 3}, {2825.0: 3}, {2826.0: 3}, {2835.0: 3}, {2850.0: 3}, {2857.0: 3}, {2881.0: 3}, {2896.0: 3}, {2916.0: 3}, {2922. 0: 3}, {2923.0: 3}, {2942.0: 4}, {2951.0: 4}, {2954.0: 4}, {2987.0: 4}, {2989.0: 4}, In [128… pq['airline.airline_id'].head() Out[128… In [133… def balance_partitions(keys, num_partitions): ac = keys.cumsum() #sum of the entire array partsum = ac[-1]//num_partitions #generates the cumulative sums of each part cum_part_sums = np.array(range(1,p))*partsum #finds the indices inds = np.searchsorted(ac,cum_part_sums) #split into approximately equal-sum arrays parts = np.split(arr,inds) return parts In [134… keys = list(pq['airline.airline_id']) num_partitions=7 In [135… print(balance_partitions(keys, num_partitions))
  • 6. {2990.0: 4}, {2993.0: 4}, {2994.0: 4}, {3000.0: 4}, {3021.0: 4}, {3026.0: 4}, {3029. 0: 4}, {3052.0: 4}, {3081.0: 4}, {3090.0: 4}, {3097.0: 4}, {3123.0: 4}, {3126.0: 4}, {3148.0: 4}, {3163.0: 4}, {3179.0: 4}, {3197.0: 4}, {3200.0: 4}, {3201.0: 4}, {3210. 0: 4}, {3233.0: 4}, {3251.0: 4}, {3258.0: 4}, {3287.0: 4}, {3290.0: 4}, {3320.0: 4}, {3329.0: 4}, {3342.0: 4}, {3354.0: 4}, {3370.0: 4}, {3378.0: 4}, {3386.0: 4}, {3391. 0: 4}, {3392.0: 4}, {3393.0: 4}, {3432.0: 4}, {3437.0: 4}, {3463.0: 4}, {3490.0: 4}, {3498.0: 4}, {3534.0: 4}, {3539.0: 4}, {3545.0: 4}, {3547.0: 4}, {3574.0: 4}, {3589. 0: 4}, {3613.0: 4}, {3618.0: 4}, {3637.0: 4}, {3652.0: 4}, {3661.0: 4}, {3674.0: 4}, {3721.0: 4}, {3734.0: 4}, {3737.0: 4}, {3740.0: 4}, {3754.0: 4}, {3764.0: 4}, {3776. 0: 4}, {3778.0: 4}, {3781.0: 4}, {3783.0: 4}, {3788.0: 4}, {3805.0: 4}, {3811.0: 4}, {3826.0: 4}, {3834.0: 4}, {3835.0: 4}, {3850.0: 4}, {3856.0: 4}, {3857.0: 4}, {3865. 0: 4}, {3871.0: 4}, {3926.0: 4}, {3935.0: 5}, {3952.0: 5}, {3969.0: 5}, {3976.0: 5}, {4021.0: 5}, {4026.0: 5}, {4031.0: 5}, {4044.0: 5}, {4066.0: 5}, {4089.0: 5}, {4091. 0: 5}, {4165.0: 5}, {4178.0: 5}, {4234.0: 5}, {4248.0: 5}, {4255.0: 5}, {4259.0: 5}, {4292.0: 5}, {4296.0: 5}, {4304.0: 5}, {4305.0: 5}, {4311.0: 5}, {4319.0: 5}, {4329. 0: 5}, {4335.0: 5}, {4349.0: 5}, {4356.0: 5}, {4375.0: 5}, {4388.0: 5}, {4429.0: 5}, {4435.0: 5}, {4436.0: 5}, {4438.0: 5}, {4454.0: 5}, {4475.0: 5}, {4496.0: 5}, {4513. 0: 5}, {4521.0: 5}, {4533.0: 5}, {4547.0: 5}, {4550.0: 5}, {4559.0: 5}, {4573.0: 5}, {4599.0: 5}, {4608.0: 5}, {4609.0: 5}, {4611.0: 5}, {4687.0: 5}, {4691.0: 5}, {4735. 0: 5}, {4737.0: 5}, {4740.0: 5}, {4750.0: 5}, {4752.0: 5}, {4797.0: 5}, {4805.0: 5}, {4808.0: 5}, {4822.0: 5}, {4840.0: 5}, {4863.0: 5}, {4867.0: 5}, {4869.0: 5}, {4870. 0: 5}, {4897.0: 5}, {4936.0: 5}, {4937.0: 5}, {4940.0: 5}, {4947.0: 5}, {4951.0: 5}, {4965.0: 5}, {5002.0: 5}, {5013.0: 5}, {5016.0: 5}, {5038.0: 5}, {5039.0: 5}, {5041. 0: 5}, {5067.0: 5}, {5083.0: 5}, {5085.0: 5}, {5097.0: 6}, {5133.0: 6}, {5156.0: 6}, {5179.0: 6}, {5188.0: 6}, {5209.0: 6}, {5234.0: 6}, {5265.0: 6}, {5281.0: 6}, {5282. 0: 6}, {5297.0: 6}, {5309.0: 6}, {5325.0: 6}, {5331.0: 6}, {5333.0: 6}, {5347.0: 6}, {5354.0: 6}, {5360.0: 6}, {5368.0: 6}, {5399.0: 6}, {5416.0: 6}, {5439.0: 6}, {5461. 0: 6}, {5479.0: 6}, {5484.0: 6}, {5496.0: 6}, {5521.0: 6}, {5523.0: 6}, {5651.0: 6}, {5813.0: 6}, {5982.0: 6}, {6557.0: 6}, {8359.0: 6}, {8463.0: 6}, {8576.0: 6}, {8745. 0: 6}, {8809.0: 6}, {9082.0: 6}, {9531.0: 6}, {9541.0: 6}, {9620.0: 6}, {9666.0: 6}, {9764.0: 6}, {9784.0: 6}, {9809.0: 6}, {9810.0: 6}, {9818.0: 6}, {9828.0: 6}, {9829. 0: 6}, {10121.0: 6}, {10122.0: 6}, {10128.0: 6}, {10646.0: 6}, {10650.0: 6}, {10675. 0: 6}, {10737.0: 6}, {10739.0: 6}, {10741.0: 6}, {10758.0: 6}, {10765.0: 6}, {10776. 0: 6}, {10800.0: 6}, {10912.0: 6}, {10955.0: 6}, {11741.0: 6}, {11763.0: 6}, {11794. 0: 6}, {11806.0: 6}, {11808.0: 6}, {11811.0: 6}, {11814.0: 6}, {11838.0: 6}, {11857. 0: 6}, {11948.0: 6}, {11963.0: 6}, {12978.0: 6}, {13088.0: 6}, {13108.0: 6}, {13200. 0: 6}, {13335.0: 7}, {13704.0: 7}, {13757.0: 7}, {13899.0: 7}, {13983.0: 7}, {14061. 0: 7}, {14118.0: 7}, {14485.0: 7}, {14849.0: 7}, {15814.0: 7}, {15837.0: 7}, {15893. 0: 7}, {15999.0: 7}, {16120.0: 7}, {16133.0: 7}, {16136.0: 7}, {16149.0: 7}, {16150. 0: 7}, {16262.0: 7}, {16415.0: 7}, {16475.0: 7}, {16508.0: 7}, {16615.0: 7}, {16624. 0: 7}, {16660.0: 7}, {16707.0: 7}, {16725.0: 7}, {16726.0: 7}, {16844.0: 7}, {16882. 0: 7}, {16942.0: 7}, {16960.0: 7}, {16963.0: 7}, {17023.0: 7}, {17083.0: 7}, {17094. 0: 7}, {17095.0: 7}, {17099.0: 7}, {17408.0: 7}, {17519.0: 7}, {17675.0: 7}, {17885. 0: 7}, {17891.0: 7}, {18169.0: 7}, {18232.0: 7}, {18529.0: 7}, {18543.0: 7}, {18553. 0: 7}, {18700.0: 7}, {18732.0: 7}, {18825.0: 7}, {18828.0: 7}, {18944.0: 7}, {18946. 0: 7}, {19016.0: 7}, {19305.0: 7}, {19582.0: 7}, {19610.0: 7}, {19676.0: 7}, {19804. 0: 7}, {19810.0: 7}, {19944.0: 7}, {20004.0: 7}, {20047.0: 7}, {20160.0: 7}, {20270. 0: 7}, {20565.0: 7}, {20577.0: 7}, {20686.0: 7}, {20710.0: 7}, {20963.0: 7}, {20976. 0: 7}, {21012.0: 7}] In [ ]: