More Related Content Similar to Python + Hive on AWS EMR で貧者のログサマリ (20) Python + Hive on AWS EMR で貧者のログサマリ9. 2VJDL4VSWFZ
Ø ؚٗⴓ匿חꟼגְ׃ׯ倯
Ø )BEPPQ⢪ג׃ׯ倯
Ø )JWF⢪ג׃ׯ倯
Ø .3⢪ג׃ׯ倯
14. 1PPSNBOˏT
Ø ➙֮植朐⯋ח
Ø 満ٔا٦أ ➂儗穗꿀
ד湡涸麦䧭ׅ㪦
Ø 湡涸麦䧭ׅأؾ٦س〳腉זꣲ♳־㪦
Ø 搀欽ז佄⳿鼘ֽ㪦
15. ,BONV OHJOFFS5FBN
NBLJ
$0OHJOFFS
@JEFZVUB
%FTJHOFS
NPRBEB
OHJOFFS
@BDIJLV
OHJOFFS
爡ꞿ噟灇瑔涪
رؠ؎ٝؿٗٝز
أوم،فؚٔٗⴓ匿
ؿٗٝزغحؙؒٝس
؎ٝؿٓأوم،فٔ
غحؙؒٝس؎ٝؿٓ
ⴓ匿㛇湍ؚٗⴓ匿㼎ػ٦زش٦璞〡
17. 剢 ꬊ㖇簭
• ؟٦ؽأך䧭ꞿהⰟח㟓ִ鋅鴥
• 剢⽃⡘ד،سمحؙזؙؒٔ䫎־ְ
Ø 爡ⰻח㣐鋉垷ر٦ةⳢ椚ׅ濼鋅顕ְ
• չل٦أꂁⴓ׃♳דպ濼鋅顕
• 㢩鿇ח⳿׃חְֻإٝءذ؍ـזر٦ة㶷㖈
Ø 麊欽؝أزⴱ劍䫎项⡚ֻ䫇ְִ
20. 84.3
Ø 侧֮84؟٦ؽأךֲך♧א
Ø )BEPPQװ)BEPPQؒ؝ءأذيⰻך48ָر
ؿٕؓزדⵃ欽〳腉
Ø 1*ד饯⹛ծ+PCך㹋遤ծ⨡姺乼⡲〳腉
Ø ٌصةؚٔٝ瘝״׃זח㹋倵׃גֻ
Ø 4)%'4ך剏חⵃ欽〳腉
Ø ؙٓأةך〴侧㢌刿ָ㺁僒
21. SDIJUFDUVSF
盖椚؟٦غ
ؙ٦هٝ
ꂁ⥋؟٦غ
ؙ٦هٝ
ꂁ⥋؟٦غ
• ꂁ⥋؟٦غ♳ך'MVFOUEדؚٗ꧊《
• VFOUETQMVHJOד ꧊׃ؚٗ
4♳ח⥂㶷
• .3♳ך)JWFדؚٗ⸇䊨ծ꧊鎘
• ꧊鎘⦼3%4ח⥂㶷׃ג〳鋔⻉
26. VFOUET
QMVHJOⵃ欽׃גؚٗ굲לׅ
Ø 굲לؚׅٗכِ٦ؠך،ؙءّٝ2VFSZ4USJOH
חろג굲לׅ
• 醱꧟ז+40/כ굲לׁ׆ծ2VFSZ4USJOHח䞔㜠鯹ׇ
• )JWFדך꧊鎘儗חⰋג+40/ח㢌䳔
• IUUQTFYBNQMFDPNCFBDPO TVCPCKDPVQPOBDUJPODMJDLDJE
Ø 'MVFOUE꧊秈؟٦غכⵃ欽׃זְ
• ٔ،ٕة؎ي꧊鎘ך䗳銲䚍כ植朐넝ֻזְ
• ⱔꞿ圓䧭罋ִילז׆醱꧟חז
• 4ך㸜㹀䠬חֶ⟣ׇ׃ְ
28. 4UPSF
Ø הִ֮׆4ח굲לׅ
Ø 4ךغ؛حزכ劤殢嗚鏾דⴓֽגֶֻ
• غ؛حز⽃⡘ד،ؙإأ؝ٝزٗ٦ٕ〳腉
• FYBNQMFDPNQSPEVDUJPOMPH
Ø ؟٦غ䕵ⶴⴽחؗ٦ⴓֽגֶֻ
• ⴽ؟٦غָ㟓ִג㸜䗰
• FYBNQMFDPNQSPEVDUJPOMPHBQJ
Ø 傈ⴽחؗ٦ⴓֽגֶֻ
• )JWFךػ٦ذ؍ءّٝⵃ欽ׅ捀
• FYBNQMFDPNQSPEVDUJPOMPHBQJEU
42. 3FGFSFODFT
Ø 84NB[PO.3#FTU1SBDUJDFT
• ؝ٖ铣ל荈ⴓ麦ך؝ٝذؙأزחさ.3圓䧭ַָկ
)BEPPQךⰅה׃ג葺ְךדכկ
Ø NJYJך鍑匿㛇湍הQBDIF)JWFדך+40/ػ٦؟
ך崞欽ך稱➜
• +40/ד顕ג7JFXדذ٦ـٕשֻ䪔ֲ،؎ر؍،顗կؚٗ
꧊鎘חꟼ➂麦ך؝ىُص؛٦ءّٝ؝أزծהְֲ嚊䙀顗կ
Ø #BUDI1SPDFTTJOHBOE4USFBN1SPDFTTJOHCZ42-
• ֿךز٦ؙ耀ְגⴓ匿㛇湍ח.11禸ؒٝآٝⵃ欽ׅ✲寸䠐կ
*NQBMBה1SFTUP嫰鯰׃ծ4ח湫䱸ؙؒٔ䫎־1SFTUP㼪
Ⰵ׃կ *NQBMB如劍غ٦آّٝדכ4ח湫䱸ؙؒٔ䫎־׃
ְךדך儗חⱄ䏝嗚鏾✮㹀
45. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
46. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
47. BXTDMJ
Ø ٔٔ٦أך7FSַ.3堣腉ך1SFWJFX
أذ٦ةأָ《ծ兦ג㸜㹀׃1*ה׃גⵃ欽〳腉
Ø ➙תדرؿ؋ؙز3VCZךMBTUJD.BQ3FEVDFأؙ
ٔفزַ⛦䳔ִ
• QJQד知⽃ח؎ٝأز٦ٕדֹ
• ⟃ַBXTDMJ⢪גךדخ٦ٕ窟♧
• (JU)VC♳דך涪ָ崞涪ד13⳿ׇ
49. $
mkvirtualenv
pycon-‐emr-‐dev
(pycon-‐emr-‐dev)$
pip
install
awscli
(pycon-‐emr-‐dev)$
mkdir
~/.awscli
(pycon-‐emr-‐dev)$
cat
-‐EOF
~/.awscli/config
[profile
development]
aws_access_key_id=development_access_key
aws_secret_access_key=development_secret_key
region=ap-‐northeast-‐1
EOF
(pycon-‐emr-‐dev)$
cat
-‐EOF
$VIRTUAL_ENV/bin/activate
export
AWS_CONFIG_FILE=~/.awscli/config
export
AWS_DEFAULT_PROFILE=development
source
aws_zsh_completer.sh
EOF
50. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
51. $
aws
emr
create-‐cluster
-‐-‐ami-‐version
3.1.1
-‐-‐name
'PyConJP
2014
(AMI
3.1.1
Hive)'
-‐-‐tags
Name=pycon-‐jp-‐emr
environment=development
-‐-‐ec2-‐attributes
KeyName=yourkey
-‐-‐log-‐uri
's3://yourbucket/jobflow_logs/'
-‐-‐no-‐auto-‐terminate
-‐-‐visible-‐to-‐all-‐users
-‐-‐instance-‐groups
file://./normal-‐instance-‐setup.json
-‐-‐applications
file://./app-‐hive.json
52. [
{
OPSNBMJOTUBODFHSPVQKTPO BQQIJWFKTPO
Name:
emr-‐master,
InstanceGroupType:
MASTER,
InstanceCount:
1,
InstanceType:
m1.medium
},
{
Name:
emr-‐core,
InstanceGroupType:
CORE,
InstanceCount:
2,
InstanceType:
m1.medium
}
]
[
{
Name:
HIVE
}
]
54. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
55. $
aws
emr
add-‐steps
-‐-‐cluster-‐id
j-‐8xxxxxxxxx
-‐-‐steps
file://./hive-‐sample-‐step-‐1.json
56. [
{
IJWFTBNQMFTUFQKTPO
Args:
[
-‐f,
s3n://yourbucket/hive-‐script/sample01.hql,
-‐d,
BUCKET_NAME=yourbucket,
-‐d,
TARGET_DATE=20140818
],
ActionOnFailure:
CONTINUE,
Name:
Hive
Sample
Program
01,
Type:
HIVE
},
{
Args:
[
-‐f,
s3n://yourbucket/hive-‐script/sample02.hql,
-‐d,
BUCKET_NAME=yourbucket,
-‐d,
TARGET_DATE=20140818
],
ActionOnFailure:
CONTINUE,
Name:
Hive
Sample
Program
02,
Type:
HIVE
}
]
57. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
58. $
aws
emr
add-‐steps
-‐-‐cluster-‐id
j-‐8xxxxxxxxx
-‐-‐steps
file://./s3distcp-‐sample-‐step.json
59. [
{
TEJTUDQTBNQMFTUFQKTPO
Name:
s3distcp
Sample,
ActionOnFailure:
CONTINUE,
Jar:
/home/hadoop/lib/emr-‐s3distcp-‐1.0.jar,
Type:
CUSTOM_JAR,
Args:
[
-‐-‐src,
s3n://yourbucket/access_log/dt=20140818,
-‐-‐dest,
s3n://yourbucket/compressed_log/dt=20140818,
-‐-‐groupBy,
.*(nginx_access_log-‐).*,
-‐-‐targetSize,
100,
-‐-‐outputCodec,
gzip
]
}
]
60. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
61. $
aws
emr
create-‐cluster
-‐-‐ami-‐version
3.1.1
-‐-‐name
'PyConJP
2014
(AMI
3.1.1
Hive)'
-‐-‐tags
Name=pycon-‐jp-‐emr
environment=development
-‐-‐ec2-‐attributes
KeyName=yourkey
-‐-‐log-‐uri
's3://yourbucket/jobflow_logs/'
-‐-‐no-‐auto-‐terminate
-‐-‐visible-‐to-‐all-‐users
-‐-‐instance-‐groups
file://./normal-‐instance-‐setup.json
-‐-‐applications
file://./app-‐hive-‐with-‐config.json
63. IJWFTJUFYNM
?xml
version=1.0?
?xml-‐stylesheet
type=text/xsl
href=configuration.xsl?
configuration
property
namehive.optimize.s3.query/name
valuetrue/value
descriptionOptimize
query
on
S3/description
/property
/configuration
64. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
65. $
aws
emr
create-‐cluster
-‐-‐ami-‐version
3.1.1
-‐-‐name
'PyConJP
2014
(AMI
3.1.1
Hive
+
Presto)'
-‐-‐tags
Name=pycon-‐jp-‐emr
environment=development
-‐-‐ec2-‐attributes
KeyName=yourkey
-‐-‐log-‐uri
's3://yourbucket/jobflow_logs/'
-‐-‐no-‐auto-‐terminate
-‐-‐visible-‐to-‐all-‐users
-‐-‐instance-‐groups
file://./normal-‐instance-‐setup.json
-‐-‐bootstrap-‐actions
file://./bootstrap-‐presto.json
-‐-‐applications
file://./app-‐hive-‐with-‐config.json
66. [
{
Name:
Install/Setup
Presto,
Path:
s3://yourbucket/libs/setup-‐presto.rb,
Args:
[
-‐-‐task_memory,
1GB,
-‐-‐log-‐level,
DEGUB,
-‐-‐version,
0.75,
-‐-‐presto-‐repo-‐url,
http://central.maven.org/maven2/com/
facebook/presto/,
-‐-‐sink-‐buffer-‐size,
1GB,
-‐-‐query-‐max-‐age,
1h,
-‐-‐jvm-‐config,
-‐server
-‐Xmx2G
-‐XX:+UseConcMarkSweepGC
-‐XX:
+ExplicitGCInvokesConcurrent
-‐XX:+CMSClassUnloadingEnabled
-‐XX:
+AggressiveOpts
-‐XX:+HeapDumpOnOutOfMemoryError
-‐
XX:OnOutOfMemoryError=kill
-‐9
%p
-‐XX:PermSize=150M
-‐
XX:MaxPermSize=150M
-‐XX:ReservedCodeCacheSize=150M
-‐
Dhive.config.resources=/home/hadoop/conf/core-‐site.xml,/home/
hadoop/conf/hdfs-‐site.xml
]
}
]
68. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
69. Ø .FUBTUPSFהכ)JWFךذ٦ـٕ㹀纏瘝ך䞔㜠⥂
㶷׃גֶֻ㜥䨽ךֿה
Ø 植㖈㢳ֻכ.Z42-ָⵃ欽ׁגְ
Ø ⡦鏣㹀׃זְה.3ך؎ٝأةٝأך.Z42-ח
⥂㶷ׁ
Ø .FUBTUPSF.3㢩鿇ך%#ח鏣㹀׃גֶֻֿהדծ
.3甧♳־ꥷח%%-ⱄ䏝崧ׁזֻג葺ֻ
ז
Ø %#⩎ך4FDVSJUZ(SPVQ⥜姻ׅ䗳銲֮
70. configuration
property
BQQIJWFXJUIDPOHKTPO
namehive.optimize.s3.query/name
valuetrue/value
descriptionOptimize
query
on
S3/description
/property
property
namejavax.jdo.option.ConnectionURL/name
valuejdbc:mysql://hostname:3306/hive?createDatabaseIfNotExist=true/value
descriptionJDBC
connect
string
for
a
JDBC
metastore/description
/property
property
namejavax.jdo.option.ConnectionDriverName/name
valuecom.mysql.jdbc.Driver/value
descriptionDriver
class
name
for
a
JDBC
metastore/description
/property
property
namejavax.jdo.option.ConnectionUserName/name
valueusername/value
descriptionUsername
to
use
against
metastore
database/description
/property
property
namejavax.jdo.option.ConnectionPassword/name
valuepassword/value
descriptionPassword
to
use
against
metastore
database/description
/property
/configuration
71. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
73. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
74. #
-‐*-‐
coding:
utf-‐8
-‐*-‐
from
datetime
import
datetime
from
boto.emr
import
connect_to_region
from
boto.emr.step
import
InstallHiveStep
def
setup_emr():
#
need
to
export
AWS_ACCESS_KEY_ID
and
AWS_SECRET_ACCESS_KEY
#
as
environment
variables.
conn
=
connect_to_region('ap-‐northeast-‐1')
install_step
=
InstallHiveStep(hive_versions='0.11.0.2')
jobid
=
conn.run_jobflow(
name='Create
EMR
[{}]'.format(datetime.today().strftime('%Y%m%d')),
log_uri='s3://yourbucket/jobflow_logs/',
ec2_keyname='your_key',
master_instance_type='m1.medium',
slave_instance_type='m1.medium',
num_instances=3,
action_on_failure='TERMINATE_JOB_FLOW',
keep_alive=True,
enable_debugging=False,
hadoop_version='2.4.0',
steps=[install_step],
bootstrap_actions=[],
instance_groups=None,
additional_info=None,
ami_version='3.1.1',
api_params=None,
visible_to_all_users=True,
job_flow_role=None)
return
jobid
if
__name__
==
'__main__':
jobflow_id
=
setup_emr()
print
JobFlowID:
{}
started..format(jobflow_id)
76. GSPN UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
YFDVUF
)JWF2-
VTF
.3
77. jobid
ꞿֻזג׃תךדꨜ㔲孡ֽ
=
conn.run_jobflow(
name='Create
EMR
and
Exec
hiveql
[{}]'.format(target_date),
log_uri='s3://{}/jobflow_logs/'.format(bucket_name),
ec2_keyname='your_key',
master_instance_type='m1.medium',
slave_instance_type='m1.medium',
num_instances=3,
action_on_failure='TERMINATE_JOB_FLOW',
keep_alive=True,
enable_debugging=False,
hadoop_version='2.4.0',
steps=[install_step],
bootstrap_actions=[],
instance_groups=None,
additional_info=None,
ami_version='3.1.1',
api_params=None,
visible_to_all_users=True,
job_flow_role=None)
query_files
=
['sample01.hql',
'sample02.hql']
hql_steps
=
[]
for
query_file
in
query_files:
hql_step
=
HiveStep(
name='Executing
Query
[{}]'.format(query_file),
hive_file='s3n://{0}/hive-‐script/{1}'.format(
bucket_name,
query_file),
hive_versions=hive_version,
hive_args=['-‐dTARGET_DATE={0}'.format(target_date),
'-‐dBUCKET_NAME={0}'.format(bucket_name)])
hql_steps.append(hql_step)
conn.add_jobflow_steps(jobid,
hql_steps)
78. VTF UPEPUIFGPMMPXJOH
BXTDMJ YFDVUF
)JWF2-
YFDVUF
TEJTUDQ
$POH
:PVS.3
#PPUTUSQ
1SFTUP
$SFBUF
$MVTUFS
.FUBTUS
$POH
1ZUIPO
4DSJQU
$SFBUF
$MVTUFS
+PC'MPX
.HNOU
GSPN
YFDVUF
)JWF2-
.3
80. • IUUQTHJUIVCDPNTQPUJGZMVJHJ
• 1ZUIPO醡ךػ؎فٓ؎ٝ盖椚ؿٖ٦يٙ٦ؙ
• )BEPPQ4USFBNJOHⵃ欽׃.BQ3FEVDFָ知⽃ח剅ֽ堣圓֮
• 1ZUIPOך؝٦سֽד⣛㶷䚍鍑寸
• ⣛㶷䚍〳鋔⻉ ⴽ؟٦ؽأה׃ג甧♳־
• ⣛㶷䚍〳鋔⻉خ٦ٕכ钠鏾瘝稢ְַ堣腉כ搀ְ
• )JWF2-ך㹋遤ח㼎䘔׃גְ
• 1JHך㹋遤ח㼎䘔׃גְ
• 4ך乼⡲ח㼎䘔׃ג
• 植朐הؔ٦غ٦ٕؗ
81. • 盖椚歗כ%KBOHPⵃ欽
• ず♧ך؟٦غדDFMFSZהDFMFSZCFBU饯⹛
• EKBOHPDFMFSZⵃ欽׃ג暴㹀ةأؙ暴㹀ך儗חُؗ٦חⰅ״
ֲח鏣㹀
• DFMFSZCFBUָُؗ٦חⰅةأؙ䭪ג㹋遤׃גֻ
• EKBOHPDFMFSZזֻגDFMFSZה%KBOHPכ鸬䵿דֹֽוծֿךأ؛
آُ٦ٕ堣腉ָ⤑ⵃזךדת⢪ג
89. Ø ⴱג䪮遭禸ך涪邌׃
Ø ➬✲דװגֹ✲תהְְ堣⠓
Ø ➭ך倯ָ➬✲׃ג儗ח罋ִגְ✲濼ְ
Ø ➭ך⠓爡ך圓䧭ָזך圓䧭הגְךַ濼ְ