# embulk
curl --create-dirs -o ~/.embulk/bin/embulk -L "https://dl.embulk.org/embulk-latest.jar"
chmod +x ~/.embulk/bin/embulk
echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
source ~/.bashrc



# 

embulk gem install embulk-input-mysql
embulk gem install embulk-output-bigquery



# embulk 

embulk example
-
embulk guess embulk-example/seed.yml
- seed.yml
embulk guess embulk-example/seed.yml -o config.yml
- seed.yml config.yml
embulk preview config.yml
- config.yml
embulk run config.yml
- config.yml
# config.yml
in:
type: mysql
host: localhost
port: 3306
user: root
password: root
database: database_name
table: table_name
select: "col1, col2, col3"
where: "col4 != 'a'"
order_by: "col1 DESC"
out:
type: bigquery
mode: replace
auth_method: json_key
json_keyfile: /path/to/json_keyfile.json
project: my-project
dataset: reservation
table: reservation
gcs_bucket: seongyun
auto_create_gcs_bucket: true
auto_create_table: true
auto_create_dataset: true
embulk run config.yml




# airflow
pip3 install apache-airflow
airflow initdb
airflow webserver -p 8080
localhost:8080



airflow list_dags
- airflow dags *.py , DAGs
- dags DAG
airflow list_tasks test
- test dags tasks
airflow list_tasks test --tree
- test dags tasks tree
airflow test [DAG id] [Task id] [date]
) airflow test test print_date 2017-10-01
- DAG Task test
airflow scheduler
- Test , . DAG
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 10, 1),
'email': ['airflow@airflow.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
# 'queue': 'bash_queue',
# 'pool': 'backfill', # Only celery option
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
# dag
dag = DAG('test', description='First DAG',
schedule_interval = '55 14 * * *',
default_args=default_args)
t1 = BashOperator(
task_id='print_date',
bash_command='date',
dag=dag)
# BashOperator
# task_id unique
# bash_command bash date
t2 = BashOperator(
task_id='sleep',
bash_command='sleep 5',
retries=3,
dag=dag)
t3 = BashOperator(
task_id='templated',
bash_command=templated_command,
params={'my_param': 'Parameter I passed in'},
dag=dag)
# set_upstream t1 t2
t2.set_upstream(t1)
# t1.set_downstream(t2)
# t1 >> t2
t3.set_upstream(t1)
• Project Id :  Project Id
• Keyfile Path : json keyfile
• Scopes : https://www.googleapis.com/auth/cloud-platform




구름 이야기(Feat. gcp) - 구글클라우드(GCP) 활용 사례

구름 이야기(Feat. gcp) - 구글클라우드(GCP) 활용 사례

  • 2.
  • 3.
  • 4.
  • 8.
  • 13.
  • 15.
  • 19.
  • 24.
  • 25.
  • 30.
    # embulk curl --create-dirs-o ~/.embulk/bin/embulk -L "https://dl.embulk.org/embulk-latest.jar" chmod +x ~/.embulk/bin/embulk echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc source ~/.bashrc
 
 # 
 embulk gem install embulk-input-mysql embulk gem install embulk-output-bigquery
 
 # embulk 
 embulk example - embulk guess embulk-example/seed.yml - seed.yml embulk guess embulk-example/seed.yml -o config.yml - seed.yml config.yml embulk preview config.yml - config.yml embulk run config.yml - config.yml
  • 31.
    # config.yml in: type: mysql host:localhost port: 3306 user: root password: root database: database_name table: table_name select: "col1, col2, col3" where: "col4 != 'a'" order_by: "col1 DESC" out: type: bigquery mode: replace auth_method: json_key json_keyfile: /path/to/json_keyfile.json project: my-project dataset: reservation table: reservation gcs_bucket: seongyun auto_create_gcs_bucket: true auto_create_table: true auto_create_dataset: true embulk run config.yml
  • 35.
  • 36.
    # airflow pip3 installapache-airflow airflow initdb airflow webserver -p 8080 localhost:8080
 
 airflow list_dags - airflow dags *.py , DAGs - dags DAG airflow list_tasks test - test dags tasks airflow list_tasks test --tree - test dags tasks tree airflow test [DAG id] [Task id] [date] ) airflow test test print_date 2017-10-01 - DAG Task test airflow scheduler - Test , . DAG
  • 37.
    from airflow importDAG from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2017, 10, 1), 'email': ['airflow@airflow.com'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # Only celery option # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } # dag dag = DAG('test', description='First DAG', schedule_interval = '55 14 * * *', default_args=default_args) t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) # BashOperator # task_id unique # bash_command bash date t2 = BashOperator( task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) t3 = BashOperator( task_id='templated', bash_command=templated_command, params={'my_param': 'Parameter I passed in'}, dag=dag) # set_upstream t1 t2 t2.set_upstream(t1) # t1.set_downstream(t2) # t1 >> t2 t3.set_upstream(t1)
  • 41.
    • Project Id:  Project Id • Keyfile Path : json keyfile • Scopes : https://www.googleapis.com/auth/cloud-platform
  • 42.