SlideShare a Scribd company logo
Apache Airflow
넷마블 데이터인프라팀 김장현
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
Q&A
def scheduler(args):
print(settings.HEADER)
job = jobs.SchedulerJob(
dag_id=args.dag_id,
subdir=process_subdir(args.subdir),
run_duration=args.run_duration,
num_runs=args.num_runs,
do_pickle=args.do_pickle)
if args.daemon:
pid, stdout, stderr, log_file = setup_locations("scheduler",
args.pid,
args.stdout,
args.stderr,
args.log_file)
handle = setup_logging(log_file)
stdout = open(stdout, 'w+')
stderr = open(stderr, 'w+')
ctx = daemon.DaemonContext(
pidfile=TimeoutPIDLockFile(pid, -1),
files_preserve=[handle],
stdout=stdout,
stderr=stderr,
)
with ctx:
job.run()
stdout.close()
stderr.close()
else:
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
signal.signal(signal.SIGQUIT, sigquit_handler)
job.run()
class SchedulerJob(BaseJob):
"""
This SchedulerJob runs for a specific time interval and schedules the jobs
that are ready to run. It figures out the latest runs for each
task and sees if the dependencies for the next schedules are met.
If so, it creates appropriate TaskInstances and sends run commands to the
executor. It does this for each task in each DAG and repeats.
"""
def run(self):
Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1)
# Adding an entry in the DB
with create_session() as session:
self.state = State.RUNNING
session.add(self)
session.commit()
id_ = self.id
make_transient(self)
self.id = id_
try:
self._execute()
# In case of max runs or max duration
self.state = State.SUCCESS
except SystemExit as e:
# In case of ^C or SIGTERM
self.state = State.SUCCESS
except Exception as e:
self.state = State.FAILED
raise
finally:
self.end_date = timezone.utcnow()
session.merge(self)
session.commit()
Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1)
def _execute(self):
self.log.info("Starting the scheduler")
# DAGs can be pickled for easier remote execution by some executors
pickle_dags = False
if self.do_pickle and self.executor.__class__ not in 
(executors.LocalExecutor, executors.SequentialExecutor):
pickle_dags = True
# Use multiple processes to parse and generate tasks for the
# DAGs in parallel. By processing them in separate processes,
# we can get parallelism and isolation from potentially harmful
# user code.
self.log.info("Processing files using up to %s processes at a time",
self.max_threads)
self.log.info("Running execute loop for %s seconds", self.run_duration)
self.log.info("Processing each file at most %s times", self.num_runs)
self.log.info("Process each file at most once every %s seconds",
self.file_process_interval)
self.log.info("Wait until at least %s seconds have passed between file parsing "
"loops", self.min_file_parsing_loop_time)
self.log.info("Checking for new files in %s every %s seconds",
self.subdir, self.dag_dir_list_interval)
# Build up a list of Python files that could contain DAGs
self.log.info("Searching for files in %s", self.subdir)
known_file_paths = list_py_file_paths(self.subdir)
self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
def processor_factory(file_path):
return DagFileProcessor(file_path,
pickle_dags,
self.dag_ids)
processor_manager = DagFileProcessorManager(self.subdir,
known_file_paths,
self.max_threads,
self.file_process_interval,
self.min_file_parsing_loop_time,
self.num_runs,
processor_factory)
try:
self._execute_helper(processor_manager)
finally:
self.log.info("Exited execute loop")
# Kill all child processes on exit since we don't want to leave
# them as orphaned.
# 후략
# For the execute duration, parse and schedule DAGs
while (timezone.utcnow() - execute_start_time).total_seconds() < 
self.run_duration or self.run_duration < 0:
self.log.debug("Starting Loop...")
loop_start_time = time.time()
# Traverse the DAG directory for Python files containing DAGs
# periodically
elapsed_time_since_refresh = (timezone.utcnow() -
last_dag_dir_refresh_time).total_seconds()
# 중략
# Kick of new processes and collect results from finished ones
self.log.debug("Heartbeating the process manager")
simple_dags = processor_manager.heartbeat()
# Send tasks for execution if available
simple_dag_bag = SimpleDagBag(simple_dags)
if len(simple_dags) > 0:
# Handle cases where a DAG run state is set (perhaps manually) to
# a non-running state. Handle task instances that belong to
# DAG runs in those states
# If a task instance is up for retry but the corresponding DAG run
# isn't running, mark the task instance as FAILED so we don't try
# to re-run it.
self._change_state_for_tis_without_dagrun(simple_dag_bag,
[State.UP_FOR_RETRY],
State.FAILED)
# If a task instance is scheduled or queued, but the corresponding
# DAG run isn't running, set the state to NONE so we don't try to
# re-run it.
self._change_state_for_tis_without_dagrun(simple_dag_bag,
[State.QUEUED,
State.SCHEDULED],
State.NONE)
self._execute_task_instances(simple_dag_bag,
(State.SCHEDULED,))
# Call heartbeats
self.log.debug("Heartbeating the executor")
self.executor.heartbeat()
# Process events from the executor
self._process_executor_events(simple_dag_bag)
# Heartbeat the scheduler periodically
time_since_last_heartbeat = (timezone.utcnow() -
last_self_heartbeat_time).total_seconds()
if time_since_last_heartbeat > self.heartrate:
self.log.debug("Heartbeating the scheduler")
self.heartbeat()
last_self_heartbeat_time = timezone.utcnow()
# Occasionally print out stats about how fast the files are getting processed
if ((timezone.utcnow() - last_stat_print_time).total_seconds() >
self.print_stats_interval):
if len(known_file_paths) > 0:
self._log_file_processing_stats(known_file_paths,
processor_manager)
last_stat_print_time = timezone.utcnow()
loop_end_time = time.time()
self.log.debug("Ran scheduling loop in %.2f seconds",
loop_end_time - loop_start_time)
# Exit early for a test mode
if processor_manager.max_runs_reached():
self.log.info("Exiting loop as all files have been processed %s times",
self.num_runs)
break
def _execute_helper(self, processor_manager):
"""
:param processor_manager: manager to use
:type processor_manager: DagFileProcessorManager
:return: None
"""
self.executor.start()
# 중략
# For the execute duration, parse and schedule DAGs
while (timezone.utcnow() - execute_start_time).total_seconds() < 
self.run_duration or self.run_duration < 0:
self.log.debug("Starting Loop...")
loop_start_time = time.time()
# 스케줄러 프로세스 실행 동안 반복되는 루프
# Stop any processors
processor_manager.terminate()
# Verify that all files were processed, and if so, deactivate DAGs that
# haven't been touched by the scheduler as they likely have been
# deleted.
all_files_processed = True
for file_path in known_file_paths:
if processor_manager.get_last_finish_time(file_path) is None:
all_files_processed = False
break
if all_files_processed:
self.log.info(
"Deactivating DAGs that haven't been touched since %s",
execute_start_time.isoformat()
)
models.DAG.deactivate_stale_dags(execute_start_time)
self.executor.end()
settings.Session.remove()
def heartbeat(self):
"""
This should be periodically called by the scheduler. This method will
kick off new processes to process DAG definition files and read the
results from the finished processors.
:return: a list of SimpleDags that were produced by processors that
have finished since the last time this was called
:rtype: list[SimpleDag]
"""
finished_processors = {}
""":type : dict[unicode, AbstractDagFileProcessor]"""
running_processors = {}
""":type : dict[unicode, AbstractDagFileProcessor]"""
for file_path, processor in self._processors.items():
if processor.done:
self.log.info("Processor for %s finished", file_path)
now = timezone.utcnow()
finished_processors[file_path] = processor
self._last_runtime[file_path] = (now -
processor.start_time).total_seconds()
self._last_finish_time[file_path] = now
self._run_count[file_path] += 1
else:
running_processors[file_path] = processor
self._processors = running_processors
self.log.debug("%s/%s scheduler processes running",
len(self._processors), self._parallelism)
self.log.debug("%s file paths queued for processing",
len(self._file_path_queue))
# Collect all the DAGs that were found in the processed files
simple_dags = []
for file_path, processor in finished_processors.items():
if processor.result is None:
self.log.warning(
"Processor for %s exited with return code %s.",
processor.file_path, processor.exit_code
)
else:
for simple_dag in processor.result:
simple_dags.append(simple_dag)
# 중략
# Start more processors if we have enough slots and files to process
while (self._parallelism - len(self._processors) > 0 and
len(self._file_path_queue) > 0):
file_path = self._file_path_queue.pop(0)
processor = self._processor_factory(file_path)
processor.start()
self.log.info(
"Started a process (PID: %s) to generate tasks for %s",
processor.pid, file_path
)
self._processors[file_path] = processor
# Update scheduler heartbeat count.
self._run_count[self._heart_beat_key] += 1
return simple_dags
def helper():
# This helper runs in the newly created process
log = logging.getLogger("airflow.processor")
stdout = StreamLogWriter(log, logging.INFO)
stderr = StreamLogWriter(log, logging.WARN)
set_context(log, file_path)
try:
# redirect stdout/stderr to log
sys.stdout = stdout
sys.stderr = stderr
# Re-configure the ORM engine as there are issues with multiple processes
settings.configure_orm()
# Change the thread name to differentiate log lines. This is
# really a separate process, but changing the name of the
# process doesn't work, so changing the thread name instead.
threading.current_thread().name = thread_name
start_time = time.time()
log.info("Started process (PID=%s) to work on %s",
os.getpid(), file_path)
scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log)
result = scheduler_job.process_file(file_path,
pickle_dags)
result_queue.put(result)
end_time = time.time()
log.info(
"Processing %s took %.3f seconds", file_path, end_time - start_time
)
except:
# Log exceptions through the logging framework.
log.exception("Got an exception! Propagating...")
raise
finally:
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
# We re-initialized the ORM within this Process above so we need to
# tear it down manually here
settings.dispose_orm()
def _launch_process(result_queue,
file_path,
pickle_dags,
dag_id_white_list,
thread_name):
"""
Launch a process to process the given file.
:param result_queue: the queue to use for passing back the result
:type result_queue: multiprocessing.Queue
:param file_path: the file to process
:type file_path: unicode
:param pickle_dags: whether to pickle the DAGs found in the file and
save them to the DB
:type pickle_dags: bool
:param dag_id_white_list: if specified, only examine DAG ID's that are
in this list
:type dag_id_white_list: list[unicode]
:param thread_name: the name to use for the process that is launched
:type thread_name: unicode
:return: the process that was launched
:rtype: multiprocessing.Process
"""
p = multiprocessing.Process(target=helper,
args=(),
name="{}-Process".format(thread_name))
p.start()
return p
def start(self):
"""
Launch the process and start processing the DAG.
"""
self._process = DagFileProcessor._launch_process(
self._result_queue,
self.file_path,
self._pickle_dags,
self._dag_id_white_list,
"DagFileProcessor{}".format(self._instance_id))
self._start_time = timezone.utcnow()
def _process_dags(self, dagbag, dags, tis_out):
"""
Iterates over the dags and processes them. Processing includes:
1. Create appropriate DagRun(s) in the DB.
2. Create appropriate TaskInstance(s) in the DB.
3. Send emails for tasks that have missed SLAs.
:param dagbag: a collection of DAGs to process
:type dagbag: models.DagBag
:param dags: the DAGs from the DagBag to process
:type dags: DAG
:param tis_out: A queue to add generated TaskInstance objects
:type tis_out: multiprocessing.Queue[TaskInstance]
:return: None
"""
for dag in dags:
dag = dagbag.get_dag(dag.dag_id)
if dag.is_paused:
self.log.info("Not processing DAG %s since it's paused", dag.dag_id)
continue
if not dag:
self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id)
continue
self.log.info("Processing %s", dag.dag_id)
dag_run = self.create_dag_run(dag)
if dag_run:
self.log.info("Created %s", dag_run)
self._process_task_instances(dag, tis_out)
self.manage_slas(dag)
models.DagStat.update([d.dag_id for d in dags])
def process_file(self, file_path, pickle_dags=False, session=None):
"""
Process a Python file containing Airflow DAGs.
This includes:
1. Execute the file and look for DAG objects in the namespace.
2. Pickle the DAG and save it to the DB (if necessary).
3. For each DAG, see what tasks should run and create appropriate task
instances in the DB.
4. Record any errors importing the file into ORM
5. Kill (in ORM) any task instances belonging to the DAGs that haven't
issued a heartbeat in a while.
Returns a list of SimpleDag objects that represent the DAGs found in
the file
:param file_path: the path to the Python file that should be executed
:type file_path: unicode
:param pickle_dags: whether serialize the DAGs found in the file and
save them to the db
:type pickle_dags: bool
:return: a list of SimpleDags made from the Dags found in the file
:rtype: list[SimpleDag]
"""
self.log.info("Processing file %s for tasks to queue", file_path)
# As DAGs are parsed from this file, they will be converted into SimpleDags
simple_dags = []
try:
dagbag = models.DagBag(file_path)
except Exception:
self.log.exception("Failed at reloading the DAG file %s", file_path)
Stats.incr('dag_file_refresh_error', 1, 1)
return []
# 중략
self._process_dags(dagbag, dags, ti_keys_to_schedule)
def _process_task_instances(self, dag, queue, session=None):
"""
This method schedules the tasks for a single DAG by looking at the
active DAG runs and adding task instances that should run to the
queue.
"""
# 중략
for run in active_dag_runs:
self.log.debug("Examining active DAG run: %s", run)
# this needs a fresh session sometimes tis get detached
tis = run.get_task_instances(state=(State.NONE,
State.UP_FOR_RETRY))
# this loop is quite slow as it uses are_dependencies_met for
# every task (in ti.is_runnable). This is also called in
# update_state above which has already checked these tasks
for ti in tis:
task = dag.get_task(ti.task_id)
# fixme: ti.task is transient but needs to be set
ti.task = task
# future: remove adhoc
if task.adhoc:
continue
if ti.are_dependencies_met(
dep_context=DepContext(flag_upstream_failed=True),
session=session):
self.log.debug('Queuing task: %s', ti)
queue.append(ti.key)
def process_file(self, file_path, pickle_dags=False, session=None):
"""
Process a Python file containing Airflow DAGs.
This includes:
1. Execute the file and look for DAG objects in the namespace.
2. Pickle the DAG and save it to the DB (if necessary).
3. For each DAG, see what tasks should run and create appropriate task
instances in the DB.
4. Record any errors importing the file into ORM
5. Kill (in ORM) any task instances belonging to the DAGs that haven't
issued a heartbeat in a while.
Returns a list of SimpleDag objects that represent the DAGs found in
the file
:param file_path: the path to the Python file that should be executed
:type file_path: unicode
:param pickle_dags: whether serialize the DAGs found in the file and
save them to the db
:type pickle_dags: bool
:return: a list of SimpleDags made from the Dags found in the file
:rtype: list[SimpleDag]
"""
self.log.info("Processing file %s for tasks to queue", file_path)
# As DAGs are parsed from this file, they will be converted into SimpleDags
simple_dags = []
try:
dagbag = models.DagBag(file_path)
except Exception:
self.log.exception("Failed at reloading the DAG file %s", file_path)
Stats.incr('dag_file_refresh_error', 1, 1)
return []
# 중략
self._process_dags(dagbag, dags, ti_keys_to_schedule)
def process_file(self, file_path, pickle_dags=False, session=None):
# 중략
ti_keys_to_schedule = []
self._process_dags(dagbag, dags, ti_keys_to_schedule)
for ti_key in ti_keys_to_schedule:
dag = dagbag.dags[ti_key[0]]
task = dag.get_task(ti_key[1])
ti = models.TaskInstance(task, ti_key[2])
ti.refresh_from_db(session=session, lock_for_update=True)
# We can defer checking the task dependency checks to the worker themselves
# since they can be expensive to run in the scheduler.
dep_context = DepContext(deps=QUEUE_DEPS, ignore_task_deps=True)
# Only schedule tasks that have their dependencies met, e.g. to avoid
# a task that recently got its state changed to RUNNING from somewhere
# other than the scheduler from getting its state overwritten.
# TODO(aoen): It's not great that we have to check all the task instance
# dependencies twice; once to get the task scheduled, and again to actually
# run the task. We should try to come up with a way to only check them once.
if ti.are_dependencies_met(
dep_context=dep_context,
session=session,
verbose=True):
# Task starts out in the scheduled state. All tasks in the
# scheduled state will be sent to the executor
ti.state = State.SCHEDULED
# Also save this task instance to the DB.
self.log.info("Creating / updating %s in ORM", ti)
session.merge(ti)
# commit batch
session.commit()
# 중략
return simple_dags
def _find_executable_task_instances(self, simple_dag_bag, states, session=None):
# 중략
states_to_count_as_running = [State.RUNNING]
executable_tis = []
# Get all the queued task instances from associated with scheduled
# DagRuns which are not backfilled, in the given states,
# and the dag is not paused
TI = models.TaskInstance
DR = models.DagRun
DM = models.DagModel
ti_query = (
session
.query(TI)
.filter(TI.dag_id.in_(simple_dag_bag.dag_ids))
.outerjoin(DR,
and_(DR.dag_id == TI.dag_id,
DR.execution_date == TI.execution_date))
.filter(or_(DR.run_id == None,
not_(DR.run_id.like(BackfillJob.ID_PREFIX + '%'))))
.outerjoin(DM, DM.dag_id==TI.dag_id)
.filter(or_(DM.dag_id == None,
not_(DM.is_paused)))
)
if None in states:
ti_query = ti_query.filter(or_(TI.state == None, TI.state.in_(states)))
else:
ti_query = ti_query.filter(TI.state.in_(states))
task_instances_to_examine = ti_query.all()
# 중략
# Get the pool settings
pools = {p.pool: p for p in session.query(models.Pool).all()}
pool_to_task_instances = defaultdict(list)
for task_instance in task_instances_to_examine:
pool_to_task_instances[task_instance.pool].append(task_instance)
# 중략
# Go through each pool, and queue up a task for execution if there are
# any open slots in the pool.
for pool, task_instances in pool_to_task_instances.items():
# 중략
priority_sorted_task_instances = sorted(
task_instances, key=lambda ti: (-ti.priority_weight, ti.execution_date))
# DAG IDs with running tasks that equal the concurrency limit of the dag
dag_id_to_possibly_running_task_count = {}
for task_instance in priority_sorted_task_instances:
if open_slots <= 0:
self.log.info(
"Not scheduling since there are %s open slots in pool %s",
open_slots, pool
)
# Can't schedule any more since there are no more open slots.
break
# Check to make sure that the task concurrency of the DAG hasn't been
# reached.
# 중략
task_instance_str = "nt".join(
["{}".format(x) for x in executable_tis])
self.log.info("Setting the follow tasks to queued state:nt%s", task_instance_str)
# so these dont expire on commit
for ti in executable_tis:
copy_dag_id = ti.dag_id
copy_execution_date = ti.execution_date
copy_task_id = ti.task_id
make_transient(ti)
ti.dag_id = copy_dag_id
ti.execution_date = copy_execution_date
ti.task_id = copy_task_id
return executable_tis
_execute_task_instances
class LocalExecutor(BaseExecutor):
"""
LocalExecutor executes tasks locally in parallel. It uses the
multiprocessing Python library and queues to parallelize the execution
of tasks.
"""
def start(self):
self.result_queue = multiprocessing.Queue()
self.queue = None
self.workers = []
self.workers_used = 0
self.workers_active = 0
self.impl = (LocalExecutor._UnlimitedParallelism(self) if self.parallelism == 0
else LocalExecutor._LimitedParallelism(self))
self.impl.start()
def __init__(
self,
executor=executors.GetDefaultExecutor(),
heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'),
*args, **kwargs):
self.hostname = get_hostname()
self.executor = executor
self.executor_class = executor.__class__.__name__
self.start_date = timezone.utcnow()
self.latest_heartbeat = timezone.utcnow()
self.heartrate = heartrate
self.unixname = getpass.getuser()
self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query')
super(BaseJob, self).__init__(*args, **kwargs)
class BaseJob(Base, LoggingMixin):
"""
Abstract class to be derived for jobs. Jobs are processing items with state
and duration that aren't task instances. For instance a BackfillJob is
a collection of task instance runs, but should have its own state, start
and end time.
"""
def GetDefaultExecutor():
"""Creates a new instance of the configured executor if none exists and returns it"""
global DEFAULT_EXECUTOR
if DEFAULT_EXECUTOR is not None:
return DEFAULT_EXECUTOR
executor_name = configuration.conf.get('core', 'EXECUTOR')
DEFAULT_EXECUTOR = _get_executor(executor_name)
log = LoggingMixin().log
log.info("Using executor %s", executor_name)
return DEFAULT_EXECUTOR
def _get_executor(executor_name):
"""
Creates a new instance of the named executor.
In case the executor name is not know in airflow,
look for it in the plugins
"""
if executor_name == Executors.LocalExecutor:
return LocalExecutor()
elif executor_name == Executors.SequentialExecutor:
return SequentialExecutor()
elif executor_name == Executors.CeleryExecutor:
from airflow.executors.celery_executor import CeleryExecutor
return CeleryExecutor()
elif executor_name == Executors.DaskExecutor:
from airflow.executors.dask_executor import DaskExecutor
return DaskExecutor()
elif executor_name == Executors.MesosExecutor:
from airflow.contrib.executors.mesos_executor import MesosExecutor
return MesosExecutor()
elif executor_name == Executors.KubernetesExecutor:
from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor
return KubernetesExecutor()
else:
# Loading plugins
_integrate_plugins()
executor_path = executor_name.split('.')
if len(executor_path) != 2:
raise AirflowException(
"Executor {0} not supported: "
"please specify in format plugin_module.executor".format(executor_name))
if executor_path[0] in globals():
return globals()[executor_path[0]].__dict__[executor_path[1]]()
else:
raise AirflowException("Executor {0} not supported.".format(executor_name))
def _execute_helper(self, processor_manager):
"""
:param processor_manager: manager to use
:type processor_manager: DagFileProcessorManager
:return: None
"""
self.executor.start()
# 후략
class SchedulerJob(BaseJob):
"""
This SchedulerJob runs for a specific time interval and schedules the jobs
that are ready to run. It figures out the latest runs for each
task and sees if the dependencies for the next schedules are met.
If so, it creates appropriate TaskInstances and sends run commands to the
executor. It does this for each task in each DAG and repeats.
"""
def _enqueue_task_instances_with_queued_state(self, simple_dag_bag, task_instances):
"""
Takes task_instances, which should have been set to queued, and enqueues them
with the executor.
:param task_instances: TaskInstances to enqueue
:type task_instances: List[TaskInstance]
:param simple_dag_bag: Should contains all of the task_instances' dags
:type simple_dag_bag: SimpleDagBag
"""
TI = models.TaskInstance
# actually enqueue them
for task_instance in task_instances:
simple_dag = simple_dag_bag.get_dag(task_instance.dag_id)
command = " ".join(TI.generate_command(
task_instance.dag_id,
task_instance.task_id,
task_instance.execution_date,
local=True,
mark_success=False,
ignore_all_deps=False,
ignore_depends_on_past=False,
ignore_task_deps=False,
ignore_ti_state=False,
pool=task_instance.pool,
file_path=simple_dag.full_filepath,
pickle_id=simple_dag.pickle_id))
priority = task_instance.priority_weight
queue = task_instance.queue
self.log.info(
"Sending %s to executor with priority %s and queue %s",
task_instance.key, priority, queue
)
# save attributes so sqlalchemy doesnt expire them
copy_dag_id = task_instance.dag_id
copy_task_id = task_instance.task_id
copy_execution_date = task_instance.execution_date
make_transient(task_instance)
task_instance.dag_id = copy_dag_id
task_instance.task_id = copy_task_id
task_instance.execution_date = copy_execution_date
self.executor.queue_command(
task_instance,
command,
priority=priority,
queue=queue)
def _execute_task_instances(self,
simple_dag_bag,
states,
session=None):
"""
Attempts to execute TaskInstances that should be executed by the scheduler.
There are three steps:
1. Pick TIs by priority with the constraint that they are in the expected states
and that we do exceed max_active_runs or pool limits.
2. Change the state for the TIs above atomically.
3. Enqueue the TIs in the executor.
:param simple_dag_bag: TaskInstances associated with DAGs in the
simple_dag_bag will be fetched from the DB and executed
:type simple_dag_bag: SimpleDagBag
:param states: Execute TaskInstances in these states
:type states: Tuple[State]
:return: None
"""
executable_tis = self._find_executable_task_instances(simple_dag_bag, states,
session=session)
def query(result, items):
tis_with_state_changed = self._change_state_for_executable_task_instances(
items,
states,
session=session)
self._enqueue_task_instances_with_queued_state(
simple_dag_bag,
tis_with_state_changed)
session.commit()
return result + len(tis_with_state_changed)
return helpers.reduce_in_chunks(query, executable_tis, 0, self.max_tis_per_query)
def _execute_helper(self, processor_manager):
"""
:param processor_manager: manager to use
:type processor_manager: DagFileProcessorManager
:return: None
"""
self.executor.start()
# 중략
# For the execute duration, parse and schedule DAGs
while (timezone.utcnow() - execute_start_time).total_seconds() < 
self.run_duration or self.run_duration < 0:
self.log.debug("Starting Loop...")
loop_start_time = time.time()
# 중략
# Kick of new processes and collect results from finished ones
self.log.debug("Heartbeating the process manager")
simple_dags = processor_manager.heartbeat()
# Send tasks for execution if available
simple_dag_bag = SimpleDagBag(simple_dags)
if len(simple_dags) > 0:
# 중략
self._execute_task_instances(simple_dag_bag,
(State.SCHEDULED,))
# Call heartbeats
self.log.debug("Heartbeating the executor")
self.executor.heartbeat()
# 중략
# Exit early for a test mode
if processor_manager.max_runs_reached():
self.log.info("Exiting loop as all files have been processed %s times",
self.num_runs)
break
# 후략
def queue_command(self, task_instance, command, priority=1, queue=None):
key = task_instance.key
if key not in self.queued_tasks and key not in self.running:
self.log.info("Adding to queue: %s", command)
self.queued_tasks[key] = (command, priority, queue, task_instance)
else:
self.log.info("could not queue task {}".format(key))
def heartbeat(self):
# Triggering new jobs
if not self.parallelism:
open_slots = len(self.queued_tasks)
else:
open_slots = self.parallelism - len(self.running)
self.log.debug("%s running task instances", len(self.running))
self.log.debug("%s in queue", len(self.queued_tasks))
self.log.debug("%s open slots", open_slots)
sorted_queue = sorted(
[(k, v) for k, v in self.queued_tasks.items()],
key=lambda x: x[1][1],
reverse=True)
for i in range(min((open_slots, len(self.queued_tasks)))):
key, (command, _, queue, ti) = sorted_queue.pop(0)
# TODO(jlowin) without a way to know what Job ran which tasks,
# there is a danger that another Job started running a task
# that was also queued to this executor. This is the last chance
# to check if that happened. The most probable way is that a
# Scheduler tried to run a task that was originally queued by a
# Backfill. This fix reduces the probability of a collision but
# does NOT eliminate it.
self.queued_tasks.pop(key)
ti.refresh_from_db()
if ti.state != State.RUNNING:
self.running[key] = command
self.execute_async(key=key,
command=command,
queue=queue,
executor_config=ti.executor_config)
else:
self.logger.info(
'Task is already running, not sending to '
'executor: {}'.format(key))
# Calling child class sync method
self.log.debug("Calling the %s sync method", self.__class__)
self.sync()
def execute_async(self, key, command):
"""
:param key: the key to identify the TI
:type key: Tuple(dag_id, task_id, execution_date)
:param command: the command to execute
:type command: string
"""
local_worker = LocalWorker(self.executor.result_queue)
local_worker.key = key
local_worker.command = command
self.executor.workers_used += 1
self.executor.workers_active += 1
local_worker.start()
class LocalWorker(multiprocessing.Process, LoggingMixin):
"""LocalWorker Process implementation to run airflow commands. Executes the given
command and puts the result into a result queue when done, terminating execution."""
def __init__(self, result_queue):
"""
:param result_queue: the queue to store result states tuples (key, State)
:type result_queue: multiprocessing.Queue
"""
super(LocalWorker, self).__init__()
self.daemon = True
self.result_queue = result_queue
self.key = None
self.command = None
def execute_work(self, key, command):
"""
Executes command received and stores result state in queue.
:param key: the key to identify the TI
:type key: Tuple(dag_id, task_id, execution_date)
:param command: the command to execute
:type command: string
"""
if key is None:
return
self.log.info("%s running %s", self.__class__.__name__, command)
command = "exec bash -c '{0}'".format(command)
try:
subprocess.check_call(command, shell=True, close_fds=True)
state = State.SUCCESS
except subprocess.CalledProcessError as e:
state = State.FAILED
self.log.error("Failed to execute task %s.", str(e))
# TODO: Why is this commented out?
# raise e
self.result_queue.put((key, state))
def run(self):
self.execute_work(self.key, self.command)
time.sleep(1)
def generate_command(dag_id,
task_id,
execution_date,
mark_success=False,
ignore_all_deps=False,
ignore_depends_on_past=False,
ignore_task_deps=False,
ignore_ti_state=False,
local=False,
pickle_id=None,
file_path=None,
raw=False,
job_id=None,
pool=None,
cfg_path=None
):
"""
Generates the shell command required to execute this task instance.
중략
"""
iso = execution_date.isoformat()
cmd = ["airflow", "run", str(dag_id), str(task_id), str(iso)]
cmd.extend(["--mark_success"]) if mark_success else None
cmd.extend(["--pickle", str(pickle_id)]) if pickle_id else None
cmd.extend(["--job_id", str(job_id)]) if job_id else None
cmd.extend(["-A"]) if ignore_all_deps else None
cmd.extend(["-i"]) if ignore_task_deps else None
cmd.extend(["-I"]) if ignore_depends_on_past else None
cmd.extend(["--force"]) if ignore_ti_state else None
cmd.extend(["--local"]) if local else None
cmd.extend(["--pool", pool]) if pool else None
cmd.extend(["--raw"]) if raw else None
cmd.extend(["-sd", file_path]) if file_path else None
cmd.extend(["--cfg_path", cfg_path]) if cfg_path else None
return cmd
def run(args, dag=None):
# 중략
task = dag.get_task(task_id=args.task_id)
ti = TaskInstance(task, args.execution_date)
ti.refresh_from_db()
ti.init_run_context(raw=args.raw)
hostname = get_hostname()
log.info("Running %s on host %s", ti, hostname)
if args.interactive:
_run(args, dag, ti)
else:
with redirect_stdout(ti.log, logging.INFO), redirect_stderr(ti.log, logging.WARN):
_run(args, dag, ti)
logging.shutdown()
def _run(args, dag, ti):
if args.local:
run_job = jobs.LocalTaskJob(
task_instance=ti,
mark_success=args.mark_success,
pickle_id=args.pickle,
ignore_all_deps=args.ignore_all_dependencies,
ignore_depends_on_past=args.ignore_depends_on_past,
ignore_task_deps=args.ignore_dependencies,
ignore_ti_state=args.force,
pool=args.pool)
run_job.run()
elif args.raw:
ti._run_raw_task(
mark_success=args.mark_success,
job_id=args.job_id,
pool=args.pool,
)
else:
# 후략
def _execute(self):
self.task_runner = get_task_runner(self)
# 중략
if not self.task_instance._check_and_change_state_before_execution(
mark_success=self.mark_success,
ignore_all_deps=self.ignore_all_deps,
ignore_depends_on_past=self.ignore_depends_on_past,
ignore_task_deps=self.ignore_task_deps,
ignore_ti_state=self.ignore_ti_state,
job_id=self.id,
pool=self.pool):
self.log.info("Task is not able to be run")
return
try:
self.task_runner.start()
last_heartbeat_time = time.time()
heartbeat_time_limit = conf.getint('scheduler',
'scheduler_zombie_task_threshold')
while True:
# Monitor the task to see if it's done
return_code = self.task_runner.return_code()
if return_code is not None:
self.log.info("Task exited with return code %s", return_code)
return
# 중략
finally:
self.on_kill() class BashTaskRunner(BaseTaskRunner):
"""
Runs the raw Airflow task by invoking through the Bash shell.
"""
def __init__(self, local_task_job):
super(BashTaskRunner, self).__init__(local_task_job)
def start(self):
self.process = self.run_command(['bash', '-c'], join_args=True)
def return_code(self):
return self.process.poll()
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•
•

More Related Content

What's hot

Drush. Secrets come out.
Drush. Secrets come out.Drush. Secrets come out.
Drush. Secrets come out.
Alex S
 
How Secure Are Docker Containers?
How Secure Are Docker Containers?How Secure Are Docker Containers?
How Secure Are Docker Containers?
Ben Hall
 
Deconstructing the Functional Web with Clojure
Deconstructing the Functional Web with ClojureDeconstructing the Functional Web with Clojure
Deconstructing the Functional Web with Clojure
Norman Richards
 
To infinity and beyond
To infinity and beyondTo infinity and beyond
To infinity and beyond
clintongormley
 
Hadoop
HadoopHadoop
Drupal 8 in action, the route to the method
Drupal 8 in action, the route to the methodDrupal 8 in action, the route to the method
Drupal 8 in action, the route to the method
juanolalla
 
Zabbix LLD from a C Module by Jan-Piet Mens
Zabbix LLD from a C Module by Jan-Piet MensZabbix LLD from a C Module by Jan-Piet Mens
Zabbix LLD from a C Module by Jan-Piet Mens
NETWAYS
 
Anatomy of distributed computing with Hadoop
Anatomy of distributed computing with HadoopAnatomy of distributed computing with Hadoop
Anatomy of distributed computing with Hadoop
Sergey Bushik
 
Best Practices in Handling Performance Issues
Best Practices in Handling Performance IssuesBest Practices in Handling Performance Issues
Best Practices in Handling Performance Issues
Odoo
 
How to create a secured multi tenancy for clustered ML with JupyterHub
How to create a secured multi tenancy for clustered ML with JupyterHubHow to create a secured multi tenancy for clustered ML with JupyterHub
How to create a secured multi tenancy for clustered ML with JupyterHub
Tiago Simões
 
Nouveau document texte
Nouveau document texteNouveau document texte
Nouveau document texte
Sai Ef
 
Yy
YyYy
Yy
yygh
 
Node Powered Mobile
Node Powered MobileNode Powered Mobile
Node Powered Mobile
Tim Caswell
 
Commands documentaion
Commands documentaionCommands documentaion
Commands documentaion
TejalNijai
 
How to go the extra mile on monitoring
How to go the extra mile on monitoringHow to go the extra mile on monitoring
How to go the extra mile on monitoring
Tiago Simões
 
Real Time Web with Node
Real Time Web with NodeReal Time Web with Node
Real Time Web with Node
Tim Caswell
 
Perl Web Client
Perl Web ClientPerl Web Client
Perl Web Client
Flavio Poletti
 
Nubilus Perl
Nubilus PerlNubilus Perl
Nubilus Perl
Flavio Poletti
 
Drush - use full power - DrupalCamp Donetsk 2014
Drush - use full power - DrupalCamp Donetsk 2014Drush - use full power - DrupalCamp Donetsk 2014
Drush - use full power - DrupalCamp Donetsk 2014
Alex S
 
Pry, the good parts
Pry, the good partsPry, the good parts
Pry, the good parts
Conrad Irwin
 

What's hot (20)

Drush. Secrets come out.
Drush. Secrets come out.Drush. Secrets come out.
Drush. Secrets come out.
 
How Secure Are Docker Containers?
How Secure Are Docker Containers?How Secure Are Docker Containers?
How Secure Are Docker Containers?
 
Deconstructing the Functional Web with Clojure
Deconstructing the Functional Web with ClojureDeconstructing the Functional Web with Clojure
Deconstructing the Functional Web with Clojure
 
To infinity and beyond
To infinity and beyondTo infinity and beyond
To infinity and beyond
 
Hadoop
HadoopHadoop
Hadoop
 
Drupal 8 in action, the route to the method
Drupal 8 in action, the route to the methodDrupal 8 in action, the route to the method
Drupal 8 in action, the route to the method
 
Zabbix LLD from a C Module by Jan-Piet Mens
Zabbix LLD from a C Module by Jan-Piet MensZabbix LLD from a C Module by Jan-Piet Mens
Zabbix LLD from a C Module by Jan-Piet Mens
 
Anatomy of distributed computing with Hadoop
Anatomy of distributed computing with HadoopAnatomy of distributed computing with Hadoop
Anatomy of distributed computing with Hadoop
 
Best Practices in Handling Performance Issues
Best Practices in Handling Performance IssuesBest Practices in Handling Performance Issues
Best Practices in Handling Performance Issues
 
How to create a secured multi tenancy for clustered ML with JupyterHub
How to create a secured multi tenancy for clustered ML with JupyterHubHow to create a secured multi tenancy for clustered ML with JupyterHub
How to create a secured multi tenancy for clustered ML with JupyterHub
 
Nouveau document texte
Nouveau document texteNouveau document texte
Nouveau document texte
 
Yy
YyYy
Yy
 
Node Powered Mobile
Node Powered MobileNode Powered Mobile
Node Powered Mobile
 
Commands documentaion
Commands documentaionCommands documentaion
Commands documentaion
 
How to go the extra mile on monitoring
How to go the extra mile on monitoringHow to go the extra mile on monitoring
How to go the extra mile on monitoring
 
Real Time Web with Node
Real Time Web with NodeReal Time Web with Node
Real Time Web with Node
 
Perl Web Client
Perl Web ClientPerl Web Client
Perl Web Client
 
Nubilus Perl
Nubilus PerlNubilus Perl
Nubilus Perl
 
Drush - use full power - DrupalCamp Donetsk 2014
Drush - use full power - DrupalCamp Donetsk 2014Drush - use full power - DrupalCamp Donetsk 2014
Drush - use full power - DrupalCamp Donetsk 2014
 
Pry, the good parts
Pry, the good partsPry, the good parts
Pry, the good parts
 

Similar to Apache Airflow

Magic of Ruby
Magic of RubyMagic of Ruby
Magic of Ruby
Gabriele Lana
 
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak   CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
PROIDEA
 
Ajuste (tuning) del rendimiento de SQL Server 2008
Ajuste (tuning) del rendimiento de SQL Server 2008Ajuste (tuning) del rendimiento de SQL Server 2008
Ajuste (tuning) del rendimiento de SQL Server 2008
Eduardo Castro
 
SQL Server Performance Analysis
SQL Server Performance AnalysisSQL Server Performance Analysis
SQL Server Performance Analysis
Eduardo Castro
 
Python advanced 3.the python std lib by example – system related modules
Python advanced 3.the python std lib by example – system related modulesPython advanced 3.the python std lib by example – system related modules
Python advanced 3.the python std lib by example – system related modules
John(Qiang) Zhang
 
Django Celery
Django Celery Django Celery
Django Celery
Mat Clayton
 
Scaling python webapps from 0 to 50 million users - A top-down approach
Scaling python webapps from 0 to 50 million users - A top-down approachScaling python webapps from 0 to 50 million users - A top-down approach
Scaling python webapps from 0 to 50 million users - A top-down approach
Jinal Jhaveri
 
Process monitoring in UNIX shell scripting
Process monitoring in UNIX shell scriptingProcess monitoring in UNIX shell scripting
Process monitoring in UNIX shell scripting
Dan Morrill
 
Python magicmethods
Python magicmethodsPython magicmethods
Python magicmethods
dreampuf
 
Counting on God
Counting on GodCounting on God
Counting on God
James Gray
 
Airflow tutorials hands_on
Airflow tutorials hands_onAirflow tutorials hands_on
Airflow tutorials hands_on
pko89403
 
Monitoring with Prometheus
Monitoring with PrometheusMonitoring with Prometheus
Monitoring with Prometheus
Shiao-An Yuan
 
Scheduling tasks the human way - Brad Wood - ITB2021
Scheduling tasks the human way -  Brad Wood - ITB2021Scheduling tasks the human way -  Brad Wood - ITB2021
Scheduling tasks the human way - Brad Wood - ITB2021
Ortus Solutions, Corp
 
Hacking ansible
Hacking ansibleHacking ansible
Hacking ansible
bcoca
 
Python Asíncrono - Async Python
Python Asíncrono - Async PythonPython Asíncrono - Async Python
Python Asíncrono - Async Python
Javier Abadía
 
Apache Spark in your likeness - low and high level customization
Apache Spark in your likeness - low and high level customizationApache Spark in your likeness - low and high level customization
Apache Spark in your likeness - low and high level customization
Bartosz Konieczny
 
Commit2015 kharchenko - python generators - ext
Commit2015   kharchenko - python generators - extCommit2015   kharchenko - python generators - ext
Commit2015 kharchenko - python generators - ext
Maxym Kharchenko
 
Go Web Development
Go Web DevelopmentGo Web Development
Go Web Development
Cheng-Yi Yu
 
PyCon 2010 SQLAlchemy tutorial
PyCon 2010 SQLAlchemy tutorialPyCon 2010 SQLAlchemy tutorial
PyCon 2010 SQLAlchemy tutorial
jbellis
 
Metaprogramovanie #1
Metaprogramovanie #1Metaprogramovanie #1
Metaprogramovanie #1
Jano Suchal
 

Similar to Apache Airflow (20)

Magic of Ruby
Magic of RubyMagic of Ruby
Magic of Ruby
 
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak   CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
 
Ajuste (tuning) del rendimiento de SQL Server 2008
Ajuste (tuning) del rendimiento de SQL Server 2008Ajuste (tuning) del rendimiento de SQL Server 2008
Ajuste (tuning) del rendimiento de SQL Server 2008
 
SQL Server Performance Analysis
SQL Server Performance AnalysisSQL Server Performance Analysis
SQL Server Performance Analysis
 
Python advanced 3.the python std lib by example – system related modules
Python advanced 3.the python std lib by example – system related modulesPython advanced 3.the python std lib by example – system related modules
Python advanced 3.the python std lib by example – system related modules
 
Django Celery
Django Celery Django Celery
Django Celery
 
Scaling python webapps from 0 to 50 million users - A top-down approach
Scaling python webapps from 0 to 50 million users - A top-down approachScaling python webapps from 0 to 50 million users - A top-down approach
Scaling python webapps from 0 to 50 million users - A top-down approach
 
Process monitoring in UNIX shell scripting
Process monitoring in UNIX shell scriptingProcess monitoring in UNIX shell scripting
Process monitoring in UNIX shell scripting
 
Python magicmethods
Python magicmethodsPython magicmethods
Python magicmethods
 
Counting on God
Counting on GodCounting on God
Counting on God
 
Airflow tutorials hands_on
Airflow tutorials hands_onAirflow tutorials hands_on
Airflow tutorials hands_on
 
Monitoring with Prometheus
Monitoring with PrometheusMonitoring with Prometheus
Monitoring with Prometheus
 
Scheduling tasks the human way - Brad Wood - ITB2021
Scheduling tasks the human way -  Brad Wood - ITB2021Scheduling tasks the human way -  Brad Wood - ITB2021
Scheduling tasks the human way - Brad Wood - ITB2021
 
Hacking ansible
Hacking ansibleHacking ansible
Hacking ansible
 
Python Asíncrono - Async Python
Python Asíncrono - Async PythonPython Asíncrono - Async Python
Python Asíncrono - Async Python
 
Apache Spark in your likeness - low and high level customization
Apache Spark in your likeness - low and high level customizationApache Spark in your likeness - low and high level customization
Apache Spark in your likeness - low and high level customization
 
Commit2015 kharchenko - python generators - ext
Commit2015   kharchenko - python generators - extCommit2015   kharchenko - python generators - ext
Commit2015 kharchenko - python generators - ext
 
Go Web Development
Go Web DevelopmentGo Web Development
Go Web Development
 
PyCon 2010 SQLAlchemy tutorial
PyCon 2010 SQLAlchemy tutorialPyCon 2010 SQLAlchemy tutorial
PyCon 2010 SQLAlchemy tutorial
 
Metaprogramovanie #1
Metaprogramovanie #1Metaprogramovanie #1
Metaprogramovanie #1
 

Recently uploaded

A presentation that explain the Power BI Licensing
A presentation that explain the Power BI LicensingA presentation that explain the Power BI Licensing
A presentation that explain the Power BI Licensing
AlessioFois2
 
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
bopyb
 
一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理
一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理
一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理
nuttdpt
 
Global Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headedGlobal Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headed
vikram sood
 
Analysis insight about a Flyball dog competition team's performance
Analysis insight about a Flyball dog competition team's performanceAnalysis insight about a Flyball dog competition team's performance
Analysis insight about a Flyball dog competition team's performance
roli9797
 
Learn SQL from basic queries to Advance queries
Learn SQL from basic queries to Advance queriesLearn SQL from basic queries to Advance queries
Learn SQL from basic queries to Advance queries
manishkhaire30
 
University of New South Wales degree offer diploma Transcript
University of New South Wales degree offer diploma TranscriptUniversity of New South Wales degree offer diploma Transcript
University of New South Wales degree offer diploma Transcript
soxrziqu
 
4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...
4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...
4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...
Social Samosa
 
一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理
一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理
一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理
nyfuhyz
 
Challenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more importantChallenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more important
Sm321
 
Udemy_2024_Global_Learning_Skills_Trends_Report (1).pdf
Udemy_2024_Global_Learning_Skills_Trends_Report (1).pdfUdemy_2024_Global_Learning_Skills_Trends_Report (1).pdf
Udemy_2024_Global_Learning_Skills_Trends_Report (1).pdf
Fernanda Palhano
 
The Building Blocks of QuestDB, a Time Series Database
The Building Blocks of QuestDB, a Time Series DatabaseThe Building Blocks of QuestDB, a Time Series Database
The Building Blocks of QuestDB, a Time Series Database
javier ramirez
 
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
Timothy Spann
 
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data LakeViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
Walaa Eldin Moustafa
 
Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...
Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...
Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...
Aggregage
 
STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...
STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...
STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...
sameer shah
 
办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样
办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样
办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样
apvysm8
 
Intelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicineIntelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicine
AndrzejJarynowski
 
The Ipsos - AI - Monitor 2024 Report.pdf
The  Ipsos - AI - Monitor 2024 Report.pdfThe  Ipsos - AI - Monitor 2024 Report.pdf
The Ipsos - AI - Monitor 2024 Report.pdf
Social Samosa
 
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
v7oacc3l
 

Recently uploaded (20)

A presentation that explain the Power BI Licensing
A presentation that explain the Power BI LicensingA presentation that explain the Power BI Licensing
A presentation that explain the Power BI Licensing
 
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
一比一原版(GWU,GW文凭证书)乔治·华盛顿大学毕业证如何办理
 
一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理
一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理
一比一原版(UCSF文凭证书)旧金山分校毕业证如何办理
 
Global Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headedGlobal Situational Awareness of A.I. and where its headed
Global Situational Awareness of A.I. and where its headed
 
Analysis insight about a Flyball dog competition team's performance
Analysis insight about a Flyball dog competition team's performanceAnalysis insight about a Flyball dog competition team's performance
Analysis insight about a Flyball dog competition team's performance
 
Learn SQL from basic queries to Advance queries
Learn SQL from basic queries to Advance queriesLearn SQL from basic queries to Advance queries
Learn SQL from basic queries to Advance queries
 
University of New South Wales degree offer diploma Transcript
University of New South Wales degree offer diploma TranscriptUniversity of New South Wales degree offer diploma Transcript
University of New South Wales degree offer diploma Transcript
 
4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...
4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...
4th Modern Marketing Reckoner by MMA Global India & Group M: 60+ experts on W...
 
一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理
一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理
一比一原版(UMN文凭证书)明尼苏达大学毕业证如何办理
 
Challenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more importantChallenges of Nation Building-1.pptx with more important
Challenges of Nation Building-1.pptx with more important
 
Udemy_2024_Global_Learning_Skills_Trends_Report (1).pdf
Udemy_2024_Global_Learning_Skills_Trends_Report (1).pdfUdemy_2024_Global_Learning_Skills_Trends_Report (1).pdf
Udemy_2024_Global_Learning_Skills_Trends_Report (1).pdf
 
The Building Blocks of QuestDB, a Time Series Database
The Building Blocks of QuestDB, a Time Series DatabaseThe Building Blocks of QuestDB, a Time Series Database
The Building Blocks of QuestDB, a Time Series Database
 
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
06-12-2024-BudapestDataForum-BuildingReal-timePipelineswithFLaNK AIM
 
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data LakeViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
ViewShift: Hassle-free Dynamic Policy Enforcement for Every Data Lake
 
Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...
Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...
Beyond the Basics of A/B Tests: Highly Innovative Experimentation Tactics You...
 
STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...
STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...
STATATHON: Unleashing the Power of Statistics in a 48-Hour Knowledge Extravag...
 
办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样
办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样
办(uts毕业证书)悉尼科技大学毕业证学历证书原版一模一样
 
Intelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicineIntelligence supported media monitoring in veterinary medicine
Intelligence supported media monitoring in veterinary medicine
 
The Ipsos - AI - Monitor 2024 Report.pdf
The  Ipsos - AI - Monitor 2024 Report.pdfThe  Ipsos - AI - Monitor 2024 Report.pdf
The Ipsos - AI - Monitor 2024 Report.pdf
 
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
在线办理(英国UCA毕业证书)创意艺术大学毕业证在读证明一模一样
 

Apache Airflow

  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 11.
  • 12.
  • 24. Q&A
  • 25.
  • 26.
  • 27.
  • 28.
  • 29. def scheduler(args): print(settings.HEADER) job = jobs.SchedulerJob( dag_id=args.dag_id, subdir=process_subdir(args.subdir), run_duration=args.run_duration, num_runs=args.num_runs, do_pickle=args.do_pickle) if args.daemon: pid, stdout, stderr, log_file = setup_locations("scheduler", args.pid, args.stdout, args.stderr, args.log_file) handle = setup_logging(log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), files_preserve=[handle], stdout=stdout, stderr=stderr, ) with ctx: job.run() stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) signal.signal(signal.SIGQUIT, sigquit_handler) job.run() class SchedulerJob(BaseJob): """ This SchedulerJob runs for a specific time interval and schedules the jobs that are ready to run. It figures out the latest runs for each task and sees if the dependencies for the next schedules are met. If so, it creates appropriate TaskInstances and sends run commands to the executor. It does this for each task in each DAG and repeats. """ def run(self): Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1) # Adding an entry in the DB with create_session() as session: self.state = State.RUNNING session.add(self) session.commit() id_ = self.id make_transient(self) self.id = id_ try: self._execute() # In case of max runs or max duration self.state = State.SUCCESS except SystemExit as e: # In case of ^C or SIGTERM self.state = State.SUCCESS except Exception as e: self.state = State.FAILED raise finally: self.end_date = timezone.utcnow() session.merge(self) session.commit() Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1) def _execute(self): self.log.info("Starting the scheduler") # DAGs can be pickled for easier remote execution by some executors pickle_dags = False if self.do_pickle and self.executor.__class__ not in (executors.LocalExecutor, executors.SequentialExecutor): pickle_dags = True # Use multiple processes to parse and generate tasks for the # DAGs in parallel. By processing them in separate processes, # we can get parallelism and isolation from potentially harmful # user code. self.log.info("Processing files using up to %s processes at a time", self.max_threads) self.log.info("Running execute loop for %s seconds", self.run_duration) self.log.info("Processing each file at most %s times", self.num_runs) self.log.info("Process each file at most once every %s seconds", self.file_process_interval) self.log.info("Wait until at least %s seconds have passed between file parsing " "loops", self.min_file_parsing_loop_time) self.log.info("Checking for new files in %s every %s seconds", self.subdir, self.dag_dir_list_interval) # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self.subdir) known_file_paths = list_py_file_paths(self.subdir) self.log.info("There are %s files in %s", len(known_file_paths), self.subdir) def processor_factory(file_path): return DagFileProcessor(file_path, pickle_dags, self.dag_ids) processor_manager = DagFileProcessorManager(self.subdir, known_file_paths, self.max_threads, self.file_process_interval, self.min_file_parsing_loop_time, self.num_runs, processor_factory) try: self._execute_helper(processor_manager) finally: self.log.info("Exited execute loop") # Kill all child processes on exit since we don't want to leave # them as orphaned. # 후략
  • 30. # For the execute duration, parse and schedule DAGs while (timezone.utcnow() - execute_start_time).total_seconds() < self.run_duration or self.run_duration < 0: self.log.debug("Starting Loop...") loop_start_time = time.time() # Traverse the DAG directory for Python files containing DAGs # periodically elapsed_time_since_refresh = (timezone.utcnow() - last_dag_dir_refresh_time).total_seconds() # 중략 # Kick of new processes and collect results from finished ones self.log.debug("Heartbeating the process manager") simple_dags = processor_manager.heartbeat() # Send tasks for execution if available simple_dag_bag = SimpleDagBag(simple_dags) if len(simple_dags) > 0: # Handle cases where a DAG run state is set (perhaps manually) to # a non-running state. Handle task instances that belong to # DAG runs in those states # If a task instance is up for retry but the corresponding DAG run # isn't running, mark the task instance as FAILED so we don't try # to re-run it. self._change_state_for_tis_without_dagrun(simple_dag_bag, [State.UP_FOR_RETRY], State.FAILED) # If a task instance is scheduled or queued, but the corresponding # DAG run isn't running, set the state to NONE so we don't try to # re-run it. self._change_state_for_tis_without_dagrun(simple_dag_bag, [State.QUEUED, State.SCHEDULED], State.NONE) self._execute_task_instances(simple_dag_bag, (State.SCHEDULED,)) # Call heartbeats self.log.debug("Heartbeating the executor") self.executor.heartbeat() # Process events from the executor self._process_executor_events(simple_dag_bag) # Heartbeat the scheduler periodically time_since_last_heartbeat = (timezone.utcnow() - last_self_heartbeat_time).total_seconds() if time_since_last_heartbeat > self.heartrate: self.log.debug("Heartbeating the scheduler") self.heartbeat() last_self_heartbeat_time = timezone.utcnow() # Occasionally print out stats about how fast the files are getting processed if ((timezone.utcnow() - last_stat_print_time).total_seconds() > self.print_stats_interval): if len(known_file_paths) > 0: self._log_file_processing_stats(known_file_paths, processor_manager) last_stat_print_time = timezone.utcnow() loop_end_time = time.time() self.log.debug("Ran scheduling loop in %.2f seconds", loop_end_time - loop_start_time) # Exit early for a test mode if processor_manager.max_runs_reached(): self.log.info("Exiting loop as all files have been processed %s times", self.num_runs) break def _execute_helper(self, processor_manager): """ :param processor_manager: manager to use :type processor_manager: DagFileProcessorManager :return: None """ self.executor.start() # 중략 # For the execute duration, parse and schedule DAGs while (timezone.utcnow() - execute_start_time).total_seconds() < self.run_duration or self.run_duration < 0: self.log.debug("Starting Loop...") loop_start_time = time.time() # 스케줄러 프로세스 실행 동안 반복되는 루프 # Stop any processors processor_manager.terminate() # Verify that all files were processed, and if so, deactivate DAGs that # haven't been touched by the scheduler as they likely have been # deleted. all_files_processed = True for file_path in known_file_paths: if processor_manager.get_last_finish_time(file_path) is None: all_files_processed = False break if all_files_processed: self.log.info( "Deactivating DAGs that haven't been touched since %s", execute_start_time.isoformat() ) models.DAG.deactivate_stale_dags(execute_start_time) self.executor.end() settings.Session.remove()
  • 31. def heartbeat(self): """ This should be periodically called by the scheduler. This method will kick off new processes to process DAG definition files and read the results from the finished processors. :return: a list of SimpleDags that were produced by processors that have finished since the last time this was called :rtype: list[SimpleDag] """ finished_processors = {} """:type : dict[unicode, AbstractDagFileProcessor]""" running_processors = {} """:type : dict[unicode, AbstractDagFileProcessor]""" for file_path, processor in self._processors.items(): if processor.done: self.log.info("Processor for %s finished", file_path) now = timezone.utcnow() finished_processors[file_path] = processor self._last_runtime[file_path] = (now - processor.start_time).total_seconds() self._last_finish_time[file_path] = now self._run_count[file_path] += 1 else: running_processors[file_path] = processor self._processors = running_processors self.log.debug("%s/%s scheduler processes running", len(self._processors), self._parallelism) self.log.debug("%s file paths queued for processing", len(self._file_path_queue)) # Collect all the DAGs that were found in the processed files simple_dags = [] for file_path, processor in finished_processors.items(): if processor.result is None: self.log.warning( "Processor for %s exited with return code %s.", processor.file_path, processor.exit_code ) else: for simple_dag in processor.result: simple_dags.append(simple_dag) # 중략 # Start more processors if we have enough slots and files to process while (self._parallelism - len(self._processors) > 0 and len(self._file_path_queue) > 0): file_path = self._file_path_queue.pop(0) processor = self._processor_factory(file_path) processor.start() self.log.info( "Started a process (PID: %s) to generate tasks for %s", processor.pid, file_path ) self._processors[file_path] = processor # Update scheduler heartbeat count. self._run_count[self._heart_beat_key] += 1 return simple_dags
  • 32. def helper(): # This helper runs in the newly created process log = logging.getLogger("airflow.processor") stdout = StreamLogWriter(log, logging.INFO) stderr = StreamLogWriter(log, logging.WARN) set_context(log, file_path) try: # redirect stdout/stderr to log sys.stdout = stdout sys.stderr = stderr # Re-configure the ORM engine as there are issues with multiple processes settings.configure_orm() # Change the thread name to differentiate log lines. This is # really a separate process, but changing the name of the # process doesn't work, so changing the thread name instead. threading.current_thread().name = thread_name start_time = time.time() log.info("Started process (PID=%s) to work on %s", os.getpid(), file_path) scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log) result = scheduler_job.process_file(file_path, pickle_dags) result_queue.put(result) end_time = time.time() log.info( "Processing %s took %.3f seconds", file_path, end_time - start_time ) except: # Log exceptions through the logging framework. log.exception("Got an exception! Propagating...") raise finally: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # We re-initialized the ORM within this Process above so we need to # tear it down manually here settings.dispose_orm() def _launch_process(result_queue, file_path, pickle_dags, dag_id_white_list, thread_name): """ Launch a process to process the given file. :param result_queue: the queue to use for passing back the result :type result_queue: multiprocessing.Queue :param file_path: the file to process :type file_path: unicode :param pickle_dags: whether to pickle the DAGs found in the file and save them to the DB :type pickle_dags: bool :param dag_id_white_list: if specified, only examine DAG ID's that are in this list :type dag_id_white_list: list[unicode] :param thread_name: the name to use for the process that is launched :type thread_name: unicode :return: the process that was launched :rtype: multiprocessing.Process """ p = multiprocessing.Process(target=helper, args=(), name="{}-Process".format(thread_name)) p.start() return p def start(self): """ Launch the process and start processing the DAG. """ self._process = DagFileProcessor._launch_process( self._result_queue, self.file_path, self._pickle_dags, self._dag_id_white_list, "DagFileProcessor{}".format(self._instance_id)) self._start_time = timezone.utcnow()
  • 33. def _process_dags(self, dagbag, dags, tis_out): """ Iterates over the dags and processes them. Processing includes: 1. Create appropriate DagRun(s) in the DB. 2. Create appropriate TaskInstance(s) in the DB. 3. Send emails for tasks that have missed SLAs. :param dagbag: a collection of DAGs to process :type dagbag: models.DagBag :param dags: the DAGs from the DagBag to process :type dags: DAG :param tis_out: A queue to add generated TaskInstance objects :type tis_out: multiprocessing.Queue[TaskInstance] :return: None """ for dag in dags: dag = dagbag.get_dag(dag.dag_id) if dag.is_paused: self.log.info("Not processing DAG %s since it's paused", dag.dag_id) continue if not dag: self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id) continue self.log.info("Processing %s", dag.dag_id) dag_run = self.create_dag_run(dag) if dag_run: self.log.info("Created %s", dag_run) self._process_task_instances(dag, tis_out) self.manage_slas(dag) models.DagStat.update([d.dag_id for d in dags]) def process_file(self, file_path, pickle_dags=False, session=None): """ Process a Python file containing Airflow DAGs. This includes: 1. Execute the file and look for DAG objects in the namespace. 2. Pickle the DAG and save it to the DB (if necessary). 3. For each DAG, see what tasks should run and create appropriate task instances in the DB. 4. Record any errors importing the file into ORM 5. Kill (in ORM) any task instances belonging to the DAGs that haven't issued a heartbeat in a while. Returns a list of SimpleDag objects that represent the DAGs found in the file :param file_path: the path to the Python file that should be executed :type file_path: unicode :param pickle_dags: whether serialize the DAGs found in the file and save them to the db :type pickle_dags: bool :return: a list of SimpleDags made from the Dags found in the file :rtype: list[SimpleDag] """ self.log.info("Processing file %s for tasks to queue", file_path) # As DAGs are parsed from this file, they will be converted into SimpleDags simple_dags = [] try: dagbag = models.DagBag(file_path) except Exception: self.log.exception("Failed at reloading the DAG file %s", file_path) Stats.incr('dag_file_refresh_error', 1, 1) return [] # 중략 self._process_dags(dagbag, dags, ti_keys_to_schedule) def _process_task_instances(self, dag, queue, session=None): """ This method schedules the tasks for a single DAG by looking at the active DAG runs and adding task instances that should run to the queue. """ # 중략 for run in active_dag_runs: self.log.debug("Examining active DAG run: %s", run) # this needs a fresh session sometimes tis get detached tis = run.get_task_instances(state=(State.NONE, State.UP_FOR_RETRY)) # this loop is quite slow as it uses are_dependencies_met for # every task (in ti.is_runnable). This is also called in # update_state above which has already checked these tasks for ti in tis: task = dag.get_task(ti.task_id) # fixme: ti.task is transient but needs to be set ti.task = task # future: remove adhoc if task.adhoc: continue if ti.are_dependencies_met( dep_context=DepContext(flag_upstream_failed=True), session=session): self.log.debug('Queuing task: %s', ti) queue.append(ti.key)
  • 34. def process_file(self, file_path, pickle_dags=False, session=None): """ Process a Python file containing Airflow DAGs. This includes: 1. Execute the file and look for DAG objects in the namespace. 2. Pickle the DAG and save it to the DB (if necessary). 3. For each DAG, see what tasks should run and create appropriate task instances in the DB. 4. Record any errors importing the file into ORM 5. Kill (in ORM) any task instances belonging to the DAGs that haven't issued a heartbeat in a while. Returns a list of SimpleDag objects that represent the DAGs found in the file :param file_path: the path to the Python file that should be executed :type file_path: unicode :param pickle_dags: whether serialize the DAGs found in the file and save them to the db :type pickle_dags: bool :return: a list of SimpleDags made from the Dags found in the file :rtype: list[SimpleDag] """ self.log.info("Processing file %s for tasks to queue", file_path) # As DAGs are parsed from this file, they will be converted into SimpleDags simple_dags = [] try: dagbag = models.DagBag(file_path) except Exception: self.log.exception("Failed at reloading the DAG file %s", file_path) Stats.incr('dag_file_refresh_error', 1, 1) return [] # 중략 self._process_dags(dagbag, dags, ti_keys_to_schedule) def process_file(self, file_path, pickle_dags=False, session=None): # 중략 ti_keys_to_schedule = [] self._process_dags(dagbag, dags, ti_keys_to_schedule) for ti_key in ti_keys_to_schedule: dag = dagbag.dags[ti_key[0]] task = dag.get_task(ti_key[1]) ti = models.TaskInstance(task, ti_key[2]) ti.refresh_from_db(session=session, lock_for_update=True) # We can defer checking the task dependency checks to the worker themselves # since they can be expensive to run in the scheduler. dep_context = DepContext(deps=QUEUE_DEPS, ignore_task_deps=True) # Only schedule tasks that have their dependencies met, e.g. to avoid # a task that recently got its state changed to RUNNING from somewhere # other than the scheduler from getting its state overwritten. # TODO(aoen): It's not great that we have to check all the task instance # dependencies twice; once to get the task scheduled, and again to actually # run the task. We should try to come up with a way to only check them once. if ti.are_dependencies_met( dep_context=dep_context, session=session, verbose=True): # Task starts out in the scheduled state. All tasks in the # scheduled state will be sent to the executor ti.state = State.SCHEDULED # Also save this task instance to the DB. self.log.info("Creating / updating %s in ORM", ti) session.merge(ti) # commit batch session.commit() # 중략 return simple_dags def _find_executable_task_instances(self, simple_dag_bag, states, session=None): # 중략 states_to_count_as_running = [State.RUNNING] executable_tis = [] # Get all the queued task instances from associated with scheduled # DagRuns which are not backfilled, in the given states, # and the dag is not paused TI = models.TaskInstance DR = models.DagRun DM = models.DagModel ti_query = ( session .query(TI) .filter(TI.dag_id.in_(simple_dag_bag.dag_ids)) .outerjoin(DR, and_(DR.dag_id == TI.dag_id, DR.execution_date == TI.execution_date)) .filter(or_(DR.run_id == None, not_(DR.run_id.like(BackfillJob.ID_PREFIX + '%')))) .outerjoin(DM, DM.dag_id==TI.dag_id) .filter(or_(DM.dag_id == None, not_(DM.is_paused))) ) if None in states: ti_query = ti_query.filter(or_(TI.state == None, TI.state.in_(states))) else: ti_query = ti_query.filter(TI.state.in_(states)) task_instances_to_examine = ti_query.all() # 중략 # Get the pool settings pools = {p.pool: p for p in session.query(models.Pool).all()} pool_to_task_instances = defaultdict(list) for task_instance in task_instances_to_examine: pool_to_task_instances[task_instance.pool].append(task_instance) # 중략 # Go through each pool, and queue up a task for execution if there are # any open slots in the pool. for pool, task_instances in pool_to_task_instances.items(): # 중략 priority_sorted_task_instances = sorted( task_instances, key=lambda ti: (-ti.priority_weight, ti.execution_date)) # DAG IDs with running tasks that equal the concurrency limit of the dag dag_id_to_possibly_running_task_count = {} for task_instance in priority_sorted_task_instances: if open_slots <= 0: self.log.info( "Not scheduling since there are %s open slots in pool %s", open_slots, pool ) # Can't schedule any more since there are no more open slots. break # Check to make sure that the task concurrency of the DAG hasn't been # reached. # 중략 task_instance_str = "nt".join( ["{}".format(x) for x in executable_tis]) self.log.info("Setting the follow tasks to queued state:nt%s", task_instance_str) # so these dont expire on commit for ti in executable_tis: copy_dag_id = ti.dag_id copy_execution_date = ti.execution_date copy_task_id = ti.task_id make_transient(ti) ti.dag_id = copy_dag_id ti.execution_date = copy_execution_date ti.task_id = copy_task_id return executable_tis _execute_task_instances
  • 35. class LocalExecutor(BaseExecutor): """ LocalExecutor executes tasks locally in parallel. It uses the multiprocessing Python library and queues to parallelize the execution of tasks. """ def start(self): self.result_queue = multiprocessing.Queue() self.queue = None self.workers = [] self.workers_used = 0 self.workers_active = 0 self.impl = (LocalExecutor._UnlimitedParallelism(self) if self.parallelism == 0 else LocalExecutor._LimitedParallelism(self)) self.impl.start() def __init__( self, executor=executors.GetDefaultExecutor(), heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'), *args, **kwargs): self.hostname = get_hostname() self.executor = executor self.executor_class = executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() self.heartrate = heartrate self.unixname = getpass.getuser() self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') super(BaseJob, self).__init__(*args, **kwargs) class BaseJob(Base, LoggingMixin): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have its own state, start and end time. """ def GetDefaultExecutor(): """Creates a new instance of the configured executor if none exists and returns it""" global DEFAULT_EXECUTOR if DEFAULT_EXECUTOR is not None: return DEFAULT_EXECUTOR executor_name = configuration.conf.get('core', 'EXECUTOR') DEFAULT_EXECUTOR = _get_executor(executor_name) log = LoggingMixin().log log.info("Using executor %s", executor_name) return DEFAULT_EXECUTOR def _get_executor(executor_name): """ Creates a new instance of the named executor. In case the executor name is not know in airflow, look for it in the plugins """ if executor_name == Executors.LocalExecutor: return LocalExecutor() elif executor_name == Executors.SequentialExecutor: return SequentialExecutor() elif executor_name == Executors.CeleryExecutor: from airflow.executors.celery_executor import CeleryExecutor return CeleryExecutor() elif executor_name == Executors.DaskExecutor: from airflow.executors.dask_executor import DaskExecutor return DaskExecutor() elif executor_name == Executors.MesosExecutor: from airflow.contrib.executors.mesos_executor import MesosExecutor return MesosExecutor() elif executor_name == Executors.KubernetesExecutor: from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor return KubernetesExecutor() else: # Loading plugins _integrate_plugins() executor_path = executor_name.split('.') if len(executor_path) != 2: raise AirflowException( "Executor {0} not supported: " "please specify in format plugin_module.executor".format(executor_name)) if executor_path[0] in globals(): return globals()[executor_path[0]].__dict__[executor_path[1]]() else: raise AirflowException("Executor {0} not supported.".format(executor_name)) def _execute_helper(self, processor_manager): """ :param processor_manager: manager to use :type processor_manager: DagFileProcessorManager :return: None """ self.executor.start() # 후략 class SchedulerJob(BaseJob): """ This SchedulerJob runs for a specific time interval and schedules the jobs that are ready to run. It figures out the latest runs for each task and sees if the dependencies for the next schedules are met. If so, it creates appropriate TaskInstances and sends run commands to the executor. It does this for each task in each DAG and repeats. """
  • 36. def _enqueue_task_instances_with_queued_state(self, simple_dag_bag, task_instances): """ Takes task_instances, which should have been set to queued, and enqueues them with the executor. :param task_instances: TaskInstances to enqueue :type task_instances: List[TaskInstance] :param simple_dag_bag: Should contains all of the task_instances' dags :type simple_dag_bag: SimpleDagBag """ TI = models.TaskInstance # actually enqueue them for task_instance in task_instances: simple_dag = simple_dag_bag.get_dag(task_instance.dag_id) command = " ".join(TI.generate_command( task_instance.dag_id, task_instance.task_id, task_instance.execution_date, local=True, mark_success=False, ignore_all_deps=False, ignore_depends_on_past=False, ignore_task_deps=False, ignore_ti_state=False, pool=task_instance.pool, file_path=simple_dag.full_filepath, pickle_id=simple_dag.pickle_id)) priority = task_instance.priority_weight queue = task_instance.queue self.log.info( "Sending %s to executor with priority %s and queue %s", task_instance.key, priority, queue ) # save attributes so sqlalchemy doesnt expire them copy_dag_id = task_instance.dag_id copy_task_id = task_instance.task_id copy_execution_date = task_instance.execution_date make_transient(task_instance) task_instance.dag_id = copy_dag_id task_instance.task_id = copy_task_id task_instance.execution_date = copy_execution_date self.executor.queue_command( task_instance, command, priority=priority, queue=queue) def _execute_task_instances(self, simple_dag_bag, states, session=None): """ Attempts to execute TaskInstances that should be executed by the scheduler. There are three steps: 1. Pick TIs by priority with the constraint that they are in the expected states and that we do exceed max_active_runs or pool limits. 2. Change the state for the TIs above atomically. 3. Enqueue the TIs in the executor. :param simple_dag_bag: TaskInstances associated with DAGs in the simple_dag_bag will be fetched from the DB and executed :type simple_dag_bag: SimpleDagBag :param states: Execute TaskInstances in these states :type states: Tuple[State] :return: None """ executable_tis = self._find_executable_task_instances(simple_dag_bag, states, session=session) def query(result, items): tis_with_state_changed = self._change_state_for_executable_task_instances( items, states, session=session) self._enqueue_task_instances_with_queued_state( simple_dag_bag, tis_with_state_changed) session.commit() return result + len(tis_with_state_changed) return helpers.reduce_in_chunks(query, executable_tis, 0, self.max_tis_per_query) def _execute_helper(self, processor_manager): """ :param processor_manager: manager to use :type processor_manager: DagFileProcessorManager :return: None """ self.executor.start() # 중략 # For the execute duration, parse and schedule DAGs while (timezone.utcnow() - execute_start_time).total_seconds() < self.run_duration or self.run_duration < 0: self.log.debug("Starting Loop...") loop_start_time = time.time() # 중략 # Kick of new processes and collect results from finished ones self.log.debug("Heartbeating the process manager") simple_dags = processor_manager.heartbeat() # Send tasks for execution if available simple_dag_bag = SimpleDagBag(simple_dags) if len(simple_dags) > 0: # 중략 self._execute_task_instances(simple_dag_bag, (State.SCHEDULED,)) # Call heartbeats self.log.debug("Heartbeating the executor") self.executor.heartbeat() # 중략 # Exit early for a test mode if processor_manager.max_runs_reached(): self.log.info("Exiting loop as all files have been processed %s times", self.num_runs) break # 후략 def queue_command(self, task_instance, command, priority=1, queue=None): key = task_instance.key if key not in self.queued_tasks and key not in self.running: self.log.info("Adding to queue: %s", command) self.queued_tasks[key] = (command, priority, queue, task_instance) else: self.log.info("could not queue task {}".format(key))
  • 37. def heartbeat(self): # Triggering new jobs if not self.parallelism: open_slots = len(self.queued_tasks) else: open_slots = self.parallelism - len(self.running) self.log.debug("%s running task instances", len(self.running)) self.log.debug("%s in queue", len(self.queued_tasks)) self.log.debug("%s open slots", open_slots) sorted_queue = sorted( [(k, v) for k, v in self.queued_tasks.items()], key=lambda x: x[1][1], reverse=True) for i in range(min((open_slots, len(self.queued_tasks)))): key, (command, _, queue, ti) = sorted_queue.pop(0) # TODO(jlowin) without a way to know what Job ran which tasks, # there is a danger that another Job started running a task # that was also queued to this executor. This is the last chance # to check if that happened. The most probable way is that a # Scheduler tried to run a task that was originally queued by a # Backfill. This fix reduces the probability of a collision but # does NOT eliminate it. self.queued_tasks.pop(key) ti.refresh_from_db() if ti.state != State.RUNNING: self.running[key] = command self.execute_async(key=key, command=command, queue=queue, executor_config=ti.executor_config) else: self.logger.info( 'Task is already running, not sending to ' 'executor: {}'.format(key)) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync() def execute_async(self, key, command): """ :param key: the key to identify the TI :type key: Tuple(dag_id, task_id, execution_date) :param command: the command to execute :type command: string """ local_worker = LocalWorker(self.executor.result_queue) local_worker.key = key local_worker.command = command self.executor.workers_used += 1 self.executor.workers_active += 1 local_worker.start() class LocalWorker(multiprocessing.Process, LoggingMixin): """LocalWorker Process implementation to run airflow commands. Executes the given command and puts the result into a result queue when done, terminating execution.""" def __init__(self, result_queue): """ :param result_queue: the queue to store result states tuples (key, State) :type result_queue: multiprocessing.Queue """ super(LocalWorker, self).__init__() self.daemon = True self.result_queue = result_queue self.key = None self.command = None def execute_work(self, key, command): """ Executes command received and stores result state in queue. :param key: the key to identify the TI :type key: Tuple(dag_id, task_id, execution_date) :param command: the command to execute :type command: string """ if key is None: return self.log.info("%s running %s", self.__class__.__name__, command) command = "exec bash -c '{0}'".format(command) try: subprocess.check_call(command, shell=True, close_fds=True) state = State.SUCCESS except subprocess.CalledProcessError as e: state = State.FAILED self.log.error("Failed to execute task %s.", str(e)) # TODO: Why is this commented out? # raise e self.result_queue.put((key, state)) def run(self): self.execute_work(self.key, self.command) time.sleep(1)
  • 38. def generate_command(dag_id, task_id, execution_date, mark_success=False, ignore_all_deps=False, ignore_depends_on_past=False, ignore_task_deps=False, ignore_ti_state=False, local=False, pickle_id=None, file_path=None, raw=False, job_id=None, pool=None, cfg_path=None ): """ Generates the shell command required to execute this task instance. 중략 """ iso = execution_date.isoformat() cmd = ["airflow", "run", str(dag_id), str(task_id), str(iso)] cmd.extend(["--mark_success"]) if mark_success else None cmd.extend(["--pickle", str(pickle_id)]) if pickle_id else None cmd.extend(["--job_id", str(job_id)]) if job_id else None cmd.extend(["-A"]) if ignore_all_deps else None cmd.extend(["-i"]) if ignore_task_deps else None cmd.extend(["-I"]) if ignore_depends_on_past else None cmd.extend(["--force"]) if ignore_ti_state else None cmd.extend(["--local"]) if local else None cmd.extend(["--pool", pool]) if pool else None cmd.extend(["--raw"]) if raw else None cmd.extend(["-sd", file_path]) if file_path else None cmd.extend(["--cfg_path", cfg_path]) if cfg_path else None return cmd def run(args, dag=None): # 중략 task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.refresh_from_db() ti.init_run_context(raw=args.raw) hostname = get_hostname() log.info("Running %s on host %s", ti, hostname) if args.interactive: _run(args, dag, ti) else: with redirect_stdout(ti.log, logging.INFO), redirect_stderr(ti.log, logging.WARN): _run(args, dag, ti) logging.shutdown() def _run(args, dag, ti): if args.local: run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, pickle_id=args.pickle, ignore_all_deps=args.ignore_all_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, ignore_task_deps=args.ignore_dependencies, ignore_ti_state=args.force, pool=args.pool) run_job.run() elif args.raw: ti._run_raw_task( mark_success=args.mark_success, job_id=args.job_id, pool=args.pool, ) else: # 후략 def _execute(self): self.task_runner = get_task_runner(self) # 중략 if not self.task_instance._check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool): self.log.info("Task is not able to be run") return try: self.task_runner.start() last_heartbeat_time = time.time() heartbeat_time_limit = conf.getint('scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done return_code = self.task_runner.return_code() if return_code is not None: self.log.info("Task exited with return code %s", return_code) return # 중략 finally: self.on_kill() class BashTaskRunner(BaseTaskRunner): """ Runs the raw Airflow task by invoking through the Bash shell. """ def __init__(self, local_task_job): super(BashTaskRunner, self).__init__(local_task_job) def start(self): self.process = self.run_command(['bash', '-c'], join_args=True) def return_code(self): return self.process.poll()