Apache Airflow

Apache Airflow
넷마블 데이터인프라팀 김장현

•
•
•
•
•
•
•
•

•
•
•
•
•
•
•
•
•
•
•
•
•
•
•

•
•
•
•
•
•
•
•
•
•
•
•
•
•

def scheduler(args):
print(settings.HEADER)
job = jobs.SchedulerJob(
dag_id=args.dag_id,
subdir=process_subdir(args.subdir),
run_duration=args.run_duration,
num_runs=args.num_runs,
do_pickle=args.do_pickle)
if args.daemon:
pid, stdout, stderr, log_file = setup_locations("scheduler",
args.pid,
args.stdout,
args.stderr,
args.log_file)
handle = setup_logging(log_file)
stdout = open(stdout, 'w+')
stderr = open(stderr, 'w+')
ctx = daemon.DaemonContext(
pidfile=TimeoutPIDLockFile(pid, -1),
files_preserve=[handle],
stdout=stdout,
stderr=stderr,
)
with ctx:
job.run()
stdout.close()
stderr.close()
else:
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
signal.signal(signal.SIGQUIT, sigquit_handler)
job.run()
class SchedulerJob(BaseJob):
"""
This SchedulerJob runs for a specific time interval and schedules the jobs
that are ready to run. It figures out the latest runs for each
task and sees if the dependencies for the next schedules are met.
If so, it creates appropriate TaskInstances and sends run commands to the
executor. It does this for each task in each DAG and repeats.
"""
def run(self):
Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1)
# Adding an entry in the DB
with create_session() as session:
self.state = State.RUNNING
session.add(self)
session.commit()
id_ = self.id
make_transient(self)
self.id = id_
try:
self._execute()
# In case of max runs or max duration
self.state = State.SUCCESS
except SystemExit as e:
# In case of ^C or SIGTERM
self.state = State.SUCCESS
except Exception as e:
self.state = State.FAILED
raise
finally:
self.end_date = timezone.utcnow()
session.merge(self)
session.commit()
Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1)
def _execute(self):
self.log.info("Starting the scheduler")
# DAGs can be pickled for easier remote execution by some executors
pickle_dags = False
if self.do_pickle and self.executor.__class__ not in
(executors.LocalExecutor, executors.SequentialExecutor):
pickle_dags = True
# Use multiple processes to parse and generate tasks for the
# DAGs in parallel. By processing them in separate processes,
# we can get parallelism and isolation from potentially harmful
# user code.
self.log.info("Processing files using up to %s processes at a time",
self.max_threads)
self.log.info("Running execute loop for %s seconds", self.run_duration)
self.log.info("Processing each file at most %s times", self.num_runs)
self.log.info("Process each file at most once every %s seconds",
self.file_process_interval)
self.log.info("Wait until at least %s seconds have passed between file parsing "
"loops", self.min_file_parsing_loop_time)
self.log.info("Checking for new files in %s every %s seconds",
self.subdir, self.dag_dir_list_interval)
# Build up a list of Python files that could contain DAGs
self.log.info("Searching for files in %s", self.subdir)
known_file_paths = list_py_file_paths(self.subdir)
self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
def processor_factory(file_path):
return DagFileProcessor(file_path,
pickle_dags,
self.dag_ids)
processor_manager = DagFileProcessorManager(self.subdir,
known_file_paths,
self.max_threads,
self.file_process_interval,
self.min_file_parsing_loop_time,
self.num_runs,
processor_factory)
try:
self._execute_helper(processor_manager)
finally:
self.log.info("Exited execute loop")
# Kill all child processes on exit since we don't want to leave
# them as orphaned.
# 후략

# For the execute duration, parse and schedule DAGs
while (timezone.utcnow() - execute_start_time).total_seconds() <
self.run_duration or self.run_duration < 0:
self.log.debug("Starting Loop...")
loop_start_time = time.time()
# Traverse the DAG directory for Python files containing DAGs
# periodically
elapsed_time_since_refresh = (timezone.utcnow() -
last_dag_dir_refresh_time).total_seconds()
# 중략
# Kick of new processes and collect results from finished ones
self.log.debug("Heartbeating the process manager")
simple_dags = processor_manager.heartbeat()
# Send tasks for execution if available
simple_dag_bag = SimpleDagBag(simple_dags)
if len(simple_dags) > 0:
# Handle cases where a DAG run state is set (perhaps manually) to
# a non-running state. Handle task instances that belong to
# DAG runs in those states
# If a task instance is up for retry but the corresponding DAG run
# isn't running, mark the task instance as FAILED so we don't try
# to re-run it.
self._change_state_for_tis_without_dagrun(simple_dag_bag,
[State.UP_FOR_RETRY],
State.FAILED)
# If a task instance is scheduled or queued, but the corresponding
# DAG run isn't running, set the state to NONE so we don't try to
# re-run it.
self._change_state_for_tis_without_dagrun(simple_dag_bag,
[State.QUEUED,
State.SCHEDULED],
State.NONE)
self._execute_task_instances(simple_dag_bag,
(State.SCHEDULED,))
# Call heartbeats
self.log.debug("Heartbeating the executor")
self.executor.heartbeat()
# Process events from the executor
self._process_executor_events(simple_dag_bag)
# Heartbeat the scheduler periodically
time_since_last_heartbeat = (timezone.utcnow() -
last_self_heartbeat_time).total_seconds()
if time_since_last_heartbeat > self.heartrate:
self.log.debug("Heartbeating the scheduler")
self.heartbeat()
last_self_heartbeat_time = timezone.utcnow()
# Occasionally print out stats about how fast the files are getting processed
if ((timezone.utcnow() - last_stat_print_time).total_seconds() >
self.print_stats_interval):
if len(known_file_paths) > 0:
self._log_file_processing_stats(known_file_paths,
processor_manager)
last_stat_print_time = timezone.utcnow()
loop_end_time = time.time()
self.log.debug("Ran scheduling loop in %.2f seconds",
loop_end_time - loop_start_time)
# Exit early for a test mode
if processor_manager.max_runs_reached():
self.log.info("Exiting loop as all files have been processed %s times",
self.num_runs)
break
def _execute_helper(self, processor_manager):
"""
:param processor_manager: manager to use
:type processor_manager: DagFileProcessorManager
:return: None
"""
self.executor.start()
# 중략
# 스케줄러 프로세스 실행 동안 반복되는 루프
# Stop any processors
processor_manager.terminate()
# Verify that all files were processed, and if so, deactivate DAGs that
# haven't been touched by the scheduler as they likely have been
# deleted.
all_files_processed = True
for file_path in known_file_paths:
if processor_manager.get_last_finish_time(file_path) is None:
all_files_processed = False
break
if all_files_processed:
self.log.info(
"Deactivating DAGs that haven't been touched since %s",
execute_start_time.isoformat()
)
models.DAG.deactivate_stale_dags(execute_start_time)
self.executor.end()
settings.Session.remove()

def heartbeat(self):
"""
This should be periodically called by the scheduler. This method will
kick off new processes to process DAG definition files and read the
results from the finished processors.
:return: a list of SimpleDags that were produced by processors that
have finished since the last time this was called
:rtype: list[SimpleDag]
"""
finished_processors = {}
""":type : dict[unicode, AbstractDagFileProcessor]"""
running_processors = {}
""":type : dict[unicode, AbstractDagFileProcessor]"""
for file_path, processor in self._processors.items():
if processor.done:
self.log.info("Processor for %s finished", file_path)
now = timezone.utcnow()
finished_processors[file_path] = processor
self._last_runtime[file_path] = (now -
processor.start_time).total_seconds()
self._last_finish_time[file_path] = now
self._run_count[file_path] += 1
else:
running_processors[file_path] = processor
self._processors = running_processors
self.log.debug("%s/%s scheduler processes running",
len(self._processors), self._parallelism)
self.log.debug("%s file paths queued for processing",
len(self._file_path_queue))
# Collect all the DAGs that were found in the processed files
simple_dags = []
for file_path, processor in finished_processors.items():
if processor.result is None:
self.log.warning(
"Processor for %s exited with return code %s.",
processor.file_path, processor.exit_code
)
else:
for simple_dag in processor.result:
simple_dags.append(simple_dag)
# 중략
# Start more processors if we have enough slots and files to process
while (self._parallelism - len(self._processors) > 0 and
len(self._file_path_queue) > 0):
file_path = self._file_path_queue.pop(0)
processor = self._processor_factory(file_path)
processor.start()
self.log.info(
"Started a process (PID: %s) to generate tasks for %s",
processor.pid, file_path
)
self._processors[file_path] = processor
# Update scheduler heartbeat count.
self._run_count[self._heart_beat_key] += 1
return simple_dags

def helper():
# This helper runs in the newly created process
log = logging.getLogger("airflow.processor")
stdout = StreamLogWriter(log, logging.INFO)
stderr = StreamLogWriter(log, logging.WARN)
set_context(log, file_path)
try:
# redirect stdout/stderr to log
sys.stdout = stdout
sys.stderr = stderr
# Re-configure the ORM engine as there are issues with multiple processes
settings.configure_orm()
# Change the thread name to differentiate log lines. This is
# really a separate process, but changing the name of the
# process doesn't work, so changing the thread name instead.
threading.current_thread().name = thread_name
start_time = time.time()
log.info("Started process (PID=%s) to work on %s",
os.getpid(), file_path)
scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log)
result = scheduler_job.process_file(file_path,
pickle_dags)
result_queue.put(result)
end_time = time.time()
log.info(
"Processing %s took %.3f seconds", file_path, end_time - start_time
)
except:
# Log exceptions through the logging framework.
log.exception("Got an exception! Propagating...")
raise
finally:
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
# We re-initialized the ORM within this Process above so we need to
# tear it down manually here
settings.dispose_orm()
def _launch_process(result_queue,
file_path,
pickle_dags,
dag_id_white_list,
thread_name):
"""
Launch a process to process the given file.
:param result_queue: the queue to use for passing back the result
:type result_queue: multiprocessing.Queue
:param file_path: the file to process
:type file_path: unicode
:param pickle_dags: whether to pickle the DAGs found in the file and
save them to the DB
:type pickle_dags: bool
:param dag_id_white_list: if specified, only examine DAG ID's that are
in this list
:type dag_id_white_list: list[unicode]
:param thread_name: the name to use for the process that is launched
:type thread_name: unicode
:return: the process that was launched
:rtype: multiprocessing.Process
"""
p = multiprocessing.Process(target=helper,
args=(),
name="{}-Process".format(thread_name))
p.start()
return p
def start(self):
"""
Launch the process and start processing the DAG.
"""
self._process = DagFileProcessor._launch_process(
self._result_queue,
self.file_path,
self._pickle_dags,
self._dag_id_white_list,
"DagFileProcessor{}".format(self._instance_id))
self._start_time = timezone.utcnow()

def _process_dags(self, dagbag, dags, tis_out):
"""
Iterates over the dags and processes them. Processing includes:
1. Create appropriate DagRun(s) in the DB.
2. Create appropriate TaskInstance(s) in the DB.
3. Send emails for tasks that have missed SLAs.
:param dagbag: a collection of DAGs to process
:type dagbag: models.DagBag
:param dags: the DAGs from the DagBag to process
:type dags: DAG
:param tis_out: A queue to add generated TaskInstance objects
:type tis_out: multiprocessing.Queue[TaskInstance]
:return: None
"""
for dag in dags:
dag = dagbag.get_dag(dag.dag_id)
if dag.is_paused:
self.log.info("Not processing DAG %s since it's paused", dag.dag_id)
continue
if not dag:
self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id)
continue
self.log.info("Processing %s", dag.dag_id)
dag_run = self.create_dag_run(dag)
if dag_run:
self.log.info("Created %s", dag_run)
self._process_task_instances(dag, tis_out)
self.manage_slas(dag)
models.DagStat.update([d.dag_id for d in dags])
def process_file(self, file_path, pickle_dags=False, session=None):
"""
Process a Python file containing Airflow DAGs.
This includes:
1. Execute the file and look for DAG objects in the namespace.
2. Pickle the DAG and save it to the DB (if necessary).
3. For each DAG, see what tasks should run and create appropriate task
instances in the DB.
4. Record any errors importing the file into ORM
5. Kill (in ORM) any task instances belonging to the DAGs that haven't
issued a heartbeat in a while.
Returns a list of SimpleDag objects that represent the DAGs found in
the file
:param file_path: the path to the Python file that should be executed
:param pickle_dags: whether serialize the DAGs found in the file and
save them to the db
:return: a list of SimpleDags made from the Dags found in the file
"""
self.log.info("Processing file %s for tasks to queue", file_path)
# As DAGs are parsed from this file, they will be converted into SimpleDags
simple_dags = []
try:
dagbag = models.DagBag(file_path)
except Exception:
self.log.exception("Failed at reloading the DAG file %s", file_path)
Stats.incr('dag_file_refresh_error', 1, 1)
return []
# 중략
self._process_dags(dagbag, dags, ti_keys_to_schedule)
def _process_task_instances(self, dag, queue, session=None):
"""
This method schedules the tasks for a single DAG by looking at the
active DAG runs and adding task instances that should run to the
queue.
"""
# 중략
for run in active_dag_runs:
self.log.debug("Examining active DAG run: %s", run)
# this needs a fresh session sometimes tis get detached
tis = run.get_task_instances(state=(State.NONE,
State.UP_FOR_RETRY))
# this loop is quite slow as it uses are_dependencies_met for
# every task (in ti.is_runnable). This is also called in
# update_state above which has already checked these tasks
for ti in tis:
task = dag.get_task(ti.task_id)
# fixme: ti.task is transient but needs to be set
ti.task = task
# future: remove adhoc
if task.adhoc:
continue
if ti.are_dependencies_met(
dep_context=DepContext(flag_upstream_failed=True),
session=session):
self.log.debug('Queuing task: %s', ti)
queue.append(ti.key)

"""
Process a Python file containing Airflow DAGs.
This includes:
1. Execute the file and look for DAG objects in the namespace.
2. Pickle the DAG and save it to the DB (if necessary).
3. For each DAG, see what tasks should run and create appropriate task
instances in the DB.
4. Record any errors importing the file into ORM
5. Kill (in ORM) any task instances belonging to the DAGs that haven't
issued a heartbeat in a while.
Returns a list of SimpleDag objects that represent the DAGs found in
the file
:param file_path: the path to the Python file that should be executed
:param pickle_dags: whether serialize the DAGs found in the file and
save them to the db
:return: a list of SimpleDags made from the Dags found in the file
"""
self.log.info("Processing file %s for tasks to queue", file_path)
# As DAGs are parsed from this file, they will be converted into SimpleDags
simple_dags = []
try:
dagbag = models.DagBag(file_path)
except Exception:
self.log.exception("Failed at reloading the DAG file %s", file_path)
Stats.incr('dag_file_refresh_error', 1, 1)
return []
# 중략
# 중략
ti_keys_to_schedule = []
for ti_key in ti_keys_to_schedule:
dag = dagbag.dags[ti_key[0]]
task = dag.get_task(ti_key[1])
ti = models.TaskInstance(task, ti_key[2])
ti.refresh_from_db(session=session, lock_for_update=True)
# We can defer checking the task dependency checks to the worker themselves
# since they can be expensive to run in the scheduler.
dep_context = DepContext(deps=QUEUE_DEPS, ignore_task_deps=True)
# Only schedule tasks that have their dependencies met, e.g. to avoid
# a task that recently got its state changed to RUNNING from somewhere
# other than the scheduler from getting its state overwritten.
# TODO(aoen): It's not great that we have to check all the task instance
# dependencies twice; once to get the task scheduled, and again to actually
# run the task. We should try to come up with a way to only check them once.
if ti.are_dependencies_met(
dep_context=dep_context,
session=session,
verbose=True):
# Task starts out in the scheduled state. All tasks in the
# scheduled state will be sent to the executor
ti.state = State.SCHEDULED
# Also save this task instance to the DB.
self.log.info("Creating / updating %s in ORM", ti)
session.merge(ti)
# commit batch
session.commit()
# 중략
return simple_dags
def _find_executable_task_instances(self, simple_dag_bag, states, session=None):
# 중략
states_to_count_as_running = [State.RUNNING]
executable_tis = []
# Get all the queued task instances from associated with scheduled
# DagRuns which are not backfilled, in the given states,
# and the dag is not paused
TI = models.TaskInstance
DR = models.DagRun
DM = models.DagModel
ti_query = (
session
.query(TI)
.filter(TI.dag_id.in_(simple_dag_bag.dag_ids))
.outerjoin(DR,
and_(DR.dag_id == TI.dag_id,
DR.execution_date == TI.execution_date))
.filter(or_(DR.run_id == None,
not_(DR.run_id.like(BackfillJob.ID_PREFIX + '%'))))
.outerjoin(DM, DM.dag_id==TI.dag_id)
.filter(or_(DM.dag_id == None,
not_(DM.is_paused)))
)
if None in states:
ti_query = ti_query.filter(or_(TI.state == None, TI.state.in_(states)))
else:
ti_query = ti_query.filter(TI.state.in_(states))
task_instances_to_examine = ti_query.all()
# 중략
# Get the pool settings
pools = {p.pool: p for p in session.query(models.Pool).all()}
pool_to_task_instances = defaultdict(list)
for task_instance in task_instances_to_examine:
pool_to_task_instances[task_instance.pool].append(task_instance)
# 중략
# Go through each pool, and queue up a task for execution if there are
# any open slots in the pool.
for pool, task_instances in pool_to_task_instances.items():
# 중략
priority_sorted_task_instances = sorted(
task_instances, key=lambda ti: (-ti.priority_weight, ti.execution_date))
# DAG IDs with running tasks that equal the concurrency limit of the dag
dag_id_to_possibly_running_task_count = {}
for task_instance in priority_sorted_task_instances:
if open_slots <= 0:
self.log.info(
"Not scheduling since there are %s open slots in pool %s",
open_slots, pool
)
# Can't schedule any more since there are no more open slots.
break
# Check to make sure that the task concurrency of the DAG hasn't been
# reached.
# 중략
task_instance_str = "nt".join(
["{}".format(x) for x in executable_tis])
self.log.info("Setting the follow tasks to queued state:nt%s", task_instance_str)
# so these dont expire on commit
for ti in executable_tis:
copy_dag_id = ti.dag_id
copy_execution_date = ti.execution_date
copy_task_id = ti.task_id
make_transient(ti)
ti.dag_id = copy_dag_id
ti.execution_date = copy_execution_date
ti.task_id = copy_task_id
return executable_tis
_execute_task_instances

class LocalExecutor(BaseExecutor):
"""
LocalExecutor executes tasks locally in parallel. It uses the
multiprocessing Python library and queues to parallelize the execution
of tasks.
"""
def start(self):
self.result_queue = multiprocessing.Queue()
self.queue = None
self.workers = []
self.workers_used = 0
self.workers_active = 0
self.impl = (LocalExecutor._UnlimitedParallelism(self) if self.parallelism == 0
else LocalExecutor._LimitedParallelism(self))
self.impl.start()
def __init__(
self,
executor=executors.GetDefaultExecutor(),
heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'),
*args, **kwargs):
self.hostname = get_hostname()
self.executor = executor
self.executor_class = executor.__class__.__name__
self.start_date = timezone.utcnow()
self.latest_heartbeat = timezone.utcnow()
self.heartrate = heartrate
self.unixname = getpass.getuser()
self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query')
super(BaseJob, self).__init__(*args, **kwargs)
class BaseJob(Base, LoggingMixin):
"""
Abstract class to be derived for jobs. Jobs are processing items with state
and duration that aren't task instances. For instance a BackfillJob is
a collection of task instance runs, but should have its own state, start
and end time.
"""
def GetDefaultExecutor():
"""Creates a new instance of the configured executor if none exists and returns it"""
global DEFAULT_EXECUTOR
if DEFAULT_EXECUTOR is not None:
return DEFAULT_EXECUTOR
executor_name = configuration.conf.get('core', 'EXECUTOR')
DEFAULT_EXECUTOR = _get_executor(executor_name)
log = LoggingMixin().log
log.info("Using executor %s", executor_name)
return DEFAULT_EXECUTOR
def _get_executor(executor_name):
"""
Creates a new instance of the named executor.
In case the executor name is not know in airflow,
look for it in the plugins
"""
if executor_name == Executors.LocalExecutor:
return LocalExecutor()
elif executor_name == Executors.SequentialExecutor:
return SequentialExecutor()
elif executor_name == Executors.CeleryExecutor:
from airflow.executors.celery_executor import CeleryExecutor
return CeleryExecutor()
elif executor_name == Executors.DaskExecutor:
from airflow.executors.dask_executor import DaskExecutor
return DaskExecutor()
elif executor_name == Executors.MesosExecutor:
from airflow.contrib.executors.mesos_executor import MesosExecutor
return MesosExecutor()
elif executor_name == Executors.KubernetesExecutor:
from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor
return KubernetesExecutor()
else:
# Loading plugins
_integrate_plugins()
executor_path = executor_name.split('.')
if len(executor_path) != 2:
raise AirflowException(
"Executor {0} not supported: "
"please specify in format plugin_module.executor".format(executor_name))
if executor_path[0] in globals():
return globals()[executor_path[0]].__dict__[executor_path[1]]()
else:
raise AirflowException("Executor {0} not supported.".format(executor_name))
"""
:return: None
"""
# 후략
class SchedulerJob(BaseJob):
"""
This SchedulerJob runs for a specific time interval and schedules the jobs
that are ready to run. It figures out the latest runs for each
task and sees if the dependencies for the next schedules are met.
If so, it creates appropriate TaskInstances and sends run commands to the
executor. It does this for each task in each DAG and repeats.
"""

def _enqueue_task_instances_with_queued_state(self, simple_dag_bag, task_instances):
"""
Takes task_instances, which should have been set to queued, and enqueues them
with the executor.
:param task_instances: TaskInstances to enqueue
:type task_instances: List[TaskInstance]
:param simple_dag_bag: Should contains all of the task_instances' dags
:type simple_dag_bag: SimpleDagBag
"""
TI = models.TaskInstance
# actually enqueue them
for task_instance in task_instances:
simple_dag = simple_dag_bag.get_dag(task_instance.dag_id)
command = " ".join(TI.generate_command(
task_instance.dag_id,
task_instance.task_id,
task_instance.execution_date,
local=True,
mark_success=False,
ignore_all_deps=False,
ignore_depends_on_past=False,
ignore_task_deps=False,
ignore_ti_state=False,
pool=task_instance.pool,
file_path=simple_dag.full_filepath,
pickle_id=simple_dag.pickle_id))
priority = task_instance.priority_weight
queue = task_instance.queue
self.log.info(
"Sending %s to executor with priority %s and queue %s",
task_instance.key, priority, queue
)
# save attributes so sqlalchemy doesnt expire them
copy_dag_id = task_instance.dag_id
copy_task_id = task_instance.task_id
copy_execution_date = task_instance.execution_date
make_transient(task_instance)
task_instance.dag_id = copy_dag_id
task_instance.task_id = copy_task_id
task_instance.execution_date = copy_execution_date
self.executor.queue_command(
task_instance,
command,
priority=priority,
queue=queue)
def _execute_task_instances(self,
simple_dag_bag,
states,
session=None):
"""
Attempts to execute TaskInstances that should be executed by the scheduler.
There are three steps:
1. Pick TIs by priority with the constraint that they are in the expected states
and that we do exceed max_active_runs or pool limits.
2. Change the state for the TIs above atomically.
3. Enqueue the TIs in the executor.
:param simple_dag_bag: TaskInstances associated with DAGs in the
simple_dag_bag will be fetched from the DB and executed
:type simple_dag_bag: SimpleDagBag
:param states: Execute TaskInstances in these states
:type states: Tuple[State]
:return: None
"""
executable_tis = self._find_executable_task_instances(simple_dag_bag, states,
session=session)
def query(result, items):
tis_with_state_changed = self._change_state_for_executable_task_instances(
items,
states,
session=session)
self._enqueue_task_instances_with_queued_state(
simple_dag_bag,
tis_with_state_changed)
session.commit()
return result + len(tis_with_state_changed)
return helpers.reduce_in_chunks(query, executable_tis, 0, self.max_tis_per_query)
"""
:return: None
"""
# 중략
# 중략
# Kick of new processes and collect results from finished ones
self.log.debug("Heartbeating the process manager")
simple_dags = processor_manager.heartbeat()
# Send tasks for execution if available
simple_dag_bag = SimpleDagBag(simple_dags)
if len(simple_dags) > 0:
# 중략
self._execute_task_instances(simple_dag_bag,
(State.SCHEDULED,))
# Call heartbeats
self.log.debug("Heartbeating the executor")
self.executor.heartbeat()
# 중략
# Exit early for a test mode
if processor_manager.max_runs_reached():
self.log.info("Exiting loop as all files have been processed %s times",
self.num_runs)
break
# 후략
def queue_command(self, task_instance, command, priority=1, queue=None):
key = task_instance.key
if key not in self.queued_tasks and key not in self.running:
self.log.info("Adding to queue: %s", command)
self.queued_tasks[key] = (command, priority, queue, task_instance)
else:
self.log.info("could not queue task {}".format(key))

def heartbeat(self):
# Triggering new jobs
if not self.parallelism:
open_slots = len(self.queued_tasks)
else:
open_slots = self.parallelism - len(self.running)
self.log.debug("%s running task instances", len(self.running))
self.log.debug("%s in queue", len(self.queued_tasks))
self.log.debug("%s open slots", open_slots)
sorted_queue = sorted(
[(k, v) for k, v in self.queued_tasks.items()],
key=lambda x: x[1][1],
reverse=True)
for i in range(min((open_slots, len(self.queued_tasks)))):
key, (command, _, queue, ti) = sorted_queue.pop(0)
# TODO(jlowin) without a way to know what Job ran which tasks,
# there is a danger that another Job started running a task
# that was also queued to this executor. This is the last chance
# to check if that happened. The most probable way is that a
# Scheduler tried to run a task that was originally queued by a
# Backfill. This fix reduces the probability of a collision but
# does NOT eliminate it.
self.queued_tasks.pop(key)
ti.refresh_from_db()
if ti.state != State.RUNNING:
self.running[key] = command
self.execute_async(key=key,
command=command,
queue=queue,
executor_config=ti.executor_config)
else:
self.logger.info(
'Task is already running, not sending to '
'executor: {}'.format(key))
# Calling child class sync method
self.log.debug("Calling the %s sync method", self.__class__)
self.sync()
def execute_async(self, key, command):
"""
:param key: the key to identify the TI
:type key: Tuple(dag_id, task_id, execution_date)
:param command: the command to execute
:type command: string
"""
local_worker = LocalWorker(self.executor.result_queue)
local_worker.key = key
local_worker.command = command
self.executor.workers_used += 1
self.executor.workers_active += 1
local_worker.start()
class LocalWorker(multiprocessing.Process, LoggingMixin):
"""LocalWorker Process implementation to run airflow commands. Executes the given
command and puts the result into a result queue when done, terminating execution."""
def __init__(self, result_queue):
"""
:param result_queue: the queue to store result states tuples (key, State)
:type result_queue: multiprocessing.Queue
"""
super(LocalWorker, self).__init__()
self.daemon = True
self.result_queue = result_queue
self.key = None
self.command = None
def execute_work(self, key, command):
"""
Executes command received and stores result state in queue.
:param key: the key to identify the TI
:type key: Tuple(dag_id, task_id, execution_date)
:param command: the command to execute
:type command: string
"""
if key is None:
return
self.log.info("%s running %s", self.__class__.__name__, command)
command = "exec bash -c '{0}'".format(command)
try:
subprocess.check_call(command, shell=True, close_fds=True)
state = State.SUCCESS
except subprocess.CalledProcessError as e:
state = State.FAILED
self.log.error("Failed to execute task %s.", str(e))
# TODO: Why is this commented out?
# raise e
self.result_queue.put((key, state))
def run(self):
self.execute_work(self.key, self.command)
time.sleep(1)

def generate_command(dag_id,
task_id,
execution_date,
mark_success=False,
ignore_all_deps=False,
ignore_depends_on_past=False,
ignore_task_deps=False,
ignore_ti_state=False,
local=False,
pickle_id=None,
file_path=None,
raw=False,
job_id=None,
pool=None,
cfg_path=None
):
"""
Generates the shell command required to execute this task instance.
중략
"""
iso = execution_date.isoformat()
cmd = ["airflow", "run", str(dag_id), str(task_id), str(iso)]
cmd.extend(["--mark_success"]) if mark_success else None
cmd.extend(["--pickle", str(pickle_id)]) if pickle_id else None
cmd.extend(["--job_id", str(job_id)]) if job_id else None
cmd.extend(["-A"]) if ignore_all_deps else None
cmd.extend(["-i"]) if ignore_task_deps else None
cmd.extend(["-I"]) if ignore_depends_on_past else None
cmd.extend(["--force"]) if ignore_ti_state else None
cmd.extend(["--local"]) if local else None
cmd.extend(["--pool", pool]) if pool else None
cmd.extend(["--raw"]) if raw else None
cmd.extend(["-sd", file_path]) if file_path else None
cmd.extend(["--cfg_path", cfg_path]) if cfg_path else None
return cmd
def run(args, dag=None):
# 중략
task = dag.get_task(task_id=args.task_id)
ti = TaskInstance(task, args.execution_date)
ti.refresh_from_db()
ti.init_run_context(raw=args.raw)
hostname = get_hostname()
log.info("Running %s on host %s", ti, hostname)
if args.interactive:
_run(args, dag, ti)
else:
with redirect_stdout(ti.log, logging.INFO), redirect_stderr(ti.log, logging.WARN):
_run(args, dag, ti)
logging.shutdown()
def _run(args, dag, ti):
if args.local:
run_job = jobs.LocalTaskJob(
task_instance=ti,
mark_success=args.mark_success,
pickle_id=args.pickle,
ignore_all_deps=args.ignore_all_dependencies,
ignore_depends_on_past=args.ignore_depends_on_past,
ignore_task_deps=args.ignore_dependencies,
ignore_ti_state=args.force,
pool=args.pool)
run_job.run()
elif args.raw:
ti._run_raw_task(
mark_success=args.mark_success,
job_id=args.job_id,
pool=args.pool,
)
else:
# 후략
def _execute(self):
self.task_runner = get_task_runner(self)
# 중략
if not self.task_instance._check_and_change_state_before_execution(
mark_success=self.mark_success,
ignore_all_deps=self.ignore_all_deps,
ignore_depends_on_past=self.ignore_depends_on_past,
ignore_task_deps=self.ignore_task_deps,
ignore_ti_state=self.ignore_ti_state,
job_id=self.id,
pool=self.pool):
self.log.info("Task is not able to be run")
return
try:
self.task_runner.start()
last_heartbeat_time = time.time()
heartbeat_time_limit = conf.getint('scheduler',
'scheduler_zombie_task_threshold')
while True:
# Monitor the task to see if it's done
return_code = self.task_runner.return_code()
if return_code is not None:
self.log.info("Task exited with return code %s", return_code)
return
# 중략
finally:
self.on_kill() class BashTaskRunner(BaseTaskRunner):
"""
Runs the raw Airflow task by invoking through the Bash shell.
"""
def __init__(self, local_task_job):
super(BashTaskRunner, self).__init__(local_task_job)
def start(self):
self.process = self.run_command(['bash', '-c'], join_args=True)
def return_code(self):
return self.process.poll()

Apache Airflow

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Apache Airflow

Similar to Apache Airflow (20)

Recently uploaded

Recently uploaded (20)

Apache Airflow