Join me as I walk through the evolution of a tiny library that solves a simple, yet reoccurring desire in your data processing scripts: displaying the progress of your processing.
From 2 lines of simplicity, to 30 lines of complexity, and back to 3 lines by making clever use of the tools that Python gives us to manage complexity. A "there and back again" story of software development that you might find useful in your every-day hacking.
2. A common problem
Often I am processing a lot of messages in a simple script
for message in messages:
process(message)
The processing might take several minutes and I want to know how close I am
to completion.
I want some indication of progress
3. First attempt
Print out every 100 records.
for index, message in enumerate(messages):
if index % 100 == 0:
print(f"Processed {index} messages")
process(message)
4. Second attempt: Add time taken
from datetime import datetime
start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
print(f"Processed {index} messages")
process(message)
end_time = datetime.utcnow()
print(f"Processing took {end_time - start_time}")
5. Third attempt: Add messages/second
start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
seconds_so_far = (datetime.utcnow() - start_time).total_seconds()
messages_per_second = (index / seconds_so_far) if seconds_so_far != 0 else None
print(f"Processed {index} messages ({messages_per_second}/s)“)
process(message)
end_time = datetime.utcnow()
print(f"Processing took {end_time - start_time}")
6. Third attempt: Add messages/second
start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
seconds_so_far = (datetime.utcnow() - start_time).total_seconds()
messages_per_second = (index / seconds_so_far) if seconds_so_far != 0 else None
print(f"Processed {index} messages ({messages_per_second}/s)“)
process(message)
end_time = datetime.utcnow()
total_duration = end_time - start_time
total_seconds = total_duration.total_seconds()
messages_per_second = (index / total_seconds) if total_seconds != 0 else None
print(f"Processing took {total_duration}, ({messages_per_second}/s)")
7. Third attempt: Add messages/second
def print_progress(messages_processed, start_time):
duration = (datetime.utcnow() - start_time)
seconds_so_far = duration.total_seconds()
messages_per_second = (messages_processed / seconds_so_far) if seconds_so_far != 0 else None
print(f"Processed {messages_processed} messages in {duration} ({messages_per_second}/s)“)
start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
print_progress(index, start_time)
process(message)
print_progress(len(messages), start_time)
9. Fifth attempt: Add time remaining
from datetime import datetime, timedelta
def print_progress(messages_processed, total_message_count, start_time):
duration = (datetime.utcnow() - start_time)
seconds_so_far = duration.total_seconds()
messages_per_second = (messages_processed / seconds_so_far) if seconds_so_far != 0 else None
percent_complete = (messages_processed / total_message_count) * 100
estimated_time_remaining = timedelta(seconds=((100 - percent_complete) / percent_complete) * seconds_so_far) if percent_complete != 0 else None
print(f"Processed {messages_processed} messages ({percent_complete}%) in {duration} ({messages_per_second}/s, ETA: {estimated_time_remaining})”)
start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
print_progress(index, len(messages), start_time)
process(message)
print_progress(len(messages), len(messages), start_time)
10. Repeated parameters
We are passing in the same parameter values (total_message_count,
start_time) every time we call print progress:
start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
print_progress(index, len(messages), start_time)
process(message)
print_progress(len(messages), len(messages), start_time)
It would be nice if print_progress would remember these values.
Rework it as a class?
11. Sixth attempt: Refactor as class
class ProgressTracker(object):
def __init__(self, total_message_count, start_time):
self.total_message_count = total_message_count
self.start_time = start_time
def print_progress(self, messages_processed):
duration = (datetime.utcnow() - self.start_time)
seconds_so_far = duration.total_seconds()
messages_per_second = (messages_processed / seconds_so_far) if seconds_so_far != 0 else None
percent_complete = (messages_processed / self.total_message_count) * 100
estimated_time_remaining = timedelta(seconds=((100 - percent_complete) / percent_complete) * seconds_so_far) if percent_complete != 0 else None
print(f"Processed {messages_processed} messages ({percent_complete}%) in {duration} ({messages_per_second}/s, ETA: {estimated_time_remaining})")
start_time = datetime.utcnow()
tracker = ProgressTracker(len(messages), start_time)
for index, message in enumerate(messages):
if index % 100 == 0:
tracker.print_progress(index)
process(message)
tracker.print_progress(len(messages))
12. Sixth attempt: Refactor as class
class ProgressTracker(object):
def __init__(self, total_message_count):
self.total_message_count = total_message_count
self.start_time = datetime.utcnow()
def print_progress(self, messages_processed):
duration = (datetime.utcnow() - self.start_time)
seconds_so_far = duration.total_seconds()
messages_per_second = (messages_processed / seconds_so_far) if seconds_so_far != 0 else None
percent_complete = (messages_processed / self.total_message_count) * 100
estimated_time_remaining = timedelta(seconds=((100 - percent_complete) / percent_complete) *
seconds_so_far) if percent_complete != 0 else None
print(f"Processed {messages_processed} messages ({percent_complete}%) in {duration}
({messages_per_second}/s, ETA: {estimated_time_remaining})")
tracker = ProgressTracker(len(messages))
for index, message in enumerate(messages):
if index % 100 == 0:
tracker.print_progress(index)
process(message)
tracker.print_progress(len(messages))
13. Results
So now we’ve gone from 2 lines to 28 lines
Not quite fair. If we move the Progress Tracker out into a different file, it’s
only 7 lines:
from progress_tracker import ProgressTracker
tracker = ProgressTracker(len(messages))
for index, message in enumerate(messages):
if index % 100 == 0:
tracker.print_progress(index)
process(message)
tracker.print_progress(len(messages))
14. Generators
Remember enumerate()?
for index, message in enumerate(messages):
It wraps the iteration of a iterable and does additional computation
We could do the same thing with ProgressTracker
15. Seventh attempt: Refactor as generator
class ProgressTracker(object):
def __init__(self, iterable):
self.iterable = iterable
self.total_message_count = len(iterable)
self.start_time = None
def print_progress(self, messages_processed):
…
def __iter__(self):
if self.start_time is None:
self.start_time = datetime.utcnow()
for index, message in enumerate(messages):
if index % 100 == 0:
self.print_progress(index)
yield message
self.print_progress(self.total_message_count)
16. Results
Back down to 3 lines:
from progress_tracker import ProgressTracker
for message in ProgressTracker(messages):
process(message)
17. Limitations
Currently I have a hard-coded “output every 100 entries”
• This might be way too much output, especially if you are processing
millions of messages.
You might want to only output every 10%
But every 10% might be too long between reports
So you might also want to output every 30 seconds as well.
Or perhaps more complicated conditions.
ie. You want to be able to customize the conditions that will trigger output.
18. Unbounded message stream
What about infinite streams of messages?
You obviously can’t do percent complete or ETA
But it would be nice to use the same code for both bounded and unbounded
streams.
19. Final API
ProgressTracker(
iterable, # The iterable to iterate over
total=None, # Override for the total message count, defaults to len(iterable)
callback=print, # A function (f(string): None) that gets called each time a condition matches
format_string=None, # Custom format string, sensible defaults for both bounded and unbounded iterables
every_n_records=None, # Reports every n records
every_x_percent=None, # Reports after every x percent
every_n_seconds=None, # Reports every n seconds
every_n_seconds_idle=None, # Report every n seconds, but only if there hasn’t been any
progress. Useful for infinite streams
ignore_first_iteration=True, # Don’t report on the first iteration
last_iteration=False # Report after the last iteration
)
for message in ProgressTracker(messages, every_n_records=10000, every_x_percent=5):
process(message)
20. Final API
Make it more Pythonic:
def track_progress(iterable, **kwargs):
return ProgressTracker(iterable, **kwargs)
Example:
for message in track_progress(messages, every_n_records=10000, every_x_percent=5):
process(message)