Make Sure Your Applications Crash

Make Sure Your Applications Crash

Moshe Zadka

Python doesn't crash

Memory managed, no direct pointer arithmetic

...except it does

C bugs, untrapped exception, inﬁnite loops,
blocking calls, thread dead-lock, inconsistent
resident state

Recovery is important

"[S]ystem failure can usually be considered to
be the result of two program errors[...] the
second, in the recovery routine[...]"

Crashes and inconsistent data

A crash results in data from an arbitrary
program state.

Avoid storage

Caches are better than master copies.

Databases

Transactions maintain consistency
Databases can crash too!

Atomic operations

File rename

Example: Counting
def update_counter():
fp = file("counter.txt")
s = fp.read()
counter = int(s.strip())
counter += 1
# If there is a crash before this point,
# no changes have been done.
fp = file("counter.txt.tmp", 'w')
print >>fp, counter
fp.close()
# only a temp file has been modified
# The following is an atomic operation
os.rename("counter.txt.tmp", "counter.txt")

Eﬃcient caches, reliable masters

Mark inconsistency of cache

No shutdown

Crash in testing

Availability

If data is consistent, just restart!

Improving availability

Limit impact
Fast detection
Fast start-up

Vertical splitting

Diﬀerent execution paths, diﬀerent processes

Horizontal splitting

Diﬀerent code bases, diﬀerent processes

Watchdog

Monitor -> Flag -> Remediate

Watchdog principles

Keep it simple, keep it safe!

Watchdog: Heartbeats
## In a Twisted process
def beat():
file('beats/my-name', 'a').close()
task.LoopingCall(beat).start(30)

Watchdog: Get time-outs
def getTimeout()
timeout = dict()
now = time.time()
for heart in glob.glob('hearts/*'):
beat = int(file(heart).read().strip())
timeout[heart] = now-beat
return timeout

Watchdog: Mark problems
def markProblems():
timeout = getTimeout()
for heart in glob.glob('beats/*'):
mtime = os.path.getmtime(heart)
problem = 'problems/'+heart
if (mtime<timeout[heart] and
not os.path.isfile(problem)):
fp = file('problems/'+heart, 'w')
fp.write('watchdog')
fp.close()

Watchdog: check solutions
def checkSolutions():
now = time.time()
problemTimeout = now-30
for problem in glob.glob('problems/*'):
mtime = os.path.getmtime(problem)
if mtime<problemTimeout:
subprocess.call(['restart-system'])

Watchdog: Loop
## Watchdog
while True:
markProblems()
checkSolutions()
time.sleep(1)

Watchdog: accuracy of

Custom checkers can manufacture problems

Watchdog: reliability of

Use cron for main loop

Watchdog: reliability of

Use software/hardware watchdogs

Conclusions

Everything crashes -- plan for it

Welcome to the back-up slides
Extra! Extra!

Example: Counting on Windows
fp = file("counter.txt")
s = fp.read()
counter = int(s.strip())
counter += 1
fp = file("counter.txt.tmp", 'w')
print >>fp, counter
fp.close()
os.remove("counter.txt")
# At this point, the state is inconsistent*
# The following is an atomic operation

os.rename("counter.txt.tmp", "counter.txt")

Example: Counting on Windows
(Recovery)
def recover():
if not os.path.exists("counter.txt"):
# The permanent file has been removed
# Therefore, the temp file is valid
os.rename("counter.txt.tmp",
"counter.txt")

Example: Counting with versions
files = [int(name.split('.')[-1])
for name in os.listdir('.')
if name.startswith('counter.')]
last = max(files)
counter = int(file('counter.%s' % last
).read().strip())
counter += 1
fp = file("tmp.counter", 'w')
print >>fp, counter
fp.close()

os.rename('tmp.counter',
'counter.%s' % (last+1))
os.remove('counter.%s' % last)

Example: Counting with versions
(cleanup)
# This is not a recovery routine, but a cleanup
# routine.
# Even in its absence, the state is consistent
def cleanup():
files = [int(name.split('.')[-1])
for name in os.listdir('.')
if name.startswith('counter.')]
files.sort()
files.pop()
for n in files:
os.remove('counter.%d' % n)
if os.path.exists('tmp.counter'):
os.remove('tmp.counter')

Correct ordering
def activate_due():
scheduled = rs.smembers('scheduled')
now = time.time()
for el in scheduled:
due = int(rs.get(el+':due'))
if now<due:
continue
rs.sadd('activated', el)
rs.delete(el+':due')
rs.sremove('scheduled', el)

Correct ordering (recovery)
def recover():
inconsistent = rs.sinter('activated',
'scheduled')
for el in inconsistent:
rs.delete(el+':due') #*
rs.sremove('scheduled', el)

Example: Key/value stores
0.log:
['add', 'key-0', 'value-0']
['remove', 'key-1']
.
.
.

1.log:
.
.
.

2.log:

Example: Key/value stores (utility
functions)
## Get the level of a file
def getLevel(s)
return int(s.split('.')[0])

## Get all files of a given type
def getType(tp):
return [(getLevel(s), s)
for s in files if s.endswith(tp)]

(classifying ﬁles)
## Get all relevant files
def relevant(d):
files = os.listdir(d):
mlevel, master = max(getType('.master'))
logs = getType('.log')
logs.sort()
return master+[log for llevel, log in logs
if llevel>mlevel]

Example: Key/value stores (reading)
## Read in a single file
def update(result, fp):
for line in fp:
val = json.loads(line)
if val[0] == 'add':
result[val[1]] = val[2]
else:
del result[val[1]]

## Read in several files
def read(files):
result = dict()
for fname in files:
try:
update(result, file(fname))

except ValueError:
pass
return result

Example: Key/value stores (writer
class)
class Writer(object):
def __init__(self, level):
self.level = level
self.fp = None
self._next()
def _next(self):
self.level += 1
if self.fp:
self.fp.close()
name ='%3d.log' % self.currentLevel
self.fp = file(name, 'w')
self.rows = 0
def write(self, value):

print >>self.fp, json.dumps(value)
self.fp.flush()
self.rows += 1
if self.rows>200:
self._next()

Example: Key/value stores (storage
class)
## The actual data store abstraction.
class Store(object):
def __init__(self):
files = relevant(d)
self.result = read(files)
level = getLevel(files[-1])
self.writer = Writer(level)
def get(self, key):
return self.result[key]
def add(self, key, value):
self.writer.write(['add', key, value])
def remove(self, key):
self.writer.write(['remove', key])

(compression code)
## This should be run periodically
# from a different thread
def compress(d):
files = relevant(d)[:-1]
if len(files)<2:
return
result = read(files)
master = getLevel(files[-1])+1
fp = file('%3d.master.tmp' % master, 'w')
for key, value in result.iteritems():
towrite = ['add', key, value])
print >>fp, json.dumps(towrite)
fp.close()

Vertical splitting: Example
def forking_server():
s = socket.socket()
s.bind(('', 8080))
s.listen(5)
while True:
client = s.accept()
newpid = os.fork()
if newpid:
f = client.makefile()
f.write("Sunday, May 22, 1983 "
"18:45:59-PST")
f.close()
os._exit()

Horizontal splitting: front-end
## Process one
class SchedulerResource(resource.Resource):
isLeaf = True
def __init__(self, filepath):
resource.Resource.__init__(self)
self.filepath = filepath
def render_PUT(self, request):
uuid, = request.postpath
content = request.content.read()
child = self.filepath.child(uuid)
child.setContent(content)
fp = filepath.FilePath("things")
r = SchedulerResource(fp)
s = server.Site(r)
reactor.listenTCP(8080, s)

Horizontal splitting: scheduler
## Process two
rs = redis.Redis(host='localhost',
port=6379, db=9)
while True:
for fname in os.listdir("things"):
when = int(file(fname).read().strip())
rs.set(uuid+':due', when)
rs.sadd('scheduled', uuid)
os.remove(fname)
time.sleep(1)

Horizontal splitting: runner
## Process three
port=6379, db=9)
recover()
while True:
activate_due()
time.sleep(1)

Horizontal splitting: message
queues
No direct dependencies

queues: sender
## Process four
port=6379, db=9)
params = pika.ConnectionParameters('localhost')
conn = pika.BlockingConnection(params)
channel = conn.channel()
channel.queue_declare(queue='active')
while True:
activated = rs.smembers('activated')
finished = set(rs.smembers('finished'))
for el in activated:
if el in finished:
continue

channel.basic_publish(
exchange='', routing_key='active',
body=el)
rs.add('finished', el)

queues: receiver
## Process five
# It is possible to get "dups" of bodies.
# Application logic should deal with that
params = pika.ConnectionParameters('localhost')
conn = pika.BlockingConnection(params)
channel = conn.channel()
channel.queue_declare(queue='active')
def callback(ch, method, properties, el):
syslog.syslog('Activated %s' % el)
channel.basic_consume(callback, queue='hello', no_ack=True)
channel.start_consuming()

Horizontal splitting: point-to-point
Use HTTP (preferably, REST)

Make Sure Your Applications Crash

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Viewers also liked

Viewers also liked (20)

Similar to Make Sure Your Applications Crash

Similar to Make Sure Your Applications Crash (20)

Recently uploaded

Recently uploaded (20)

Make Sure Your Applications Crash