The effective use of
Django ORM
Yaroslav Muravskyi
@myarik
What we’re going to talk about
- Django ORM Mistakes
- Performance Problems in the Django ORM
- Database concurrency in Django the right way
What is Django ORM?
ORM - Object-relational mapping
Django's ORM is just a pythonical way to create
SQL to query and manipulate your database and get
results in a pythonic fashion.
The first step to fixing a problem
is being able to identify it
Tools
Tools
• django.db.connection (Make sure your Django
DEBUG setting is set to True)
Tools
• django.db.connection (Make sure your Django
DEBUG setting is set to True)
>>>	from	django.db import	connection
>>>	Author.objects.all()
<QuerySet [<Author:	Michael	Armstrong>,	...,	<Author:	GEORGE	PACKER>]>
>>>	connection.queries
[{'sql':	'SELECT	"books_author"."id",	"books_author"."first_name",	
"books_author"."last_name",	"books_author"."email"	FROM	"books_author"	LIMIT	21',
'time':	'0.007'}]
Tools
• django.db.connection (Make sure your Django
DEBUG setting is set to True)
>>>	from	django.db import	connection
>>>	Author.objects.all()
<QuerySet [<Author:	Michael	Armstrong>,	...,	<Author:	GEORGE	PACKER>]>
>>>	connection.queries
[{'sql':	'SELECT	"books_author"."id",	"books_author"."first_name",	
"books_author"."last_name",	"books_author"."email"	FROM	"books_author"	LIMIT	21',
'time':	'0.007'}]
• sql -- The raw SQL statement
• time -- How long the statement took to execute, in seconds.
Tools
• shell_plus command with option --print-sql
from django-extensions project
Tools
• shell_plus command with option --print-sql
from django-extensions project
>>>	Author.objects.all()
SELECT	"books_author"."id",	"books_author"."first_name",	
"book_author"."last_name","books_author"."email"	FROM	"books_author"	LIMIT	
21
Execution	time:	0.001993s	[Database:	default]
Tools
• Django Debug Toolbar
Django ORM Mistakes
Mistake #1
>>>	queryset =	Author.objects.all()
>>>	[author	for	author	in	queryset if	author.email.endswith('@gmail.com')]
Mistake #1
Use filter() and exclude() method to do
filtering in the database.
>>>	Author.objects.filter(email__icontains='@gmail.com')
>>>	queryset =	Author.objects.all()
>>>	[author	for	author	in	queryset if	author.email.endswith('@gmail.com')]
Mistake #2
authors	=	Author.objects.filter(email__icontains='@gmail.com')
if	authors:
do_stuff()
if	len(authors)	>	0:
do_stuff()
Mistake #2
If you want to know only if a record exists,
then use exists() method
Author.objects.filter(email__icontains='@gmail.com').exists()
authors	=	Author.objects.filter(email__icontains='@gmail.com')
if	authors:
do_stuff()
if	len(authors)	>	0:
do_stuff()
Mistake #3
authors	=	Author.objects.filter(email__icontains='@gmail.com')
if	len(authors)	>	5:
do_stuff()
Mistake #3
authors	=	Author.objects.filter(email__icontains='@gmail.com')
if	len(authors)	>	5:
do_stuff()
If you need only the size use count()
count	=	Author.objects.filter(email__icontains='@gmail.com').count()
if	count	>	5:
do_stuff()
Mistake #4
book	=	Book.objects.get(id=5)
if	book.author:
do_stuff()
Mistake #4
book	=	Book.objects.get(id=5)
if	book.author:
do_stuff()
Use foreign key values directly
#	If	you	don't	need	the	author	object
if	book.author_id:
do_stuff()
Additional request to the database
Mistake #5
Don’t fetch data you don’t need
#	Retrieve	values	as	a	dictionary
Book.objects.all().values('name',	'price')
<	QuerySet[{'name':	'Rat	Of	The	Stockades',	'price':	Decimal('4.28')},	...]	>
#	Retrieve	values	as	a	tuple
Book.objects.all().values_list('name',	'price')
<	QuerySet[('Rat	Of	The	Stockades',	Decimal('4.28')),	...]	>
#	Use	QuerySet.defer()	and	only()
Book.objects.only('name',	'price').all()
Book.objects.defer('price').all()
Mistake #6
for	book	in	Book.objects.all():
do_stuff(book)
Mistake #6
for	book	in	Book.objects.all():
do_stuff(book)
When you have a lot of objects, the caching
behavior of the QuerySet can cause a large
amount of memory to be used.
#	One	book	object	in	the	memory
for	book	in	Book.objects.iterator():
do_stuff(book)
Mistake #7
def create():
with	transaction.atomic():
item	=	Author.objects.create(first_name='Simon',	last_name='Anderson',	
email='fake@mail.com')
set_email(item.id)
item.books_count=	3
item.save()
def set_email(id):
item	=	Author.objects.get(id=id)
item.email =	'simon@gmail.com'
item.save()
Mistake #7
def create():
with	transaction.atomic():
item	=	Author.objects.create(first_name='Simon',	last_name='Anderson',	
email='fake@mail.com')
set_email(item.id)
#	The	database	row	has	been	updated	with	email='simon@gmail.com',	but	this	
instance	still	has	email='fake@mail.com'	as	it	hasn't	been	reloaded
item.books_count=	3
item.save()
def set_email(id):
item	=	Author.objects.get(id=id)
item.email =	'simon@gmail.com'
item.save()
Mistake #7
def create():
with	transaction.atomic():
item	=	Author.objects.create(first_name='Simon',	last_name='Anderson',	
email='fake@mail.com')
set_email(item)
item.books_count=	3
item.save()
def set_email(item):
item.email =	'simon@gmail.com'
item.save()
Performance Problems in
the Django ORM
Avoid unnecessary queries when
accessing foreign keys
Use the select_related method to load
the foreign key values and cache those results
for each object
#	N	+	1	query
for	book	in	Book.objects.iterator():
print(book.name,	book.author.get_full_name())
#	1	query
for	book	in	Book.objects.select_related('author').iterator():
print(book.name,	book.author.get_full_name())
prefetch_related
#	Queries	2,	Execution	time:	0.002458s
def authors():
queryset =	Author.objects.prefetch_related('books')
authors	=	[]
for	author	in	queryset:
books	=	[book.name for	book	in	author.books.all()]
authors.append({
'name':	author.get_full_name(),
'books':	books
})
return	authors
Use the prefetch_related for joining many-to-
many and many-to-one objects
Be careful with prefetch_related
#	Queries	18,	Execution	time:	0.032458s
def top_authors():
queryset =	Author.objects.prefetch_related('books')
authors	=	[]
for	author	in	queryset:
books	=	[book.name for	book	in	author.books.filter(average_rating__gt=3)]
authors.append({
'name':	author.get_full_name(),
'books':	books
})
return	authors
Using Prefetch with to_attr
from	django.db.modelsimport	Prefetch
#	Queries	2,	Execution	time:	0.008458s
def top_authors():
queryset =	Author.objects.prefetch_related(
Prefetch('books',
queryset=Book.objects.filter(average_rating__gt=3),
to_attr='top_books')
)
authors	=	[]
for	author	in	queryset:
books	=	[book.name for	book	in	author.top_books]
authors.append({'name':	author.get_full_name(),	'books':	books})
return	authors
Subqueries and Annotations
Subqueries and annotations can speed
up to 50 times your application.
Task:
Display the list of authors name and
the count of books for each author.
Subqueries and Annotations
Instead of concatenating name in Python
class	Author(models.Model):
...
def get_full_name(self):
return	'%s	%s'	%	(self.first_name,	self.last_name)
We can concatenate name in a database
from	django.db.modelsimport	Value
from	django.db.models.functionsimport	Concat
Author.objects.annotate(
name=Concat('first_name',	Value('	'),	'last_name')
).values('name',	)
Subqueries and Annotations
from	django.db.modelsimport	OuterRef,	Subquery,	Value,	IntegerField
from	django.db.models.functionsimport	Concat
def get_authors():
book_query=	Book.objects.filter(
author_id=OuterRef('id')
).values('author_id').order_by().annotate(
books=Count('*')).values('books')
return	Author.objects.annotate(
name=Concat('first_name',	Value('	'),	'last_name'),
count_books=Subquery(book_query,	output_field=IntegerField())
).values('name',	'count_books')
Case
A	Case()	expression	is	like	the	if …	elif …	else statement	in	Python
from	django.db.modelsimport	CharField,	Case,	Value,	When
#	Get	the	discount	for	each	Book	based	on	the	rating	value
def get_authors():
Book.objects.annotate(
discount=Case(
When(average_rating__gte=4,	then=Value('10%')),
When(average_rating__gte=2,	then=Value('5%')),
default=Value('0%'),
output_field=CharField(),
)
)
Aggregation with Filter
from	django.db.modelsimport	Count,	Case,	Value,	When,	Sum,	IntegerField
Book.objects.aggregate(
total_books=Count('id'),
total_platinum_discount=Sum(Case(
When(average_rating__gte=4,	then=Value(1)),
default=Value(0),
output_field=IntegerField(),
))
)
Conditional expressions
Aggregation with Filter
from	django.db.modelsimport	Count,	Q
Book.objects.aggregate(
total_books=Count('id'),
total_platinum_discount=Count('id',	filter=Q(average_rating__gte=4))
)
In Django 2.0 a filter argument to aggregate
functions was added to make this a lot easier
Database concurrency in
Django the right way
Specifying which fields to save
def set_name(id,	value):
instance	=	Book.objects.get(id=id)
instance.name =	value
instance.save()
def set_rating(id,	value):
instance	=	Book.objects.get(id=id)
instance.average_rating =	value
instance.save()
What happens if the set_name and set_rating
will run simultaneously?
Specifying which fields to save
Specifying which fields to save
def set_name(id,	value):
instance	=	Book.objects.get(id=id)
instance.name =	value
instance.save(update_fields=['name'])
def set_rating(id,	value):
instance	=	Book.objects.get(id=id)
instance.average_rating =	value
instance.save(update_fields=['average_rating'])
One possible solution is to identify the
updated fields
Use an F() expression to simple
arithmetic tasks
book	=	Book.objects.get(pk=804)
book.count+=	1
book.save()
Use an F() expression to simple
arithmetic tasks
book	=	Book.objects.get(pk=804)
book.count=	F('count')	+	1
book.save()
Use an F() expression to simple
arithmetic tasks
book	=	Book.objects.get(pk=804)
book.count=	F('count')	+	1
book.save()
But take care with this kind of assignment.
Avoiding race conditions using F()
book	=	Book.objects.get(pk=804)		#	count	=	10
book.count=	F('count')	+	1
book.save()		#	count	=	11
book.name =	'Avoiding	race	conditions'
book.save()		#	count	=	12
The code
def create_payment(collection_id):
with	transaction.atomic():
book_collection =	BookCollection.objects.select_releted('user').get(id=collection_id)
amount	=	book_collection.book_set.all().aggregate(total=Sum('price'))['total']
if	book_collection.user.balance>=	amount:
user.reduce_balance(amount)
Payment.objects.create(amount=amount,	book_collection=book_collection)
else:
raise	Exception('Insufficient	funds')
What happens if there are two
simultaneous requests?
Select for update
SELECT FOR UPDATE returns a queryset that
will lock rows until the end of the
transaction
Select for update
def create_payment(collection_id):
#	Wrap	in	a	database	transaction
with	transaction.atomic():
book_collection =	BookCollection.objects.get(id=collection_id)
amount	=	book_collection.book_set.all().aggregate(total=Sum('price'))['total']
#	Wait	for	a	lock
user	=	User.objects.select_for_update().get(id=book_collection.user_id)
if	user.balance >=	amount:
user.reduce_balance(amount)
Payment.objects.create(amount=amount,	book_collection=book_collection)
else:
raise	Exception('Insufficient	funds')
Select for update
with	transaction.atomic():
User.objects.select_for_update().filter(id__in=[804,	806])
...
Select for update – Querysets are
lazy!
In this case, the select_for_update will never
be run. Wrap the select_for_update in a bool if
you don't evaluate them straight away.
bool(User.objects.select_for_update().filter(id__in=[804,	806]))
Select for update – Preventing
deadlocks
#	Worker	1
with	transaction.atomic():
ids	=	[804,	805]
bool(User.objects.select_for_update().filter(id__in=ids))
...
#	Worker	2
with	transaction.atomic():
ids	=	[805,	804]
bool(User.objects.select_for_update().filter(id__in=ids))
...
Waiting for each other
Select for update – Preventing
deadlocks
When using select_for_updates on multiple
records, make sure you acquire the locks in a
consistent order.
#	Worker	1
with	transaction.atomic():
ids	=	[804,	805]
bool(User.objects.select_for_update().filter(id__in=ids).order_by('id'))
...
- Make your code clear and then work on optimizing
it
- Learn how to use the Django ORM properly to get
the most out of it
- Database concurrency is something you need to
think about
- ORMs can obscure bugs. Look at the SQL
Summary
Thanks!
@myarik
y@myarik.com

The effective use of Django ORM