Crowdsourcing with Django

Crowdsourcing
with Django
EuroPython, 30th June 2009
Simon Willison · http://simonwillison.net/ · @simonw

“Web development on
journalism deadlines”

November 2000
The Freedom of Information Act

Heather Brooke

• http://www.guardian.co.uk/politics/
2009/may/08/mps-expenses-telegraph-
checquebook-journalism

• http://www.guardian.co.uk/politics/
2009/may/15/mps-expenses-heather-
brooke-foi

July 2006
The FOI commissioner

May 2007
The FOI (Amendment) Bill

February 2008
The Information Tribunal

“Transparency will
damage democracy”

January 2009
The exemption law

“All of the receipts of 650-odd MPs,
redacted and unredacted, are for sale
at a price of £300,000, so I am told.
The price is going up because of the
interest in the subject.”
Sir Stuart Bell, MP
Newsnight, 30th March

8th May, 2009
The Daily Telegraph

April: “Expenses are due out
in a couple of months, is
there anything we can do?”

June: “Expenses have been
bumped forward, they’re out
next week!”

Thursday 11th June
The proof-of-concept

Monday 15th June
The tentative go-ahead

Tuesday 16th June
Designer + client-side engineer

Wednesday 17th June
Operations engineer

Thursday 18th June
Launch day!

$ convert Frank_Comm.pdf pages.png

class Party(models.Model):
name = models.CharField(max_length=100)

class Constituency(models.Model):

class MP(models.Model):
party = models.ForeignKey(Party)
constituency = models.ForeignKey(Constituency)
guardian_url = models.CharField(max_length=255,
blank=True)
guardian_image_url = models.CharField(max_length=255,
blank=True)

class FinancialYear(models.Model):

class Document(models.Model):
title = models.CharField(max_length=100, blank=True)
ﬁlename = models.CharField(max_length=100)
mp = models.ForeignKey(MP)
ﬁnancial_year = models.ForeignKey(FinancialYear)

class Page(models.Model):
document = models.ForeignKey(Document)
page_number = models.IntegerField()

class User(models.Model):
created = models.DateTimeField(auto_now_add = True)
username = models.TextField(max_length = 100)
password_hash = models.CharField(max_length = 128, blank=True)

class LineItemCategory(models.Model):
order = models.IntegerField(default = 0)
name = models.CharField(max_length = 32)

class LineItem(models.Model):
user = models.ForeignKey(User)
page = models.ForeignKey(Page)
type = models.CharField(max_length = 16, choices = (
('claim', 'claim'), ('proof', 'proof'),
), db_index = True)
date = models.DateField(null = True, blank = True)
amount = models.DecimalField(max_digits=20, decimal_places=2)
description = models.CharField(max_length = 255, blank = True)
created = models.DateTimeField(auto_now_add = True, db_index = True)
categories = models.ManyToManyField(LineItemCategory, blank=True)

class Vote(models.Model):
user = models.ForeignKey(User, related_name = 'votes')
page = models.ForeignKey(Page, related_name = 'votes')
obsolete = models.BooleanField(default = False)
vote_type = models.CharField(max_length = 32, blank = True)
ip_address = models.CharField(max_length = 32)
created = models.DateTimeField(auto_now_add = True)

class TypeVote(Vote):
type = models.CharField(max_length = 10, choices = (
('claim', 'Claim'), ('proof', 'Proof'),
('blank', 'Blank'), ('other', 'Other')
))

class InterestingVote(Vote):
status = models.CharField(max_length = 10, choices = (
('no', 'Not interesting'), ('yes', 'Interesting'),
('known', 'Interesting but known'), ('very', 'Investigate this!'),
))

page_filters = (
# Maps name of filter to dictionary of kwargs to doc.pages.filter()
('reviewed', {
'votes__isnull': False
}),
('unreviewed', {
'votes__isnull': True
}),
('with line items', {
'line_items__isnull': False
}),
('interesting', {
'votes__interestingvote__status': 'yes'
}),
('interesting but known', {
'votes__interestingvote__status': 'known'
...
)
page_filters_lookup = dict(page_filters)

pages = doc.pages.all()
if page_filter:
kwargs = page_filters_lookup.get(page_filter)
if kwargs is None:
raise Http404, 'Invalid page filter: %s' % page_filter
pages = pages.filter(**kwargs).distinct()

# Build the filters
filters = []
for name, kwargs in page_filters:
filters.append({
'name': name,
'count': doc.pages.filter(**kwargs).distinct().count(),
})

http://github.com/simonw/datamatcher

def get_mp_pages():
"Returns list of (mp-name, mp-page-url) tuples"
soup = Soup(urllib.urlopen(INDEX_URL))
mp_links = []
for link in soup.ﬁndAll('a'):
if link.get('title', '').endswith("'s allowances"):
mp_links.append(
(link['title'].replace("'s allowances", ''), link['href'])
)
return mp_links

def get_pdfs(mp_url):
"Returns list of (description, years, pdf-url, size) tuples"
soup = Soup(urllib.urlopen(mp_url))
pdfs = []
trs = soup.findAll('tr')[1:] # Skip the first, it's the table header
for tr in trs:
name_td, year_td, pdf_td = tr.findAll('td')
name = name_td.string
year = year_td.string
pdf_url = pdf_td.find('a')['href']
size = pdf_td.find('a').contents[-1].replace('(', '').replace(')', '')
pdfs.append(
(name, year, pdf_url, size)
)
return pdfs

Photoshop + AppleScript
v.s.
Java + IntelliJ

Images on our
docroot (S3 upload
was taking too long)

Crash #1: more
Apache children than
MySQL connections

unreviewed_count = Page.objects.ﬁlter(
votes__isnull = True
).distinct().count()

SELECT
COUNT(DISTINCT èxpenses_page`.ìd`)
FROM
èxpenses_page` LEFT OUTER JOIN èxpenses_vote` ON (
èxpenses_page`.ìd` = èxpenses_vote`.`page_id`
) WHERE èxpenses_vote`.ìd` IS NULL

unreviewed_count = cache.get('homepage:unreviewed_count')
if unreviewed_count is None:
cache.set('homepage: unreviewed_count', unreviewed_count, 60)

• With 70,000 pages and a LOT of votes...
• DB takes up 135% of CPU
• Cache the count in memcached...
• DB drops to %35 of CPU


reviewed_count = Page.objects.ﬁlter(
votes__isnull = False

is_reviewed = False
).count()

Migrating to InnoDB
on a separate server

ssh mps-live "mysqldump mp_expenses" |
sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' |
sed 's/CHARSET=latin1/CHARSET=utf8/g' |
ssh mysql-big "mysql -u root mp_expenses"

def next_global(request):
# Next unreviewed page from the whole site
all_unreviewed_pages = Page.objects.ﬁlter(
is_reviewed = False
).order_by('?')
if all_unreviewed_pages:
return Redirect(
all_unreviewed_pages[0].get_absolute_url()
)
else:
return HttpResponse(
'All pages have been reviewed!'
)

import random

def next_global_from_cache(request):
page_ids = cache.get('unreviewed_page_ids')
if page_ids:
return Redirect(
'/page/%s/' % random.choice(page_ids)
)
else:
return next_global(request)

from django.core.management.base import BaseCommand
from mp_expenses.expenses.models import Page
from django.core.cache import cache

class Command(BaseCommand):
help = """
populate unreviewed_page_ids in memcached
"""
requires_model_validation = True
can_import_settings = True
def handle(self, *args, **options):
ids = list(Page.objects.exclude(
is_reviewed = True
).values_list('pk', ﬂat=True)[:1000])
cache.set('unreviewed_page_ids', ids)

Final thoughts

• High score tables help
• MP photographs really help
• Keeping up the interest is hard
• Next step: start releasing the data

Crowdsourcing with Django

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Crowdsourcing with Django

Similar to Crowdsourcing with Django (20)

More from Simon Willison

More from Simon Willison (20)

Recently uploaded

Recently uploaded (20)

Crowdsourcing with Django