SlideShare a Scribd company logo
1 of 83
Download to read offline
Crowdsourcing
        with Django
           EuroPython, 30th June 2009
Simon Willison · http://simonwillison.net/ · @simonw
“Web development on
journalism deadlines”
The back story...
November 2000
The Freedom of Information Act
Heather Brooke

• http://www.guardian.co.uk/politics/
  2009/may/08/mps-expenses-telegraph-
  checquebook-journalism

• http://www.guardian.co.uk/politics/
  2009/may/15/mps-expenses-heather-
  brooke-foi
2004
The request
January 2005
 The FOI request
July 2006
The FOI commissioner
May 2007
The FOI (Amendment) Bill
February 2008
The Information Tribunal
“Transparency will
damage democracy”
May 2008
The high court
January 2009
The exemption law
March 2009
  The mole
“All of the receipts of 650-odd MPs,
redacted and unredacted, are for sale
 at a price of £300,000, so I am told.
 The price is going up because of the
         interest in the subject.”
                           Sir Stuart Bell, MP
                        Newsnight, 30th March
8th May, 2009
The Daily Telegraph
At the Guardian...
April: “Expenses are due out
  in a couple of months, is
there anything we can do?”
June: “Expenses have been
bumped forward, they’re out
        next week!”
Thursday 11th June
  The proof-of-concept
Monday 15th June
The tentative go-ahead
Tuesday 16th June
Designer + client-side engineer
Wednesday 17th June
   Operations engineer
Thursday 18th June
    Launch day!
How we built it
$ convert Frank_Comm.pdf pages.png
Models
class Party(models.Model):
   name = models.CharField(max_length=100)

class Constituency(models.Model):
   name = models.CharField(max_length=100)

class MP(models.Model):
   name = models.CharField(max_length=100)
   party = models.ForeignKey(Party)
   constituency = models.ForeignKey(Constituency)
   guardian_url = models.CharField(max_length=255,
      blank=True)
   guardian_image_url = models.CharField(max_length=255,
      blank=True)
class FinancialYear(models.Model):
   name = models.CharField(max_length=10)

class Document(models.Model):
   title = models.CharField(max_length=100, blank=True)
   filename = models.CharField(max_length=100)
   mp = models.ForeignKey(MP)
   financial_year = models.ForeignKey(FinancialYear)

class Page(models.Model):
   document = models.ForeignKey(Document)
   page_number = models.IntegerField()
class User(models.Model):
   created = models.DateTimeField(auto_now_add = True)
   username = models.TextField(max_length = 100)
   password_hash = models.CharField(max_length = 128, blank=True)

class LineItemCategory(models.Model):
   order = models.IntegerField(default = 0)
   name = models.CharField(max_length = 32)

class LineItem(models.Model):
   user = models.ForeignKey(User)
   page = models.ForeignKey(Page)
   type = models.CharField(max_length = 16, choices = (
       ('claim', 'claim'), ('proof', 'proof'),
   ), db_index = True)
   date = models.DateField(null = True, blank = True)
   amount = models.DecimalField(max_digits=20, decimal_places=2)
   description = models.CharField(max_length = 255, blank = True)
   created = models.DateTimeField(auto_now_add = True, db_index = True)
   categories = models.ManyToManyField(LineItemCategory, blank=True)
class Vote(models.Model):
   user = models.ForeignKey(User, related_name = 'votes')
   page = models.ForeignKey(Page, related_name = 'votes')
   obsolete = models.BooleanField(default = False)
   vote_type = models.CharField(max_length = 32, blank = True)
   ip_address = models.CharField(max_length = 32)
   created = models.DateTimeField(auto_now_add = True)

class TypeVote(Vote):
   type = models.CharField(max_length = 10, choices = (
      ('claim', 'Claim'), ('proof', 'Proof'),
      ('blank', 'Blank'), ('other', 'Other')
   ))

class InterestingVote(Vote):
   status = models.CharField(max_length = 10, choices = (
      ('no', 'Not interesting'), ('yes', 'Interesting'),
      ('known', 'Interesting but known'), ('very', 'Investigate this!'),
   ))
Frictionless
registration
Page filters
page_filters = (
    # Maps name of filter to dictionary of kwargs to doc.pages.filter()
    ('reviewed', {
        'votes__isnull': False
    }),
    ('unreviewed', {
        'votes__isnull': True
    }),
    ('with line items', {
        'line_items__isnull': False
    }),
    ('interesting', {
        'votes__interestingvote__status': 'yes'
    }),
    ('interesting but known', {
        'votes__interestingvote__status': 'known'
...
)
page_filters_lookup = dict(page_filters)
pages = doc.pages.all()
if page_filter:
    kwargs = page_filters_lookup.get(page_filter)
    if kwargs is None:
        raise Http404, 'Invalid page filter: %s' % page_filter
    pages = pages.filter(**kwargs).distinct()

# Build the filters
filters = []
for name, kwargs in page_filters:
   filters.append({
      'name': name,
      'count': doc.pages.filter(**kwargs).distinct().count(),
   })
Matching names
http://github.com/simonw/datamatcher
On the day
def get_mp_pages():
  "Returns list of (mp-name, mp-page-url) tuples"
  soup = Soup(urllib.urlopen(INDEX_URL))
  mp_links = []
  for link in soup.findAll('a'):
      if link.get('title', '').endswith("'s allowances"):
           mp_links.append(
             (link['title'].replace("'s allowances", ''), link['href'])
           )
  return mp_links
def get_pdfs(mp_url):
  "Returns list of (description, years, pdf-url, size) tuples"
  soup = Soup(urllib.urlopen(mp_url))
  pdfs = []
  trs = soup.findAll('tr')[1:] # Skip the first, it's the table header
  for tr in trs:
      name_td, year_td, pdf_td = tr.findAll('td')
      name = name_td.string
      year = year_td.string
      pdf_url = pdf_td.find('a')['href']
      size = pdf_td.find('a').contents[-1].replace('(', '').replace(')', '')
      pdfs.append(
         (name, year, pdf_url, size)
      )
  return pdfs
“Drop Everything”
Photoshop + AppleScript
           v.s.
     Java + IntelliJ
Images on our
docroot (S3 upload
was taking too long)
Blitz QA
Launch! (on EC2)
Crash #1: more
Apache children than
MySQL connections
unreviewed_count = Page.objects.filter(
   votes__isnull = True
).distinct().count()
SELECT
  COUNT(DISTINCT `expenses_page`.`id`)
FROM
  `expenses_page` LEFT OUTER JOIN `expenses_vote` ON (
     `expenses_page`.`id` = `expenses_vote`.`page_id`
  ) WHERE `expenses_vote`.`id` IS NULL
unreviewed_count = cache.get('homepage:unreviewed_count')
if unreviewed_count is None:
    unreviewed_count = Page.objects.filter(
       votes__isnull = True
    ).distinct().count()
    cache.set('homepage: unreviewed_count', unreviewed_count, 60)
• With 70,000 pages and a LOT of votes...
 • DB takes up 135% of CPU
• Cache the count in memcached...
 • DB drops to %35 of CPU
unreviewed_count = Page.objects.filter(
   votes__isnull = True
).distinct().count()

reviewed_count = Page.objects.filter(
   votes__isnull = False
).distinct().count()
unreviewed_count = Page.objects.filter(
   is_reviewed = False
).count()
Migrating to InnoDB
on a separate server
ssh mps-live "mysqldump mp_expenses" |
sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' |
  sed 's/CHARSET=latin1/CHARSET=utf8/g' |
  ssh mysql-big "mysql -u root mp_expenses"
“next” button
def next_global(request):
  # Next unreviewed page from the whole site
  all_unreviewed_pages = Page.objects.filter(
      is_reviewed = False
  ).order_by('?')
  if all_unreviewed_pages:
      return Redirect(
         all_unreviewed_pages[0].get_absolute_url()
      )
  else:
      return HttpResponse(
         'All pages have been reviewed!'
      )
import random

def next_global_from_cache(request):
  page_ids = cache.get('unreviewed_page_ids')
  if page_ids:
      return Redirect(
         '/page/%s/' % random.choice(page_ids)
      )
  else:
      return next_global(request)
from django.core.management.base import BaseCommand
from mp_expenses.expenses.models import Page
from django.core.cache import cache

class Command(BaseCommand):
   help = """
   populate unreviewed_page_ids in memcached
   """
   requires_model_validation = True
   can_import_settings = True
   def handle(self, *args, **options):
       ids = list(Page.objects.exclude(
          is_reviewed = True
       ).values_list('pk', flat=True)[:1000])
       cache.set('unreviewed_page_ids', ids)
The numbers
Final thoughts

• High score tables help
• MP photographs really help
• Keeping up the interest is hard
• Next step: start releasing the data

More Related Content

What's hot

Cheap tricks for startups
Cheap tricks for startupsCheap tricks for startups
Cheap tricks for startupsSimon Willison
 
Building Things Fast - and getting approval
Building Things Fast - and getting approvalBuilding Things Fast - and getting approval
Building Things Fast - and getting approvalSimon Willison
 
Learning jQuery in 30 minutes
Learning jQuery in 30 minutesLearning jQuery in 30 minutes
Learning jQuery in 30 minutesSimon Willison
 
Jqeury ajax plugins
Jqeury ajax pluginsJqeury ajax plugins
Jqeury ajax pluginsInbal Geffen
 
Week 4 - jQuery + Ajax
Week 4 - jQuery + AjaxWeek 4 - jQuery + Ajax
Week 4 - jQuery + Ajaxbaygross
 
A Short Introduction To jQuery
A Short Introduction To jQueryA Short Introduction To jQuery
A Short Introduction To jQuerySudar Muthu
 
Jquery In Rails
Jquery In RailsJquery In Rails
Jquery In Railsshen liu
 
Drupal Best Practices
Drupal Best PracticesDrupal Best Practices
Drupal Best Practicesmanugoel2003
 
jQuery from the very beginning
jQuery from the very beginningjQuery from the very beginning
jQuery from the very beginningAnis Ahmad
 
Jquery Best Practices
Jquery Best PracticesJquery Best Practices
Jquery Best Practicesbrinsknaps
 
HTML5 and CSS3 Refresher
HTML5 and CSS3 RefresherHTML5 and CSS3 Refresher
HTML5 and CSS3 RefresherIvano Malavolta
 
Datamapper @ Railsconf2010
Datamapper @ Railsconf2010Datamapper @ Railsconf2010
Datamapper @ Railsconf2010Dirkjan Bussink
 
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway ichikaway
 
Acceptance Testing with Webrat
Acceptance Testing with WebratAcceptance Testing with Webrat
Acceptance Testing with WebratLuismi Cavallé
 
DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009Dirkjan Bussink
 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodJeremy Kendall
 
Maintainable JavaScript 2012
Maintainable JavaScript 2012Maintainable JavaScript 2012
Maintainable JavaScript 2012Nicholas Zakas
 
Leveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHPLeveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHPJeremy Kendall
 

What's hot (20)

Cheap tricks for startups
Cheap tricks for startupsCheap tricks for startups
Cheap tricks for startups
 
Building Things Fast - and getting approval
Building Things Fast - and getting approvalBuilding Things Fast - and getting approval
Building Things Fast - and getting approval
 
Learning jQuery in 30 minutes
Learning jQuery in 30 minutesLearning jQuery in 30 minutes
Learning jQuery in 30 minutes
 
Jqeury ajax plugins
Jqeury ajax pluginsJqeury ajax plugins
Jqeury ajax plugins
 
Week 4 - jQuery + Ajax
Week 4 - jQuery + AjaxWeek 4 - jQuery + Ajax
Week 4 - jQuery + Ajax
 
A Short Introduction To jQuery
A Short Introduction To jQueryA Short Introduction To jQuery
A Short Introduction To jQuery
 
Jquery In Rails
Jquery In RailsJquery In Rails
Jquery In Rails
 
jQuery quick tuts
jQuery quick tutsjQuery quick tuts
jQuery quick tuts
 
Drupal Best Practices
Drupal Best PracticesDrupal Best Practices
Drupal Best Practices
 
jQuery from the very beginning
jQuery from the very beginningjQuery from the very beginning
jQuery from the very beginning
 
Jquery Best Practices
Jquery Best PracticesJquery Best Practices
Jquery Best Practices
 
HTML5 and CSS3 Refresher
HTML5 and CSS3 RefresherHTML5 and CSS3 Refresher
HTML5 and CSS3 Refresher
 
Datamapper @ Railsconf2010
Datamapper @ Railsconf2010Datamapper @ Railsconf2010
Datamapper @ Railsconf2010
 
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
 
Acceptance Testing with Webrat
Acceptance Testing with WebratAcceptance Testing with Webrat
Acceptance Testing with Webrat
 
Advanced Django
Advanced DjangoAdvanced Django
Advanced Django
 
DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009
 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the Good
 
Maintainable JavaScript 2012
Maintainable JavaScript 2012Maintainable JavaScript 2012
Maintainable JavaScript 2012
 
Leveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHPLeveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHP
 

Similar to Crowdsourcing with Django

A brief history of Django model syntax
A brief history of Django model syntaxA brief history of Django model syntax
A brief history of Django model syntaxJacob Kaplan-Moss
 
Python Development (MongoSF)
Python Development (MongoSF)Python Development (MongoSF)
Python Development (MongoSF)Mike Dirolf
 
Inside PyMongo - MongoNYC
Inside PyMongo - MongoNYCInside PyMongo - MongoNYC
Inside PyMongo - MongoNYCMike Dirolf
 
Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Leonardo Soto
 
Django workshop : let's make a blog
Django workshop : let's make a blogDjango workshop : let's make a blog
Django workshop : let's make a blogPierre Sudron
 
Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)Leonardo Soto
 
RubyBarCamp “Полезные gems и plugins”
RubyBarCamp “Полезные gems и plugins”RubyBarCamp “Полезные gems и plugins”
RubyBarCamp “Полезные gems и plugins”apostlion
 
Type safe embedded domain-specific languages
Type safe embedded domain-specific languagesType safe embedded domain-specific languages
Type safe embedded domain-specific languagesArthur Xavier
 
Introduction to Django
Introduction to DjangoIntroduction to Django
Introduction to DjangoJoaquim Rocha
 
Django Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, TricksDjango Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, TricksShawn Rider
 
Django in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for FreeDjango in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for FreeHarvard Web Working Group
 
Mongoid in the real world
Mongoid in the real worldMongoid in the real world
Mongoid in the real worldKevin Faustino
 
A Basic Django Introduction
A Basic Django IntroductionA Basic Django Introduction
A Basic Django IntroductionGanga Ram
 

Similar to Crowdsourcing with Django (20)

A brief history of Django model syntax
A brief history of Django model syntaxA brief history of Django model syntax
A brief history of Django model syntax
 
Python Development (MongoSF)
Python Development (MongoSF)Python Development (MongoSF)
Python Development (MongoSF)
 
Django Search
Django SearchDjango Search
Django Search
 
JQuery Flot
JQuery FlotJQuery Flot
JQuery Flot
 
Inside PyMongo - MongoNYC
Inside PyMongo - MongoNYCInside PyMongo - MongoNYC
Inside PyMongo - MongoNYC
 
Capybara
CapybaraCapybara
Capybara
 
Epic South Disasters
Epic South DisastersEpic South Disasters
Epic South Disasters
 
Django - sql alchemy - jquery
Django - sql alchemy - jqueryDjango - sql alchemy - jquery
Django - sql alchemy - jquery
 
Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)
 
Django workshop : let's make a blog
Django workshop : let's make a blogDjango workshop : let's make a blog
Django workshop : let's make a blog
 
Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)
 
ORM in Django
ORM in DjangoORM in Django
ORM in Django
 
RubyBarCamp “Полезные gems и plugins”
RubyBarCamp “Полезные gems и plugins”RubyBarCamp “Полезные gems и plugins”
RubyBarCamp “Полезные gems и plugins”
 
Type safe embedded domain-specific languages
Type safe embedded domain-specific languagesType safe embedded domain-specific languages
Type safe embedded domain-specific languages
 
Build your own entity with Drupal
Build your own entity with DrupalBuild your own entity with Drupal
Build your own entity with Drupal
 
Introduction to Django
Introduction to DjangoIntroduction to Django
Introduction to Django
 
Django Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, TricksDjango Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, Tricks
 
Django in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for FreeDjango in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for Free
 
Mongoid in the real world
Mongoid in the real worldMongoid in the real world
Mongoid in the real world
 
A Basic Django Introduction
A Basic Django IntroductionA Basic Django Introduction
A Basic Django Introduction
 

More from Simon Willison

How we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graphHow we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graphSimon Willison
 
Web Services for Fun and Profit
Web Services for Fun and ProfitWeb Services for Fun and Profit
Web Services for Fun and ProfitSimon Willison
 
Tricks & challenges developing a large Django application
Tricks & challenges developing a large Django applicationTricks & challenges developing a large Django application
Tricks & challenges developing a large Django applicationSimon Willison
 
Advanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & FabricAdvanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & FabricSimon Willison
 
How Lanyrd uses Twitter
How Lanyrd uses TwitterHow Lanyrd uses Twitter
How Lanyrd uses TwitterSimon Willison
 
Building crowdsourcing applications
Building crowdsourcing applicationsBuilding crowdsourcing applications
Building crowdsourcing applicationsSimon Willison
 
Evented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunniesEvented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunniesSimon Willison
 
Class-based views with Django
Class-based views with DjangoClass-based views with Django
Class-based views with DjangoSimon Willison
 
Web App Security Horror Stories
Web App Security Horror StoriesWeb App Security Horror Stories
Web App Security Horror StoriesSimon Willison
 
Web Security Horror Stories
Web Security Horror StoriesWeb Security Horror Stories
Web Security Horror StoriesSimon Willison
 
When Zeppelins Ruled The Earth
When Zeppelins Ruled The EarthWhen Zeppelins Ruled The Earth
When Zeppelins Ruled The EarthSimon Willison
 
When Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentalsWhen Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentalsSimon Willison
 
I love Zeppelins, and you should too
I love Zeppelins, and you should tooI love Zeppelins, and you should too
I love Zeppelins, and you should tooSimon Willison
 
OpenID at Open Tech 2008
OpenID at Open Tech 2008OpenID at Open Tech 2008
OpenID at Open Tech 2008Simon Willison
 
Going Live! with Comet
Going Live! with CometGoing Live! with Comet
Going Live! with CometSimon Willison
 
URL-based identity with OpenID
URL-based identity with OpenIDURL-based identity with OpenID
URL-based identity with OpenIDSimon Willison
 

More from Simon Willison (20)

How Lanyrd does Geo
How Lanyrd does GeoHow Lanyrd does Geo
How Lanyrd does Geo
 
Building Lanyrd
Building LanyrdBuilding Lanyrd
Building Lanyrd
 
How we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graphHow we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graph
 
Web Services for Fun and Profit
Web Services for Fun and ProfitWeb Services for Fun and Profit
Web Services for Fun and Profit
 
Tricks & challenges developing a large Django application
Tricks & challenges developing a large Django applicationTricks & challenges developing a large Django application
Tricks & challenges developing a large Django application
 
Advanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & FabricAdvanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
 
How Lanyrd uses Twitter
How Lanyrd uses TwitterHow Lanyrd uses Twitter
How Lanyrd uses Twitter
 
ScaleFail
ScaleFailScaleFail
ScaleFail
 
Building crowdsourcing applications
Building crowdsourcing applicationsBuilding crowdsourcing applications
Building crowdsourcing applications
 
Evented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunniesEvented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunnies
 
Django Heresies
Django HeresiesDjango Heresies
Django Heresies
 
Class-based views with Django
Class-based views with DjangoClass-based views with Django
Class-based views with Django
 
Web App Security Horror Stories
Web App Security Horror StoriesWeb App Security Horror Stories
Web App Security Horror Stories
 
Web Security Horror Stories
Web Security Horror StoriesWeb Security Horror Stories
Web Security Horror Stories
 
When Zeppelins Ruled The Earth
When Zeppelins Ruled The EarthWhen Zeppelins Ruled The Earth
When Zeppelins Ruled The Earth
 
When Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentalsWhen Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentals
 
I love Zeppelins, and you should too
I love Zeppelins, and you should tooI love Zeppelins, and you should too
I love Zeppelins, and you should too
 
OpenID at Open Tech 2008
OpenID at Open Tech 2008OpenID at Open Tech 2008
OpenID at Open Tech 2008
 
Going Live! with Comet
Going Live! with CometGoing Live! with Comet
Going Live! with Comet
 
URL-based identity with OpenID
URL-based identity with OpenIDURL-based identity with OpenID
URL-based identity with OpenID
 

Recently uploaded

Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxLoriGlavin3
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Manik S Magar
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity PlanDatabarracks
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsPixlogix Infotech
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 3652toLead Limited
 
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptxPasskey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptxLoriGlavin3
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Commit University
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii SoldatenkoFwdays
 
Time Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsTime Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsNathaniel Shimoni
 
From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .Alan Dix
 
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxDigital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxLoriGlavin3
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsSergiu Bodiu
 
A Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptxA Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptxLoriGlavin3
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebUiPathCommunity
 
A Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software DevelopersA Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software DevelopersNicole Novielli
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brandgvaughan
 
unit 4 immunoblotting technique complete.pptx
unit 4 immunoblotting technique complete.pptxunit 4 immunoblotting technique complete.pptx
unit 4 immunoblotting technique complete.pptxBkGupta21
 
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...AliaaTarek5
 

Recently uploaded (20)

Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity Plan
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and Cons
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365
 
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptxPasskey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptx
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko
 
Time Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsTime Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directions
 
From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .
 
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxDigital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platforms
 
A Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptxA Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptx
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio Web
 
A Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software DevelopersA Journey Into the Emotions of Software Developers
A Journey Into the Emotions of Software Developers
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brand
 
unit 4 immunoblotting technique complete.pptx
unit 4 immunoblotting technique complete.pptxunit 4 immunoblotting technique complete.pptx
unit 4 immunoblotting technique complete.pptx
 
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
 

Crowdsourcing with Django

  • 1. Crowdsourcing with Django EuroPython, 30th June 2009 Simon Willison · http://simonwillison.net/ · @simonw
  • 4. November 2000 The Freedom of Information Act
  • 5. Heather Brooke • http://www.guardian.co.uk/politics/ 2009/may/08/mps-expenses-telegraph- checquebook-journalism • http://www.guardian.co.uk/politics/ 2009/may/15/mps-expenses-heather- brooke-foi
  • 7. January 2005 The FOI request
  • 8. July 2006 The FOI commissioner
  • 9. May 2007 The FOI (Amendment) Bill
  • 14.
  • 15.
  • 16. March 2009 The mole
  • 17. “All of the receipts of 650-odd MPs, redacted and unredacted, are for sale at a price of £300,000, so I am told. The price is going up because of the interest in the subject.” Sir Stuart Bell, MP Newsnight, 30th March
  • 18. 8th May, 2009 The Daily Telegraph
  • 20. April: “Expenses are due out in a couple of months, is there anything we can do?”
  • 21. June: “Expenses have been bumped forward, they’re out next week!”
  • 22. Thursday 11th June The proof-of-concept
  • 23. Monday 15th June The tentative go-ahead
  • 24. Tuesday 16th June Designer + client-side engineer
  • 25. Wednesday 17th June Operations engineer
  • 26. Thursday 18th June Launch day!
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 34.
  • 35.
  • 37.
  • 39. class Party(models.Model): name = models.CharField(max_length=100) class Constituency(models.Model): name = models.CharField(max_length=100) class MP(models.Model): name = models.CharField(max_length=100) party = models.ForeignKey(Party) constituency = models.ForeignKey(Constituency) guardian_url = models.CharField(max_length=255, blank=True) guardian_image_url = models.CharField(max_length=255, blank=True)
  • 40. class FinancialYear(models.Model): name = models.CharField(max_length=10) class Document(models.Model): title = models.CharField(max_length=100, blank=True) filename = models.CharField(max_length=100) mp = models.ForeignKey(MP) financial_year = models.ForeignKey(FinancialYear) class Page(models.Model): document = models.ForeignKey(Document) page_number = models.IntegerField()
  • 41. class User(models.Model): created = models.DateTimeField(auto_now_add = True) username = models.TextField(max_length = 100) password_hash = models.CharField(max_length = 128, blank=True) class LineItemCategory(models.Model): order = models.IntegerField(default = 0) name = models.CharField(max_length = 32) class LineItem(models.Model): user = models.ForeignKey(User) page = models.ForeignKey(Page) type = models.CharField(max_length = 16, choices = ( ('claim', 'claim'), ('proof', 'proof'), ), db_index = True) date = models.DateField(null = True, blank = True) amount = models.DecimalField(max_digits=20, decimal_places=2) description = models.CharField(max_length = 255, blank = True) created = models.DateTimeField(auto_now_add = True, db_index = True) categories = models.ManyToManyField(LineItemCategory, blank=True)
  • 42. class Vote(models.Model): user = models.ForeignKey(User, related_name = 'votes') page = models.ForeignKey(Page, related_name = 'votes') obsolete = models.BooleanField(default = False) vote_type = models.CharField(max_length = 32, blank = True) ip_address = models.CharField(max_length = 32) created = models.DateTimeField(auto_now_add = True) class TypeVote(Vote): type = models.CharField(max_length = 10, choices = ( ('claim', 'Claim'), ('proof', 'Proof'), ('blank', 'Blank'), ('other', 'Other') )) class InterestingVote(Vote): status = models.CharField(max_length = 10, choices = ( ('no', 'Not interesting'), ('yes', 'Interesting'), ('known', 'Interesting but known'), ('very', 'Investigate this!'), ))
  • 44.
  • 46.
  • 47. page_filters = ( # Maps name of filter to dictionary of kwargs to doc.pages.filter() ('reviewed', { 'votes__isnull': False }), ('unreviewed', { 'votes__isnull': True }), ('with line items', { 'line_items__isnull': False }), ('interesting', { 'votes__interestingvote__status': 'yes' }), ('interesting but known', { 'votes__interestingvote__status': 'known' ... ) page_filters_lookup = dict(page_filters)
  • 48. pages = doc.pages.all() if page_filter: kwargs = page_filters_lookup.get(page_filter) if kwargs is None: raise Http404, 'Invalid page filter: %s' % page_filter pages = pages.filter(**kwargs).distinct() # Build the filters filters = [] for name, kwargs in page_filters: filters.append({ 'name': name, 'count': doc.pages.filter(**kwargs).distinct().count(), })
  • 52.
  • 53.
  • 54.
  • 55. def get_mp_pages(): "Returns list of (mp-name, mp-page-url) tuples" soup = Soup(urllib.urlopen(INDEX_URL)) mp_links = [] for link in soup.findAll('a'): if link.get('title', '').endswith("'s allowances"): mp_links.append( (link['title'].replace("'s allowances", ''), link['href']) ) return mp_links
  • 56. def get_pdfs(mp_url): "Returns list of (description, years, pdf-url, size) tuples" soup = Soup(urllib.urlopen(mp_url)) pdfs = [] trs = soup.findAll('tr')[1:] # Skip the first, it's the table header for tr in trs: name_td, year_td, pdf_td = tr.findAll('td') name = name_td.string year = year_td.string pdf_url = pdf_td.find('a')['href'] size = pdf_td.find('a').contents[-1].replace('(', '').replace(')', '') pdfs.append( (name, year, pdf_url, size) ) return pdfs
  • 57.
  • 58.
  • 59.
  • 61. Photoshop + AppleScript v.s. Java + IntelliJ
  • 62. Images on our docroot (S3 upload was taking too long)
  • 65.
  • 66. Crash #1: more Apache children than MySQL connections
  • 67.
  • 68.
  • 69. unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count()
  • 70. SELECT COUNT(DISTINCT `expenses_page`.`id`) FROM `expenses_page` LEFT OUTER JOIN `expenses_vote` ON ( `expenses_page`.`id` = `expenses_vote`.`page_id` ) WHERE `expenses_vote`.`id` IS NULL
  • 71. unreviewed_count = cache.get('homepage:unreviewed_count') if unreviewed_count is None: unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count() cache.set('homepage: unreviewed_count', unreviewed_count, 60)
  • 72. • With 70,000 pages and a LOT of votes... • DB takes up 135% of CPU • Cache the count in memcached... • DB drops to %35 of CPU
  • 73. unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count() reviewed_count = Page.objects.filter( votes__isnull = False ).distinct().count()
  • 74. unreviewed_count = Page.objects.filter( is_reviewed = False ).count()
  • 75. Migrating to InnoDB on a separate server
  • 76. ssh mps-live "mysqldump mp_expenses" | sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' | sed 's/CHARSET=latin1/CHARSET=utf8/g' | ssh mysql-big "mysql -u root mp_expenses"
  • 78. def next_global(request): # Next unreviewed page from the whole site all_unreviewed_pages = Page.objects.filter( is_reviewed = False ).order_by('?') if all_unreviewed_pages: return Redirect( all_unreviewed_pages[0].get_absolute_url() ) else: return HttpResponse( 'All pages have been reviewed!' )
  • 79. import random def next_global_from_cache(request): page_ids = cache.get('unreviewed_page_ids') if page_ids: return Redirect( '/page/%s/' % random.choice(page_ids) ) else: return next_global(request)
  • 80. from django.core.management.base import BaseCommand from mp_expenses.expenses.models import Page from django.core.cache import cache class Command(BaseCommand): help = """ populate unreviewed_page_ids in memcached """ requires_model_validation = True can_import_settings = True def handle(self, *args, **options): ids = list(Page.objects.exclude( is_reviewed = True ).values_list('pk', flat=True)[:1000]) cache.set('unreviewed_page_ids', ids)
  • 82.
  • 83. Final thoughts • High score tables help • MP photographs really help • Keeping up the interest is hard • Next step: start releasing the data