SlideShare a Scribd company logo
1 of 83
Download to read offline
Crowdsourcing
        with Django
           EuroPython, 30th June 2009
Simon Willison Ā· http://simonwillison.net/ Ā· @simonw
ā€œWeb development on
journalism deadlinesā€
The back story...
November 2000
The Freedom of Information Act
Heather Brooke

ā€¢ http://www.guardian.co.uk/politics/
  2009/may/08/mps-expenses-telegraph-
  checquebook-journalism

ā€¢ http://www.guardian.co.uk/politics/
  2009/may/15/mps-expenses-heather-
  brooke-foi
2004
The request
January 2005
 The FOI request
July 2006
The FOI commissioner
May 2007
The FOI (Amendment) Bill
February 2008
The Information Tribunal
ā€œTransparency will
damage democracyā€
May 2008
The high court
January 2009
The exemption law
March 2009
  The mole
ā€œAll of the receipts of 650-odd MPs,
redacted and unredacted, are for sale
 at a price of Ā£300,000, so I am told.
 The price is going up because of the
         interest in the subject.ā€
                           Sir Stuart Bell, MP
                        Newsnight, 30th March
8th May, 2009
The Daily Telegraph
At the Guardian...
April: ā€œExpenses are due out
  in a couple of months, is
there anything we can do?ā€
June: ā€œExpenses have been
bumped forward, theyā€™re out
        next week!ā€
Thursday 11th June
  The proof-of-concept
Monday 15th June
The tentative go-ahead
Tuesday 16th June
Designer + client-side engineer
Wednesday 17th June
   Operations engineer
Thursday 18th June
    Launch day!
How we built it
$ convert Frank_Comm.pdf pages.png
Models
class Party(models.Model):
   name = models.CharField(max_length=100)

class Constituency(models.Model):
   name = models.CharField(max_length=100)

class MP(models.Model):
   name = models.CharField(max_length=100)
   party = models.ForeignKey(Party)
   constituency = models.ForeignKey(Constituency)
   guardian_url = models.CharField(max_length=255,
      blank=True)
   guardian_image_url = models.CharField(max_length=255,
      blank=True)
class FinancialYear(models.Model):
   name = models.CharField(max_length=10)

class Document(models.Model):
   title = models.CharField(max_length=100, blank=True)
   ļ¬lename = models.CharField(max_length=100)
   mp = models.ForeignKey(MP)
   ļ¬nancial_year = models.ForeignKey(FinancialYear)

class Page(models.Model):
   document = models.ForeignKey(Document)
   page_number = models.IntegerField()
class User(models.Model):
   created = models.DateTimeField(auto_now_add = True)
   username = models.TextField(max_length = 100)
   password_hash = models.CharField(max_length = 128, blank=True)

class LineItemCategory(models.Model):
   order = models.IntegerField(default = 0)
   name = models.CharField(max_length = 32)

class LineItem(models.Model):
   user = models.ForeignKey(User)
   page = models.ForeignKey(Page)
   type = models.CharField(max_length = 16, choices = (
       ('claim', 'claim'), ('proof', 'proof'),
   ), db_index = True)
   date = models.DateField(null = True, blank = True)
   amount = models.DecimalField(max_digits=20, decimal_places=2)
   description = models.CharField(max_length = 255, blank = True)
   created = models.DateTimeField(auto_now_add = True, db_index = True)
   categories = models.ManyToManyField(LineItemCategory, blank=True)
class Vote(models.Model):
   user = models.ForeignKey(User, related_name = 'votes')
   page = models.ForeignKey(Page, related_name = 'votes')
   obsolete = models.BooleanField(default = False)
   vote_type = models.CharField(max_length = 32, blank = True)
   ip_address = models.CharField(max_length = 32)
   created = models.DateTimeField(auto_now_add = True)

class TypeVote(Vote):
   type = models.CharField(max_length = 10, choices = (
      ('claim', 'Claim'), ('proof', 'Proof'),
      ('blank', 'Blank'), ('other', 'Other')
   ))

class InterestingVote(Vote):
   status = models.CharField(max_length = 10, choices = (
      ('no', 'Not interesting'), ('yes', 'Interesting'),
      ('known', 'Interesting but known'), ('very', 'Investigate this!'),
   ))
Frictionless
registration
Page ļ¬lters
page_ļ¬lters = (
    # Maps name of ļ¬lter to dictionary of kwargs to doc.pages.ļ¬lter()
    ('reviewed', {
        'votes__isnull': False
    }),
    ('unreviewed', {
        'votes__isnull': True
    }),
    ('with line items', {
        'line_items__isnull': False
    }),
    ('interesting', {
        'votes__interestingvote__status': 'yes'
    }),
    ('interesting but known', {
        'votes__interestingvote__status': 'known'
...
)
page_ļ¬lters_lookup = dict(page_ļ¬lters)
pages = doc.pages.all()
if page_ļ¬lter:
    kwargs = page_ļ¬lters_lookup.get(page_ļ¬lter)
    if kwargs is None:
        raise Http404, 'Invalid page ļ¬lter: %s' % page_ļ¬lter
    pages = pages.ļ¬lter(**kwargs).distinct()

# Build the ļ¬lters
ļ¬lters = []
for name, kwargs in page_ļ¬lters:
   ļ¬lters.append({
      'name': name,
      'count': doc.pages.ļ¬lter(**kwargs).distinct().count(),
   })
Matching names
http://github.com/simonw/datamatcher
On the day
def get_mp_pages():
  "Returns list of (mp-name, mp-page-url) tuples"
  soup = Soup(urllib.urlopen(INDEX_URL))
  mp_links = []
  for link in soup.ļ¬ndAll('a'):
      if link.get('title', '').endswith("'s allowances"):
           mp_links.append(
             (link['title'].replace("'s allowances", ''), link['href'])
           )
  return mp_links
def get_pdfs(mp_url):
  "Returns list of (description, years, pdf-url, size) tuples"
  soup = Soup(urllib.urlopen(mp_url))
  pdfs = []
  trs = soup.ļ¬ndAll('tr')[1:] # Skip the ļ¬rst, it's the table header
  for tr in trs:
      name_td, year_td, pdf_td = tr.ļ¬ndAll('td')
      name = name_td.string
      year = year_td.string
      pdf_url = pdf_td.ļ¬nd('a')['href']
      size = pdf_td.ļ¬nd('a').contents[-1].replace('(', '').replace(')', '')
      pdfs.append(
         (name, year, pdf_url, size)
      )
  return pdfs
ā€œDrop Everythingā€
Photoshop + AppleScript
           v.s.
     Java + IntelliJ
Images on our
docroot (S3 upload
was taking too long)
Blitz QA
Launch! (on EC2)
Crash #1: more
Apache children than
MySQL connections
unreviewed_count = Page.objects.ļ¬lter(
   votes__isnull = True
).distinct().count()
SELECT
  COUNT(DISTINCT `expenses_page`.`id`)
FROM
  `expenses_page` LEFT OUTER JOIN `expenses_vote` ON (
     `expenses_page`.`id` = `expenses_vote`.`page_id`
  ) WHERE `expenses_vote`.`id` IS NULL
unreviewed_count = cache.get('homepage:unreviewed_count')
if unreviewed_count is None:
    unreviewed_count = Page.objects.ļ¬lter(
       votes__isnull = True
    ).distinct().count()
    cache.set('homepage: unreviewed_count', unreviewed_count, 60)
ā€¢ With 70,000 pages and a LOT of votes...
 ā€¢ DB takes up 135% of CPU
ā€¢ Cache the count in memcached...
 ā€¢ DB drops to %35 of CPU
unreviewed_count = Page.objects.ļ¬lter(
   votes__isnull = True
).distinct().count()

reviewed_count = Page.objects.ļ¬lter(
   votes__isnull = False
).distinct().count()
unreviewed_count = Page.objects.ļ¬lter(
   is_reviewed = False
).count()
Migrating to InnoDB
on a separate server
ssh mps-live "mysqldump mp_expenses" |
sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' |
  sed 's/CHARSET=latin1/CHARSET=utf8/g' |
  ssh mysql-big "mysql -u root mp_expenses"
ā€œnextā€ button
def next_global(request):
  # Next unreviewed page from the whole site
  all_unreviewed_pages = Page.objects.ļ¬lter(
      is_reviewed = False
  ).order_by('?')
  if all_unreviewed_pages:
      return Redirect(
         all_unreviewed_pages[0].get_absolute_url()
      )
  else:
      return HttpResponse(
         'All pages have been reviewed!'
      )
import random

def next_global_from_cache(request):
  page_ids = cache.get('unreviewed_page_ids')
  if page_ids:
      return Redirect(
         '/page/%s/' % random.choice(page_ids)
      )
  else:
      return next_global(request)
from django.core.management.base import BaseCommand
from mp_expenses.expenses.models import Page
from django.core.cache import cache

class Command(BaseCommand):
   help = """
   populate unreviewed_page_ids in memcached
   """
   requires_model_validation = True
   can_import_settings = True
   def handle(self, *args, **options):
       ids = list(Page.objects.exclude(
          is_reviewed = True
       ).values_list('pk', ļ¬‚at=True)[:1000])
       cache.set('unreviewed_page_ids', ids)
The numbers
Final thoughts

ā€¢ High score tables help
ā€¢ MP photographs really help
ā€¢ Keeping up the interest is hard
ā€¢ Next step: start releasing the data

More Related Content

What's hot

Cheap tricks for startups
Cheap tricks for startupsCheap tricks for startups
Cheap tricks for startupsSimon Willison
Ā 
Building Things Fast - and getting approval
Building Things Fast - and getting approvalBuilding Things Fast - and getting approval
Building Things Fast - and getting approvalSimon Willison
Ā 
Learning jQuery in 30 minutes
Learning jQuery in 30 minutesLearning jQuery in 30 minutes
Learning jQuery in 30 minutesSimon Willison
Ā 
Jqeury ajax plugins
Jqeury ajax pluginsJqeury ajax plugins
Jqeury ajax pluginsInbal Geffen
Ā 
Week 4 - jQuery + Ajax
Week 4 - jQuery + AjaxWeek 4 - jQuery + Ajax
Week 4 - jQuery + Ajaxbaygross
Ā 
A Short Introduction To jQuery
A Short Introduction To jQueryA Short Introduction To jQuery
A Short Introduction To jQuerySudar Muthu
Ā 
Jquery In Rails
Jquery In RailsJquery In Rails
Jquery In Railsshen liu
Ā 
jQuery quick tuts
jQuery quick tutsjQuery quick tuts
jQuery quick tutsNasa Vietnam
Ā 
Drupal Best Practices
Drupal Best PracticesDrupal Best Practices
Drupal Best Practicesmanugoel2003
Ā 
jQuery from the very beginning
jQuery from the very beginningjQuery from the very beginning
jQuery from the very beginningAnis Ahmad
Ā 
Jquery Best Practices
Jquery Best PracticesJquery Best Practices
Jquery Best Practicesbrinsknaps
Ā 
HTML5 and CSS3 Refresher
HTML5 and CSS3 RefresherHTML5 and CSS3 Refresher
HTML5 and CSS3 RefresherIvano Malavolta
Ā 
Datamapper @ Railsconf2010
Datamapper @ Railsconf2010Datamapper @ Railsconf2010
Datamapper @ Railsconf2010Dirkjan Bussink
Ā 
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway ichikaway
Ā 
Acceptance Testing with Webrat
Acceptance Testing with WebratAcceptance Testing with Webrat
Acceptance Testing with WebratLuismi CavallƩ
Ā 
DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009Dirkjan Bussink
Ā 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodJeremy Kendall
Ā 
Maintainable JavaScript 2012
Maintainable JavaScript 2012Maintainable JavaScript 2012
Maintainable JavaScript 2012Nicholas Zakas
Ā 
Leveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHPLeveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHPJeremy Kendall
Ā 

What's hot (20)

Cheap tricks for startups
Cheap tricks for startupsCheap tricks for startups
Cheap tricks for startups
Ā 
Building Things Fast - and getting approval
Building Things Fast - and getting approvalBuilding Things Fast - and getting approval
Building Things Fast - and getting approval
Ā 
Learning jQuery in 30 minutes
Learning jQuery in 30 minutesLearning jQuery in 30 minutes
Learning jQuery in 30 minutes
Ā 
Jqeury ajax plugins
Jqeury ajax pluginsJqeury ajax plugins
Jqeury ajax plugins
Ā 
Week 4 - jQuery + Ajax
Week 4 - jQuery + AjaxWeek 4 - jQuery + Ajax
Week 4 - jQuery + Ajax
Ā 
A Short Introduction To jQuery
A Short Introduction To jQueryA Short Introduction To jQuery
A Short Introduction To jQuery
Ā 
Jquery In Rails
Jquery In RailsJquery In Rails
Jquery In Rails
Ā 
jQuery quick tuts
jQuery quick tutsjQuery quick tuts
jQuery quick tuts
Ā 
Drupal Best Practices
Drupal Best PracticesDrupal Best Practices
Drupal Best Practices
Ā 
jQuery from the very beginning
jQuery from the very beginningjQuery from the very beginning
jQuery from the very beginning
Ā 
Jquery Best Practices
Jquery Best PracticesJquery Best Practices
Jquery Best Practices
Ā 
HTML5 and CSS3 Refresher
HTML5 and CSS3 RefresherHTML5 and CSS3 Refresher
HTML5 and CSS3 Refresher
Ā 
Datamapper @ Railsconf2010
Datamapper @ Railsconf2010Datamapper @ Railsconf2010
Datamapper @ Railsconf2010
Ā 
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Tips of CakePHP and MongoDB - Cakefest2011 ichikaway
Ā 
Acceptance Testing with Webrat
Acceptance Testing with WebratAcceptance Testing with Webrat
Acceptance Testing with Webrat
Ā 
Advanced Django
Advanced DjangoAdvanced Django
Advanced Django
Ā 
DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009DataMapper @ RubyEnRails2009
DataMapper @ RubyEnRails2009
Ā 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the Good
Ā 
Maintainable JavaScript 2012
Maintainable JavaScript 2012Maintainable JavaScript 2012
Maintainable JavaScript 2012
Ā 
Leveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHPLeveraging the Power of Graph Databases in PHP
Leveraging the Power of Graph Databases in PHP
Ā 

Similar to Crowdsourcing with Django

A brief history of Django model syntax
A brief history of Django model syntaxA brief history of Django model syntax
A brief history of Django model syntaxJacob Kaplan-Moss
Ā 
Python Development (MongoSF)
Python Development (MongoSF)Python Development (MongoSF)
Python Development (MongoSF)Mike Dirolf
Ā 
Inside PyMongo - MongoNYC
Inside PyMongo - MongoNYCInside PyMongo - MongoNYC
Inside PyMongo - MongoNYCMike Dirolf
Ā 
Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Leonardo Soto
Ā 
Django workshop : let's make a blog
Django workshop : let's make a blogDjango workshop : let's make a blog
Django workshop : let's make a blogPierre Sudron
Ā 
Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)Leonardo Soto
Ā 
ORM in Django
ORM in DjangoORM in Django
ORM in DjangoHoang Nguyen
Ā 
RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€
RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€
RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€apostlion
Ā 
Type safe embedded domain-specific languages
Type safe embedded domain-specific languagesType safe embedded domain-specific languages
Type safe embedded domain-specific languagesArthur Xavier
Ā 
Build your own entity with Drupal
Build your own entity with DrupalBuild your own entity with Drupal
Build your own entity with DrupalMarco Vito Moscaritolo
Ā 
Introduction to Django
Introduction to DjangoIntroduction to Django
Introduction to DjangoJoaquim Rocha
Ā 
Django Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, TricksDjango Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, TricksShawn Rider
Ā 
Django in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for FreeDjango in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for FreeHarvard Web Working Group
Ā 
Mongoid in the real world
Mongoid in the real worldMongoid in the real world
Mongoid in the real worldKevin Faustino
Ā 
A Basic Django Introduction
A Basic Django IntroductionA Basic Django Introduction
A Basic Django IntroductionGanga Ram
Ā 

Similar to Crowdsourcing with Django (20)

A brief history of Django model syntax
A brief history of Django model syntaxA brief history of Django model syntax
A brief history of Django model syntax
Ā 
Python Development (MongoSF)
Python Development (MongoSF)Python Development (MongoSF)
Python Development (MongoSF)
Ā 
Django Search
Django SearchDjango Search
Django Search
Ā 
JQuery Flot
JQuery FlotJQuery Flot
JQuery Flot
Ā 
Inside PyMongo - MongoNYC
Inside PyMongo - MongoNYCInside PyMongo - MongoNYC
Inside PyMongo - MongoNYC
Ā 
Capybara
CapybaraCapybara
Capybara
Ā 
Epic South Disasters
Epic South DisastersEpic South Disasters
Epic South Disasters
Ā 
Django - sql alchemy - jquery
Django - sql alchemy - jqueryDjango - sql alchemy - jquery
Django - sql alchemy - jquery
Ā 
Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)
Ā 
Django workshop : let's make a blog
Django workshop : let's make a blogDjango workshop : let's make a blog
Django workshop : let's make a blog
Ā 
Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)Jython: Python para la plataforma Java (JRSL 09)
Jython: Python para la plataforma Java (JRSL 09)
Ā 
ORM in Django
ORM in DjangoORM in Django
ORM in Django
Ā 
RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€
RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€
RubyBarCamp ā€œŠŸŠ¾Š»ŠµŠ·Š½Ń‹Šµ gems Šø pluginsā€
Ā 
Type safe embedded domain-specific languages
Type safe embedded domain-specific languagesType safe embedded domain-specific languages
Type safe embedded domain-specific languages
Ā 
Build your own entity with Drupal
Build your own entity with DrupalBuild your own entity with Drupal
Build your own entity with Drupal
Ā 
Introduction to Django
Introduction to DjangoIntroduction to Django
Introduction to Django
Ā 
Django Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, TricksDjango Forms: Best Practices, Tips, Tricks
Django Forms: Best Practices, Tips, Tricks
Ā 
Django in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for FreeDjango in the Office: Get Your Admin for Nothing and Your SQL for Free
Django in the Office: Get Your Admin for Nothing and Your SQL for Free
Ā 
Mongoid in the real world
Mongoid in the real worldMongoid in the real world
Mongoid in the real world
Ā 
A Basic Django Introduction
A Basic Django IntroductionA Basic Django Introduction
A Basic Django Introduction
Ā 

More from Simon Willison

How Lanyrd does Geo
How Lanyrd does GeoHow Lanyrd does Geo
How Lanyrd does GeoSimon Willison
Ā 
How we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graphHow we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graphSimon Willison
Ā 
Web Services for Fun and Profit
Web Services for Fun and ProfitWeb Services for Fun and Profit
Web Services for Fun and ProfitSimon Willison
Ā 
Tricks & challenges developing a large Django application
Tricks & challenges developing a large Django applicationTricks & challenges developing a large Django application
Tricks & challenges developing a large Django applicationSimon Willison
Ā 
Advanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & FabricAdvanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & FabricSimon Willison
Ā 
How Lanyrd uses Twitter
How Lanyrd uses TwitterHow Lanyrd uses Twitter
How Lanyrd uses TwitterSimon Willison
Ā 
Building crowdsourcing applications
Building crowdsourcing applicationsBuilding crowdsourcing applications
Building crowdsourcing applicationsSimon Willison
Ā 
Evented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunniesEvented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunniesSimon Willison
Ā 
Class-based views with Django
Class-based views with DjangoClass-based views with Django
Class-based views with DjangoSimon Willison
Ā 
Web App Security Horror Stories
Web App Security Horror StoriesWeb App Security Horror Stories
Web App Security Horror StoriesSimon Willison
Ā 
Web Security Horror Stories
Web Security Horror StoriesWeb Security Horror Stories
Web Security Horror StoriesSimon Willison
Ā 
When Zeppelins Ruled The Earth
When Zeppelins Ruled The EarthWhen Zeppelins Ruled The Earth
When Zeppelins Ruled The EarthSimon Willison
Ā 
When Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentalsWhen Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentalsSimon Willison
Ā 
I love Zeppelins, and you should too
I love Zeppelins, and you should tooI love Zeppelins, and you should too
I love Zeppelins, and you should tooSimon Willison
Ā 
OpenID at Open Tech 2008
OpenID at Open Tech 2008OpenID at Open Tech 2008
OpenID at Open Tech 2008Simon Willison
Ā 
Going Live! with Comet
Going Live! with CometGoing Live! with Comet
Going Live! with CometSimon Willison
Ā 
URL-based identity with OpenID
URL-based identity with OpenIDURL-based identity with OpenID
URL-based identity with OpenIDSimon Willison
Ā 

More from Simon Willison (20)

How Lanyrd does Geo
How Lanyrd does GeoHow Lanyrd does Geo
How Lanyrd does Geo
Ā 
Building Lanyrd
Building LanyrdBuilding Lanyrd
Building Lanyrd
Ā 
How we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graphHow we bootstrapped Lanyrd using Twitter's social graph
How we bootstrapped Lanyrd using Twitter's social graph
Ā 
Web Services for Fun and Profit
Web Services for Fun and ProfitWeb Services for Fun and Profit
Web Services for Fun and Profit
Ā 
Tricks & challenges developing a large Django application
Tricks & challenges developing a large Django applicationTricks & challenges developing a large Django application
Tricks & challenges developing a large Django application
Ā 
Advanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & FabricAdvanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Advanced Aspects of the Django Ecosystem: Haystack, Celery & Fabric
Ā 
How Lanyrd uses Twitter
How Lanyrd uses TwitterHow Lanyrd uses Twitter
How Lanyrd uses Twitter
Ā 
ScaleFail
ScaleFailScaleFail
ScaleFail
Ā 
Building crowdsourcing applications
Building crowdsourcing applicationsBuilding crowdsourcing applications
Building crowdsourcing applications
Ā 
Evented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunniesEvented I/O based web servers, explained using bunnies
Evented I/O based web servers, explained using bunnies
Ā 
Django Heresies
Django HeresiesDjango Heresies
Django Heresies
Ā 
Class-based views with Django
Class-based views with DjangoClass-based views with Django
Class-based views with Django
Ā 
Web App Security Horror Stories
Web App Security Horror StoriesWeb App Security Horror Stories
Web App Security Horror Stories
Ā 
Web Security Horror Stories
Web Security Horror StoriesWeb Security Horror Stories
Web Security Horror Stories
Ā 
When Zeppelins Ruled The Earth
When Zeppelins Ruled The EarthWhen Zeppelins Ruled The Earth
When Zeppelins Ruled The Earth
Ā 
When Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentalsWhen Ajax Attacks! Web application security fundamentals
When Ajax Attacks! Web application security fundamentals
Ā 
I love Zeppelins, and you should too
I love Zeppelins, and you should tooI love Zeppelins, and you should too
I love Zeppelins, and you should too
Ā 
OpenID at Open Tech 2008
OpenID at Open Tech 2008OpenID at Open Tech 2008
OpenID at Open Tech 2008
Ā 
Going Live! with Comet
Going Live! with CometGoing Live! with Comet
Going Live! with Comet
Ā 
URL-based identity with OpenID
URL-based identity with OpenIDURL-based identity with OpenID
URL-based identity with OpenID
Ā 

Recently uploaded

AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAndrey Devyatkin
Ā 
Ransomware_Q4_2023. The report. [EN].pdf
Ransomware_Q4_2023. The report. [EN].pdfRansomware_Q4_2023. The report. [EN].pdf
Ransomware_Q4_2023. The report. [EN].pdfOverkill Security
Ā 
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...Orbitshub
Ā 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobeapidays
Ā 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businesspanagenda
Ā 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdfSandro Moreira
Ā 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfsudhanshuwaghmare1
Ā 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024The Digital Insurer
Ā 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...DianaGray10
Ā 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProduct Anonymous
Ā 
Manulife - Insurer Transformation Award 2024
Manulife - Insurer Transformation Award 2024Manulife - Insurer Transformation Award 2024
Manulife - Insurer Transformation Award 2024The Digital Insurer
Ā 
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ..."I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...Zilliz
Ā 
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...apidays
Ā 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FMESafe Software
Ā 
Cyberprint. Dark Pink Apt Group [EN].pdf
Cyberprint. Dark Pink Apt Group [EN].pdfCyberprint. Dark Pink Apt Group [EN].pdf
Cyberprint. Dark Pink Apt Group [EN].pdfOverkill Security
Ā 
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWEREMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWERMadyBayot
Ā 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamUiPathCommunity
Ā 
MS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectorsMS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectorsNanddeep Nachan
Ā 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FMESafe Software
Ā 

Recently uploaded (20)

+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
Ā 
AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of Terraform
Ā 
Ransomware_Q4_2023. The report. [EN].pdf
Ransomware_Q4_2023. The report. [EN].pdfRansomware_Q4_2023. The report. [EN].pdf
Ransomware_Q4_2023. The report. [EN].pdf
Ā 
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Ā 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Ā 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire business
Ā 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf
Ā 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
Ā 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
Ā 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Ā 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
Ā 
Manulife - Insurer Transformation Award 2024
Manulife - Insurer Transformation Award 2024Manulife - Insurer Transformation Award 2024
Manulife - Insurer Transformation Award 2024
Ā 
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ..."I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
Ā 
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Ā 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Ā 
Cyberprint. Dark Pink Apt Group [EN].pdf
Cyberprint. Dark Pink Apt Group [EN].pdfCyberprint. Dark Pink Apt Group [EN].pdf
Cyberprint. Dark Pink Apt Group [EN].pdf
Ā 
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWEREMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
EMPOWERMENT TECHNOLOGY GRADE 11 QUARTER 2 REVIEWER
Ā 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
Ā 
MS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectorsMS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectors
Ā 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Ā 

Crowdsourcing with Django

  • 1. Crowdsourcing with Django EuroPython, 30th June 2009 Simon Willison Ā· http://simonwillison.net/ Ā· @simonw
  • 4. November 2000 The Freedom of Information Act
  • 5. Heather Brooke ā€¢ http://www.guardian.co.uk/politics/ 2009/may/08/mps-expenses-telegraph- checquebook-journalism ā€¢ http://www.guardian.co.uk/politics/ 2009/may/15/mps-expenses-heather- brooke-foi
  • 7. January 2005 The FOI request
  • 8. July 2006 The FOI commissioner
  • 9. May 2007 The FOI (Amendment) Bill
  • 14.
  • 15.
  • 16. March 2009 The mole
  • 17. ā€œAll of the receipts of 650-odd MPs, redacted and unredacted, are for sale at a price of Ā£300,000, so I am told. The price is going up because of the interest in the subject.ā€ Sir Stuart Bell, MP Newsnight, 30th March
  • 18. 8th May, 2009 The Daily Telegraph
  • 20. April: ā€œExpenses are due out in a couple of months, is there anything we can do?ā€
  • 21. June: ā€œExpenses have been bumped forward, theyā€™re out next week!ā€
  • 22. Thursday 11th June The proof-of-concept
  • 23. Monday 15th June The tentative go-ahead
  • 24. Tuesday 16th June Designer + client-side engineer
  • 25. Wednesday 17th June Operations engineer
  • 26. Thursday 18th June Launch day!
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 34.
  • 35.
  • 37.
  • 39. class Party(models.Model): name = models.CharField(max_length=100) class Constituency(models.Model): name = models.CharField(max_length=100) class MP(models.Model): name = models.CharField(max_length=100) party = models.ForeignKey(Party) constituency = models.ForeignKey(Constituency) guardian_url = models.CharField(max_length=255, blank=True) guardian_image_url = models.CharField(max_length=255, blank=True)
  • 40. class FinancialYear(models.Model): name = models.CharField(max_length=10) class Document(models.Model): title = models.CharField(max_length=100, blank=True) ļ¬lename = models.CharField(max_length=100) mp = models.ForeignKey(MP) ļ¬nancial_year = models.ForeignKey(FinancialYear) class Page(models.Model): document = models.ForeignKey(Document) page_number = models.IntegerField()
  • 41. class User(models.Model): created = models.DateTimeField(auto_now_add = True) username = models.TextField(max_length = 100) password_hash = models.CharField(max_length = 128, blank=True) class LineItemCategory(models.Model): order = models.IntegerField(default = 0) name = models.CharField(max_length = 32) class LineItem(models.Model): user = models.ForeignKey(User) page = models.ForeignKey(Page) type = models.CharField(max_length = 16, choices = ( ('claim', 'claim'), ('proof', 'proof'), ), db_index = True) date = models.DateField(null = True, blank = True) amount = models.DecimalField(max_digits=20, decimal_places=2) description = models.CharField(max_length = 255, blank = True) created = models.DateTimeField(auto_now_add = True, db_index = True) categories = models.ManyToManyField(LineItemCategory, blank=True)
  • 42. class Vote(models.Model): user = models.ForeignKey(User, related_name = 'votes') page = models.ForeignKey(Page, related_name = 'votes') obsolete = models.BooleanField(default = False) vote_type = models.CharField(max_length = 32, blank = True) ip_address = models.CharField(max_length = 32) created = models.DateTimeField(auto_now_add = True) class TypeVote(Vote): type = models.CharField(max_length = 10, choices = ( ('claim', 'Claim'), ('proof', 'Proof'), ('blank', 'Blank'), ('other', 'Other') )) class InterestingVote(Vote): status = models.CharField(max_length = 10, choices = ( ('no', 'Not interesting'), ('yes', 'Interesting'), ('known', 'Interesting but known'), ('very', 'Investigate this!'), ))
  • 44.
  • 46.
  • 47. page_ļ¬lters = ( # Maps name of ļ¬lter to dictionary of kwargs to doc.pages.ļ¬lter() ('reviewed', { 'votes__isnull': False }), ('unreviewed', { 'votes__isnull': True }), ('with line items', { 'line_items__isnull': False }), ('interesting', { 'votes__interestingvote__status': 'yes' }), ('interesting but known', { 'votes__interestingvote__status': 'known' ... ) page_ļ¬lters_lookup = dict(page_ļ¬lters)
  • 48. pages = doc.pages.all() if page_ļ¬lter: kwargs = page_ļ¬lters_lookup.get(page_ļ¬lter) if kwargs is None: raise Http404, 'Invalid page ļ¬lter: %s' % page_ļ¬lter pages = pages.ļ¬lter(**kwargs).distinct() # Build the ļ¬lters ļ¬lters = [] for name, kwargs in page_ļ¬lters: ļ¬lters.append({ 'name': name, 'count': doc.pages.ļ¬lter(**kwargs).distinct().count(), })
  • 52.
  • 53.
  • 54.
  • 55. def get_mp_pages(): "Returns list of (mp-name, mp-page-url) tuples" soup = Soup(urllib.urlopen(INDEX_URL)) mp_links = [] for link in soup.ļ¬ndAll('a'): if link.get('title', '').endswith("'s allowances"): mp_links.append( (link['title'].replace("'s allowances", ''), link['href']) ) return mp_links
  • 56. def get_pdfs(mp_url): "Returns list of (description, years, pdf-url, size) tuples" soup = Soup(urllib.urlopen(mp_url)) pdfs = [] trs = soup.ļ¬ndAll('tr')[1:] # Skip the ļ¬rst, it's the table header for tr in trs: name_td, year_td, pdf_td = tr.ļ¬ndAll('td') name = name_td.string year = year_td.string pdf_url = pdf_td.ļ¬nd('a')['href'] size = pdf_td.ļ¬nd('a').contents[-1].replace('(', '').replace(')', '') pdfs.append( (name, year, pdf_url, size) ) return pdfs
  • 57.
  • 58.
  • 59.
  • 61. Photoshop + AppleScript v.s. Java + IntelliJ
  • 62. Images on our docroot (S3 upload was taking too long)
  • 65.
  • 66. Crash #1: more Apache children than MySQL connections
  • 67.
  • 68.
  • 69. unreviewed_count = Page.objects.ļ¬lter( votes__isnull = True ).distinct().count()
  • 70. SELECT COUNT(DISTINCT `expenses_page`.`id`) FROM `expenses_page` LEFT OUTER JOIN `expenses_vote` ON ( `expenses_page`.`id` = `expenses_vote`.`page_id` ) WHERE `expenses_vote`.`id` IS NULL
  • 71. unreviewed_count = cache.get('homepage:unreviewed_count') if unreviewed_count is None: unreviewed_count = Page.objects.ļ¬lter( votes__isnull = True ).distinct().count() cache.set('homepage: unreviewed_count', unreviewed_count, 60)
  • 72. ā€¢ With 70,000 pages and a LOT of votes... ā€¢ DB takes up 135% of CPU ā€¢ Cache the count in memcached... ā€¢ DB drops to %35 of CPU
  • 73. unreviewed_count = Page.objects.ļ¬lter( votes__isnull = True ).distinct().count() reviewed_count = Page.objects.ļ¬lter( votes__isnull = False ).distinct().count()
  • 74. unreviewed_count = Page.objects.ļ¬lter( is_reviewed = False ).count()
  • 75. Migrating to InnoDB on a separate server
  • 76. ssh mps-live "mysqldump mp_expenses" | sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' | sed 's/CHARSET=latin1/CHARSET=utf8/g' | ssh mysql-big "mysql -u root mp_expenses"
  • 78. def next_global(request): # Next unreviewed page from the whole site all_unreviewed_pages = Page.objects.ļ¬lter( is_reviewed = False ).order_by('?') if all_unreviewed_pages: return Redirect( all_unreviewed_pages[0].get_absolute_url() ) else: return HttpResponse( 'All pages have been reviewed!' )
  • 79. import random def next_global_from_cache(request): page_ids = cache.get('unreviewed_page_ids') if page_ids: return Redirect( '/page/%s/' % random.choice(page_ids) ) else: return next_global(request)
  • 80. from django.core.management.base import BaseCommand from mp_expenses.expenses.models import Page from django.core.cache import cache class Command(BaseCommand): help = """ populate unreviewed_page_ids in memcached """ requires_model_validation = True can_import_settings = True def handle(self, *args, **options): ids = list(Page.objects.exclude( is_reviewed = True ).values_list('pk', ļ¬‚at=True)[:1000]) cache.set('unreviewed_page_ids', ids)
  • 82.
  • 83. Final thoughts ā€¢ High score tables help ā€¢ MP photographs really help ā€¢ Keeping up the interest is hard ā€¢ Next step: start releasing the data