TEACHING YOUR
MACHINE
TO FIND
FRAUDSTERS

Ian Barber
ianb@php.net
phpir.com
twitter.com/ianbarber
http://joind.in/3429




https://github.com/ianbarber/FindingFraudsters-Talk
5%
           3%
SOME      .1%
SMALL
NUMBERS    8%
99%
ACCURACY
REALLY     REALLY
             LEGITIMATE   FRAUD


EVALUATED
                989         0
LEGITIMATE


EVALUATED
                 10         1
  FRAUD
REALLY     REALLY
             LEGITIMATE   FRAUD



      90%
EVALUATED
LEGITIMATE
          WR ONG989         0



EVALUATED
                 10         1
  FRAUD
ANOMALY DETECTION
30




         22.5
Clicks




          15




          7.5




           0
                Date
SOFTWARE
ARCHITECTURE
                           Alarm

               Detector

                          No Alarm
                Buffer


User Clicks    Landing
    Ad          Page
DETECTOR
              statistics

 Expected
  Clicks
              Threshold    Data Buffer
Sensitivity



               Alarm
average.php
function detect($sen) {
  $window = array(); $i = 0;
  $alarmCount = 0; $dtd = 0;
  $avg = $stddev = 0;
  $fraud = fopen("fraudclicks.csv", 'r');
  while($d = fgetcsv($fraud)) {
    $i++;
    if(count($window) > 7) {
      array_shift($window);
      $avg = array_sum($window) / 7;
      foreach($window as $val) {
        $stddev += pow($val - $average, 2);
      }
      $stddev = sqrt($stddev/7);
0.2




0.15




 0.1




0.05




  0
       1   2   3   4   5   6   7   8   9   10 11 12 13 14 15 16 17 18 19 20
if($d[1] > ($avg + ($sen * $stddev))){
          $alarmCount++;
          if($i > 201) {
            break;
          }
        } else {
          if($i > 201) {
            $dtd++;
          }
        }
      }
      array_push($window, $d[1]);
    }
    return array($alarmCount-1, $dtd);
}
1.6 SENSITIVITY
          30
                18 False Alarms          1 Day To Detect

         22.5
Clicks




          15




          7.5




           0
                                  Date
2.7 SENSITIVITY
          30
                1 False Alarm      18 Days To Detect

         22.5
Clicks




          15




          7.5




           0
                                Date
SICKNESS
AVAILABILITY
function detect($sens) {          sickavail.php
  $i = 0; $alarms = 0; $dtd = 0;
  $window = array(); $avail = array();
  $fraud = fopen("fraudclicks.csv", 'r');
  while($dat = fgetcsv($fraud)) {
    $dow = date("w", strtotime($dat[0]));
    if( count($window) >= 7
        && isset($avail[$dow]) ) {

      $sick = 0;
      foreach($window as $day => $value) {
        $dowavg = array_sum($avail[$day]) /
                  count($avail[$day]);
        $sick += $value / $dowavg;
      }
      $sick /= count($window);
$avlblty = array_sum($avail[$dow]) /
           count($avail[$dow]);
  $est = $sick * $avlblty;

  $fac = fac($dat[1]);
  $p = exp(-$est) * pow($est,$dat[1])
       / $fac; // poisson calc

  if($p < $sens && $dat[1] > $est) {
    $alarms++;
    if($i > 201) { break; }
  } else {
    if($i > 201) { $dtd++; }
  }

} // end if
0.2




0.15




 0.1




0.05




  0
       1   2   3   4   5   6   7   8   9   10
0.011 SENSITIVITY
          30
                1 False Alarm          1 Day To Detect

         22.5
Clicks




          15




          7.5




           0
                                Date
SUPERVISED CLASSIFIERS
classification model
SOFTWARE
ARCHITECTURE
                               Fraud

            Classifier

                             Not Fraud
  User     Transaction
Purchase    Processor


           Transaction
                              Learner
            Database
EVALUATING THE CLASSIFIER

Training Data   Learner      Model




 Test Data
                            Prediction
                Classifier   Accuracy
   Model
20




15




10




5




0
     0   5   10   15   20
20




15




10




5
             ?
0
     0   5       10   15   20
20




15




10




5
             ?
0
     0   5       10   15   20
$docs = array(
 array('fraud' => false, 'price' => 1699,
       'desc'=>'toy ninja', 'ship' => 'US'),
 array('fraud' => false, 'price' => 20000,
       'desc' => 'TV','ship' => 'US'),
 array('fraud' => false, 'price' => 2500,
       'desc' => 'cds', 'ship' => 'US'),
 array('fraud' => true, 'price' => 20000,
       'desc' => 'console', 'ship' => 'CN'),
 array('fraud' => true, 'price' => 5000,
       'desc' => 'books', 'ship' => 'US'),
 array('fraud' => true, 'price' => 15000,
       'desc' => 'ipod', 'ship' => 'CN'),
);
$db   = new XapianWritableDatabase("index",
                Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);

foreach($docs as $key => $doc) {
    $xdoc = new XapianDocument();
    $xdoc->set_data($doc['fraud'] ?
                    "fraud" : "clean");
    $idx->set_document($xdoc);
    $idx->index_text($doc['price'] . ' ' .
         $doc['desc'] . ' ' . $doc['ship']);
    $db->add_document($xdoc, $key);
}
$db = null;
                               frau dknn.php
$test = array(                     testknn.ph
                                              p
   'price' => 10000, 'desc' => 'TV',
   'ship' => 'CN'
);

$db   = new XapianWritableDatabase("index",
         Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);

$xdoc = new XapianDocument();
$idx->set_document($xdoc);
$idx->index_text($test['price'] . ' ' .
      $test['desc'] . ' ' . $test['ship']);
$id = $db->add_document($xdoc);
$enq = new XapianEnquire($db);
$rset = new XapianRSet();
$rset->add_document($id);
$eset = $enq->get_eset(10, $rset);
$terms = array();
$i = $eset->begin();
while ( !$i->equals($eset->end()) ) {
  $terms[] = $i->get_term(); $i->next();
}

$q = new XapianQuery(
         XapianQuery::OP_OR, $terms);
$enq->set_query($q);
$matches = $enq->get_mset(0, 4, $rset);
$i = $matches->begin();
while (!$i->equals($matches->end())) {
  if($i->get_document()->get_docid() != $id)
  {
    $class = $i->get_document()->get_data();
    var_dump($class);
  }
  $i->next();
}
$db->delete_document($id);


$ php testknn.php
string(5) "clean"
string(5) "fraud"
string(5) "fraud"
TRANSACTION
PARAMETERS
function compareEmailToName($name, $email) {
  $name = strtolower($name);
  $email = strtolower($email);
  $parts = explode(" ", $name);
  $pcnt = 0;

  list($user, $dom) = explode("@", $email);
  $user = str_replace(
              array(".", "+"), " ", $user);
  $dom = preg_replace("/..*/", "", $dom);

  similar_text($name, $user, $pcnt);
  if($pcnt > 80) { return 1.0; }
  similar_text($name, $dom, $pcnt);
  if($pcnt > 80) { return 0.8; }
                                 email.php
if(count($parts)) {
       $highest = 0;
       foreach($parts as $part) {
         similar_text($user, $part, $pcnt);
         if($pcnt > 50 && $pcnt > $highest) {
           $highest = $percent;
         }
         similar_text($dom, $part, $pcnt);
         if($pcnt > 50 && $pcnt > $highest) {
            $highest = $percent;
         }
       }
       return (1.7 * ($highest/100)) - 1;
     }

     return -1;
}
$data = array(
  'purchase_value' => 20993,
  'geo_country' => 'DE',
  'previous_orders' => 1,
  'time' => 6,
  'timegap' => 146632,
  'product_category' => 'small_item',
  'delivery_matches_card' => 0,
  'geo_ip_matches_card' => 1,
  'difference_from_last_trans' => 8755,
  'free_shipping' => 0,
  'email_like_name' => 0,
  'free_email_provider' => 0,
  'disposable_email_provider' => 0,
  'quantity' => 2,
  'fraud' => 0);
SUPPORT
VECTOR MACHINES
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
$ apt-get install libsvm-dev
$ apt-get install libsvm-tools

$ yum install libsvm-devel

$ pecl install svm-beta
$ echo extension=svm.so > /etc/php.d/svm.ini
$ php -r '$s = new svm(); $m = $s->train
(array(array(-1, -1), array(1, 1))); echo
$m->predict(array(0, -1));'
-1
$fh = fopen('paydata.csv', 'r');
$output = array();

while($data = fgetcsv($fh)) {
  $output[] = array(
     $data[14] == 1 ? -1 : 1,
     1 => ($data[0]/20000.00) - 1.0, // price
     2 => $data[1] == 'CN' ? 1.0:-1.0,
     3 => $data[1] == 'US' ? 1.0:-1.0,
     4 => $data[5] == 'digital' ? 1.0:-1.0,
     5 => $data[7] == 1 ? 1.0:-1.0, //geo
     6 => $data[6] == 1 ? 1.0:-1.0, // deliv
     12 => $data[9] == 1 ? 1.0:-1.0, // ship
     13 => ($data[13] / 1.5) - 1.0, // qty
  );
}                                learn.php
$svm = new svm();
$model = $svm->train($output,
               array(-1 => 0.65, 1 => 0.5));
$model->save('learn.model');

$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
  $res = $model->predict($test);
  if($test[0] > 0) {
    if($res > 0) { $tp++; }
    else { $fn++; }
  } else {
    if($res > 0) { $fp++; }
    else { $tn++; }
  }
}
// ...snip.. loading test data from
// paytest.csv

$model = new SVMModel('learn.model');

$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
  $res = $model->predict($test);
  if($test[0] > 0) {
    if($res > 0) { $tp++; }
    else { $fn++; }
  } else {
    if($res > 0) { $fp++; }
    else { $tn++; }
  }
}
                                   test.php
var_dump("True Positive " . $tp);
var_dump("True Negative " . $tn);
var_dump("False Positive " . $fp);
var_dump("False Negative " . $fn);
var_dump("Accuracy " .
        (($tp+$tn)/($tp+$tn+$fp+$fn)));
$ php learn.php
string(18) "True Positive 8316"
string(18) "True Negative 1682"
string(16) "False Positive 2"
string(16) "False Negative 0"
string(15) "Accuracy 0.9998"

$ php test.php
string(17) "True Positive 844"
string(17) "True Negative 155"
string(16) "False Positive 0"
string(16) "False Negative 1"
string(14) "Accuracy 0.999"
training data


  Test         Verify       Update



Automated     Manual        Manual
Time Series           Class Based



   Sensitivity             Model



 False    Days To    False        False
Alarms    Detect    Positives   Negatives
(shogun)
TEACHING YOUR
MACHINE
TO FIND
FRAUDSTERS

http://joind.in/3429

Ian Barber
ianb@php.net
Title Slide - CSI
http://www.flickr.com/photos/39matt/5241862082
Sickness Availability - Chicago Fire Department
http://www.flickr.com/photos/mike_miley/3929146730/
Model Buildings - Ah Ain’t Long For This Whorl
http://www.flickr.com/photos/chadmiller/98014022/
Repeat Customer - McDonald’s Loyalty Card
http://www.flickr.com/photos/fsse-info/3658873057/
Shipping - FedEx Truck
http://www.flickr.com/photos/moto_club4ag/4852235145/
Velocity - Chevrolet Chevelle Dragster
http://www.flickr.com/photos/jns001/2958999006/
GeoIP - Earth Asia Terminator View
http://www.flickr.com/photos/flyingsinger/86898564/
Multiple Items - Boxes
http://www.flickr.com/photos/skrewtape/851672959/

Teaching Your Machine To Find Fraudsters

  • 1.
    TEACHING YOUR MACHINE TO FIND FRAUDSTERS IanBarber ianb@php.net phpir.com twitter.com/ianbarber
  • 2.
  • 3.
    5% 3% SOME .1% SMALL NUMBERS 8%
  • 4.
  • 5.
    REALLY REALLY LEGITIMATE FRAUD EVALUATED 989 0 LEGITIMATE EVALUATED 10 1 FRAUD
  • 6.
    REALLY REALLY LEGITIMATE FRAUD 90% EVALUATED LEGITIMATE WR ONG989 0 EVALUATED 10 1 FRAUD
  • 7.
  • 8.
    30 22.5 Clicks 15 7.5 0 Date
  • 9.
    SOFTWARE ARCHITECTURE Alarm Detector No Alarm Buffer User Clicks Landing Ad Page
  • 10.
    DETECTOR statistics Expected Clicks Threshold Data Buffer Sensitivity Alarm
  • 11.
    average.php function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);
  • 12.
    0.2 0.15 0.1 0.05 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
  • 13.
    if($d[1] > ($avg+ ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd); }
  • 14.
    1.6 SENSITIVITY 30 18 False Alarms 1 Day To Detect 22.5 Clicks 15 7.5 0 Date
  • 15.
    2.7 SENSITIVITY 30 1 False Alarm 18 Days To Detect 22.5 Clicks 15 7.5 0 Date
  • 16.
  • 17.
    function detect($sens) { sickavail.php $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) { $sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);
  • 18.
    $avlblty = array_sum($avail[$dow])/ count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } // end if
  • 19.
    0.2 0.15 0.1 0.05 0 1 2 3 4 5 6 7 8 9 10
  • 20.
    0.011 SENSITIVITY 30 1 False Alarm 1 Day To Detect 22.5 Clicks 15 7.5 0 Date
  • 21.
  • 22.
    classification model SOFTWARE ARCHITECTURE Fraud Classifier Not Fraud User Transaction Purchase Processor Transaction Learner Database
  • 23.
    EVALUATING THE CLASSIFIER TrainingData Learner Model Test Data Prediction Classifier Accuracy Model
  • 24.
    20 15 10 5 0 0 5 10 15 20
  • 25.
    20 15 10 5 ? 0 0 5 10 15 20
  • 26.
    20 15 10 5 ? 0 0 5 10 15 20
  • 27.
    $docs = array( array('fraud' => false, 'price' => 1699, 'desc'=>'toy ninja', 'ship' => 'US'), array('fraud' => false, 'price' => 20000, 'desc' => 'TV','ship' => 'US'), array('fraud' => false, 'price' => 2500, 'desc' => 'cds', 'ship' => 'US'), array('fraud' => true, 'price' => 20000, 'desc' => 'console', 'ship' => 'CN'), array('fraud' => true, 'price' => 5000, 'desc' => 'books', 'ship' => 'US'), array('fraud' => true, 'price' => 15000, 'desc' => 'ipod', 'ship' => 'CN'), );
  • 28.
    $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN); $idx = new XapianTermGenerator(); $stem = new XapianStem("english"); $idx->set_stemmer($stem); foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc['fraud'] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc['price'] . ' ' . $doc['desc'] . ' ' . $doc['ship']); $db->add_document($xdoc, $key); } $db = null; frau dknn.php
  • 29.
    $test = array( testknn.ph p 'price' => 10000, 'desc' => 'TV', 'ship' => 'CN' ); $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN); $idx = new XapianTermGenerator(); $stem = new XapianStem("english"); $idx->set_stemmer($stem); $xdoc = new XapianDocument(); $idx->set_document($xdoc); $idx->index_text($test['price'] . ' ' . $test['desc'] . ' ' . $test['ship']); $id = $db->add_document($xdoc);
  • 30.
    $enq = newXapianEnquire($db); $rset = new XapianRSet(); $rset->add_document($id); $eset = $enq->get_eset(10, $rset); $terms = array(); $i = $eset->begin(); while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next(); } $q = new XapianQuery( XapianQuery::OP_OR, $terms); $enq->set_query($q); $matches = $enq->get_mset(0, 4, $rset);
  • 31.
    $i = $matches->begin(); while(!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next(); } $db->delete_document($id); $ php testknn.php string(5) "clean" string(5) "fraud" string(5) "fraud"
  • 32.
  • 35.
    function compareEmailToName($name, $email){ $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; } email.php
  • 36.
    if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; } return -1; }
  • 40.
    $data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);
  • 41.
  • 42.
    20 15 10 5 0 0 5 10 15 20
  • 43.
    20 15 10 5 0 0 5 10 15 20
  • 44.
    20 15 10 5 0 0 5 10 15 20
  • 45.
    20 15 10 5 0 0 5 10 15 20
  • 46.
    20 15 10 5 0 0 5 10 15 20
  • 47.
    $ apt-get installlibsvm-dev $ apt-get install libsvm-tools $ yum install libsvm-devel $ pecl install svm-beta $ echo extension=svm.so > /etc/php.d/svm.ini $ php -r '$s = new svm(); $m = $s->train (array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));' -1
  • 48.
    $fh = fopen('paydata.csv','r'); $output = array(); while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty ); } learn.php
  • 49.
    $svm = newsvm(); $model = $svm->train($output, array(-1 => 0.65, 1 => 0.5)); $model->save('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } }
  • 50.
    // ...snip.. loadingtest data from // paytest.csv $model = new SVMModel('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } } test.php
  • 51.
    var_dump("True Positive ". $tp); var_dump("True Negative " . $tn); var_dump("False Positive " . $fp); var_dump("False Negative " . $fn); var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));
  • 52.
    $ php learn.php string(18)"True Positive 8316" string(18) "True Negative 1682" string(16) "False Positive 2" string(16) "False Negative 0" string(15) "Accuracy 0.9998" $ php test.php string(17) "True Positive 844" string(17) "True Negative 155" string(16) "False Positive 0" string(16) "False Negative 1" string(14) "Accuracy 0.999"
  • 53.
    training data Test Verify Update Automated Manual Manual
  • 54.
    Time Series Class Based Sensitivity Model False Days To False False Alarms Detect Positives Negatives
  • 55.
  • 56.
  • 57.
    Title Slide -CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Department http://www.flickr.com/photos/mike_miley/3929146730/ Model Buildings - Ah Ain’t Long For This Whorl http://www.flickr.com/photos/chadmiller/98014022/ Repeat Customer - McDonald’s Loyalty Card http://www.flickr.com/photos/fsse-info/3658873057/ Shipping - FedEx Truck http://www.flickr.com/photos/moto_club4ag/4852235145/ Velocity - Chevrolet Chevelle Dragster http://www.flickr.com/photos/jns001/2958999006/ GeoIP - Earth Asia Terminator View http://www.flickr.com/photos/flyingsinger/86898564/ Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/