Teaching Your Machine To Find Fraudsters

1,911 views
1,821 views

Published on

The slides from my talk at PHP Tek 11.

When dealing with money online, fraud is an ongoing problem for both
consumers and sellers. Researchers have been developing statistical
and machine learning techniques to detect shady sellers on auction
sites, spot fraudulent payments on e-commerce systems and catch click
fraud on adverts. While there is no silver bullet, you will learn to
flag suspicious activity and help protect your site from scammers
using PHP and a little help from some other technologies.

Published in: Technology
0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total views
1,911
On SlideShare
0
From Embeds
0
Number of Embeds
4
Actions
Shares
0
Downloads
17
Comments
0
Likes
0
Embeds 0
No embeds

No notes for slide

Teaching Your Machine To Find Fraudsters

  1. 1. TEACHING YOURMACHINETO FINDFRAUDSTERSIan Barberianb@php.netphpir.comtwitter.com/ianbarber
  2. 2. http://joind.in/3429https://github.com/ianbarber/FindingFraudsters-Talk
  3. 3. 5% 3%SOME .1%SMALLNUMBERS 8%
  4. 4. 99%ACCURACY
  5. 5. REALLY REALLY LEGITIMATE FRAUDEVALUATED 989 0LEGITIMATEEVALUATED 10 1 FRAUD
  6. 6. REALLY REALLY LEGITIMATE FRAUD 90%EVALUATEDLEGITIMATE WR ONG989 0EVALUATED 10 1 FRAUD
  7. 7. ANOMALY DETECTION
  8. 8. 30 22.5Clicks 15 7.5 0 Date
  9. 9. SOFTWAREARCHITECTURE Alarm Detector No Alarm BufferUser Clicks Landing Ad Page
  10. 10. DETECTOR statistics Expected Clicks Threshold Data BufferSensitivity Alarm
  11. 11. average.phpfunction detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", r); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);
  12. 12. 0.20.15 0.10.05 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
  13. 13. if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd);}
  14. 14. 1.6 SENSITIVITY 30 18 False Alarms 1 Day To Detect 22.5Clicks 15 7.5 0 Date
  15. 15. 2.7 SENSITIVITY 30 1 False Alarm 18 Days To Detect 22.5Clicks 15 7.5 0 Date
  16. 16. SICKNESSAVAILABILITY
  17. 17. function detect($sens) { sickavail.php $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", r); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) { $sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);
  18. 18. $avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } }} // end if
  19. 19. 0.20.15 0.10.05 0 1 2 3 4 5 6 7 8 9 10
  20. 20. 0.011 SENSITIVITY 30 1 False Alarm 1 Day To Detect 22.5Clicks 15 7.5 0 Date
  21. 21. SUPERVISED CLASSIFIERS
  22. 22. classification modelSOFTWAREARCHITECTURE Fraud Classifier Not Fraud User TransactionPurchase Processor Transaction Learner Database
  23. 23. EVALUATING THE CLASSIFIERTraining Data Learner Model Test Data Prediction Classifier Accuracy Model
  24. 24. 20151050 0 5 10 15 20
  25. 25. 2015105 ?0 0 5 10 15 20
  26. 26. 2015105 ?0 0 5 10 15 20
  27. 27. $docs = array( array(fraud => false, price => 1699, desc=>toy ninja, ship => US), array(fraud => false, price => 20000, desc => TV,ship => US), array(fraud => false, price => 2500, desc => cds, ship => US), array(fraud => true, price => 20000, desc => console, ship => CN), array(fraud => true, price => 5000, desc => books, ship => US), array(fraud => true, price => 15000, desc => ipod, ship => CN),);
  28. 28. $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc[fraud] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc[price] . . $doc[desc] . . $doc[ship]); $db->add_document($xdoc, $key);}$db = null; frau dknn.php
  29. 29. $test = array( testknn.ph p price => 10000, desc => TV, ship => CN);$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);$xdoc = new XapianDocument();$idx->set_document($xdoc);$idx->index_text($test[price] . . $test[desc] . . $test[ship]);$id = $db->add_document($xdoc);
  30. 30. $enq = new XapianEnquire($db);$rset = new XapianRSet();$rset->add_document($id);$eset = $enq->get_eset(10, $rset);$terms = array();$i = $eset->begin();while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next();}$q = new XapianQuery( XapianQuery::OP_OR, $terms);$enq->set_query($q);$matches = $enq->get_mset(0, 4, $rset);
  31. 31. $i = $matches->begin();while (!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next();}$db->delete_document($id);$ php testknn.phpstring(5) "clean"string(5) "fraud"string(5) "fraud"
  32. 32. TRANSACTIONPARAMETERS
  33. 33. function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; } email.php
  34. 34. if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; } return -1;}
  35. 35. $data = array( purchase_value => 20993, geo_country => DE, previous_orders => 1, time => 6, timegap => 146632, product_category => small_item, delivery_matches_card => 0, geo_ip_matches_card => 1, difference_from_last_trans => 8755, free_shipping => 0, email_like_name => 0, free_email_provider => 0, disposable_email_provider => 0, quantity => 2, fraud => 0);
  36. 36. SUPPORTVECTOR MACHINES
  37. 37. 20151050 0 5 10 15 20
  38. 38. 20151050 0 5 10 15 20
  39. 39. 20151050 0 5 10 15 20
  40. 40. 20151050 0 5 10 15 20
  41. 41. 20151050 0 5 10 15 20
  42. 42. $ apt-get install libsvm-dev$ apt-get install libsvm-tools$ yum install libsvm-devel$ pecl install svm-beta$ echo extension=svm.so > /etc/php.d/svm.ini$ php -r $s = new svm(); $m = $s->train(array(array(-1, -1), array(1, 1))); echo$m->predict(array(0, -1));-1
  43. 43. $fh = fopen(paydata.csv, r);$output = array();while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == CN ? 1.0:-1.0, 3 => $data[1] == US ? 1.0:-1.0, 4 => $data[5] == digital ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty );} learn.php
  44. 44. $svm = new svm();$model = $svm->train($output, array(-1 => 0.65, 1 => 0.5));$model->save(learn.model);$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}
  45. 45. // ...snip.. loading test data from// paytest.csv$model = new SVMModel(learn.model);$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }} test.php
  46. 46. var_dump("True Positive " . $tp);var_dump("True Negative " . $tn);var_dump("False Positive " . $fp);var_dump("False Negative " . $fn);var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));
  47. 47. $ php learn.phpstring(18) "True Positive 8316"string(18) "True Negative 1682"string(16) "False Positive 2"string(16) "False Negative 0"string(15) "Accuracy 0.9998"$ php test.phpstring(17) "True Positive 844"string(17) "True Negative 155"string(16) "False Positive 0"string(16) "False Negative 1"string(14) "Accuracy 0.999"
  48. 48. training data Test Verify UpdateAutomated Manual Manual
  49. 49. Time Series Class Based Sensitivity Model False Days To False FalseAlarms Detect Positives Negatives
  50. 50. (shogun)
  51. 51. TEACHING YOURMACHINETO FINDFRAUDSTERShttp://joind.in/3429Ian Barberianb@php.net
  52. 52. Title Slide - CSIhttp://www.flickr.com/photos/39matt/5241862082Sickness Availability - Chicago Fire Departmenthttp://www.flickr.com/photos/mike_miley/3929146730/Model Buildings - Ah Ain’t Long For This Whorlhttp://www.flickr.com/photos/chadmiller/98014022/Repeat Customer - McDonald’s Loyalty Cardhttp://www.flickr.com/photos/fsse-info/3658873057/Shipping - FedEx Truckhttp://www.flickr.com/photos/moto_club4ag/4852235145/Velocity - Chevrolet Chevelle Dragsterhttp://www.flickr.com/photos/jns001/2958999006/GeoIP - Earth Asia Terminator Viewhttp://www.flickr.com/photos/flyingsinger/86898564/Multiple Items - Boxeshttp://www.flickr.com/photos/skrewtape/851672959/

×