More Related Content Similar to London XQuery Meetup: Querying the World (Web Scraping) (20) London XQuery Meetup: Querying the World (Web Scraping)1. XQuery: Querying the World (formerly known as Web Scraping) Dennis Knochenwefel <dennis.knochenwefel@28msec.com> 3. PHP (2007) $url = "http://www.nfl.com/teams/sandiegochargers/roster?team=SD"; $raw = file_get_contents($url); $newlines = array("","","","2020","","0B"); $content = str_replace($newlines, "", html_entity_decode($raw)); $start = strpos($content,'<table cellpadding="2" class="standard_table"'); $end = strpos($content,'</table>',$start) + 8; $table = substr($content,$start,$end-$start); preg_match_all("|<tr(.*)</tr>|U",$table,$rows); foreach ($rows[0] as $row){ if ((strpos($row,'<th')===false)){ preg_match_all("|<td(.*)</td>|U",$row,$cells); $number = strip_tags($cells[0][0]); $name = strip_tags($cells[0][1]); $position = strip_tags($cells[0][2]); echo "{$position} - {$name} - Number {$number} <br>"; } } $url = "http://www.nfl.com/teams/sandiegochargers/roster?team=SD"; $raw = file_get_contents($url); $newlines = array("","","","2020","","0B"); $content = str_replace($newlines, "", html_entity_decode($raw)); $start = strpos($content,'<table cellpadding="2" class="standard_table"'); $end = strpos($content,'</table>',$start) + 8; $table = substr($content,$start,$end-$start); preg_match_all("|<tr(.*)</tr>|U",$table,$rows); foreach ($rows[0] as $row){ if ((strpos($row,'<th')===false)){ preg_match_all("|<td(.*)</td>|U",$row,$cells); $number = strip_tags($cells[0][0]); $name = strip_tags($cells[0][1]); $position = strip_tags($cells[0][2]); echo "{$position} - {$name} - Number {$number} <br>"; } } source: http://www.bradino.com/php/screen-scraping/ 4. PHP (June 2011) $url="http://www.rtu.ac.in/results/reformat.php"; $post="rollnumber=08epccs060&filename=fetchmodulesem_4_btech410m.php&button=Submit"; $ch=curl_init(); curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_POST,1); curl_setopt($ch,CURLOPT_POSTFIELDS,$post); curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); $content=curl_exec($ch); curl_close($ch); $totalPath="html/body/table[4]/tbody/tr[3]/td[4]"; $page=new DOMDocument(); $xpath=new DOMXPath($page); $page->loadHTML($content); $page->saveHTML(); // this shows the page contents $total=$xpath->query($totalPath); echo $total->length; //shows 0 echo $total->item(0)->nodeValue; //shows nothing $url="http://www.rtu.ac.in/results/reformat.php"; $post="rollnumber=08epccs060&filename=fetchmodulesem_4_btech410m.php&button=Submit"; $ch=curl_init(); curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_POST,1); curl_setopt($ch,CURLOPT_POSTFIELDS,$post); curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); $content=curl_exec($ch); curl_close($ch); $totalPath="html/body/table[4]/tbody/tr[3]/td[4]"; $page=new DOMDocument(); $xpath=new DOMXPath($page); $page->loadHTML($content); $page->saveHTML(); // this shows the page contents $total=$xpath->query($totalPath); echo $total->length; //shows 0 echo $total->item(0)->nodeValue; //shows nothing ! ! source: http://stackoverflow.com/questions/6283361/unable-to-get-table-data-from-a-html-page 11. JSON ? XML ? CSV ! HTML ! XLS ! Zip ! App Website 13. Stateless REST API ? JSON ? XML ? CSV ! HTML ! XLS ! Zip ! Session! App Website Customize with URL Params HTML Forms 14. Stateless REST API ? JSON ? XML ? CSV ! HTML ! XLS ! Zip ! Session! App Website Customize with URL Params HTML Forms 15. CSV ! HTML ! XLS ! Zip ! HTML ! Session! Session! App Website XQuery ! HTML Forms HTML Forms Editor's Notes http://stackoverflow.com/questions/6283361/unable-to-get-table-data-from-a-html-page http://stackoverflow.com/questions/6283361/unable-to-get-table-data-from-a-html-page http://stackoverflow.com/questions/6283361/unable-to-get-table-data-from-a-html-page