Location: PHPKode > projects > Astanda Open Directory Project > adp-1.3b/admin/spider.php
<?php
//-------------------------------------------------//
//                                                 //
//        ASTANDA DIRECTORY PROJECT  [ADP]         //
//                  Version 1.3b                   //
//               License: SHAREWARE                //
//             http://www.astanda.com              //
//      © Copyright 2005, 2006, Pavel Golovko      //
//                                                 //
//-------------------------------------------------//
//                                                 //
// Any attempt to remove or alter "Powered by ADP" //
// recip link will automatically wave your rights  //
// to any use of ADP software!                     //
//                                                 //
// Reverse engineering and redistribution of       //
// ADP code or any part of its code is strictly    //
// prohibited!                                     //
//                                                 //
//-------------------------------------------------//
//                                                 //
// ADP is distributed in the hope that it will be  //
// useful, but WITHOUT ANY WARRANTY; without even  //
// the implied warranty of MERCHANTABILITY or      //
// FITNESS FOR A PARTICULAR PURPOSE.               //
//                                                 //
//-------------------------------------------------//

$demo = "0"; require_once("database.php"); require_once("functions.php"); include("pagerank.php"); set_time_limit(cfg("admin_set_time_limit")); if(isset($_REQUEST['url'])){  indexURL($_REQUEST['url']); } elseif(isset($_REQUEST['full'])){ if(!$_REQUEST['num']){ $num = 0; } else{ $num = $_REQUEST['num']; } $result = mysql_query("SELECT COUNT(`id`) FROM `links`"); print mysql_error(); if(mysql_num_rows($result)){  while($row = mysql_fetch_array($result)){  $total_num = $row[0]; }  } $result = mysql_query("SELECT `url` FROM `links` LIMIT $num, 1"); print mysql_error(); if(mysql_num_rows($result)){  while($row=mysql_fetch_array($result)){  $url = $row['url']; }  } $next = $num + 1; if ($total_num > $next) {  print "<P><center><a href=\"spider.php?full=1&num=".$next."\">Next >>></a></center>"; print "<meta http-equiv=\"refresh\" content=\"3;url=spider.php?full=1&num=".$next."\"><hr>"; } indexURL($url); if ($total_num > $next) {  print "<hr><center><a href=\"spider.php?full=1&num=".$next."\">Next >>></a></center>"; }  else{ print"<hr><h2>FINISHED FULL CRAWL !!!</h2><p>&nbsp;<p>"; }  } elseif(isset($_REQUEST['date'])){ if($_REQUEST['date'] == date("Y-m-d")){ exit("Todays indexes are not available for reindexing, because the spider will fall into an endless loop recrawling the same pages over and over."); }  $link_id = "0"; $result = mysql_query("SELECT `link_id` FROM `pages` WHERE `date` = '".$_REQUEST['date']."' LIMIT 1"); print mysql_error(); if(mysql_num_rows($result)){  while ($row = mysql_fetch_array($result)){  $link_id = $row['0']; $total_num++; }  }  if($link_id <> "0"){  $result = mysql_query("SELECT `url` FROM `links` WHERE `id` = '".$link_id."'"); print mysql_error(); if(mysql_num_rows($result)){  while ($row = mysql_fetch_array($result)){  $url = $row['0']; }  }  }  if($link_id <> "0"){  print "<P><center><a href=\"spider.php?date=".$_REQUEST['date']."\">Next >>></a></center>"; print "<meta http-equiv=\"refresh\" content=\"3;url=spider.php?date=".$_REQUEST['date']."\"><hr>"; indexURL($url); }  else{ print"<hr><h2>FINISHED DATE CRAWL !!!</h2><p>&nbsp;<p>"; }  } elseif(isset($_REQUEST['cat_id'])){ if(!isset($_REQUEST['num'])){ $num = 0; } else{ $num = $_REQUEST['num']; } $result = mysql_query("SELECT COUNT(`id`) FROM `links` WHERE `category_id` = '".$_REQUEST['cat_id']."'"); print mysql_error(); if(mysql_num_rows($result)){  while($row = mysql_fetch_array($result)){  $total_num = $row[0]; }  } $result = mysql_query("SELECT `url` FROM `links` WHERE `category_id` = '".$_REQUEST['cat_id']."' LIMIT $num, 1"); print mysql_error(); if(mysql_num_rows($result)){  while($row=mysql_fetch_array($result)){  $url = $row[0]; }  } $next = $num + 1; if ($total_num > $next) {  print "<P><center><a href=\"spider.php?cat_id=".$_REQUEST['cat_id']."&num=".$next."\">Next >>></a></center>"; print "<meta http-equiv=\"refresh\" content=\"3;url=spider.php?cat_id=".$_REQUEST['cat_id']."&num=".$next."\"><hr>"; if($demo == "0"){  mysql_query("UPDATE `categories` SET `date` = '".date("Y-m-d")."' WHERE `categories`.`id` = '".$_REQUEST['cat_id']."' LIMIT 1"); print mysql_error(); }  } if($url <> ""){  indexURL($url); } if ($total_num > $next) {  print "<hr><center><a href=\"spider.php?cat_id=".$_REQUEST['cat_id']."&num=".$next."\">Next >>></a></center>"; }  else{ print"<hr><h2>FINISHED SINGLE CATEGORY CRAWL !!!</h2><p>&nbsp;<p>"; }  } else{  print "No input was given."; } function indexURL($url){ global $link_id,$demo; class myCrawler extends ADPcrawler{  function handlePageData($page_data){ global $link_id,$demo,$url; $today = date("Y-m-d"); print "URL: <a target=\"_blank\" href=\"".$page_data["url"]."\">".$page_data["url"]."</a><br>"; print "Status: ".strtok($page_data["header"], "\n")."<br>"; print "Referer-URL: <a target=\"_blank\" href=\"".$page_data["referer_url"]."\">".$page_data["referer_url"]."</a><br>"; if(($page_data['received'] == true) && ($page_data['http_status_code'] == 200)){  print "Content received: ".$page_data["bytes_received"]." bytes"; $warning = ""; $Bresult = mysql_query("SELECT `domain` FROM `banned`"); print mysql_error(); if(mysql_num_rows($Bresult)){  while($Brow = mysql_fetch_array($Bresult)){  if(strpos($page_data["url"],$Brow['domain'])){ $warning .= "<p><font color=red>Warning: banned URL match!</font><p>"; }  }  }  $Bresult = mysql_query("SELECT `keyword` FROM `bad_keywords`"); print mysql_error(); if(mysql_num_rows($Bresult)){  while ($Brow = mysql_fetch_array($Bresult)){  if(strpos($page_data['source'],$Brow['keyword'])){ $warning .= "<p><font color=red>Warning: banned Keyword match in content!</font><p>"; }  }  } if($warning == ""){  preg_match( "'<title[^>]*?>(.*)</title>'siU", $page_data['source'], $match ); $title_len = 0; if($match[1]){  $title = mysql_real_escape_string(substr(trim($match[1]),0,cfg("spider_TitleMax"))); $title_len = strlen($match[1]); }  else{ $title = cfg("spider_Untitled"); }  $document = trim($page_data['source']); $search = array('@<script[^>]*?>.*?</script>@si', '@<[\/\!]*?[^<>]*?>@si', '@<![\s\S]*?--[ \t\n\r]*>@' ); $document = preg_replace("'<style[^>]*>.*</style>'siU",'',$document); $document = preg_replace($search, '', $document); $document = strip_tags($document); $document = preg_replace('/&nbsp;/', ' ',$document); $document = preg_replace('/\s+/', ' ',$document); $document = trim($document); $text = mysql_real_escape_string(trim(substr($document,$title_len,cfg("spider_DescriptionMax")))); $recip = eregi(cfg("recip_regex"),$page_data['source'])?1:0; $oPR=new ADPPageRank(); $gpr = $oPR->getRank($page_data["url"]); if($gpr == "-1"){ $gpr = "0"; } print "<p><a href=\"".$page_data["url"]."\">$title</a><br><font color=green>".$page_data["url"]."</font> (PR".$gpr.", Recip: "; if($recip == 1){ print "<font color=green><b>YES</b></font>"; }  else { print "<font color=red>NO</font>"; }  print ")<br>"; if(cfg("spider_DiplayContent") != "0"){ print $text; } if($demo == "1"){ print "<p>DISABLED writing to database IN DEMO MODE<p>"; } else { if(!isset($link_id) || $link_id == "0"){ if(!isset($url)){ $url = $_REQUEST['url']; } $result = mysql_query("SELECT `id` FROM `links` WHERE `url` = '".$url."'"); print mysql_error(); if(mysql_num_rows($result)){ while($row=mysql_fetch_array($result)){ $link_id = $row['0']; } } } if(mysql_num_rows(mysql_query("SELECT `url` FROM `pages` WHERE `url`='".$page_data["url"]."'")) == 0){ mysql_query("INSERT INTO `pages` ( `id` , `link_id` , `url` , `title` , `description` , `gpr` , `recip` , `date`, `size` ) VALUES ( NULL , '$link_id' , '".$page_data["url"]."', '$title', '$text', '$gpr', '$recip', '$today', '".$page_data["bytes_received"]."' );"); print mysql_error(); }  else{  print "<ul>Page already indexed. Updating ....</ul>"; mysql_query("DELETE FROM `pages` WHERE `url` = '".$page_data["url"]."'"); mysql_query("INSERT INTO `pages` ( `id` , `link_id` , `url` , `title` , `description` , `gpr` , `recip` , `date`, `size` ) VALUES ( NULL , '$link_id' , '".$page_data["url"]."', '$title', '$text', '$gpr', '$recip', '$today', '".$page_data["bytes_received"]."' );"); print mysql_error(); }  }  }  else{  print $warning."This URL will not be added!"; }  print "<hr>"; }  else{  print "Content not received<hr>"; }  sleep(cfg("spider_pause")); flush(); }  } print "<b>Crawling:</b> <a href=\"$url\">$url</a><hr>"; if(cfg("spider_delete") == "1"){  print "Removing old entries .... "; if($demo == "1"){ print "<p>DISABLED IN DEMO MODE<p>"; } else {  $result = mysql_query("SELECT `id` FROM `links` WHERE `url` = '$url'"); print mysql_error(); if(mysql_num_rows($result)){  while($row = mysql_fetch_array($result)){  $link_id = $row['id']; }  mysql_query("DELETE FROM `pages` WHERE `link_id` = '$link_id'"); print mysql_error(); }  }  print " Done!<hr>"; } $crawler = &new MyCrawler(); $crawler->setURL($url); if(cfg("spider_FollowMode")){ $crawler->setFollowMode(cfg("spider_FollowMode")); }  if(cfg("spider_FollowMatch")){ $crawler->addFollowMatch(cfg("spider_FollowMatch")); }  if(cfg("spider_NonFollowMatch")){ $crawler->addNonFollowMatch(cfg("spider_NonFollowMatch")); }  if(cfg("spider_FollowRedirects") == "0"){ $FollowRedirects = "false"; } else{ $FollowRedirects = "true"; }  $crawler->setFollowRedirects($FollowRedirects); if(cfg("spider_FollowRedirectsTillContent") == "0"){ $FollowRedirectsTillContent = "false"; } else{ $FollowRedirectsTillContent = "true"; }  $crawler->setFollowRedirectsTillContent($FollowRedirectsTillContent); if(cfg("spider_ReceiveContentType")){ $crawler->addReceiveContentType(cfg("spider_ReceiveContentType")); }  if(cfg("spider_PageLimit")){ $crawler->setPageLimit(cfg("spider_PageLimit")); }  if(cfg("spider_TrafficLimit")){ $crawler->setTrafficLimit(cfg("spider_TrafficLimit") * 1024); }  if(cfg("spider_ContentSizeLimit")){ $crawler->setContentSizeLimit(cfg("spider_ContentSizeLimit") * 1024); }  if(cfg("spider_ConnectionTimeout")){ $crawler->setConnectionTimeout(cfg("spider_ConnectionTimeout")); }  if(cfg("spider_StreamTimeout")){ $crawler->setStreamTimeout(cfg("spider_StreamTimeout")); }  if(cfg("spider_CookieHandling") == "0"){ $CookieHandling = "false"; } else{ $CookieHandling = "true"; }  $crawler->setCookieHandling($CookieHandling); $crawler->go(); $report = $crawler->getReport(); print "<p>Report:<br>"; if($report["traffic_limit_reached"]==true){ print "Traffic-limit reached <br>"; }  if($report["file_limit_reached"]==true){ print "File-limit reached <br>"; }  print "Links followed: ".$report["links_followed"]."<br>"; print "Files received: ".$report["files_received"]."<br>"; print "Bytes received: ".$report["bytes_received"]."<br>"; print "<hr>Finished!"; }
?>
Return current item: Astanda Open Directory Project