<?php
//-------------------------------------------------//
// //
// ASTANDA DIRECTORY PROJECT [ADP] //
// Version 1.3b //
// License: SHAREWARE //
// http://www.astanda.com //
// © Copyright 2005, 2006, Pavel Golovko //
// //
//-------------------------------------------------//
// //
// Any attempt to remove or alter "Powered by ADP" //
// recip link will automatically wave your rights //
// to any use of ADP software! //
// //
// Reverse engineering and redistribution of //
// ADP code or any part of its code is strictly //
// prohibited! //
// //
//-------------------------------------------------//
// //
// ADP is distributed in the hope that it will be //
// useful, but WITHOUT ANY WARRANTY; without even //
// the implied warranty of MERCHANTABILITY or //
// FITNESS FOR A PARTICULAR PURPOSE. //
// //
//-------------------------------------------------//
$demo = "0"; require_once("database.php"); require_once("functions.php"); include("pagerank.php"); set_time_limit(cfg("admin_set_time_limit")); if(isset($_REQUEST['url'])){ indexURL($_REQUEST['url']); } elseif(isset($_REQUEST['full'])){ if(!$_REQUEST['num']){ $num = 0; } else{ $num = $_REQUEST['num']; } $result = mysql_query("SELECT COUNT(`id`) FROM `links`"); print mysql_error(); if(mysql_num_rows($result)){ while($row = mysql_fetch_array($result)){ $total_num = $row[0]; } } $result = mysql_query("SELECT `url` FROM `links` LIMIT $num, 1"); print mysql_error(); if(mysql_num_rows($result)){ while($row=mysql_fetch_array($result)){ $url = $row['url']; } } $next = $num + 1; if ($total_num > $next) { print "<P><center><a href=\"spider.php?full=1&num=".$next."\">Next >>></a></center>"; print "<meta http-equiv=\"refresh\" content=\"3;url=spider.php?full=1&num=".$next."\"><hr>"; } indexURL($url); if ($total_num > $next) { print "<hr><center><a href=\"spider.php?full=1&num=".$next."\">Next >>></a></center>"; } else{ print"<hr><h2>FINISHED FULL CRAWL !!!</h2><p> <p>"; } } elseif(isset($_REQUEST['date'])){ if($_REQUEST['date'] == date("Y-m-d")){ exit("Todays indexes are not available for reindexing, because the spider will fall into an endless loop recrawling the same pages over and over."); } $link_id = "0"; $result = mysql_query("SELECT `link_id` FROM `pages` WHERE `date` = '".$_REQUEST['date']."' LIMIT 1"); print mysql_error(); if(mysql_num_rows($result)){ while ($row = mysql_fetch_array($result)){ $link_id = $row['0']; $total_num++; } } if($link_id <> "0"){ $result = mysql_query("SELECT `url` FROM `links` WHERE `id` = '".$link_id."'"); print mysql_error(); if(mysql_num_rows($result)){ while ($row = mysql_fetch_array($result)){ $url = $row['0']; } } } if($link_id <> "0"){ print "<P><center><a href=\"spider.php?date=".$_REQUEST['date']."\">Next >>></a></center>"; print "<meta http-equiv=\"refresh\" content=\"3;url=spider.php?date=".$_REQUEST['date']."\"><hr>"; indexURL($url); } else{ print"<hr><h2>FINISHED DATE CRAWL !!!</h2><p> <p>"; } } elseif(isset($_REQUEST['cat_id'])){ if(!isset($_REQUEST['num'])){ $num = 0; } else{ $num = $_REQUEST['num']; } $result = mysql_query("SELECT COUNT(`id`) FROM `links` WHERE `category_id` = '".$_REQUEST['cat_id']."'"); print mysql_error(); if(mysql_num_rows($result)){ while($row = mysql_fetch_array($result)){ $total_num = $row[0]; } } $result = mysql_query("SELECT `url` FROM `links` WHERE `category_id` = '".$_REQUEST['cat_id']."' LIMIT $num, 1"); print mysql_error(); if(mysql_num_rows($result)){ while($row=mysql_fetch_array($result)){ $url = $row[0]; } } $next = $num + 1; if ($total_num > $next) { print "<P><center><a href=\"spider.php?cat_id=".$_REQUEST['cat_id']."&num=".$next."\">Next >>></a></center>"; print "<meta http-equiv=\"refresh\" content=\"3;url=spider.php?cat_id=".$_REQUEST['cat_id']."&num=".$next."\"><hr>"; if($demo == "0"){ mysql_query("UPDATE `categories` SET `date` = '".date("Y-m-d")."' WHERE `categories`.`id` = '".$_REQUEST['cat_id']."' LIMIT 1"); print mysql_error(); } } if($url <> ""){ indexURL($url); } if ($total_num > $next) { print "<hr><center><a href=\"spider.php?cat_id=".$_REQUEST['cat_id']."&num=".$next."\">Next >>></a></center>"; } else{ print"<hr><h2>FINISHED SINGLE CATEGORY CRAWL !!!</h2><p> <p>"; } } else{ print "No input was given."; } function indexURL($url){ global $link_id,$demo; class myCrawler extends ADPcrawler{ function handlePageData($page_data){ global $link_id,$demo,$url; $today = date("Y-m-d"); print "URL: <a target=\"_blank\" href=\"".$page_data["url"]."\">".$page_data["url"]."</a><br>"; print "Status: ".strtok($page_data["header"], "\n")."<br>"; print "Referer-URL: <a target=\"_blank\" href=\"".$page_data["referer_url"]."\">".$page_data["referer_url"]."</a><br>"; if(($page_data['received'] == true) && ($page_data['http_status_code'] == 200)){ print "Content received: ".$page_data["bytes_received"]." bytes"; $warning = ""; $Bresult = mysql_query("SELECT `domain` FROM `banned`"); print mysql_error(); if(mysql_num_rows($Bresult)){ while($Brow = mysql_fetch_array($Bresult)){ if(strpos($page_data["url"],$Brow['domain'])){ $warning .= "<p><font color=red>Warning: banned URL match!</font><p>"; } } } $Bresult = mysql_query("SELECT `keyword` FROM `bad_keywords`"); print mysql_error(); if(mysql_num_rows($Bresult)){ while ($Brow = mysql_fetch_array($Bresult)){ if(strpos($page_data['source'],$Brow['keyword'])){ $warning .= "<p><font color=red>Warning: banned Keyword match in content!</font><p>"; } } } if($warning == ""){ preg_match( "'<title[^>]*?>(.*)</title>'siU", $page_data['source'], $match ); $title_len = 0; if($match[1]){ $title = mysql_real_escape_string(substr(trim($match[1]),0,cfg("spider_TitleMax"))); $title_len = strlen($match[1]); } else{ $title = cfg("spider_Untitled"); } $document = trim($page_data['source']); $search = array('@<script[^>]*?>.*?</script>@si', '@<[\/\!]*?[^<>]*?>@si', '@<![\s\S]*?--[ \t\n\r]*>@' ); $document = preg_replace("'<style[^>]*>.*</style>'siU",'',$document); $document = preg_replace($search, '', $document); $document = strip_tags($document); $document = preg_replace('/ /', ' ',$document); $document = preg_replace('/\s+/', ' ',$document); $document = trim($document); $text = mysql_real_escape_string(trim(substr($document,$title_len,cfg("spider_DescriptionMax")))); $recip = eregi(cfg("recip_regex"),$page_data['source'])?1:0; $oPR=new ADPPageRank(); $gpr = $oPR->getRank($page_data["url"]); if($gpr == "-1"){ $gpr = "0"; } print "<p><a href=\"".$page_data["url"]."\">$title</a><br><font color=green>".$page_data["url"]."</font> (PR".$gpr.", Recip: "; if($recip == 1){ print "<font color=green><b>YES</b></font>"; } else { print "<font color=red>NO</font>"; } print ")<br>"; if(cfg("spider_DiplayContent") != "0"){ print $text; } if($demo == "1"){ print "<p>DISABLED writing to database IN DEMO MODE<p>"; } else { if(!isset($link_id) || $link_id == "0"){ if(!isset($url)){ $url = $_REQUEST['url']; } $result = mysql_query("SELECT `id` FROM `links` WHERE `url` = '".$url."'"); print mysql_error(); if(mysql_num_rows($result)){ while($row=mysql_fetch_array($result)){ $link_id = $row['0']; } } } if(mysql_num_rows(mysql_query("SELECT `url` FROM `pages` WHERE `url`='".$page_data["url"]."'")) == 0){ mysql_query("INSERT INTO `pages` ( `id` , `link_id` , `url` , `title` , `description` , `gpr` , `recip` , `date`, `size` ) VALUES ( NULL , '$link_id' , '".$page_data["url"]."', '$title', '$text', '$gpr', '$recip', '$today', '".$page_data["bytes_received"]."' );"); print mysql_error(); } else{ print "<ul>Page already indexed. Updating ....</ul>"; mysql_query("DELETE FROM `pages` WHERE `url` = '".$page_data["url"]."'"); mysql_query("INSERT INTO `pages` ( `id` , `link_id` , `url` , `title` , `description` , `gpr` , `recip` , `date`, `size` ) VALUES ( NULL , '$link_id' , '".$page_data["url"]."', '$title', '$text', '$gpr', '$recip', '$today', '".$page_data["bytes_received"]."' );"); print mysql_error(); } } } else{ print $warning."This URL will not be added!"; } print "<hr>"; } else{ print "Content not received<hr>"; } sleep(cfg("spider_pause")); flush(); } } print "<b>Crawling:</b> <a href=\"$url\">$url</a><hr>"; if(cfg("spider_delete") == "1"){ print "Removing old entries .... "; if($demo == "1"){ print "<p>DISABLED IN DEMO MODE<p>"; } else { $result = mysql_query("SELECT `id` FROM `links` WHERE `url` = '$url'"); print mysql_error(); if(mysql_num_rows($result)){ while($row = mysql_fetch_array($result)){ $link_id = $row['id']; } mysql_query("DELETE FROM `pages` WHERE `link_id` = '$link_id'"); print mysql_error(); } } print " Done!<hr>"; } $crawler = &new MyCrawler(); $crawler->setURL($url); if(cfg("spider_FollowMode")){ $crawler->setFollowMode(cfg("spider_FollowMode")); } if(cfg("spider_FollowMatch")){ $crawler->addFollowMatch(cfg("spider_FollowMatch")); } if(cfg("spider_NonFollowMatch")){ $crawler->addNonFollowMatch(cfg("spider_NonFollowMatch")); } if(cfg("spider_FollowRedirects") == "0"){ $FollowRedirects = "false"; } else{ $FollowRedirects = "true"; } $crawler->setFollowRedirects($FollowRedirects); if(cfg("spider_FollowRedirectsTillContent") == "0"){ $FollowRedirectsTillContent = "false"; } else{ $FollowRedirectsTillContent = "true"; } $crawler->setFollowRedirectsTillContent($FollowRedirectsTillContent); if(cfg("spider_ReceiveContentType")){ $crawler->addReceiveContentType(cfg("spider_ReceiveContentType")); } if(cfg("spider_PageLimit")){ $crawler->setPageLimit(cfg("spider_PageLimit")); } if(cfg("spider_TrafficLimit")){ $crawler->setTrafficLimit(cfg("spider_TrafficLimit") * 1024); } if(cfg("spider_ContentSizeLimit")){ $crawler->setContentSizeLimit(cfg("spider_ContentSizeLimit") * 1024); } if(cfg("spider_ConnectionTimeout")){ $crawler->setConnectionTimeout(cfg("spider_ConnectionTimeout")); } if(cfg("spider_StreamTimeout")){ $crawler->setStreamTimeout(cfg("spider_StreamTimeout")); } if(cfg("spider_CookieHandling") == "0"){ $CookieHandling = "false"; } else{ $CookieHandling = "true"; } $crawler->setCookieHandling($CookieHandling); $crawler->go(); $report = $crawler->getReport(); print "<p>Report:<br>"; if($report["traffic_limit_reached"]==true){ print "Traffic-limit reached <br>"; } if($report["file_limit_reached"]==true){ print "File-limit reached <br>"; } print "Links followed: ".$report["links_followed"]."<br>"; print "Files received: ".$report["files_received"]."<br>"; print "Bytes received: ".$report["bytes_received"]."<br>"; print "<hr>Finished!"; }
?>