Location: PHPKode > scripts > Spy Links > spylinks/spylin.php
<?php
//
// Scriptol Spy Links 1.1
// (c) 2009 Denis Sureau - Scriptol.com
//
// Free under the GNU GPL 3 License.
// Requires the PHP interpreter.
// Sources are compiled with the Scriptol PHP compiler 7.0
//
// The program checks the social aspect of a website through external links.
// Read the manual for details of use at:
// http://www.scriptol.com/scripts/spy-links.php.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
include_once("path.php");
include_once("dom.php");
include_once("url.php");
$RECURSE=false;
$PROCESSDEFAULT=true;
$SPECIFICFLAG=false;
$BROKEN=false;
$LIMIT=-1;
$TOPLIMIT=10;
$linkcount=0;
$brocount=0;
$elinks=0;
$ilinks=0;
$hostsite="";
$specific="";
$currentPage="";
$toplinks=array();
$params=array();
$checked=array();
$scanned=array();
$domains=array();
$retrieved=array();
function usage()
{
   echo "\n";
   echo "Spy Links - (c) 2009 Scriptol.com - Freeware", "\n";
   echo "--------------------------------------------", "\n";
   echo "Syntax:", "\n";
   echo "  solp spylin [options] url", "\n";
   echo "Options:", "\n";
   echo "  -{n}       maximal number of pages, default all.", "\n";
   echo "  -t{n}      change number of top site to display.", "\n";
   echo "  -s{domain} search for links to a specific website.", "\n";
   echo "  -c         check links, slower.", "\n";
   echo "  -v         verbose, display more infos.", "\n";
   echo "  -q         quiet, display nothing.", "\n";
   echo "Arguments:", "\n";
   echo "  url: http address of a page, usually the home page.", "\n";
   echo "Logs stored into 'links.log'.", "\n";
   echo "More info at: http://www.scriptol.com/scripts/", "\n";
   exit(0);
   return;
}

function splitSite($url)
{
   $pos=strpos($url,'/',8);
   if($pos===false)
   {
      $ext=Path::getExtension($url);
      global $extensions;
      if(!in_array($ext,$extensions))
      {
         return array($url,"");
      }
      die("$url not a valid url");
   }
   $site=substr($url,0,$pos);
   $filename=substr($url,$pos+1);
   return array($site,$filename);
}

function getDomain($url)
{
   global $PHP_URL_HOST;
   $site=parse_url($url,PHP_URL_HOST);
   $site=strtolower($site);

   if(strlen($site)>4)
   {
      $t1=strstr($site,".");
      $t2=strrchr($site,".");
      while($t1>$t2)
      {
         do
         {
            $site=substr($t1,1);
            $t1=strstr($site,".");
         } while(false);
      }
   }
   return $site;
}

function isInternal($url)
{
   global $website;
   $l=strlen($website);
   $url=strtolower($url);

   if($website===substr($url,0,$l))
   {
      return true;
   }
   return false;
}

function checkLink($url)
{
   $status=0;
   $d=getDomain($url);
   global $linkcount;
   $linkcount+=1;
   global $DEBUG;
   if($DEBUG)
   {
      echo "Checking $url", "\n";
   }
   global $checked;
   if(@array_key_exists($url,$checked))
   {
      return intVal($checked[$url]);
   }
   global $BROKEN;
   if($BROKEN)
   {
      global $FORCERETRY;
      $status=sockAccess($url,$FORCERETRY);
   }
   else
   {
      $status=200;
   }
   $checked[$url]=$status;

   global $QUIET;
   global $VERBOSE;
   if(!$QUIET&&!$VERBOSE)
   {
      echo ".";
   }
   if($status!=200)
   {
      global $brocount;
      $brocount+=1;
      if((($status===404)&&!$QUIET))
      {
         echo "Broken", " ", $url, "\n";
      }
      return $status;
   }
   global $hostsite;
   if($d===$hostsite)
   {
      global $ilinks;
      $ilinks+=1;
      return $status;
   }
   global $elinks;
   $elinks+=1;

   $n=0;
   global $domains;
   if($domains[$d]!=false)
   {
      $n=intVal($domains[$d]);
   }
   $domains[$d]=$n+1;

   global $SPECIFICFLAG;
   if($SPECIFICFLAG===true)
   {
      global $specific;
      if($d===$specific)
      {
         global $currentPage;
         global $retrieved;
         if(!in_array($currentPage,$retrieved))
         {
            array_push($retrieved,$currentPage);
         }
         if($VERBOSE)
         {
            echo "\n", " ", $specific, " ", "linked in", " ", $currentPage, "\n";
         }
      }
   }
   return $status;
}

function pageScan($fname,$caller)
{
   $current=null;
   $elem=null;
   $xres=0;
   $links=array();
   global $VERBOSE;
   if($VERBOSE)
   {
      echo "Scanning", " ", $fname, "\n";
   }
   global $currentPage;
   $currentPage=$fname;

   $d=new DOMDocument();

   
    $xres = @$d->loadHTMLFile($fname);
  
   if($xres===false)
   {
      if($VERBOSE)
      {
         echo "Error \"$fname\" not found in $caller", "\n";
      }
      global $brocount;
      $brocount+=1;
      return array();
   }
   $dnl=$d->getElementsByTagName("a");
   if($dnl->length===0)
   {
      return array();
   }
   for($i=0;$i<=$dnl->length;$i++)
   {
      $current=$dnl->item($i);
      if($current===null)
      {
         continue;
      }
      $elem=$current;
      if($elem->hasAttribute("href"))
      {
         array_push($links,$elem->getAttribute("href"));
      }
   }
   return $links;
}

function httpCheck($page,$caller)
{
   $links=array();
   $todo=array();
   $reldir="";
   $src="";
   $ext="";
   global $LIMIT;
   if($LIMIT>=0)
   {
      global $scanned;
      if(count($scanned)>=$LIMIT)
      {
         return;
      }
   }
   if(trim($page) ==false)
   {
      return;
   }
   if($page{0}===".")
   {
      return;
   }
   global $scanned;
   if(@array_key_exists($page,$scanned))
   {
      return;
   }
   $scanned[$page]=200;
   global $checked;
   $checked[$page]=200;

   global $DEBUG;
   if($DEBUG)
   {
      echo "Entering $page ", "\n";
   }
   global $differed;
   $differed="\n$page\n".str_repeat("-",strlen($page));
   global $DIFFEREDFLAG;
   $DIFFEREDFLAG=true;

   $infos=pathinfo($page);
   $reldir=@strtolower($infos{'dirname'});
   $src=@strtolower($infos{'filename'});
   $ext=@strtolower($infos{'extension'});

   if(substr($page,-1,1)==="/")
   {
      global $website;
      $l=intVal(strlen($website));
      $reldir=$page;
      $src="";
   }
   else
   {
      $infos=pathinfo($page);
      $reldir=@strtolower($infos{'dirname'});
      $src=@strtolower($infos{'filename'});
      $ext=@strtolower($infos{'extension'});
      if($ext!=false)
      {
         $ext=".".$ext;
         global $extensions;
         if(!in_array($ext,$extensions))
         {
            return;
         }
         $src.=$ext;
      }
   }
   if($DEBUG)
   {
      echo "Processing  $reldir/$src", "\n";
   }
   $links=pageScan($page,$caller);
   if(count($links)===0)
   {
      return;
   }
   $l=count($links);
   for($i=0;$i<$l;$i++)
   {
      $link=$links[$i];
      if($link{0}==="#")
      {
         continue;
      }
      $p=strpos($link,"#",0);
      if($p!=0)
      {
         $link=substr($link,0,$p);
      }
      if(!hasProtocol($link))
      {
         if(strlen($link)>6)
         {
            if(substr($link,0,3)==="../")
            {
               global $VERBOSE;
               if($VERBOSE)
               {
                  echo "Should be absolute: $link in $page", "\n";
               }
               continue;
            }
            if(substr($link,0,7)==="mailto:")
            {
               if($DEBUG)
               {
                  echo "Skipped mailto.", "\n";
               }
               continue;
            }
         }
         $link=Path::merge($reldir,$link);
      }
      if(trim($link) ==false)
      {
         continue;
      }
      if(@array_key_exists($link,$checked))
      {
         display($checked[$link],$link,false);
         continue;
      }
      if(isInternal($link))
      {
         global $PROCESSDEFAULT;
         if($PROCESSDEFAULT)
         {
            if(substr($link,-1)==="/")
            {
               $home=findDefault($link);
               if(@array_key_exists($home,$checked)===false)
               {
                  $checked[$home]=200;
               }
            }
         }
         array_push($todo,$link);
      }
      checkLink($link);
   }
   reset($todo);
   do
   {
      $link= current($todo);
      if(@array_key_exists($link,$scanned))
      {
         continue;
      }
      if(@array_key_exists($link,$checked)===false)
      {
         continue;
      }
      if($checked[$link]===200)
      {
         httpCheck($link,$page);
      }
   }
   while(!(next($todo) === false));

   return;
}

function httpProcess($page)
{
   if(substr($page,-1)==="/")
   {
      $page=findDefault($page);
   }
   httpCheck($page,"command line");
   return;
}

function processCommand($argnum,$arguments)
{
   $opt="";
   if($argnum<2)
   {
      usage();
   }
   reset($arguments);
   do
   {
      $param= current($arguments);
      if(strlen($param)>1)
      {
         $opt=substr($param,0,2);
      }
      else
      {
         usage();
      }
      
      if($opt==="-v")
      {
         global $VERBOSE;
         $VERBOSE=true;
         continue;
      }
      else
      {
         if($opt==="-q")
         {
            global $QUIET;
            $QUIET=true;
            continue;
         }
      else
      {
         if($opt==="-u")
         {
            global $DEBUG;
            $DEBUG=true;
            continue;
         }
      else
      {
         if($opt==="-c")
         {
            global $BROKEN;
            $BROKEN=true;
            continue;
         }
      else
      {
         if($opt==="-y")
         {
            global $FORCERETRY;
            $FORCERETRY=true;
            continue;
         }
      else
      {
         if($opt==="-s")
         {
            global $SPECIFICFLAG;
            $SPECIFICFLAG=true;
            global $specific;
            $specific=substr($param,2);
            if($specific==="")
            {
               usage();
            }
            continue;
         }
      else
      {
         if($opt==="-t")
         {
            $x=intval(substr($param,2));
            global $TOPLIMIT;
            $TOPLIMIT=0;
            continue;
         }
      }}}}}}
      if($param{0}==="-")
      {
         $x=intval(substr($param,1));
         if($x>0)
         {
            global $LIMIT;
            $LIMIT=$x;
            continue;
         }
      }
      if(substr($param,0,5)==="http:")
      {
         global $server;
         $server=$param;
         continue;
      }
      if($param{0}==="-")
      {
         echo "Unknown command $param", "\n";
         usage();
      }
      global $server;
      if($server ==false)
      {
         $server=$param;
         continue;
      }
      echo "Unknown command $param", "\n";
      usage();
   }
   while(!(next($arguments) === false));

   global $server;
   if($server ==false)
   {
      die("You must provide a URL.");
   }
   global $params;
   $params["server"]=$server;

   return;
}

function evaluate($pages)
{
   global $elinks;
   if($elinks<($pages/5))
   {
      return "black hole";
   }
   if($elinks<($pages/2))
   {
      return "egocentric";
   }
   if($elinks>=($pages*5))
   {
      return "very friendly";
   }
   if($elinks>=($pages*2))
   {
      return "friendly";
   }
   return "honest";
}

function main($argc,$argv)
{
   global $website;   
   $filename="";
   $x=array_slice($argv,1);
   processCommand($argc,$x);
   global $server;
   global $params;
   $server=$params["server"];

   if(!hasProtocol($server))
   {
      $server="http://".$server;
   }
   global $currentPage;
   $currentPage=$server;

   global $website;
   $_I1=splitSite($server);
   $website=reset($_I1);
   $filename=next($_I1);
   $website=strtolower($website);

   global $domain;
   $domain=substr($website,7);
   if(substr($domain,-1,1)==="/")
   {
      $domain=substr($domain,0);
   }
   global $baseLength;
   $baseLength=intVal(strlen($domain)+7);

   global $hostsite;
   $hostsite=getDomain($server);

   global $VERBOSE;
   if($VERBOSE===true)
   {
      echo "Verbose mode enabled", "\n";
   }
   global $DEBUG;
   if($DEBUG===true)
   {
      echo "Debug mode enabled", "\n";
   }
   echo "Checking ";
   global $LIMIT;
   if($LIMIT>-1)
   {
      echo $LIMIT;
   }
   else
   {
      echo "all";
   }
   echo " pages on", " ", $domain, "\n";
   echo "Starting from $server", "\n";
   global $SPECIFICFLAG;
   if($SPECIFICFLAG===true)
   {
      global $specific;
      echo "Searching links to", " ", $specific, "\n";
   }
   global $log;
   $log=fopen("links.log","w");
   httpProcess($server);
   global $scanned;
   $sp=count($scanned);
   $ranking=evaluate($sp);
   fwrite($log,"Site: ".strtoupper($ranking));
   fwrite($log,"$linkcount links checked in $sp pages.");
   fwrite($log,"$brocount broken or redirected links, ignored. ");
   fclose($log);
   global $QUIET;
   if($QUIET)
   {
      return 0;
   }
   echo "\n";
   echo "This site is : ",strtoupper($ranking),".\n";
   global $linkcount;
   echo $linkcount, " ", "links checked in $sp pages.", "\n";
   global $elinks;
   global $domains;
   echo $elinks, " ", "external links found and", " ", count($domains), " ", "domains.", "\n";
   global $ilinks;
   echo $ilinks, " ", "internal links.", "\n";
   global $BROKEN;
   if($BROKEN===true)
   {
      global $brocount;
      echo $brocount,"broken links";
      if($brocount>($linkcount/50))
      {
         echo ", not seriously maintained.";
      }
      echo "\n";
   }
   if($SPECIFICFLAG===true)
   {
      global $retrieved;
      $nt=count($retrieved);
      if($nt===0)
      {
         echo "No link found in $hostsite to $specific.", "\n";
         exit(0);
      }
      global $specific;
      echo $specific, " ", "is linked $nt time, from:", "\n";
      reset($retrieved);
      do
      {
         echo "-", " ", current($retrieved), "\n";
      }
      while(!(next($retrieved)===false));
      exit(0);
   }
   arsort($domains);
   $i=1;
   global $TOPLIMIT;
   $top=intVal(min($TOPLIMIT,count($domains)));
   echo "Top", " ", $top, " ", "sites:", "\n";
   while(($i<=$top))
   {
      do
      {
         $key=key($domains);
         echo str_pad($i,3," ",0),") ",$key," : ",$domains[$key],"\n";
         array_shift($domains);
      } while(false);
      $i+=1;
   }
   return 0;
}

main(intVal($argc),$argv);

?>
Return current item: Spy Links