<?php
//
// Scriptol Spy Links 1.1
// (c) 2009 Denis Sureau - Scriptol.com
//
// Free under the GNU GPL 3 License.
// Requires the PHP interpreter.
// Sources are compiled with the Scriptol PHP compiler 7.0
//
// The program checks the social aspect of a website through external links.
// Read the manual for details of use at:
// http://www.scriptol.com/scripts/spy-links.php.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
include_once("path.php");
include_once("dom.php");
include_once("url.php");
$RECURSE=false;
$PROCESSDEFAULT=true;
$SPECIFICFLAG=false;
$BROKEN=false;
$LIMIT=-1;
$TOPLIMIT=10;
$linkcount=0;
$brocount=0;
$elinks=0;
$ilinks=0;
$hostsite="";
$specific="";
$currentPage="";
$toplinks=array();
$params=array();
$checked=array();
$scanned=array();
$domains=array();
$retrieved=array();
function usage()
{
echo "\n";
echo "Spy Links - (c) 2009 Scriptol.com - Freeware", "\n";
echo "--------------------------------------------", "\n";
echo "Syntax:", "\n";
echo " solp spylin [options] url", "\n";
echo "Options:", "\n";
echo " -{n} maximal number of pages, default all.", "\n";
echo " -t{n} change number of top site to display.", "\n";
echo " -s{domain} search for links to a specific website.", "\n";
echo " -c check links, slower.", "\n";
echo " -v verbose, display more infos.", "\n";
echo " -q quiet, display nothing.", "\n";
echo "Arguments:", "\n";
echo " url: http address of a page, usually the home page.", "\n";
echo "Logs stored into 'links.log'.", "\n";
echo "More info at: http://www.scriptol.com/scripts/", "\n";
exit(0);
return;
}
function splitSite($url)
{
$pos=strpos($url,'/',8);
if($pos===false)
{
$ext=Path::getExtension($url);
global $extensions;
if(!in_array($ext,$extensions))
{
return array($url,"");
}
die("$url not a valid url");
}
$site=substr($url,0,$pos);
$filename=substr($url,$pos+1);
return array($site,$filename);
}
function getDomain($url)
{
global $PHP_URL_HOST;
$site=parse_url($url,PHP_URL_HOST);
$site=strtolower($site);
if(strlen($site)>4)
{
$t1=strstr($site,".");
$t2=strrchr($site,".");
while($t1>$t2)
{
do
{
$site=substr($t1,1);
$t1=strstr($site,".");
} while(false);
}
}
return $site;
}
function isInternal($url)
{
global $website;
$l=strlen($website);
$url=strtolower($url);
if($website===substr($url,0,$l))
{
return true;
}
return false;
}
function checkLink($url)
{
$status=0;
$d=getDomain($url);
global $linkcount;
$linkcount+=1;
global $DEBUG;
if($DEBUG)
{
echo "Checking $url", "\n";
}
global $checked;
if(@array_key_exists($url,$checked))
{
return intVal($checked[$url]);
}
global $BROKEN;
if($BROKEN)
{
global $FORCERETRY;
$status=sockAccess($url,$FORCERETRY);
}
else
{
$status=200;
}
$checked[$url]=$status;
global $QUIET;
global $VERBOSE;
if(!$QUIET&&!$VERBOSE)
{
echo ".";
}
if($status!=200)
{
global $brocount;
$brocount+=1;
if((($status===404)&&!$QUIET))
{
echo "Broken", " ", $url, "\n";
}
return $status;
}
global $hostsite;
if($d===$hostsite)
{
global $ilinks;
$ilinks+=1;
return $status;
}
global $elinks;
$elinks+=1;
$n=0;
global $domains;
if($domains[$d]!=false)
{
$n=intVal($domains[$d]);
}
$domains[$d]=$n+1;
global $SPECIFICFLAG;
if($SPECIFICFLAG===true)
{
global $specific;
if($d===$specific)
{
global $currentPage;
global $retrieved;
if(!in_array($currentPage,$retrieved))
{
array_push($retrieved,$currentPage);
}
if($VERBOSE)
{
echo "\n", " ", $specific, " ", "linked in", " ", $currentPage, "\n";
}
}
}
return $status;
}
function pageScan($fname,$caller)
{
$current=null;
$elem=null;
$xres=0;
$links=array();
global $VERBOSE;
if($VERBOSE)
{
echo "Scanning", " ", $fname, "\n";
}
global $currentPage;
$currentPage=$fname;
$d=new DOMDocument();
$xres = @$d->loadHTMLFile($fname);
if($xres===false)
{
if($VERBOSE)
{
echo "Error \"$fname\" not found in $caller", "\n";
}
global $brocount;
$brocount+=1;
return array();
}
$dnl=$d->getElementsByTagName("a");
if($dnl->length===0)
{
return array();
}
for($i=0;$i<=$dnl->length;$i++)
{
$current=$dnl->item($i);
if($current===null)
{
continue;
}
$elem=$current;
if($elem->hasAttribute("href"))
{
array_push($links,$elem->getAttribute("href"));
}
}
return $links;
}
function httpCheck($page,$caller)
{
$links=array();
$todo=array();
$reldir="";
$src="";
$ext="";
global $LIMIT;
if($LIMIT>=0)
{
global $scanned;
if(count($scanned)>=$LIMIT)
{
return;
}
}
if(trim($page) ==false)
{
return;
}
if($page{0}===".")
{
return;
}
global $scanned;
if(@array_key_exists($page,$scanned))
{
return;
}
$scanned[$page]=200;
global $checked;
$checked[$page]=200;
global $DEBUG;
if($DEBUG)
{
echo "Entering $page ", "\n";
}
global $differed;
$differed="\n$page\n".str_repeat("-",strlen($page));
global $DIFFEREDFLAG;
$DIFFEREDFLAG=true;
$infos=pathinfo($page);
$reldir=@strtolower($infos{'dirname'});
$src=@strtolower($infos{'filename'});
$ext=@strtolower($infos{'extension'});
if(substr($page,-1,1)==="/")
{
global $website;
$l=intVal(strlen($website));
$reldir=$page;
$src="";
}
else
{
$infos=pathinfo($page);
$reldir=@strtolower($infos{'dirname'});
$src=@strtolower($infos{'filename'});
$ext=@strtolower($infos{'extension'});
if($ext!=false)
{
$ext=".".$ext;
global $extensions;
if(!in_array($ext,$extensions))
{
return;
}
$src.=$ext;
}
}
if($DEBUG)
{
echo "Processing $reldir/$src", "\n";
}
$links=pageScan($page,$caller);
if(count($links)===0)
{
return;
}
$l=count($links);
for($i=0;$i<$l;$i++)
{
$link=$links[$i];
if($link{0}==="#")
{
continue;
}
$p=strpos($link,"#",0);
if($p!=0)
{
$link=substr($link,0,$p);
}
if(!hasProtocol($link))
{
if(strlen($link)>6)
{
if(substr($link,0,3)==="../")
{
global $VERBOSE;
if($VERBOSE)
{
echo "Should be absolute: $link in $page", "\n";
}
continue;
}
if(substr($link,0,7)==="mailto:")
{
if($DEBUG)
{
echo "Skipped mailto.", "\n";
}
continue;
}
}
$link=Path::merge($reldir,$link);
}
if(trim($link) ==false)
{
continue;
}
if(@array_key_exists($link,$checked))
{
display($checked[$link],$link,false);
continue;
}
if(isInternal($link))
{
global $PROCESSDEFAULT;
if($PROCESSDEFAULT)
{
if(substr($link,-1)==="/")
{
$home=findDefault($link);
if(@array_key_exists($home,$checked)===false)
{
$checked[$home]=200;
}
}
}
array_push($todo,$link);
}
checkLink($link);
}
reset($todo);
do
{
$link= current($todo);
if(@array_key_exists($link,$scanned))
{
continue;
}
if(@array_key_exists($link,$checked)===false)
{
continue;
}
if($checked[$link]===200)
{
httpCheck($link,$page);
}
}
while(!(next($todo) === false));
return;
}
function httpProcess($page)
{
if(substr($page,-1)==="/")
{
$page=findDefault($page);
}
httpCheck($page,"command line");
return;
}
function processCommand($argnum,$arguments)
{
$opt="";
if($argnum<2)
{
usage();
}
reset($arguments);
do
{
$param= current($arguments);
if(strlen($param)>1)
{
$opt=substr($param,0,2);
}
else
{
usage();
}
if($opt==="-v")
{
global $VERBOSE;
$VERBOSE=true;
continue;
}
else
{
if($opt==="-q")
{
global $QUIET;
$QUIET=true;
continue;
}
else
{
if($opt==="-u")
{
global $DEBUG;
$DEBUG=true;
continue;
}
else
{
if($opt==="-c")
{
global $BROKEN;
$BROKEN=true;
continue;
}
else
{
if($opt==="-y")
{
global $FORCERETRY;
$FORCERETRY=true;
continue;
}
else
{
if($opt==="-s")
{
global $SPECIFICFLAG;
$SPECIFICFLAG=true;
global $specific;
$specific=substr($param,2);
if($specific==="")
{
usage();
}
continue;
}
else
{
if($opt==="-t")
{
$x=intval(substr($param,2));
global $TOPLIMIT;
$TOPLIMIT=0;
continue;
}
}}}}}}
if($param{0}==="-")
{
$x=intval(substr($param,1));
if($x>0)
{
global $LIMIT;
$LIMIT=$x;
continue;
}
}
if(substr($param,0,5)==="http:")
{
global $server;
$server=$param;
continue;
}
if($param{0}==="-")
{
echo "Unknown command $param", "\n";
usage();
}
global $server;
if($server ==false)
{
$server=$param;
continue;
}
echo "Unknown command $param", "\n";
usage();
}
while(!(next($arguments) === false));
global $server;
if($server ==false)
{
die("You must provide a URL.");
}
global $params;
$params["server"]=$server;
return;
}
function evaluate($pages)
{
global $elinks;
if($elinks<($pages/5))
{
return "black hole";
}
if($elinks<($pages/2))
{
return "egocentric";
}
if($elinks>=($pages*5))
{
return "very friendly";
}
if($elinks>=($pages*2))
{
return "friendly";
}
return "honest";
}
function main($argc,$argv)
{
global $website;
$filename="";
$x=array_slice($argv,1);
processCommand($argc,$x);
global $server;
global $params;
$server=$params["server"];
if(!hasProtocol($server))
{
$server="http://".$server;
}
global $currentPage;
$currentPage=$server;
global $website;
$_I1=splitSite($server);
$website=reset($_I1);
$filename=next($_I1);
$website=strtolower($website);
global $domain;
$domain=substr($website,7);
if(substr($domain,-1,1)==="/")
{
$domain=substr($domain,0);
}
global $baseLength;
$baseLength=intVal(strlen($domain)+7);
global $hostsite;
$hostsite=getDomain($server);
global $VERBOSE;
if($VERBOSE===true)
{
echo "Verbose mode enabled", "\n";
}
global $DEBUG;
if($DEBUG===true)
{
echo "Debug mode enabled", "\n";
}
echo "Checking ";
global $LIMIT;
if($LIMIT>-1)
{
echo $LIMIT;
}
else
{
echo "all";
}
echo " pages on", " ", $domain, "\n";
echo "Starting from $server", "\n";
global $SPECIFICFLAG;
if($SPECIFICFLAG===true)
{
global $specific;
echo "Searching links to", " ", $specific, "\n";
}
global $log;
$log=fopen("links.log","w");
httpProcess($server);
global $scanned;
$sp=count($scanned);
$ranking=evaluate($sp);
fwrite($log,"Site: ".strtoupper($ranking));
fwrite($log,"$linkcount links checked in $sp pages.");
fwrite($log,"$brocount broken or redirected links, ignored. ");
fclose($log);
global $QUIET;
if($QUIET)
{
return 0;
}
echo "\n";
echo "This site is : ",strtoupper($ranking),".\n";
global $linkcount;
echo $linkcount, " ", "links checked in $sp pages.", "\n";
global $elinks;
global $domains;
echo $elinks, " ", "external links found and", " ", count($domains), " ", "domains.", "\n";
global $ilinks;
echo $ilinks, " ", "internal links.", "\n";
global $BROKEN;
if($BROKEN===true)
{
global $brocount;
echo $brocount,"broken links";
if($brocount>($linkcount/50))
{
echo ", not seriously maintained.";
}
echo "\n";
}
if($SPECIFICFLAG===true)
{
global $retrieved;
$nt=count($retrieved);
if($nt===0)
{
echo "No link found in $hostsite to $specific.", "\n";
exit(0);
}
global $specific;
echo $specific, " ", "is linked $nt time, from:", "\n";
reset($retrieved);
do
{
echo "-", " ", current($retrieved), "\n";
}
while(!(next($retrieved)===false));
exit(0);
}
arsort($domains);
$i=1;
global $TOPLIMIT;
$top=intVal(min($TOPLIMIT,count($domains)));
echo "Top", " ", $top, " ", "sites:", "\n";
while(($i<=$top))
{
do
{
$key=key($domains);
echo str_pad($i,3," ",0),") ",$key," : ",$domains[$key],"\n";
array_shift($domains);
} while(false);
$i+=1;
}
return 0;
}
main(intVal($argc),$argv);
?>