Location: PHPKode > scripts > GoogleCrawler > googlecrawler/googleCrawler.class.php
<?php
/*
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * (C) Jonathan Schmidt-Dominé 2008-2009 < hide@address.com >
 */
/*
This is the class for google crawler
it only takes the keywords and pageNumbers as parameters of the constructor and 
crawls in the google search results for links,titles and descriptions
you can use this crawler in your site as search engine(not real,but will search through google)
*/
define('brreplacement', "<br/>\n");
class googleCrawler
{
	private $content;
	private $divs;
    public $results = array();
    public $maxnum;
    private $mode = 'plain';
    private $linksuche = false;
    private $titelsuche = false;
    private $limode = false;
    private $beschreibungssuche = false;
    private $i;
	private $linkpart;
    
	public function __construct($keywordsGot,$pageNum = 1, $number = 10, $googleurl = 'http://www.google.de')
	{	 
		$keywords=$keywordsGot;
		
		if($pageNum==0)
			$this->content=file_get_contents($googleurl . "/search?q=".urlencode($keywords) . "&num=" . $number);
		else
		{
			$index=(($pageNum - 1)*$number);
			$this->content=file_get_contents($googleurl . "/search?q=".urlencode($keywords)."&start=".$index . "&num=" . $number);
		}
			$this->content = mb_convert_encoding($this->content, 'UTF-8');
        $this->content=substr_replace($this->content, '', 0, strpos($this->content, '<div><ol><li') + 9);
        $temp = $this->content;
        $this->content = '';
        $text2 = '<t';
        $charnum2 = 1;
        $text = '<ci';
        $charnum = 2;
        $mnsearch = true;
        for($this->i = strlen($temp) - 1; $this->i >= 0; --$this->i)
        {
            if($mnsearch)
            {
                if($temp[$this->i] != $text2[$charnum2])
                    $charnum2 = 1;
                else
                {
                    if((--$charnum2) < 0)
                    {
                        $intag = false;
                        --$this->i;
                        for(; true; --$this->i)
                        {
                            if($intag)
                            {
                                if($temp[$this->i] == '<')
                                    $intag = false;
                            }
                            elseif($temp[$this->i] == '>')
                            {
                                if(empty($this->maxnum))
                                    $intag = true;
                                else
                                    break;
                            }
                            else
                                $this->maxnum = $temp[$this->i] . $this->maxnum;
                        }
                        $charnum = 2;
                        $mnsearch = false;
                    }
                }
            }
            if($temp[$this->i] != $text[$charnum])
                $charnum = 2;
            else
            {
                if((--$charnum) < 0)
                {
                    $this->content = substr($temp, 0, $this->i);
                    break;
                }
            }
        }
	}
    
    private function plain() {}
    
    private function tag() { $this->mode = 'intag'; }
    
    private function endtag()
    {
        if($this->limode)
        {
            array_push($this->results, array('description' => '', 'title' => '', 'link' => ''));
            $this->limode = false;
            $this->mode = 'linksuche';
            $this->linksuche = true;
        }
        elseif($this->titelsuche)
            $this->mode = 'titelsuche';
        elseif($this->beschreibungssuche)
        {
            $this->mode = 'beschreibungssuche';
        }
        else
            $this->mode = 'plain';
    }
    
    private function intag()
    {
        $sub = '';
        for(; $this->content[$this->i] != '>'; )
        {
            if(!isset($continue))
            {
                $sub .= $this->content[$this->i];
                if($sub == 'li')
                    $this->limode = true;
                elseif($this->linksuche)
                {
                    if($this->content[$this->i] != 'a' || $this->content[++$this->i] != ' ')
                        $continue = true;
                    else
                    {
                        $this->i += 6;          //strlen('a href=')-1
                        while($this->content[++$this->i] != '"')
                            $this->results[count($this->results) - 1]['link'] .= $this->content[$this->i];
                        $this->mode = 'titelsuche';
                        $this->linksuche = false;
                        $this->titelsuche = true;
                        $continue = true;
                    }
                }
                elseif($this->titelsuche)
                {
                    do
                    {
                        if(strlen($sub) == 1 && $sub[0] == '/')
                            $sub .= $this->content[++$this->i];
                        else
                            break;
                        if($sub[1] == 'a')
                            $sub .= $this->content[++$this->i];
                        if($sub[2] == '>')
                        {
                            $this->mode = 'plain';
                            $this->titelsuche = false;
                            return;
                        }
                    }
                    while(false);
                    $this->mode = 'titelsuche';
                    $continue = true;
                }
                elseif($this->beschreibungssuche)
                {
                    if($sub == 'b')
                        if($this->content[++$this->i] == 'r')
                            $this->results[count($this->results) - 1]['description'] .= brreplacement;
                        else
                            $continue = true;
                    else
                    {
                        do
                        {
                            if(strlen($sub) == 1 && $sub[0] == 'c')
                                $sub .= $this->content[++$this->i];
                            else
                                break;
                            if($sub[1] == 'i')
                                $sub .= $this->content[++$this->i];
                            else
                                break;
                            if($sub[2] == 't')
                                $sub .= $this->content[++$this->i];
                            else
                                break;
                            if($sub[3] == 'e')
                                $sub .= $this->content[++$this->i];
                            else
                                break;
                            if($sub[4] == '>')
                            {
                                $this->mode = 'plain';
                                $this->beschreibungssuche = false;
                            }
                            else
                                break;
                        }
                        while($temp = false);
                        if($temp !== false)
                        {
                            $this->mode = 'beschreibungssuche';
                            $continue = true;
                        }
                    }
                }
                elseif($sub == 'div')
                {
                    $this->mode = 'beschreibungssuche';
                    $this->beschreibungssuche = true;
                    $continue = true;
                }
            }
            ++$this->i;
        }
        --$this->i;
    }
    
    private function titelsuche()
    {
        $this->results[count($this->results) - 1]['title'] .= $this->content[$this->i];
    }
    
    private function beschreibungssuche()
		{
				if($this->content[$this->i] == 'ä')
					fb($this->content[$this->i]);
        $this->results[count($this->results) - 1]['description'] .= $this->content[$this->i];
    }
    
    public function parseGoogle()
    {
        for($this->i = 0; $this->i < strlen($this->content); ++$this->i)
        {
            switch($this->content[$this->i])
            {
                case '<':
                    $this->mode = 'tag';
                    break;
                case '>':
                    $this->mode = 'endtag';
                    break;
            }
            call_user_func(array($this, $this->mode));
        }
    }
}
?>
Return current item: GoogleCrawler