<?php
// ----------------------------------------------------------------------
// Copyright (C) 2006 by Khaled Al-Shamaa.
// http://www.al-shamaa.com/
// ----------------------------------------------------------------------
// LICENSE
// This program is open source product; you can redistribute it and/or
// modify it under the terms of the GNU General Public License (GPL)
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// To read the license please visit http://www.gnu.org/copyleft/gpl.html
// ----------------------------------------------------------------------
// Class Name: Arabic Auto Summarize Class
// Filename: arAutoSummarize.class.php
// Original Author(s): Khaled Al-Sham'aa <hide@address.com>
// Purpose: Automatic keyphrase extraction to provide a quick mini-summary for a long Arabic document.
// ----------------------------------------------------------------------
class ArAutoSummarize {
/**
* @return String Output summary requested
* @param String Input Arabic document as a string using windows-1256 charset
* Integer Number of sentences required in output summary
* @desc doSummarize Summarize input Arabic string (document content) into specific number of sentences in the output
* @author Khaled Al-Shamaa
*/
public function doSummarize($str, $int, $keywords){
$summary = '';
$str = strip_tags($str);
$normalizedStr = $this->doNormalize($str);
$cleanedStr = $this->cleanCommon($normalizedStr);
$stemStr = $this->draftStem($cleanedStr);
$wordRanks = $this->rankWords($stemStr);
if($keywords){
$keywords = $this->doNormalize($keywords);
$keywords = $this->draftStem($keywords);
$words = split(' ', $keywords);
foreach($words as $word){
$wordRanks[$word] = 1000;
}
}
$sentencesRanks= $this->rankSentences($str, $wordRanks);
list($sentences, $ranks)= $sentencesRanks;
$minRank = $this->minAcceptedRank($ranks, $int);
$totalSentences = count($ranks);
for($i=0; $i<$totalSentences; $i++){
if($sentencesRanks[1][$i] >= $minRank) { $summary .= $sentencesRanks[0][$i]; }
}
return $summary;
}
/**
* @return String Output summary requested
* @param String Input Arabic document as a string using windows-1256 charset
* Integer Rate of output summary sentence number as percentage of the input Arabic string (document content)
* @desc doRateSummarize Summarize percentage of the input Arabic string (document content) into output
* @author Khaled Al-Shamaa
*/
public function doRateSummarize($str, $rate, $keywords){
preg_match_all("/[^|\.|\n|\¡|\º](.+?)[\.|\n|\¡|\º]/", $str, $sentences);
$totalSentences = count($sentences[0]);
$int = round($rate * $totalSentences / 100);
$str = $this->doSummarize($str, $int, $keywords);
return $str;
}
/**
* @return String Output highlighted key sentences summary (using CSS)
* @param String Input Arabic document as a string using windows-1256 charset
* Integer Number of key sentences required to be highlighted in the input string (document content)
* @desc highlightSummary Highlight key sentences (summary) of the input string (document content) using CSS and send the result back as an output
* @author Khaled Al-Shamaa
*/
public function highlightSummary($str, $int, $keywords){
$highlighted = '';
$str = strip_tags($str);
$normalizedStr = $this->doNormalize($str);
$cleanedStr = $this->cleanCommon($normalizedStr);
$stemStr = $this->draftStem($cleanedStr);
$wordRanks = $this->rankWords($stemStr);
if($keywords){
$keywords = $this->doNormalize($keywords);
$keywords = $this->draftStem($keywords);
$words = split(' ', $keywords);
foreach($words as $word){
$wordRanks[$word] = 1000;
}
}
$sentencesRanks= $this->rankSentences($str, $wordRanks);
list($sentences, $ranks)= $sentencesRanks;
$minRank = $this->minAcceptedRank($ranks, $int);
$totalSentences = count($ranks);
for($i=0; $i<$totalSentences; $i++){
if($sentencesRanks[1][$i] >= $minRank) {
$highlighted .= '<span style="BACKGROUND-COLOR: #EEEE80">'.$sentencesRanks[0][$i].'</span>';
}else{
$highlighted .= $sentencesRanks[0][$i];
}
}
$highlighted = str_replace("\n", '<br />', $highlighted);
return $highlighted;
}
/**
* @return String Output highlighted key sentences summary (using CSS)
* @param String Input Arabic document as a string using windows-1256 charset
* Integer Rate of highlighted key sentences summary number as percentage of the input Arabic string (document content)
* @desc highlightRateSummary Highlight key sentences (summary) as percentage of the input string (document content) using CSS and send the result back as an output
* @author Khaled Al-Shamaa
*/
public function highlightRateSummary($str, $rate, $keywords){
preg_match_all("/[^|\.|\n|\¡|\º](.+?)[\.|\n|\¡|\º]/", $str, $sentences);
$totalSentences = count($sentences[0]);
$int = round($rate * $totalSentences / 100);
$str = $this->highlightSummary($str, $int, $keywords);
return $str;
}
/**
* @return String List of the keywords extracting from input Arabic string (document content)
* @param String Input Arabic document as a string using windows-1256 charset
* Integer Number of keywords required to be extracting from input string (document content)
* @desc getMetaKeywords Extract keywords from a given Arabic string (document content)
* @author Khaled Al-Shamaa
*/
public function getMetaKeywords($str, $int){
$patterns = array();
$replacements = array();
$metaKeywords = '';
array_push($patterns, '/\.|\n|\¡|\º|\(|\[|\{|\)|\]|\}/');
array_push($replacements, ' ');
$str = preg_replace($patterns, $replacements, $str);
$normalizedStr = $this->doNormalize($str);
$cleanedStr = $this->cleanCommon($normalizedStr);
$str = preg_replace('/(\W)Çá(\w{3,})/', '\\1\\2', $cleanedStr);
$str = preg_replace('/(\W)æÇá(\w{3,})/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})åãÇ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})ßãÇ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})Êíä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})åã(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})åä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})åÇ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})äÇ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})äí(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})ßã(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})Êã(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})ßä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})ÇÊ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})íä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})Êä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})æä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})Çä(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})ÊÇ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})æÇ(\W)/', '\\1\\2', $str);
$str = preg_replace('/(\w{3,})É(\W)/', '\\1\\2', $str);
$stemStr = preg_replace('/(\W)\w{1,3}(\W)/', '\\2', $str);
$wordRanks = $this->rankWords($stemStr);
arsort($wordRanks, SORT_NUMERIC);
$i = 1;
foreach ($wordRanks as $key => $value) {
if($this->acceptedWord($key)){
$metaKeywords .= $key.', ';
$i++;
}
if($i > $int){ break; }
}
return $metaKeywords;
}
/**
* @return String Normalized Arabic document
* @param String Input Arabic document as a string using windows-1256 charset
* @desc doNormalize Normalized Arabic document
* @author Khaled Al-Shamaa
*/
private function doNormalize($str){
$patterns = array();
$replacements = array();
array_push($patterns, '/ø|ó|ð|õ|ñ|ö|ò|ú/');
array_push($patterns, '/Ç|Ã|Å|Â/');
array_push($replacements, '');
array_push($replacements, 'Ç');
$str = preg_replace($patterns, $replacements, $str);
return $str;
}
/**
* @return String Arabic document as a string free of common words (roughly)
* @param String Input normalized Arabic document as a string using windows-1256 charset
* @desc cleanCommon Extracting common Arabic words (roughly) from input Arabic string (document content)
* @author Khaled Al-Shamaa
*/
private function cleanCommon($str){
$patterns = array('/\s+\w{1,2}(\s+)/');
$words = file('common.inc.txt');
$max = count($words);
for($i=0; $i<$max; $i++){
$words[$i] = trim($words[$i]);
}
$str = preg_replace('/\s+æ?(' . implode('|', $words) . ')(\s+)/', '\\2', $str, -1);
return $str;
}
/**
* @return String Output string after removing less significant Arabic letter (not human readable output)
* @param String Input Arabic document as a string using windows-1256 charset
* @desc draftStem Remove less significant Arabic letter from given string (document content). Please note that output will not be human readable.
* @author Khaled Al-Shamaa
*/
private function draftStem($str){
$str = preg_replace('/Ó|Ç|á|Ê|ã|æ|ä|í|å|É/', '', $str);
return $str;
}
/**
* @return Hash Associated array where document words referred by index and those words ranks referred by values of those array items
* @param String Input Arabic document as a string using windows-1256 charset
* @desc rankWords Ranks words in a given Arabic string (document content). That rank refers to the frequency of that word appears in that given document.
* @author Khaled Al-Shamaa
*/
private function rankWords($str){
$wordsRanks = array();
$patterns = array();
$replacements = array();
array_push($patterns, '/\.|\n|\¡|\º|\(|\[|\{|\)|\]|\}/');
array_push($replacements, ' ');
$str = preg_replace($patterns, $replacements, $str);
$words = preg_split("/[\s,]+/", $str);
foreach($words as $word){
if($wordsRanks[$word]){
$wordsRanks[$word] = $wordsRanks[$word] + 1;
}else{
$wordsRanks[$word] = 1;
}
}
foreach($wordsRanks as $wordRank => $total){
if (substr($wordRank, 0, 1) == 'æ'){
$subWordRank = substr($wordRank, 1, strlen($wordRank)-1);
if (isset($wordsRanks[$subWordRank])){
unset($wordsRanks[$wordRank]);
$wordsRanks[$subWordRank] += $total;
}
}
}
return $wordsRanks;
}
/**
* @return Array Two dimension array, first item is an array of document sentences, second item is an array of ranks of document sentences
* @param String Input Arabic document as a string using windows-1256 charset
* Array Words ranks array (word as an index and value refer to the word frequency)
* @desc rankSentences Ranks sentences in a given Arabic string (document content).
* @author Khaled Al-Shamaa
*/
private function rankSentences($str, $arr){
$sentenceArr = array();
$rankArr = array();
preg_match_all("/[^|\.|\n|\¡|\º](.+?)[\.|\n|\¡|\º]/", $str, $sentences);
foreach($sentences[0] as $sentence){
$w = 0;
$first = $sentence{0};
$last = $sentence{strlen($sentence)-1};
if ($first == "\n") { $w += 3; }
elseif (in_array($first, array('.','¡','º'))) { $w += 2; }
else { $w += 1; }
if ($last == "\n") { $w += 3; }
elseif (in_array($last, array('.','¡','º'))) { $w += 2; }
else { $w += 1; }
$words = file('important.inc.txt');
for($i = 0; $i<count($words); $i++){ $words[$i] = trim($words[$i]); }
$pattern = '/('.implode('|',$words).')/';
preg_match_all($pattern, $sentence, $important);
$w += count($important[0]);
$sentence = substr(substr($sentence, 0, -1), 1);
if(!in_array($first, array('.','¡','º',"\n"))) { $sentence = $first.$sentence; }
$normalizedStr = $this->doNormalize($sentence);
$cleanedStr = $this->cleanCommon($normalizedStr);
$stemStr = $this->draftStem($cleanedStr);
$words = preg_split("/[\s,]+/", $stemStr);
$totalWords = count($words);
if($totalWords > 0){
$totalWordsRank = 0;
foreach($words as $word){
$totalWordsRank += $arr[$word];
}
$wordsRank = $totalWordsRank/$totalWords;
$sentenceRanks = $w * $wordsRank;
array_push($sentenceArr, $sentence.$last);
array_push($rankArr, $sentenceRanks);
}
}
$sentencesRanks = array($sentenceArr, $rankArr);
return $sentencesRanks;
}
/**
* @return Integer Minimum accepted sentence rank (sentences with rank more than this will be listed in the document summary)
* @param Array Sentences ranks
* Integer Number of sentences you need to include in your summary
* @desc minAcceptedRank Calculate minimum rank for sentences which will be including in the summary
* @author Khaled Al-Shamaa
*/
private function minAcceptedRank($arr, $int){
rsort($arr, SORT_NUMERIC);
$minRank = $arr[$int];
return $minRank;
}
/**
* @return True if passed string is accepted as a valid word else it will return False
* @param String String to be checked if it is a valid word or not
* @desc acceptedWord Check some conditions to know if a given string is a formal valid word or not
* @author Khaled Al-Shamaa
*/
private function acceptedWord($word){
$accept = true;
if(strlen($word) < 3) $accept = false;
return $accept;
}
}