Location: PHPKode > scripts > EPub > Grandt-PHPePub-877312c/EPubChapterSplitter.php
<?php
/**
 * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
 * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
 * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
 *
 * @author A. Grandt <hide@address.com>
 * @copyright 2009-2012 A. Grandt
 * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else.
 * @link http://www.phpclasses.org/package/6115 
 * @link https://github.com/Grandt/PHPePub
 * @version 2.02
 */
class EPubChapterSplitter {
	const VERSION = 2.02;
	
	private $splitDefaultSize = 250000;

	/**
	 * Set default chapter target size.
	 * Default is 250000 bytes, and minimum is 10240 bytes.
	 *
	 * @param $size segment size in bytes
	 * @return void
	 */
	function setSplitSize($size) {
		$this->splitDefaultSize = (int)$size;
		if ($size < 10240) {
			$this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
		}
	}

	/**
	 * Get the chapter target size.
	 *
	 * @return $size
	 */
	function getSplitSize() {
		return $this->splitDefaultSize;
	}

	/**
	 * Split $chapter into multiple parts.
	 * 
	 * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php 
	 * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
	 * 
	 * @param String $chapter XHTML file
	 * @param Bool   $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.  
	 * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
	 * 
	 * @return array with 1 or more parts
	 */
	function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
		$chapterData = array();
		$isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
		if ($splitOnSearchString && !$isSearchRegexp) {
			$searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
		}

		if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
			return array($chapter);
		}

		$xmlDoc = new DOMDocument();
		@$xmlDoc->loadHTML($chapter);

		$head = $xmlDoc->getElementsByTagName("head");
		$body = $xmlDoc->getElementsByTagName("body");

		$htmlPos = stripos($chapter, "<html");
		$htmlEndPos = stripos($chapter, ">", $htmlPos);
		$newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
		if (strpos(trim($newXML), "<?xml ") === FALSE) {
			$newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
		}
		$headerLength = strlen($newXML);

		$files = array();
		$chapterNames = array();
		$domDepth = 0;
		$domPath = array();
		$domClonedPath = array();

		$curFile = $xmlDoc->createDocumentFragment();
		$files[] = $curFile;
		$curParent = $curFile;
		$curSize = 0;

		$bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
		$headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;

		$partSize = $this->splitDefaultSize - $headLen;

		if ($bodyLen > $partSize) {
			$parts = ceil($bodyLen / $partSize);
			$partSize = ($bodyLen / $parts)  - $headLen;
		}

		$node = $body->item(0)->firstChild;

		do {
			$nodeData = $xmlDoc->saveXML($node);
			$nodeLen = strlen($nodeData);

			if ($nodeLen > $partSize && $node->hasChildNodes()) {
				$domPath[] = $node;
				$domClonedPath[] = $node->cloneNode(false);
				$domDepth++;

				$node = $node->firstChild;
			}

			$node2 = $node->nextSibling;

			if ($node != null && $node->nodeName != "#text") {
				$doSplit = false;
				if ($splitOnSearchString) {
					$doSplit = preg_match($searchString, $nodeData) == 1;
					if ($doSplit) {
						$chapterNames[] = trim($nodeData);
					}
				}

				if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
					$curFile = $xmlDoc->createDocumentFragment();
					$files[] = $curFile;
					$curParent = $curFile;
					if ($domDepth > 0) {
						reset($domPath);
						reset($domClonedPath);
						$oneDomClonedPath = each($domClonedPath);
						while ($oneDomClonedPath) {
							list($k, $v) = $oneDomClonedPath;
							$newParent = $v->cloneNode(false);
							$curParent->appendChild($newParent);
							$curParent = $newParent;
							$oneDomClonedPath = each($domClonedPath);
						}
					}
					$curSize = strlen($xmlDoc->saveXML($curFile));
				}
				$curParent->appendChild($node->cloneNode(true));
				$curSize += $nodeLen;
			}

			$node = $node2;
			while ($node == null && $domDepth > 0) {
				$domDepth--;
				$node = end($domPath)->nextSibling;
				array_pop($domPath);
				array_pop($domClonedPath);
				$curParent = $curParent->parentNode;
			}
		} while ($node != null);

		$curFile = null;
		$curSize = 0;

		$xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
		$xml->lookupPrefix("http://www.w3.org/1999/xhtml");
		$xml->preserveWhiteSpace = false;
		$xml->formatOutput = true;

		for ($idx = 0; $idx < count($files); $idx++) {
			$xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
			$xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
			$xml2Doc->loadXML($newXML);
			$html = $xml2Doc->getElementsByTagName("html")->item(0);
			$html->appendChild($xml2Doc->importNode($head->item(0), true));
			$body = $xml2Doc->createElement("body");
			$html->appendChild($body);
			$body->appendChild($xml2Doc->importNode($files[$idx], true));

			// force pretty printing and correct formatting, should not be needed, but it is.
			$xml->loadXML($xml2Doc->saveXML());
			
			$chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $xml->saveXML();
		}

		return $chapterData;
	}
}
?>
Return current item: EPub