Location: PHPKode > scripts > TablePress > tablepress/libraries/csv-parser.class.php
<?php
/**
 * CSV Parsing class for TablePress, used for import of CSV files
 *
 * @package TablePress
 * @subpackage Import
 * @author Tobias Bäthge
 * @since 1.0.0
 */

// Prohibit direct script loading
defined( 'ABSPATH' ) || die( 'No direct script access allowed!' );

/**
 * CSV Parsing class
 * @package TablePress
 * @subpackage Import
 * @author Tobias Bäthge
 * @since 1.0.0
 */
class CSV_Parser {

	// enclosure (double quote)
	protected $enclosure = '"';

	// number of rows to analyze when attempting to auto-detect delimiter
	protected $delimiter_search_max_lines = 15;
	// characters to ignore when attempting to auto-detect delimiter
	protected $non_delimiter_chars = "a-zA-Z0-9\n\r";
	// preferred delimiter characters, only used when all filtering method
	// returns multiple possible delimiters (happens very rarely)
	protected $preferred_delimiter_chars = ";,\t";
	// data to import
	protected $import_data;

	// error while parsing input data
	//	0 = No errors found. Everything should be fine :)
	//	1 = Hopefully correctable syntax error was found.
	//	2 = Enclosure character (double quote by default) was found in non-enclosed field.
	//		This means the file is either corrupt, or does not standard CSV formatting.
	//		Please validate the parsed data yourself.
	public $error = 0;

	// detailed error info
	public $error_info = array();

	/**
	 * Constructor
	 *
	 * @since 1.0.0
	 */
	public function __construct() {
		// intentionally left blank
	}

	/**
	 * Load data that shall be parsed
	 *
	 * @since 1.0.0
	 *
	 * @param string $data Data to be parsed
	 */
	public function load_data( $data ) {
		// check for mandatory trailing line break
		if ( substr( $data, -1 ) != "\n" )
			$data .= "\n";
		$this->import_data = &$data;
	}

	/**
	 * Detect the CSV delimiter, by analyzing some rows to determine most probable delimiter character
	 *
	 * @since 1.0.0
	 *
	 * @return string Most probable delimiter character
	 */
	public function find_delimiter() {
		$data = &$this->import_data;

		$delimiter_count = array();
		$enclosed = false;
		$current_line = 0;

		// walk through each character in the CSV string (up to $this->delimiter_search_max_lines)
		// and search potential delimiter characters
		$data_length = strlen( $data );
		for ( $i = 0; $i < $data_length; $i++ ) {
			$prev_char = ( $i-1 >= 0 ) ? $data[$i-1] : '';
			$curr_char = $data[$i];
			$next_char = ( $i+1 < $data_length ) ? $data[$i+1] : '';

			if ( $curr_char == $this->enclosure ) {
				// open and closing quotes
				if ( ! $enclosed || $next_char != $this->enclosure )
					$enclosed = ! $enclosed; // flip bool
				elseif ( $enclosed )
					$i++; // skip next character
			} elseif ( ( "\n" == $curr_char && "\r" != $prev_char || "\r" == $curr_char ) && ! $enclosed ) {
				// reached end of a line
				$current_line++;
				if ( $current_line >= $this->delimiter_search_max_lines )
					break;
			} elseif ( ! $enclosed ) {
				// at this point $curr_char seems to be used as a delimiter, as it is not enclosed
				// count $curr_char if it is not in the non_delimiter_chars list
				if ( 0 === preg_match( '#[' . $this->non_delimiter_chars . ']#i', $curr_char ) ) {
					if ( ! isset( $delimiter_count[$curr_char][$current_line] ) )
						$delimiter_count[$curr_char][$current_line] = 0; // init empty
					$delimiter_count[$curr_char][$current_line]++;
				}
			}
		}

		// find most probable delimiter, by sorting their counts
		$potential_delimiters = array();
		foreach ( $delimiter_count as $char => $line_counts ) {
			$is_possible_delimiter = $this->_check_delimiter_count( $char, $line_counts, $current_line );
			if ( false !== $is_possible_delimiter )
				$potential_delimiters[$is_possible_delimiter] = $char;
		}
		ksort( $potential_delimiters );
		// return first array element, as that has the highest count
		return array_shift( $potential_delimiters );
	}

	/**
	 * Check if passed character can be a delimiter, by checking counts in each line
	 *
	 * @since 1.0.0
	 *
	 * @param string|char $char Character to check
	 * @param array $line_counts
	 * @param int $number_lines
	 * @return bool|string False if delimiter is not possible, string to be used as a sort key if character could be a delimiter
	 */
	protected function _check_delimiter_count( $char, $line_counts, $number_lines ) {
		// was potential delimiter found in every line?
		if ( count( $line_counts ) != $number_lines )
			return false;

		// check if count in every line is the same (or one higher for "almost")
		$first = null;
		$equal = null;
		$almost = false;
		foreach ( $line_counts as $line => $count ) {
			if ( null == $first ) {
				$first = $count;
			} elseif ( $count == $first && false !== $equal ) {
				$equal = true;
			} elseif ( $count == $first + 1 && false !== $equal ) {
				$equal = true;
				$almost = true;
			} else {
				$equal = false;
			}
		}
		// check equality only if more than one row
		if ( $number_lines > 1 && ! $equal )
			return false;

		// at this point, count is equal in all lines, determine a string to sort priority
		$match = ( $almost ) ? 2 : 1 ;
		$pref = strpos( $this->preferred_delimiter_chars, $char );
		$pref = ( false !== $pref ) ? str_pad( $pref, 3, '0', STR_PAD_LEFT ) : '999';
		return $pref . $match . '.' . ( 99999 - str_pad( $first, 5, '0', STR_PAD_LEFT ) );
	}

	/**
	 * Parse CSV string into 2D array
	 *
	 * @since 1.0.0
	 *
	 * @param string $delimiter Delimiter character for the CSV parsing
	 * @return array 2D array with the data from the CSV string
	 */
	public function parse( $delimiter ) {
		$data = &$this->import_data;

		$white_spaces = str_replace( $delimiter, '', " \t\x0B\0" ); // filter delimiter from the list, if it is a white-space character

		$rows = array(); // complete rows
		$row = array(); // row that is currently built
		$column = 0; // current column index
		$cell_content = ''; // content of the currently processed cell
		$enclosed = false;
		$was_enclosed = false; // to determine if cell content will be trimmed of white-space (only for enclosed cells)

		// walk through each character in the CSV string
		$data_length = strlen( $data );
		for ( $i = 0; $i < $data_length; $i++ ) {
			$curr_char = $data[$i];
			$next_char = ( $i+1 < $data_length ) ? $data[$i+1] : '';

			if ( $curr_char == $this->enclosure ) {
				// open/close quotes, and inline quotes
				if ( ! $enclosed ) {
					if ( '' == ltrim( $cell_content, $white_spaces ) ) {
						$enclosed = true;
						$was_enclosed = true;
					} else {
						$this->error = 2;
						$error_line = count( $rows ) + 1;
						$error_column = $column + 1;
						if ( ! isset( $this->error_info[ $error_line.'-'.$error_column ] ) ) {
							$this->error_info[ $error_line.'-'.$error_column ] = array(
								'type' => 2,
								'info' => "Syntax error found in line {$error_line}. Non-enclosed fields can not contain double-quotes.",
								'line' => $error_line,
								'column' => $error_column
							);
						}
						$cell_content .= $curr_char;
					}
				} elseif ( $next_char == $this->enclosure ) {
					// enclosure character within enclosed cell (" encoded as "")
					$cell_content .= $curr_char;
					$i++; // skip next character
				} elseif ( $next_char != $delimiter && "\r" != $next_char && "\n" != $next_char ) {
					// for-loop (instead of while-loop) that skips white-space
					for ( $x = ( $i+1 ); isset( $data[$x] ) && '' == ltrim( $data[$x], $white_spaces ); $x++ ) {}
					if ( $data[$x] == $delimiter ) {
						$enclosed = false;
						$i = $x;
					} else {
						if ( $this->error < 1 )
							$this->error = 1;
						$error_line = count( $rows ) + 1;
						$error_column = $column + 1;
						if ( ! isset( $this->error_info[ $error_line.'-'.$error_column ] ) ) {
							$this->error_info[ $error_line.'-'.$error_column ] = array(
								'type' => 1,
								'info' => "Syntax error found in line {$error_line}. A single double-quote was found within an enclosed string. Enclosed double-quotes must be escaped with a second double-quote.",
								'line' => $error_line,
								'column' => $error_column
							);
						}
						$cell_content .= $curr_char;
						$enclosed = false;
					}
				} else {
					// the " was the closing one for the cell
					$enclosed = false;
				}
			} elseif ( ( $curr_char == $delimiter || "\n" == $curr_char || "\r" == $curr_char ) && ! $enclosed ) {
				// end of cell (by $delimiter), or end of line (by line break, and not enclosed!)

				$row[$column] = ( $was_enclosed ) ? $cell_content : trim( $cell_content );
				$cell_content = '';
				$was_enclosed = false;
				$column++;

				// end of line
				if ( "\n" == $curr_char || "\r" == $curr_char ) {
					// append completed row
					$rows[] = $row;
					$row = array();
					$column = 0;
					if ( "\r" == $curr_char && "\n" == $next_char )
						$i++; // skip next character in \r\n line breaks
				}
			} else {
				// append character to current cell
				$cell_content .= $curr_char;
			}
		}

		return $rows;
	}

} // class CSV_Parser
Return current item: TablePress