Location: PHPKode > scripts > RFC 822 Email Address Parser > rfc822.php
<?php

	#
	# RFC 822/2822/5322 Email Parser
	#
	# By Cal Henderson <hide@address.com>
	#
	# This code is dual licensed:
	# CC Attribution-ShareAlike 2.5 - http://creativecommons.org/licenses/by-sa/2.5/
	# GPLv3 - http://www.gnu.org/copyleft/gpl.html
	#
	# $Revision$
	#

	##################################################################################

	function is_valid_email_address($email, $options=array()){

		#
		# you can pass a few different named options as a second argument,
		# but the defaults are usually a good choice.
		#

		$defaults = array(
			'allow_comments'	=> true,
			'public_internet'	=> true, # turn this off for 'strict' mode
		);

		$opts = array();
		foreach ($defaults as $k => $v) $opts[$k] = isset($options[$k]) ? $options[$k] : $v;
		$options = $opts;
		


		####################################################################################
		#
		# NO-WS-CTL       =       %d1-8 /         ; US-ASCII control characters
		#                         %d11 /          ;  that do not include the
		#                         %d12 /          ;  carriage return, line feed,
		#                         %d14-31 /       ;  and white space characters
		#                         %d127
		# ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
		# DIGIT          =  %x30-39

		$no_ws_ctl	= "[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]";
		$alpha		= "[\\x41-\\x5a\\x61-\\x7a]";
		$digit		= "[\\x30-\\x39]";
		$cr		= "\\x0d";
		$lf		= "\\x0a";
		$crlf		= "(?:$cr$lf)";


		####################################################################################
		#
		# obs-char        =       %d0-9 / %d11 /          ; %d0-127 except CR and
		#                         %d12 / %d14-127         ;  LF
		# obs-text        =       *LF *CR *(obs-char *LF *CR)
		# text            =       %d1-9 /         ; Characters excluding CR and LF
		#                         %d11 /
		#                         %d12 /
		#                         %d14-127 /
		#                         obs-text
		# obs-qp          =       "\" (%d0-127)
		# quoted-pair     =       ("\" text) / obs-qp

		$obs_char	= "[\\x00-\\x09\\x0b\\x0c\\x0e-\\x7f]";
		$obs_text	= "(?:$lf*$cr*(?:$obs_char$lf*$cr*)*)";
		$text		= "(?:[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f]|$obs_text)";

		#
		# there's an issue with the definition of 'text', since 'obs_text' can
		# be blank and that allows qp's with no character after the slash. we're
		# treating that as bad, so this just checks we have at least one
		# (non-CRLF) character
		#

		$text		= "(?:$lf*$cr*$obs_char$lf*$cr*)";
		$obs_qp		= "(?:\\x5c[\\x00-\\x7f])";
		$quoted_pair	= "(?:\\x5c$text|$obs_qp)";


		####################################################################################
		#
		# obs-FWS         =       1*WSP *(CRLF 1*WSP)
		# FWS             =       ([*WSP CRLF] 1*WSP) /   ; Folding white space
		#                         obs-FWS
		# ctext           =       NO-WS-CTL /     ; Non white space controls
		#                         %d33-39 /       ; The rest of the US-ASCII
		#                         %d42-91 /       ;  characters not including "(",
		#                         %d93-126        ;  ")", or "\"
		# ccontent        =       ctext / quoted-pair / comment
		# comment         =       "(" *([FWS] ccontent) [FWS] ")"
		# CFWS            =       *([FWS] comment) (([FWS] comment) / FWS)

		#
		# note: we translate ccontent only partially to avoid an infinite loop
		# instead, we'll recursively strip *nested* comments before processing
		# the input. that will leave 'plain old comments' to be matched during
		# the main parse.
		#

		$wsp		= "[\\x20\\x09]";
		$obs_fws	= "(?:$wsp+(?:$crlf$wsp+)*)";
		$fws		= "(?:(?:(?:$wsp*$crlf)?$wsp+)|$obs_fws)";
		$ctext		= "(?:$no_ws_ctl|[\\x21-\\x27\\x2A-\\x5b\\x5d-\\x7e])";
		$ccontent	= "(?:$ctext|$quoted_pair)";
		$comment	= "(?:\\x28(?:$fws?$ccontent)*$fws?\\x29)";
		$cfws		= "(?:(?:$fws?$comment)*(?:$fws?$comment|$fws))";


		#
		# these are the rules for removing *nested* comments. we'll just detect
		# outer comment and replace it with an empty comment, and recurse until
		# we stop.
		#

		$outer_ccontent_dull	= "(?:$fws?$ctext|$quoted_pair)";
		$outer_ccontent_nest	= "(?:$fws?$comment)";
		$outer_comment		= "(?:\\x28$outer_ccontent_dull*(?:$outer_ccontent_nest$outer_ccontent_dull*)+$fws?\\x29)";


		####################################################################################
		#
		# atext           =       ALPHA / DIGIT / ; Any character except controls,
		#                         "!" / "#" /     ;  SP, and specials.
		#                         "$" / "%" /     ;  Used for atoms
		#                         "&" / "'" /
		#                         "*" / "+" /
		#                         "-" / "/" /
		#                         "=" / "?" /
		#                         "^" / "_" /
		#                         "`" / "{" /
		#                         "|" / "}" /
		#                         "~"
		# atom            =       [CFWS] 1*atext [CFWS]

		$atext		= "(?:$alpha|$digit|[\\x21\\x23-\\x27\\x2a\\x2b\\x2d\\x2f\\x3d\\x3f\\x5e\\x5f\\x60\\x7b-\\x7e])";
		$atom		= "(?:$cfws?(?:$atext)+$cfws?)";


		####################################################################################
		#
		# qtext           =       NO-WS-CTL /     ; Non white space controls
		#                         %d33 /          ; The rest of the US-ASCII
		#                         %d35-91 /       ;  characters not including "\"
		#                         %d93-126        ;  or the quote character
		# qcontent        =       qtext / quoted-pair
		# quoted-string   =       [CFWS]
		#                         DQUOTE *([FWS] qcontent) [FWS] DQUOTE
		#                         [CFWS]
		# word            =       atom / quoted-string

		$qtext		= "(?:$no_ws_ctl|[\\x21\\x23-\\x5b\\x5d-\\x7e])";
		$qcontent	= "(?:$qtext|$quoted_pair)";
		$quoted_string	= "(?:$cfws?\\x22(?:$fws?$qcontent)*$fws?\\x22$cfws?)";

		#
		# changed the '*' to a '+' to require that quoted strings are not empty
		#

		$quoted_string	= "(?:$cfws?\\x22(?:$fws?$qcontent)+$fws?\\x22$cfws?)";
		$word		= "(?:$atom|$quoted_string)";


		####################################################################################
		#
		# obs-local-part  =       word *("." word)
		# obs-domain      =       atom *("." atom)

		$obs_local_part	= "(?:$word(?:\\x2e$word)*)";
		$obs_domain	= "(?:$atom(?:\\x2e$atom)*)";


		####################################################################################
		#
		# dot-atom-text   =       1*atext *("." 1*atext)
		# dot-atom        =       [CFWS] dot-atom-text [CFWS]

		$dot_atom_text	= "(?:$atext+(?:\\x2e$atext+)*)";
		$dot_atom	= "(?:$cfws?$dot_atom_text$cfws?)";


		####################################################################################
		#
		# domain-literal  =       [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
		# dcontent        =       dtext / quoted-pair
		# dtext           =       NO-WS-CTL /     ; Non white space controls
		# 
		#                         %d33-90 /       ; The rest of the US-ASCII
		#                         %d94-126        ;  characters not including "[",
		#                                         ;  "]", or "\"

		$dtext		= "(?:$no_ws_ctl|[\\x21-\\x5a\\x5e-\\x7e])";
		$dcontent	= "(?:$dtext|$quoted_pair)";
		$domain_literal	= "(?:$cfws?\\x5b(?:$fws?$dcontent)*$fws?\\x5d$cfws?)";


		####################################################################################
		#
		# local-part      =       dot-atom / quoted-string / obs-local-part
		# domain          =       dot-atom / domain-literal / obs-domain
		# addr-spec       =       local-part "@" domain

		$local_part	= "(($dot_atom)|($quoted_string)|($obs_local_part))";
		$domain		= "(($dot_atom)|($domain_literal)|($obs_domain))";
		$addr_spec	= "$local_part\\x40$domain";



		#
		# this was previously 256 based on RFC3696, but dominic's errata was accepted.
		#

		if (strlen($email) > 254) return 0;


		#
		# we need to strip nested comments first - we replace them with a simple comment
		#

		if ($options['allow_comments']){

			$email = email_strip_comments($outer_comment, $email, "(x)");
		}


		#
		# now match what's left
		#

		if (!preg_match("!^$addr_spec$!", $email, $m)){

			return 0;
		}

		$bits = array(
			'local'			=> isset($m[1]) ? $m[1] : '',
			'local-atom'		=> isset($m[2]) ? $m[2] : '',
			'local-quoted'		=> isset($m[3]) ? $m[3] : '',
			'local-obs'		=> isset($m[4]) ? $m[4] : '',
			'domain'		=> isset($m[5]) ? $m[5] : '',
			'domain-atom'		=> isset($m[6]) ? $m[6] : '',
			'domain-literal'	=> isset($m[7]) ? $m[7] : '',
			'domain-obs'		=> isset($m[8]) ? $m[8] : '',
		);


		#
		# we need to now strip comments from $bits[local] and $bits[domain],
		# since we know they're in the right place and we want them out of the
		# way for checking IPs, label sizes, etc
		#

		if ($options['allow_comments']){
			$bits['local']	= email_strip_comments($comment, $bits['local']);
			$bits['domain']	= email_strip_comments($comment, $bits['domain']);
		}


		#
		# length limits on segments
		#

		if (strlen($bits['local']) > 64) return 0;
		if (strlen($bits['domain']) > 255) return 0;


		#
		# restrictions on domain-literals from RFC2821 section 4.1.3
		#
		# RFC4291 changed the meaning of :: in IPv6 addresses - i can mean one or
		# more zero groups (updated from 2 or more).
		#

		if (strlen($bits['domain-literal'])){

			$Snum			= "(\d{1,3})";
			$IPv4_address_literal	= "$Snum\.$Snum\.$Snum\.$Snum";

			$IPv6_hex		= "(?:[0-9a-fA-F]{1,4})";

			$IPv6_full		= "IPv6\:$IPv6_hex(?:\:$IPv6_hex){7}";

			$IPv6_comp_part		= "(?:$IPv6_hex(?:\:$IPv6_hex){0,7})?";
			$IPv6_comp		= "IPv6\:($IPv6_comp_part\:\:$IPv6_comp_part)";

			$IPv6v4_full		= "IPv6\:$IPv6_hex(?:\:$IPv6_hex){5}\:$IPv4_address_literal";

			$IPv6v4_comp_part	= "$IPv6_hex(?:\:$IPv6_hex){0,5}";
			$IPv6v4_comp		= "IPv6\:((?:$IPv6v4_comp_part)?\:\:(?:$IPv6v4_comp_part\:)?)$IPv4_address_literal";


			#
			# IPv4 is simple
			#

			if (preg_match("!^\[$IPv4_address_literal\]$!", $bits['domain'], $m)){

				if (intval($m[1]) > 255) return 0;
				if (intval($m[2]) > 255) return 0;
				if (intval($m[3]) > 255) return 0;
				if (intval($m[4]) > 255) return 0;

			}else{

				#
				# this should be IPv6 - a bunch of tests are needed here :)
				#

				while (1){

					if (preg_match("!^\[$IPv6_full\]$!", $bits['domain'])){
						break;
					}

					if (preg_match("!^\[$IPv6_comp\]$!", $bits['domain'], $m)){
						list($a, $b) = explode('::', $m[1]);
						$folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
						$groups = explode(':', $folded);
						if (count($groups) > 7) return 0;
						break;
					}

					if (preg_match("!^\[$IPv6v4_full\]$!", $bits['domain'], $m)){

						if (intval($m[1]) > 255) return 0;
						if (intval($m[2]) > 255) return 0;
						if (intval($m[3]) > 255) return 0;
						if (intval($m[4]) > 255) return 0;
						break;
					}

					if (preg_match("!^\[$IPv6v4_comp\]$!", $bits['domain'], $m)){
						list($a, $b) = explode('::', $m[1]);
						$b = substr($b, 0, -1); # remove the trailing colon before the IPv4 address
						$folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
						$groups = explode(':', $folded);
						if (count($groups) > 5) return 0;
						break;
					}

					return 0;
				}
			}			
		}else{

			#
			# the domain is either dot-atom or obs-domain - either way, it's
			# made up of simple labels and we split on dots
			#

			$labels = explode('.', $bits['domain']);


			#
			# this is allowed by both dot-atom and obs-domain, but is un-routeable on the
			# public internet, so we'll fail it (e.g. hide@address.com)
			#

			if ($options['public_internet']){
				if (count($labels) == 1) return 0;
			}


			#
			# checks on each label
			#

			foreach ($labels as $label){

				if (strlen($label) > 63) return 0;
				if (substr($label, 0, 1) == '-') return 0;
				if (substr($label, -1) == '-') return 0;
			}


			#
			# last label can't be all numeric
			#

			if ($options['public_internet']){
				if (preg_match('!^[0-9]+$!', array_pop($labels))) return 0;
			}
		}


		return 1;
	}

	##################################################################################

	function email_strip_comments($comment, $email, $replace=''){

		while (1){
			$new = preg_replace("!$comment!", $replace, $email);
			if (strlen($new) == strlen($email)){
				return $email;
			}
			$email = $new;
		}
	}

	##################################################################################
?>
Return current item: RFC 822 Email Address Parser