RFC-compliant email address validator

Latest source code will always be here: http://code.google.com/p/isemail/source/browse/trunk

<?php
/*
Copyright 2009 Dominic Sayers
	dominic_sayers@hotmail.com
	http://www.dominicsayers.com

Version 1.7

This source file is subject to the Common Public Attribution License Version 1.0 (CPAL) license.
The license terms are available through the world-wide-web at http://www.opensource.org/licenses/cpal_1.0
*/

//	PHPLint modules
/*.
	require_module 'standard';
	require_module 'pcre';
.*/
/*.boolean.*/ function is_email (/*.string.*/ $email, $checkDNS = false) {
	//	Check that $email is a valid address. Read the following RFCs to understand the constraints:
	//		(http://tools.ietf.org/html/rfc5322)
	//		(http://tools.ietf.org/html/rfc3696)
	//		(http://tools.ietf.org/html/rfc5321)
	//		(http://tools.ietf.org/html/rfc4291#section-2.2)
	//		(http://tools.ietf.org/html/rfc1123#section-2.1)
	
	//	the upper limit on address lengths should normally be considered to be 256
	//		(http://www.rfc-editor.org/errata_search.php?rfc=3696)
	//		NB I think John Klensin is misreading RFC 5321 and the the limit should actually be 254
	//		However, I will stick to the published number until it is changed.
	//
	//	The maximum total length of a reverse-path or forward-path is 256
	//	characters (including the punctuation and element separators)
	//		(http://tools.ietf.org/html/rfc5321#section-4.5.3.1.3)
	$emailLength = strlen($email);
	if ($emailLength > 256)	return false;	//	Too long

	//	Contemporary email addresses consist of a "local part" separated from
	//	a "domain part" (a fully-qualified domain name) by an at-sign ("@").
	//		(http://tools.ietf.org/html/rfc3696#section-3)
	$atIndex		= strrpos($email,'@');

	if ($atIndex === false)			return false;	//	No at-sign
	if ($atIndex === 0)				return false;	//	No local part
	if ($atIndex === $emailLength)	return false;	//	No domain part
	
	//	Sanitize comments
	//	- remove nested comments, quotes and dots in comments
	//	- remove parentheses and dots from quoted strings
	$braceDepth		= 0;
	$inQuote		= false;
	$escapeThisChar	= false;

	for ($i = 0; $i < $emailLength; ++$i) {
		$char = $email[$i];
		$replaceChar = false;

		if ($char === '\\') {
			$escapeThisChar = !$escapeThisChar;	//	Escape the next character?
		} else {
			switch ($char) {
			case '(':
				if ($escapeThisChar) {
					$replaceChar = true;
				} else {
					if ($inQuote) {
						$replaceChar = true;
					} else {
						if ($braceDepth++ > 0) $replaceChar = true;	//	Increment brace depth
					}
				}

				break;
			case ')':
				if ($escapeThisChar) {
					$replaceChar = true;
				} else {
					if ($inQuote) {
						$replaceChar = true;
					} else {
						if (--$braceDepth > 0) $replaceChar = true;	//	Decrement brace depth
						if ($braceDepth < 0) $braceDepth = 0;
					}
				}

				break;
			case '"':
				if ($escapeThisChar) {
					$replaceChar = true;
				} else {
					if ($braceDepth === 0) {
						$inQuote = !$inQuote;	//	Are we inside a quoted string?
					} else {
						$replaceChar = true;
					}
				}

				break;
			case '.':	//	Dots don't help us either
				if ($escapeThisChar) {
					$replaceChar = true;
				} else {
					if ($braceDepth > 0) $replaceChar = true;
				}

				break;
			}

			$escapeThisChar = false;
			if ($replaceChar) $email[$i] = 'x';	//	Replace the offending character with something harmless
		}
	}

	$localPart		= substr($email, 0, $atIndex);
	$domain			= substr($email, $atIndex + 1);
	$FWS			= "(?:(?:(?:[ \\t]*(?:\\r\\n))?[ \\t]+)|(?:[ \\t]+(?:(?:\\r\\n)[ \\t]+)*))";	//	Folding white space
	//	Let's check the local part for RFC compliance...
	//
	//	local-part      =       dot-atom / quoted-string / obs-local-part
	//	obs-local-part  =       word *("." word)
	//		(http://tools.ietf.org/html/rfc5322#section-3.4.1)
	//
	//	Problem: need to distinguish between "first.last" and "first"."last"
	//	(i.e. one element or two). And I suck at regexes.
	$dotArray	= /*. (array[int]string) .*/ preg_split('/\\.(?=(?:[^\\"]*\\"[^\\"]*\\")*(?![^\\"]*\\"))/m', $localPart);
	$partLength = 0;

	foreach ($dotArray as $element) {
		//	Remove any leading or trailing FWS
		$element = preg_replace("/^$FWS|$FWS\$/", '', $element);

		//	Then we need to remove all valid comments (i.e. those at the start or end of the element
		$elementLength = strlen($element);

		if ($element[0] === '(') {
			$indexBrace = strpos($element, ')');
			if ($indexBrace !== false) {
				if (preg_match('/(?<!\\\\)[\\(\\)]/', substr($element, 1, $indexBrace - 1)) > 0) {
																						return false;	//	Illegal characters in comment
				}
				$element = substr($element, $indexBrace + 1, $elementLength - $indexBrace - 1);
				$elementLength = strlen($element);
			}
		}
		
		if ($element[$elementLength - 1] === ')') {
			$indexBrace = strrpos($element, '(');
			if ($indexBrace !== false) {
				if (preg_match('/(?<!\\\\)(?:[\\(\\)])/', substr($element, $indexBrace + 1, $elementLength - $indexBrace - 2)) > 0) {
																						return false;	//	Illegal characters in comment
				}
				$element = substr($element, 0, $indexBrace);
				$elementLength = strlen($element);
			}
		}			

		//	Remove any leading or trailing FWS around the element (inside any comments)
		$element = preg_replace("/^$FWS|$FWS\$/", '', $element);

		//	What's left counts towards the maximum length for this part
		if ($partLength > 0) $partLength++;	//	for the dot
		$partLength += strlen($element);

		//	Each dot-delimited component can be an atom or a quoted string
		//	(because of the obs-local-part provision)
		if (preg_match('/^"(?:.)*"$/s', $element) > 0) {
			//	Quoted-string tests:
			//
			//	Remove any FWS
			$element = preg_replace("/(?<!\\\\)$FWS/", '', $element);
			//	My regex skillz aren't up to distinguishing between \" \\" \\\" \\\\" etc.
			//	So remove all \\ from the string first...
			$element = preg_replace('/\\\\\\\\/', ' ', $element);
			if (preg_match('/(?<!\\\\|^)["\\r\\n\\x00](?!$)|\\\\"$|""/', $element) > 0)	return false;	//	", CR, LF and NUL must be escaped, "" is too short
		} else {
			//	Unquoted string tests:
			//
			//	Period (".") may...appear, but may not be used to start or end the
			//	local part, nor may two or more consecutive periods appear.
			//		(http://tools.ietf.org/html/rfc3696#section-3)
			//
			//	A zero-length element implies a period at the beginning or end of the
			//	local part, or two periods together. Either way it's not allowed.
			if ($element === '')														return false;	//	Dots in wrong place

			//	Any ASCII graphic (printing) character other than the
			//	at-sign ("@"), backslash, double quote, comma, or square brackets may
			//	appear without quoting.  If any of that list of excluded characters
			//	are to appear, they must be quoted
			//		(http://tools.ietf.org/html/rfc3696#section-3)
			//
			//	Any excluded characters? i.e. 0x00-0x20, (, ), <, >, [, ], :, ;, @, \, comma, period, "
			if (preg_match('/[\\x00-\\x20\\(\\)<>\\[\\]:;@\\\\,\\."]/', $element) > 0)	return false;	//	These characters must be in a quoted string
		}
	}

	if ($partLength > 64) return false;	// Local part must be 64 characters or less

	//	Now let's check the domain part...

	//	The domain name can also be replaced by an IP address in square brackets
	//		(http://tools.ietf.org/html/rfc3696#section-3)
	//		(http://tools.ietf.org/html/rfc5321#section-4.1.3)
	//		(http://tools.ietf.org/html/rfc4291#section-2.2)
	if (preg_match('/^\\[(.)+]$/', $domain) === 1) {
		//	It's an address-literal
		$addressLiteral = substr($domain, 1, strlen($domain) - 2);
		$matchesIP		= array();
		
		//	Extract IPv4 part from the end of the address-literal (if there is one)
		if (preg_match('/\\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/', $addressLiteral, $matchesIP) > 0) {
			$index = strrpos($addressLiteral, $matchesIP[0]);
			
			if ($index === 0) {
				//	Nothing there except a valid IPv4 address, so...
				return true;
			} else {
				//	Assume it's an attempt at a mixed address (IPv6 + IPv4)
				if ($addressLiteral[$index - 1] !== ':')		return false;	//	Character preceding IPv4 address must be ':'
				if (substr($addressLiteral, 0, 5) !== 'IPv6:')	return false;	//	RFC5321 section 4.1.3

				$IPv6 = substr($addressLiteral, 5, ($index ===7) ? 2 : $index - 6);
				$groupMax = 6;
			}
		} else {
			//	It must be an attempt at pure IPv6
			if (substr($addressLiteral, 0, 5) !== 'IPv6:')		return false;	//	RFC5321 section 4.1.3
			$IPv6 = substr($addressLiteral, 5);
			$groupMax = 8;
		}

		$groupCount	= preg_match_all('/^[0-9a-fA-F]{0,4}|\\:[0-9a-fA-F]{0,4}|(.)/', $IPv6, $matchesIP);
		$index		= strpos($IPv6,'::');

		if ($index === false) {
			//	We need exactly the right number of groups
			if ($groupCount !== $groupMax)						return false;	//	RFC5321 section 4.1.3
		} else {
			if ($index !== strrpos($IPv6,'::'))					return false;	//	More than one '::'
			$groupMax = ($index === 0 || $index === (strlen($IPv6) - 2)) ? $groupMax : $groupMax - 1;
			if ($groupCount > $groupMax)						return false;	//	Too many IPv6 groups in address
		}

		//	Check for unmatched characters
		array_multisort($matchesIP[1], SORT_DESC);
		if ($matchesIP[1][0] !== '')							return false;	//	Illegal characters in address

		//	It's a valid IPv6 address, so...
		return true;
	} else {
		//	It's a domain name...

		//	The syntax of a legal Internet host name was specified in RFC-952
		//	One aspect of host name syntax is hereby changed: the
		//	restriction on the first character is relaxed to allow either a
		//	letter or a digit.
		//		(http://tools.ietf.org/html/rfc1123#section-2.1)
		//
		//	NB RFC 1123 updates RFC 1035, but this is not currently apparent from reading RFC 1035.
		//
		//	Most common applications, including email and the Web, will generally not
		//	permit...escaped strings
		//		(http://tools.ietf.org/html/rfc3696#section-2)
		//
		//	the better strategy has now become to make the "at least one period" test,
		//	to verify LDH conformance (including verification that the apparent TLD name
		//	is not all-numeric)
		//		(http://tools.ietf.org/html/rfc3696#section-2)
		//
		//	Characters outside the set of alphabetic characters, digits, and hyphen MUST NOT appear in domain name
		//	labels for SMTP clients or servers
		//		(http://tools.ietf.org/html/rfc5321#section-4.1.2)
		//
		//	RFC5321 precludes the use of a trailing dot in a domain name for SMTP purposes
		//		(http://tools.ietf.org/html/rfc5321#section-4.1.2)
		$dotArray	= /*. (array[int]string) .*/ preg_split('/\\.(?=(?:[^\\"]*\\"[^\\"]*\\")*(?![^\\"]*\\"))/m', $domain);
		$partLength = 0;

		if (count($dotArray) === 1)					return false;	//	Mail host can't be a TLD

		foreach ($dotArray as $element) {
			//	Remove any leading or trailing FWS
			$element = preg_replace("/^$FWS|$FWS\$/", '', $element);
	
			//	Then we need to remove all valid comments (i.e. those at the start or end of the element
			$elementLength = strlen($element);
	
			if ($element[0] === '(') {
				$indexBrace = strpos($element, ')');
				if ($indexBrace !== false) {
					if (preg_match('/(?<!\\\\)[\\(\\)]/', substr($element, 1, $indexBrace - 1)) > 0) {
													return false;	//	Illegal characters in comment
					}
					$element = substr($element, $indexBrace + 1, $elementLength - $indexBrace - 1);
					$elementLength = strlen($element);
				}
			}
			
			if ($element[$elementLength - 1] === ')') {
				$indexBrace = strrpos($element, '(');
				if ($indexBrace !== false) {
					if (preg_match('/(?<!\\\\)(?:[\\(\\)])/', substr($element, $indexBrace + 1, $elementLength - $indexBrace - 2)) > 0) {
													return false;	//	Illegal characters in comment
					}
					$element = substr($element, 0, $indexBrace);
					$elementLength = strlen($element);
				}
			}			
	
			//	Remove any leading or trailing FWS around the element (inside any comments)
			$element = preg_replace("/^$FWS|$FWS\$/", '', $element);
	
			//	What's left counts towards the maximum length for this part
			if ($partLength > 0) $partLength++;	//	for the dot
			$partLength += strlen($element);
	
			//	The DNS defines domain name syntax very generally -- a
			//	string of labels each containing up to 63 8-bit octets,
			//	separated by dots, and with a maximum total of 255
			//	octets.
			//		(http://tools.ietf.org/html/rfc1123#section-6.1.3.5)
			if ($elementLength > 63)				return false;	//	Label must be 63 characters or less
	
			//	Each dot-delimited component must be atext
			//	A zero-length element implies a period at the beginning or end of the
			//	local part, or two periods together. Either way it's not allowed.
			if ($elementLength === 0)				return false;	//	Dots in wrong place
	
			//	Any ASCII graphic (printing) character other than the
			//	at-sign ("@"), backslash, double quote, comma, or square brackets may
			//	appear without quoting.  If any of that list of excluded characters
			//	are to appear, they must be quoted
			//		(http://tools.ietf.org/html/rfc3696#section-3)
			//
			//	If the hyphen is used, it is not permitted to appear at
			//	either the beginning or end of a label.
			//		(http://tools.ietf.org/html/rfc3696#section-2)
			//
			//	Any excluded characters? i.e. 0x00-0x20, (, ), <, >, [, ], :, ;, @, \, comma, period, "
			if (preg_match('/[\\x00-\\x20\\(\\)<>\\[\\]:;@\\\\,\\."]|^-|-$/', $element) > 0) {
													return false;
			}
		}

		if ($partLength > 255) 						return false;	// Local part must be 64 characters or less

		if (preg_match('/^[0-9]+$/', $element) > 0)	return false;	//	TLD can't be all-numeric

		//	Check DNS?
		if ($checkDNS && function_exists('checkdnsrr')) {
			if (!(checkdnsrr($domain, 'A') || checkdnsrr($domain, 'MX'))) {
													return false;	//	Domain doesn't actually exist
			}
		}
	}

	//	Eliminate all other factors, and the one which remains must be the truth.
	//		(Sherlock Holmes, The Sign of Four)
	return true;
}
?>

Usage

I've also uploaded this function to Google Code, where you can be sure of getting the latest version: <a href="http://code.google.com/p/isemail/source/browse/trunk" target="_blank">http://code.google.com/p/isemail/source/browse/trunk</a>


Comments

Add your comment