Latest source code will always be here: http://code.google.com/p/isemail/source/browse/trunk
<?php /* Copyright 2009 Dominic Sayers dominic_sayers@hotmail.com http://www.dominicsayers.com Version 1.7 This source file is subject to the Common Public Attribution License Version 1.0 (CPAL) license. The license terms are available through the world-wide-web at http://www.opensource.org/licenses/cpal_1.0 */ // PHPLint modules /*. require_module 'standard'; require_module 'pcre'; .*/ /*.boolean.*/ function is_email (/*.string.*/ $email, $checkDNS = false) { // Check that $email is a valid address. Read the following RFCs to understand the constraints: // (http://tools.ietf.org/html/rfc5322) // (http://tools.ietf.org/html/rfc3696) // (http://tools.ietf.org/html/rfc5321) // (http://tools.ietf.org/html/rfc4291#section-2.2) // (http://tools.ietf.org/html/rfc1123#section-2.1) // the upper limit on address lengths should normally be considered to be 256 // (http://www.rfc-editor.org/errata_search.php?rfc=3696) // NB I think John Klensin is misreading RFC 5321 and the the limit should actually be 254 // However, I will stick to the published number until it is changed. // // The maximum total length of a reverse-path or forward-path is 256 // characters (including the punctuation and element separators) // (http://tools.ietf.org/html/rfc5321#section-4.5.3.1.3) $emailLength = strlen($email); if ($emailLength > 256) return false; // Too long // Contemporary email addresses consist of a "local part" separated from // a "domain part" (a fully-qualified domain name) by an at-sign ("@"). // (http://tools.ietf.org/html/rfc3696#section-3) $atIndex = strrpos($email,'@'); if ($atIndex === false) return false; // No at-sign if ($atIndex === 0) return false; // No local part if ($atIndex === $emailLength) return false; // No domain part // Sanitize comments // - remove nested comments, quotes and dots in comments // - remove parentheses and dots from quoted strings $braceDepth = 0; $inQuote = false; $escapeThisChar = false; for ($i = 0; $i < $emailLength; ++$i) { $char = $email[$i]; $replaceChar = false; if ($char === '\\') { $escapeThisChar = !$escapeThisChar; // Escape the next character? } else { switch ($char) { case '(': if ($escapeThisChar) { $replaceChar = true; } else { if ($inQuote) { $replaceChar = true; } else { if ($braceDepth++ > 0) $replaceChar = true; // Increment brace depth } } break; case ')': if ($escapeThisChar) { $replaceChar = true; } else { if ($inQuote) { $replaceChar = true; } else { if (--$braceDepth > 0) $replaceChar = true; // Decrement brace depth if ($braceDepth < 0) $braceDepth = 0; } } break; case '"': if ($escapeThisChar) { $replaceChar = true; } else { if ($braceDepth === 0) { $inQuote = !$inQuote; // Are we inside a quoted string? } else { $replaceChar = true; } } break; case '.': // Dots don't help us either if ($escapeThisChar) { $replaceChar = true; } else { if ($braceDepth > 0) $replaceChar = true; } break; } $escapeThisChar = false; if ($replaceChar) $email[$i] = 'x'; // Replace the offending character with something harmless } } $localPart = substr($email, 0, $atIndex); $domain = substr($email, $atIndex + 1); $FWS = "(?:(?:(?:[ \\t]*(?:\\r\\n))?[ \\t]+)|(?:[ \\t]+(?:(?:\\r\\n)[ \\t]+)*))"; // Folding white space // Let's check the local part for RFC compliance... // // local-part = dot-atom / quoted-string / obs-local-part // obs-local-part = word *("." word) // (http://tools.ietf.org/html/rfc5322#section-3.4.1) // // Problem: need to distinguish between "first.last" and "first"."last" // (i.e. one element or two). And I suck at regexes. $dotArray = /*. (array[int]string) .*/ preg_split('/\\.(?=(?:[^\\"]*\\"[^\\"]*\\")*(?![^\\"]*\\"))/m', $localPart); $partLength = 0; foreach ($dotArray as $element) { // Remove any leading or trailing FWS $element = preg_replace("/^$FWS|$FWS\$/", '', $element); // Then we need to remove all valid comments (i.e. those at the start or end of the element $elementLength = strlen($element); if ($element[0] === '(') { $indexBrace = strpos($element, ')'); if ($indexBrace !== false) { if (preg_match('/(?<!\\\\)[\\(\\)]/', substr($element, 1, $indexBrace - 1)) > 0) { return false; // Illegal characters in comment } $element = substr($element, $indexBrace + 1, $elementLength - $indexBrace - 1); $elementLength = strlen($element); } } if ($element[$elementLength - 1] === ')') { $indexBrace = strrpos($element, '('); if ($indexBrace !== false) { if (preg_match('/(?<!\\\\)(?:[\\(\\)])/', substr($element, $indexBrace + 1, $elementLength - $indexBrace - 2)) > 0) { return false; // Illegal characters in comment } $element = substr($element, 0, $indexBrace); $elementLength = strlen($element); } } // Remove any leading or trailing FWS around the element (inside any comments) $element = preg_replace("/^$FWS|$FWS\$/", '', $element); // What's left counts towards the maximum length for this part if ($partLength > 0) $partLength++; // for the dot $partLength += strlen($element); // Each dot-delimited component can be an atom or a quoted string // (because of the obs-local-part provision) if (preg_match('/^"(?:.)*"$/s', $element) > 0) { // Quoted-string tests: // // Remove any FWS $element = preg_replace("/(?<!\\\\)$FWS/", '', $element); // My regex skillz aren't up to distinguishing between \" \\" \\\" \\\\" etc. // So remove all \\ from the string first... $element = preg_replace('/\\\\\\\\/', ' ', $element); if (preg_match('/(?<!\\\\|^)["\\r\\n\\x00](?!$)|\\\\"$|""/', $element) > 0) return false; // ", CR, LF and NUL must be escaped, "" is too short } else { // Unquoted string tests: // // Period (".") may...appear, but may not be used to start or end the // local part, nor may two or more consecutive periods appear. // (http://tools.ietf.org/html/rfc3696#section-3) // // A zero-length element implies a period at the beginning or end of the // local part, or two periods together. Either way it's not allowed. if ($element === '') return false; // Dots in wrong place // Any ASCII graphic (printing) character other than the // at-sign ("@"), backslash, double quote, comma, or square brackets may // appear without quoting. If any of that list of excluded characters // are to appear, they must be quoted // (http://tools.ietf.org/html/rfc3696#section-3) // // Any excluded characters? i.e. 0x00-0x20, (, ), <, >, [, ], :, ;, @, \, comma, period, " if (preg_match('/[\\x00-\\x20\\(\\)<>\\[\\]:;@\\\\,\\."]/', $element) > 0) return false; // These characters must be in a quoted string } } if ($partLength > 64) return false; // Local part must be 64 characters or less // Now let's check the domain part... // The domain name can also be replaced by an IP address in square brackets // (http://tools.ietf.org/html/rfc3696#section-3) // (http://tools.ietf.org/html/rfc5321#section-4.1.3) // (http://tools.ietf.org/html/rfc4291#section-2.2) if (preg_match('/^\\[(.)+]$/', $domain) === 1) { // It's an address-literal $addressLiteral = substr($domain, 1, strlen($domain) - 2); $matchesIP = array(); // Extract IPv4 part from the end of the address-literal (if there is one) if (preg_match('/\\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/', $addressLiteral, $matchesIP) > 0) { $index = strrpos($addressLiteral, $matchesIP[0]); if ($index === 0) { // Nothing there except a valid IPv4 address, so... return true; } else { // Assume it's an attempt at a mixed address (IPv6 + IPv4) if ($addressLiteral[$index - 1] !== ':') return false; // Character preceding IPv4 address must be ':' if (substr($addressLiteral, 0, 5) !== 'IPv6:') return false; // RFC5321 section 4.1.3 $IPv6 = substr($addressLiteral, 5, ($index ===7) ? 2 : $index - 6); $groupMax = 6; } } else { // It must be an attempt at pure IPv6 if (substr($addressLiteral, 0, 5) !== 'IPv6:') return false; // RFC5321 section 4.1.3 $IPv6 = substr($addressLiteral, 5); $groupMax = 8; } $groupCount = preg_match_all('/^[0-9a-fA-F]{0,4}|\\:[0-9a-fA-F]{0,4}|(.)/', $IPv6, $matchesIP); $index = strpos($IPv6,'::'); if ($index === false) { // We need exactly the right number of groups if ($groupCount !== $groupMax) return false; // RFC5321 section 4.1.3 } else { if ($index !== strrpos($IPv6,'::')) return false; // More than one '::' $groupMax = ($index === 0 || $index === (strlen($IPv6) - 2)) ? $groupMax : $groupMax - 1; if ($groupCount > $groupMax) return false; // Too many IPv6 groups in address } // Check for unmatched characters array_multisort($matchesIP[1], SORT_DESC); if ($matchesIP[1][0] !== '') return false; // Illegal characters in address // It's a valid IPv6 address, so... return true; } else { // It's a domain name... // The syntax of a legal Internet host name was specified in RFC-952 // One aspect of host name syntax is hereby changed: the // restriction on the first character is relaxed to allow either a // letter or a digit. // (http://tools.ietf.org/html/rfc1123#section-2.1) // // NB RFC 1123 updates RFC 1035, but this is not currently apparent from reading RFC 1035. // // Most common applications, including email and the Web, will generally not // permit...escaped strings // (http://tools.ietf.org/html/rfc3696#section-2) // // the better strategy has now become to make the "at least one period" test, // to verify LDH conformance (including verification that the apparent TLD name // is not all-numeric) // (http://tools.ietf.org/html/rfc3696#section-2) // // Characters outside the set of alphabetic characters, digits, and hyphen MUST NOT appear in domain name // labels for SMTP clients or servers // (http://tools.ietf.org/html/rfc5321#section-4.1.2) // // RFC5321 precludes the use of a trailing dot in a domain name for SMTP purposes // (http://tools.ietf.org/html/rfc5321#section-4.1.2) $dotArray = /*. (array[int]string) .*/ preg_split('/\\.(?=(?:[^\\"]*\\"[^\\"]*\\")*(?![^\\"]*\\"))/m', $domain); $partLength = 0; if (count($dotArray) === 1) return false; // Mail host can't be a TLD foreach ($dotArray as $element) { // Remove any leading or trailing FWS $element = preg_replace("/^$FWS|$FWS\$/", '', $element); // Then we need to remove all valid comments (i.e. those at the start or end of the element $elementLength = strlen($element); if ($element[0] === '(') { $indexBrace = strpos($element, ')'); if ($indexBrace !== false) { if (preg_match('/(?<!\\\\)[\\(\\)]/', substr($element, 1, $indexBrace - 1)) > 0) { return false; // Illegal characters in comment } $element = substr($element, $indexBrace + 1, $elementLength - $indexBrace - 1); $elementLength = strlen($element); } } if ($element[$elementLength - 1] === ')') { $indexBrace = strrpos($element, '('); if ($indexBrace !== false) { if (preg_match('/(?<!\\\\)(?:[\\(\\)])/', substr($element, $indexBrace + 1, $elementLength - $indexBrace - 2)) > 0) { return false; // Illegal characters in comment } $element = substr($element, 0, $indexBrace); $elementLength = strlen($element); } } // Remove any leading or trailing FWS around the element (inside any comments) $element = preg_replace("/^$FWS|$FWS\$/", '', $element); // What's left counts towards the maximum length for this part if ($partLength > 0) $partLength++; // for the dot $partLength += strlen($element); // The DNS defines domain name syntax very generally -- a // string of labels each containing up to 63 8-bit octets, // separated by dots, and with a maximum total of 255 // octets. // (http://tools.ietf.org/html/rfc1123#section-6.1.3.5) if ($elementLength > 63) return false; // Label must be 63 characters or less // Each dot-delimited component must be atext // A zero-length element implies a period at the beginning or end of the // local part, or two periods together. Either way it's not allowed. if ($elementLength === 0) return false; // Dots in wrong place // Any ASCII graphic (printing) character other than the // at-sign ("@"), backslash, double quote, comma, or square brackets may // appear without quoting. If any of that list of excluded characters // are to appear, they must be quoted // (http://tools.ietf.org/html/rfc3696#section-3) // // If the hyphen is used, it is not permitted to appear at // either the beginning or end of a label. // (http://tools.ietf.org/html/rfc3696#section-2) // // Any excluded characters? i.e. 0x00-0x20, (, ), <, >, [, ], :, ;, @, \, comma, period, " if (preg_match('/[\\x00-\\x20\\(\\)<>\\[\\]:;@\\\\,\\."]|^-|-$/', $element) > 0) { return false; } } if ($partLength > 255) return false; // Local part must be 64 characters or less if (preg_match('/^[0-9]+$/', $element) > 0) return false; // TLD can't be all-numeric // Check DNS? if ($checkDNS && function_exists('checkdnsrr')) { if (!(checkdnsrr($domain, 'A') || checkdnsrr($domain, 'MX'))) { return false; // Domain doesn't actually exist } } } // Eliminate all other factors, and the one which remains must be the truth. // (Sherlock Holmes, The Sign of Four) return true; } ?>
I've also uploaded this function to Google Code, where you can be sure of getting the latest version: <a href="http://code.google.com/p/isemail/source/browse/trunk" target="_blank">http://code.google.com/p/isemail/source/browse/trunk</a>
Comments
Add your comment