Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- #
- # RFC 822/2822/5322 Email Parser
- #
- # By Cal Henderson <[email protected]>
- #
- # This code is dual licensed:
- # CC Attribution-ShareAlike 2.5 - http://creativecommons.org/licenses/by-sa/2.5/
- # GPLv3 - http://www.gnu.org/copyleft/gpl.html
- #
- # $Revision$
- #
- ##################################################################################
- function is_valid_email_address($email, $options=array()){
- #
- # you can pass a few different named options as a second argument,
- # but the defaults are usually a good choice.
- #
- $defaults = array(
- 'allow_comments' => true,
- 'public_internet' => true, # turn this off for 'strict' mode
- );
- $opts = array();
- foreach ($defaults as $k => $v) $opts[$k] = isset($options[$k]) ? $options[$k] : $v;
- $options = $opts;
- ####################################################################################
- #
- # NO-WS-CTL = %d1-8 / ; US-ASCII control characters
- # %d11 / ; that do not include the
- # %d12 / ; carriage return, line feed,
- # %d14-31 / ; and white space characters
- # %d127
- # ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
- # DIGIT = %x30-39
- $no_ws_ctl = "[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]";
- $alpha = "[\\x41-\\x5a\\x61-\\x7a]";
- $digit = "[\\x30-\\x39]";
- $cr = "\\x0d";
- $lf = "\\x0a";
- $crlf = "(?:$cr$lf)";
- ####################################################################################
- #
- # obs-char = %d0-9 / %d11 / ; %d0-127 except CR and
- # %d12 / %d14-127 ; LF
- # obs-text = *LF *CR *(obs-char *LF *CR)
- # text = %d1-9 / ; Characters excluding CR and LF
- # %d11 /
- # %d12 /
- # %d14-127 /
- # obs-text
- # obs-qp = "\" (%d0-127)
- # quoted-pair = ("\" text) / obs-qp
- $obs_char = "[\\x00-\\x09\\x0b\\x0c\\x0e-\\x7f]";
- $obs_text = "(?:$lf*$cr*(?:$obs_char$lf*$cr*)*)";
- $text = "(?:[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f]|$obs_text)";
- #
- # there's an issue with the definition of 'text', since 'obs_text' can
- # be blank and that allows qp's with no character after the slash. we're
- # treating that as bad, so this just checks we have at least one
- # (non-CRLF) character
- #
- $text = "(?:$lf*$cr*$obs_char$lf*$cr*)";
- $obs_qp = "(?:\\x5c[\\x00-\\x7f])";
- $quoted_pair = "(?:\\x5c$text|$obs_qp)";
- ####################################################################################
- #
- # obs-FWS = 1*WSP *(CRLF 1*WSP)
- # FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space
- # obs-FWS
- # ctext = NO-WS-CTL / ; Non white space controls
- # %d33-39 / ; The rest of the US-ASCII
- # %d42-91 / ; characters not including "(",
- # %d93-126 ; ")", or "\"
- # ccontent = ctext / quoted-pair / comment
- # comment = "(" *([FWS] ccontent) [FWS] ")"
- # CFWS = *([FWS] comment) (([FWS] comment) / FWS)
- #
- # note: we translate ccontent only partially to avoid an infinite loop
- # instead, we'll recursively strip *nested* comments before processing
- # the input. that will leave 'plain old comments' to be matched during
- # the main parse.
- #
- $wsp = "[\\x20\\x09]";
- $obs_fws = "(?:$wsp+(?:$crlf$wsp+)*)";
- $fws = "(?:(?:(?:$wsp*$crlf)?$wsp+)|$obs_fws)";
- $ctext = "(?:$no_ws_ctl|[\\x21-\\x27\\x2A-\\x5b\\x5d-\\x7e])";
- $ccontent = "(?:$ctext|$quoted_pair)";
- $comment = "(?:\\x28(?:$fws?$ccontent)*$fws?\\x29)";
- $cfws = "(?:(?:$fws?$comment)*(?:$fws?$comment|$fws))";
- #
- # these are the rules for removing *nested* comments. we'll just detect
- # outer comment and replace it with an empty comment, and recurse until
- # we stop.
- #
- $outer_ccontent_dull = "(?:$fws?$ctext|$quoted_pair)";
- $outer_ccontent_nest = "(?:$fws?$comment)";
- $outer_comment = "(?:\\x28$outer_ccontent_dull*(?:$outer_ccontent_nest$outer_ccontent_dull*)+$fws?\\x29)";
- ####################################################################################
- #
- # atext = ALPHA / DIGIT / ; Any character except controls,
- # "!" / "#" / ; SP, and specials.
- # "$" / "%" / ; Used for atoms
- # "&" / "'" /
- # "*" / "+" /
- # "-" / "/" /
- # "=" / "?" /
- # "^" / "_" /
- # "`" / "{" /
- # "|" / "}" /
- # "~"
- # atom = [CFWS] 1*atext [CFWS]
- $atext = "(?:$alpha|$digit|[\\x21\\x23-\\x27\\x2a\\x2b\\x2d\\x2f\\x3d\\x3f\\x5e\\x5f\\x60\\x7b-\\x7e])";
- $atom = "(?:$cfws?(?:$atext)+$cfws?)";
- ####################################################################################
- #
- # qtext = NO-WS-CTL / ; Non white space controls
- # %d33 / ; The rest of the US-ASCII
- # %d35-91 / ; characters not including "\"
- # %d93-126 ; or the quote character
- # qcontent = qtext / quoted-pair
- # quoted-string = [CFWS]
- # DQUOTE *([FWS] qcontent) [FWS] DQUOTE
- # [CFWS]
- # word = atom / quoted-string
- $qtext = "(?:$no_ws_ctl|[\\x21\\x23-\\x5b\\x5d-\\x7e])";
- $qcontent = "(?:$qtext|$quoted_pair)";
- $quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)*$fws?\\x22$cfws?)";
- #
- # changed the '*' to a '+' to require that quoted strings are not empty
- #
- $quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)+$fws?\\x22$cfws?)";
- $word = "(?:$atom|$quoted_string)";
- ####################################################################################
- #
- # obs-local-part = word *("." word)
- # obs-domain = atom *("." atom)
- $obs_local_part = "(?:$word(?:\\x2e$word)*)";
- $obs_domain = "(?:$atom(?:\\x2e$atom)*)";
- ####################################################################################
- #
- # dot-atom-text = 1*atext *("." 1*atext)
- # dot-atom = [CFWS] dot-atom-text [CFWS]
- $dot_atom_text = "(?:$atext+(?:\\x2e$atext+)*)";
- $dot_atom = "(?:$cfws?$dot_atom_text$cfws?)";
- ####################################################################################
- #
- # domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
- # dcontent = dtext / quoted-pair
- # dtext = NO-WS-CTL / ; Non white space controls
- #
- # %d33-90 / ; The rest of the US-ASCII
- # %d94-126 ; characters not including "[",
- # ; "]", or "\"
- $dtext = "(?:$no_ws_ctl|[\\x21-\\x5a\\x5e-\\x7e])";
- $dcontent = "(?:$dtext|$quoted_pair)";
- $domain_literal = "(?:$cfws?\\x5b(?:$fws?$dcontent)*$fws?\\x5d$cfws?)";
- ####################################################################################
- #
- # local-part = dot-atom / quoted-string / obs-local-part
- # domain = dot-atom / domain-literal / obs-domain
- # addr-spec = local-part "@" domain
- $local_part = "(($dot_atom)|($quoted_string)|($obs_local_part))";
- $domain = "(($dot_atom)|($domain_literal)|($obs_domain))";
- $addr_spec = "$local_part\\x40$domain";
- #
- # this was previously 256 based on RFC3696, but dominic's errata was accepted.
- #
- if (strlen($email) > 254) return 0;
- #
- # we need to strip nested comments first - we replace them with a simple comment
- #
- if ($options['allow_comments']){
- $email = email_strip_comments($outer_comment, $email, "(x)");
- }
- #
- # now match what's left
- #
- if (!preg_match("!^$addr_spec$!", $email, $m)){
- return 0;
- }
- $bits = array(
- 'local' => isset($m[1]) ? $m[1] : '',
- 'local-atom' => isset($m[2]) ? $m[2] : '',
- 'local-quoted' => isset($m[3]) ? $m[3] : '',
- 'local-obs' => isset($m[4]) ? $m[4] : '',
- 'domain' => isset($m[5]) ? $m[5] : '',
- 'domain-atom' => isset($m[6]) ? $m[6] : '',
- 'domain-literal' => isset($m[7]) ? $m[7] : '',
- 'domain-obs' => isset($m[8]) ? $m[8] : '',
- );
- #
- # we need to now strip comments from $bits[local] and $bits[domain],
- # since we know they're in the right place and we want them out of the
- # way for checking IPs, label sizes, etc
- #
- if ($options['allow_comments']){
- $bits['local'] = email_strip_comments($comment, $bits['local']);
- $bits['domain'] = email_strip_comments($comment, $bits['domain']);
- }
- #
- # length limits on segments
- #
- if (strlen($bits['local']) > 64) return 0;
- if (strlen($bits['domain']) > 255) return 0;
- #
- # restrictions on domain-literals from RFC2821 section 4.1.3
- #
- # RFC4291 changed the meaning of :: in IPv6 addresses - i can mean one or
- # more zero groups (updated from 2 or more).
- #
- if (strlen($bits['domain-literal'])){
- $Snum = "(\d{1,3})";
- $IPv4_address_literal = "$Snum\.$Snum\.$Snum\.$Snum";
- $IPv6_hex = "(?:[0-9a-fA-F]{1,4})";
- $IPv6_full = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){7}";
- $IPv6_comp_part = "(?:$IPv6_hex(?:\:$IPv6_hex){0,7})?";
- $IPv6_comp = "IPv6\:($IPv6_comp_part\:\:$IPv6_comp_part)";
- $IPv6v4_full = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){5}\:$IPv4_address_literal";
- $IPv6v4_comp_part = "$IPv6_hex(?:\:$IPv6_hex){0,5}";
- $IPv6v4_comp = "IPv6\:((?:$IPv6v4_comp_part)?\:\:(?:$IPv6v4_comp_part\:)?)$IPv4_address_literal";
- #
- # IPv4 is simple
- #
- if (preg_match("!^\[$IPv4_address_literal\]$!", $bits['domain'], $m)){
- if (intval($m[1]) > 255) return 0;
- if (intval($m[2]) > 255) return 0;
- if (intval($m[3]) > 255) return 0;
- if (intval($m[4]) > 255) return 0;
- }else{
- #
- # this should be IPv6 - a bunch of tests are needed here :)
- #
- while (1){
- if (preg_match("!^\[$IPv6_full\]$!", $bits['domain'])){
- break;
- }
- if (preg_match("!^\[$IPv6_comp\]$!", $bits['domain'], $m)){
- list($a, $b) = explode('::', $m[1]);
- $folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
- $groups = explode(':', $folded);
- if (count($groups) > 7) return 0;
- break;
- }
- if (preg_match("!^\[$IPv6v4_full\]$!", $bits['domain'], $m)){
- if (intval($m[1]) > 255) return 0;
- if (intval($m[2]) > 255) return 0;
- if (intval($m[3]) > 255) return 0;
- if (intval($m[4]) > 255) return 0;
- break;
- }
- if (preg_match("!^\[$IPv6v4_comp\]$!", $bits['domain'], $m)){
- list($a, $b) = explode('::', $m[1]);
- $b = substr($b, 0, -1); # remove the trailing colon before the IPv4 address
- $folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
- $groups = explode(':', $folded);
- if (count($groups) > 5) return 0;
- break;
- }
- return 0;
- }
- }
- }else{
- #
- # the domain is either dot-atom or obs-domain - either way, it's
- # made up of simple labels and we split on dots
- #
- $labels = explode('.', $bits['domain']);
- #
- # this is allowed by both dot-atom and obs-domain, but is un-routeable on the
- # public internet, so we'll fail it (e.g. user@localhost)
- #
- if ($options['public_internet']){
- if (count($labels) == 1) return 0;
- }
- #
- # checks on each label
- #
- foreach ($labels as $label){
- if (strlen($label) > 63) return 0;
- if (substr($label, 0, 1) == '-') return 0;
- if (substr($label, -1) == '-') return 0;
- }
- #
- # last label can't be all numeric
- #
- if ($options['public_internet']){
- if (preg_match('!^[0-9]+$!', array_pop($labels))) return 0;
- }
- }
- return 1;
- }
- ##################################################################################
- function email_strip_comments($comment, $email, $replace=''){
- while (1){
- $new = preg_replace("!$comment!", $replace, $email);
- if (strlen($new) == strlen($email)){
- return $email;
- }
- $email = $new;
- }
- }
- ##################################################################################
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement