• Main Page
  • Related Pages
  • Namespaces
  • Classes
  • Files
  • Examples
  • File List

system/classes/inputfilter.php

00001 <?php
00007 namespace Habari;
00012 class InputFilter
00013 {
00017   private static $whitelist_elements = array(
00018     // http://www.w3.org/TR/html4/struct/global.html#h-7.5.4
00019     'div', 'span',
00020     // http://www.w3.org/TR/html4/struct/links.html#h-12.2
00021     'a',
00022     // http://www.w3.org/TR/html4/struct/text.html#h-9.2.1
00023     'strong', 'em', 'code', 'kbd', 'dfn', 'samp', 'var', 'cite', 'abbr', 'acronym',
00024     // http://www.w3.org/TR/html4/struct/text.html#h-9.2.2
00025     'blockquote', 'q',
00026     // http://www.w3.org/TR/html4/struct/text.html#h-9.2.3
00027     'sub', 'sup',
00028     // http://www.w3.org/TR/html4/struct/text.html#h-9.3.1
00029     'p',
00030     // http://www.w3.org/TR/html4/struct/text.html#h-9.3.2.1
00031     'br',
00032     // http://www.w3.org/TR/html4/struct/text.html#h-9.3.4
00033     'pre',
00034     // http://www.w3.org/TR/html4/struct/text.html#h-9.4
00035     'ins', 'del',
00036     // http://www.w3.org/TR/html4/struct/lists.html#h-10.2
00037     'ol', 'ul', 'li',
00038     // http://www.w3.org/TR/html4/struct/lists.html#h-10.3
00039     'dl', 'dt', 'dd',
00040     // http://www.w3.org/TR/html4/present/graphics.html#h-15.2.1
00041     'b', 'i', 'u', 's', 'tt',
00042     // http://www.w3.org/TR/html4/struct/global.html#h-7.5.5
00043     'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
00044     // http://www.w3.org/TR/html4/struct/global.html#h-7.5.6
00045     'address',
00046     // http://www.w3.org/TR/html4/struct/dirlang.html#h-8.2.4
00047     'bdo',
00048     // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.1
00049     'table',
00050     // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.2
00051     'caption',
00052     // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.3
00053     'thead', 'tfoot', 'tbody',
00054     // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.4
00055     'colgroup', 'col',
00056     // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.5
00057     'tr',
00058     // http://www.w3.org/TR/html4/struct/tables.html#h-11.2.6
00059     'th', 'td',
00060     // http://www.w3.org/TR/html4/struct/objects.html#h-13.2
00061     'img',
00062     // http://www.w3.org/TR/html4/struct/objects.html#h-13.6.1
00063     'map', 'area',
00064     // http://www.w3.org/TR/html4/present/graphics.html#h-15.2.1 (the non-deprecated ones)
00065     'tt', 'i', 'b', 'big', 'small',
00066     // http://www.w3.org/TR/html4/present/graphics.html#h-15.3
00067     'hr',
00068     // http://www.w3.org/TR/html4/present/frames.html#h-16.2.1
00069     'frameset',
00070     // http://www.w3.org/TR/html4/present/frames.html#h-16.2.2
00071     'frame',
00072     // http://www.w3.org/TR/html4/present/frames.html#h-16.4.1
00073     'noframes',
00074     // http://www.w3.org/TR/html4/present/frames.html#h-16.5
00075     'iframe',
00076 
00077   );
00078 
00082   private static $whitelist_attributes = array(
00083     // attributes that are valid for ALL elements (a subset of coreattrs)
00084     // elements that only take coreattrs don't need to be listed separately
00085     '*' => array(
00086       'lang' => 'language-code',
00087       'xml:lang' => 'language-code', // this is our xhtml support... all of it
00088       'dir' => array( 'ltr', 'rtl', ),
00089       'title' => 'text',
00090     ),
00091     // http://www.w3.org/TR/html4/struct/links.html#h-12.2
00092     'a' => array( 'href' => 'uri', ),
00093     // http://www.w3.org/TR/html4/struct/text.html#h-9.4
00094     'ins' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
00095     'del' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
00096     // http://www.w3.org/TR/html4/struct/text.html#h-9.2.2
00097     'blockquote' => array( 'cite' => 'uri', ),
00098     'q' => array( 'cite' => 'uri', ),
00099     'img' => array( 'src' => 'uri', 'alt' => 'text' ),
00100   );
00101 
00105   private static $elements_empty = array(
00106     'img',
00107   );
00108 
00112   private static $whitelist_protocols = array(
00113     'http', 'https', 'ftp', 'mailto', 'irc', 'news', 'nntp', 'callto', 'rtsp', 'mms', 'svn',
00114   );
00115 
00119   private static $character_entities = array(
00120     'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml',
00121     'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
00122     'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm',
00123     'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
00124     'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc',
00125     'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
00126     'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute',
00127     'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute', 'acirc',
00128     'atilde', 'auml', 'aring', 'aelig', 'ccedil', 'egrave', 'eacute', 'ecirc',
00129     'euml', 'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde', 'ograve',
00130     'oacute', 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave', 'uacute',
00131     'ucirc', 'uuml', 'yacute', 'thorn', 'yuml', 'fnof', 'Alpha', 'Beta', 'Gamma',
00132     'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta', 'Iota', 'Kappa', 'Lambda', 'Mu',
00133     'Nu', 'Xi', 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon', 'Phi', 'Chi',
00134     'Psi', 'Omega', 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta',
00135     'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho',
00136     'sigmaf', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega',
00137     'thetasym', 'upsih', 'piv', 'bull', 'hellip', 'prime', 'Prime', 'oline',
00138     'frasl', 'weierp', 'image', 'real', 'trade', 'alefsym', 'larr', 'uarr',
00139     'rarr', 'darr', 'harr', 'crarr', 'lArr', 'uArr', 'rArr', 'dArr', 'hArr',
00140     'forall', 'part', 'exist', 'empty', 'nabla', 'isin', 'notin', 'ni', 'prod',
00141     'sum', 'minus', 'lowast', 'radic', 'prop', 'infin', 'ang', 'and', 'or',
00142     'cap', 'cup', 'int', 'there4', 'sim', 'cong', 'asymp', 'ne', 'equiv', 'le',
00143     'ge', 'sub', 'sup', 'nsub', 'sube', 'supe', 'oplus', 'otimes', 'perp',
00144     'sdot', 'lceil', 'rceil', 'lfloor', 'rfloor', 'lang', 'rang', 'loz',
00145     'spades', 'clubs', 'hearts', 'diams', 'quot', 'amp', 'lt', 'gt', 'OElig',
00146     'oelig', 'Scaron', 'scaron', 'Yuml', 'circ', 'tilde', 'ensp', 'emsp',
00147     'thinsp', 'zwnj', 'zwj', 'lrm', 'rlm', 'ndash', 'mdash', 'lsquo', 'rsquo',
00148     'sbquo', 'ldquo', 'rdquo', 'bdquo', 'dagger', 'Dagger', 'permil', 'lsaquo',
00149     'rsaquo', 'euro',
00150   );
00151 
00152   private static $character_entities_re = '';
00153 
00154   public static function __static ( ) {
00155     self::$whitelist_elements = Plugins::filter( 'inputfilter_whitelist_elements', self::$whitelist_elements );
00156     self::$whitelist_attributes = Plugins::filter( 'inputfilter_whitelist_attributes', self::$whitelist_attributes );
00157     self::$elements_empty = Plugins::filter( 'inputfilter_elements_empty', self::$elements_empty );
00158     self::$whitelist_protocols = Plugins::filter( 'inputfilter_whitelist_protocols', self::$whitelist_protocols );
00159     self::$character_entities = Plugins::filter( 'inputfilter_character_entities', self::$character_entities );
00160     self::$character_entities_re = Plugins::filter( 'inputfilter_character_entities_re', self::$character_entities_re );
00161   }
00162 
00168   public static function filter( $str )
00169   {
00170     if ( !MultiByte::valid_data( $str ) ) {
00171       return '';
00172     }
00173     else {
00174       do {
00175         $_str = $str;
00176         $str = self::strip_nulls( $str );
00177         $str = self::strip_illegal_entities( $str );
00178         $str = self::filter_html_elements( $str );
00179       } while ( $str != $_str );
00180 
00181       return $str;
00182     }
00183   }
00184 
00190   public static function strip_nulls( $str )
00191   {
00192     $str = str_replace( '\0', '', $str );
00193 
00194     return $str;
00195   }
00196 
00202   public static function _validate_entity( $m )
00203   {
00204     $is_valid = false;
00205 
00214     $e = trim( $m[1] );
00215     $r = $m[2];
00216     if ( $r == ';' ) {
00217       $r = '';
00218     }
00219 
00220     if ( $e{0} == '#' ) {
00221       $e = strtolower( $e );
00222       if ( $e{1} == 'x' ) {
00223         $e = hexdec( substr( $e, 2 ) );
00224       }
00225       else {
00226         $e = substr( $e, 1 );
00227       }
00228 
00229       // numeric character references may only have values in the range 0-65535 (16 bit)
00230       // we strip null, though, just for kicks
00231       $is_valid = ( intval( $e ) > 8 && intval( $e ) <= 65535 );
00232 
00233       if ( $is_valid ) {
00234         // normalize to decimal form
00235         $e = '#' . intval( $e ) . ';';
00236       }
00237     }
00238     else {
00239       if ( self::$character_entities_re == '' ) {
00240         self::$character_entities_re = ';(' . implode( '|', self::$character_entities ) . ');';
00241       }
00242 
00243       // named entities must be known
00244       $is_valid = preg_match( self::$character_entities_re, $e, $matches );
00245 
00246       // XXX should we map named entities to their numeric equivalents?
00247 
00248       if ( $is_valid ) {
00249         // normalize to name and nothing but the name... eh.
00250         $e = $matches[1] . ';';
00251       }
00252     }
00253 
00254     return $is_valid ? '&' . $e . $r : '';
00255   }
00256 
00262   public static function strip_illegal_entities( $str )
00263   {
00264     $str = preg_replace_callback( "/&([^;<\n\r]+)([;<\n\r])/", array( __CLASS__, '_validate_entity' ), $str );
00265 
00266     return $str;
00267   }
00268 
00272   public static function parse_url( $url )
00273   {
00274     // result array
00275     $r = array(
00276       'scheme' => '',
00277       'host' => '',
00278       'port' => '',
00279       'user' => '',
00280       'pass' => '',
00281       'path' => '',
00282       'query' => '',
00283       'fragment' => '',
00284       //
00285       'is_relative' => false,
00286       'is_pseudo' => false,
00287       'is_error' => false,
00288       //
00289       'pseudo_args' => '',
00290     );
00291 
00292     // sanitize the url
00293     $sanitized = html_entity_decode( $url, null, 'UTF-8' );   // make double-sure we've converted all entities
00294     $sanitized = filter_var( $sanitized, FILTER_SANITIZE_URL );   // strip everything but ascii, essentially
00295 
00296     $sanitized_scheme = parse_url( $sanitized, PHP_URL_SCHEME );
00297 
00298     // Use PHP's parse_url to get the basics
00299     $parsed = parse_url( $url );
00300     if ( $parsed == false ) {
00301       $r['is_error'] = true;
00302       return $r;
00303     }
00304     $r = array_merge( $r, $parsed );
00305 
00306     // replace the scheme with the one we got from the fully-sanitized string
00307     $r['scheme'] = $sanitized_scheme;
00308 
00309     $r['is_pseudo'] = !in_array( $r['scheme'], array( 'http', 'https', '' ) );
00310     $r['is_relative'] = ( $r['host'] == '' && !$r['is_pseudo'] );
00311 
00312     if ( $r['is_pseudo'] ) {
00313       $r['pseudo_args'] = $r['path'];
00314       $r['path'] = '';
00315     }
00316 
00317     return $r;
00318   }
00319 
00324   public static function glue_url( $parsed_url )
00325   {
00326     if ( ! is_array( $parsed_url ) ) {
00327       return false;
00328     }
00329 
00330     $res = '';
00331     $res .= $parsed_url['scheme'];
00332     if ( $parsed_url['is_pseudo'] || in_array( strtolower( $parsed_url['scheme'] ), array( 'mailto', 'callto' ) ) ) {
00333       $res .= ':';
00334     }
00335     else {
00336       if ( ! $parsed_url['is_relative'] ) {
00337         $res .= '://';
00338       }
00339     }
00340     if ( $parsed_url['is_pseudo'] ) {
00341       $res .= $parsed_url['pseudo_args'];
00342     }
00343     else {
00344       // user[:pass]@
00345       if ( $parsed_url['user'] ) {
00346         $res .= $parsed_url['user'];
00347         if ( $parsed_url['pass'] ) {
00348           $res .= ':' . $parsed_url['pass'];
00349         }
00350         $res .= '@';
00351       }
00352       $res .= $parsed_url['host'];
00353       if ( !empty( $parsed_url['port'] ) ) {
00354         if ( array_key_exists( $parsed_url['scheme'], Utils::scheme_ports() ) && Utils::scheme_ports( $parsed_url['scheme'] ) == $parsed_url['port'] ) {
00355           // default port for this scheme, do nothing
00356         }
00357         else {
00358           $res .= ':' . $parsed_url['port'];
00359         }
00360       }
00361       if ( !empty( $parsed_url['path'] ) ) {
00362         $res .= $parsed_url['path'];
00363       }
00364       else {
00365         $res .= '/';
00366       }
00367       if ( $parsed_url['query'] ) {
00368         $res .= '?' . $parsed_url['query'];
00369       }
00370       if ( $parsed_url['fragment'] ) {
00371         $res .= '#' . $parsed_url['fragment'];
00372       }
00373     }
00374 
00375     return $res;
00376   }
00377 
00378   private static function check_attr_value( $k, $v, $type )
00379   {
00380     if ( is_array( $type ) ) {
00381       // array of allowed values, exact matches only
00382       return in_array( $v, $type, true );
00383     }
00384     else {
00385       // data type
00386       switch ( $type ) {
00387         case 'uri':
00388           // RfC 2396 <http://www.ietf.org/rfc/rfc2396.txt>
00389           $bits = self::parse_url( $v );
00390           return $bits['is_relative'] || in_array( $bits['scheme'], self::$whitelist_protocols );
00391           break;
00392         case 'language-code':
00393           // RfC 1766 <http://www.ietf.org/rfc/rfc1766.txt>
00394           //    Language-Tag = Primary-tag *( "-" Subtag )
00395           //    Primary-tag = 1*8ALPHA
00396           //    Subtag = 1*8ALPHA
00397           return preg_match( '/^[a-zA-Z]{1,8}(?:-[a-zA-Z]{1,8})*$/i', $v );
00398           break;
00399         case 'text':
00400           // XXX is this sufficient?
00401           return is_string( $v );
00402           break;
00403         case 'datetime':
00404           // <http://www.w3.org/TR/1998/NOTE-datetime-19980827>
00405           // <http://www.w3.org/TR/html4/types.html#h-6.11>
00406           //    YYYY-MM-DDThh:mm:ssTZD
00407           return preg_match( '/^[0-9]{4}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9](?:Z|[\+-][0-2][0-9]:[0-5][0-9])$/', $v );
00408           break;
00409         default:
00410           Error::raise( _t( 'Unknown attribute type "%s" in %s', array( $type, __CLASS__ ) ) );
00411           return false;
00412       }
00413     }
00414   }
00415 
00420   public static function filter_html_elements( $str )
00421   {
00422     $tokenizer = new HTMLTokenizer( $str );
00423 
00424     // tokenize, baby
00425     $tokens = $tokenizer->parse();
00426 
00427     // filter token stream
00428     $filtered = new HTMLTokenSet;
00429     $stack = array();
00430     foreach ( $tokens as $node ) {
00431       switch ( $node['type'] ) {
00432         case HTMLTokenizer::NODE_TYPE_TEXT:
00433           $node['value'] = html_entity_decode( $node['value'], ENT_QUOTES, MultiByte::hab_encoding() );
00434           break;
00435         case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
00436         case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY:
00437           // is this element allowed at all?
00438           if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
00439             if ( ! in_array( strtolower( $node['name'] ), self::$elements_empty ) ) {
00440               array_push( $stack, $node['name'] );
00441             }
00442             //$node = null; //remove the node completely
00443             // convert the node to text
00444             $node = array(
00445               'type' => HTMLTokenizer::NODE_TYPE_TEXT,
00446               'name' => '#text',
00447               'value' => HTMLTokenSet::token_to_string( $node ),
00448               'attrs' => array(),
00449             );
00450           }
00451           else {
00452             // check attributes
00453             foreach ( $node['attrs'] as $k => $v ) {
00454 
00455               $attr_ok = false;
00456 
00457               // if the attribute is in the global whitelist and validates
00458               if ( array_key_exists( strtolower( $k ), self::$whitelist_attributes['*'] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes['*'][ strtolower( $k ) ] ) ) {
00459                 $attr_ok = true;
00460               }
00461 
00462               // if there is a whitelist for this node and this attribute is in that list and it validates
00463               if ( array_key_exists( strtolower( $node['name'] ), self::$whitelist_attributes ) && array_key_exists( strtolower( $k ), self::$whitelist_attributes[ strtolower( $node['name'] ) ] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes[ strtolower( $node['name'] ) ][ strtolower( $k ) ] ) ) {
00464                 $attr_ok = true;
00465               }
00466 
00467               // if it wasn't in one of the whitelists or failed its check, remove it
00468               if ( $attr_ok != true ) {
00469                 unset( $node['attrs'][$k] );
00470               }
00471             }
00472           }
00473           break;
00474         case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
00475           if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
00476             if ( strtolower( $temp = array_pop( $stack ) ) !== strtolower( $node['name'] ) ) {
00477               // something weird happened (Luke, use the DOM!)
00478               array_push( $stack, $temp );
00479             }
00480             //$node = null;
00481             //convert the node to text
00482             $node = array(
00483               'type' => HTMLTokenizer::NODE_TYPE_TEXT,
00484               'name' => '#text',
00485               'value' => HTMLTokenSet::token_to_string( $node ),
00486               'attrs' => array(),
00487             );
00488           }
00489           break;
00490         case HTMLTokenizer::NODE_TYPE_PI:
00491         case HTMLTokenizer::NODE_TYPE_COMMENT:
00492         case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
00493         case HTMLTokenizer::NODE_TYPE_STATEMENT:
00494         default:
00495           $node = null;
00496           break;
00497       }
00498 
00499       if ( $node != null ) {
00500         $filtered[] = $node;
00501       }
00502     }
00503 
00504     // rebuild our output string
00505     return preg_replace( '#<([^>\s]+)(?:\s+[^>]+)?></\1>#u', '', (string) $filtered );
00506   }
00507 }
00508 
00509 ?>

Generated on Sun Aug 4 2013 12:51:43 for Habari by  doxygen 1.7.1