00001 <?php
00007 namespace Habari;
00012 class InputFilter
00013 {
00017 private static $whitelist_elements = array(
00018
00019 'div', 'span',
00020
00021 'a',
00022
00023 'strong', 'em', 'code', 'kbd', 'dfn', 'samp', 'var', 'cite', 'abbr', 'acronym',
00024
00025 'blockquote', 'q',
00026
00027 'sub', 'sup',
00028
00029 'p',
00030
00031 'br',
00032
00033 'pre',
00034
00035 'ins', 'del',
00036
00037 'ol', 'ul', 'li',
00038
00039 'dl', 'dt', 'dd',
00040
00041 'b', 'i', 'u', 's', 'tt',
00042
00043 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
00044
00045 'address',
00046
00047 'bdo',
00048
00049 'table',
00050
00051 'caption',
00052
00053 'thead', 'tfoot', 'tbody',
00054
00055 'colgroup', 'col',
00056
00057 'tr',
00058
00059 'th', 'td',
00060
00061 'img',
00062
00063 'map', 'area',
00064
00065 'tt', 'i', 'b', 'big', 'small',
00066
00067 'hr',
00068
00069 'frameset',
00070
00071 'frame',
00072
00073 'noframes',
00074
00075 'iframe',
00076
00077 );
00078
00082 private static $whitelist_attributes = array(
00083
00084
00085 '*' => array(
00086 'lang' => 'language-code',
00087 'xml:lang' => 'language-code',
00088 'dir' => array( 'ltr', 'rtl', ),
00089 'title' => 'text',
00090 ),
00091
00092 'a' => array( 'href' => 'uri', ),
00093
00094 'ins' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
00095 'del' => array( 'cite' => 'uri', 'datetime' => 'datetime', ),
00096
00097 'blockquote' => array( 'cite' => 'uri', ),
00098 'q' => array( 'cite' => 'uri', ),
00099 'img' => array( 'src' => 'uri', 'alt' => 'text' ),
00100 );
00101
00105 private static $elements_empty = array(
00106 'img',
00107 );
00108
00112 private static $whitelist_protocols = array(
00113 'http', 'https', 'ftp', 'mailto', 'irc', 'news', 'nntp', 'callto', 'rtsp', 'mms', 'svn',
00114 );
00115
00119 private static $character_entities = array(
00120 'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml',
00121 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
00122 'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm',
00123 'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
00124 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc',
00125 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
00126 'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute',
00127 'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute', 'acirc',
00128 'atilde', 'auml', 'aring', 'aelig', 'ccedil', 'egrave', 'eacute', 'ecirc',
00129 'euml', 'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde', 'ograve',
00130 'oacute', 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave', 'uacute',
00131 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml', 'fnof', 'Alpha', 'Beta', 'Gamma',
00132 'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta', 'Iota', 'Kappa', 'Lambda', 'Mu',
00133 'Nu', 'Xi', 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon', 'Phi', 'Chi',
00134 'Psi', 'Omega', 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta',
00135 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho',
00136 'sigmaf', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega',
00137 'thetasym', 'upsih', 'piv', 'bull', 'hellip', 'prime', 'Prime', 'oline',
00138 'frasl', 'weierp', 'image', 'real', 'trade', 'alefsym', 'larr', 'uarr',
00139 'rarr', 'darr', 'harr', 'crarr', 'lArr', 'uArr', 'rArr', 'dArr', 'hArr',
00140 'forall', 'part', 'exist', 'empty', 'nabla', 'isin', 'notin', 'ni', 'prod',
00141 'sum', 'minus', 'lowast', 'radic', 'prop', 'infin', 'ang', 'and', 'or',
00142 'cap', 'cup', 'int', 'there4', 'sim', 'cong', 'asymp', 'ne', 'equiv', 'le',
00143 'ge', 'sub', 'sup', 'nsub', 'sube', 'supe', 'oplus', 'otimes', 'perp',
00144 'sdot', 'lceil', 'rceil', 'lfloor', 'rfloor', 'lang', 'rang', 'loz',
00145 'spades', 'clubs', 'hearts', 'diams', 'quot', 'amp', 'lt', 'gt', 'OElig',
00146 'oelig', 'Scaron', 'scaron', 'Yuml', 'circ', 'tilde', 'ensp', 'emsp',
00147 'thinsp', 'zwnj', 'zwj', 'lrm', 'rlm', 'ndash', 'mdash', 'lsquo', 'rsquo',
00148 'sbquo', 'ldquo', 'rdquo', 'bdquo', 'dagger', 'Dagger', 'permil', 'lsaquo',
00149 'rsaquo', 'euro',
00150 );
00151
00152 private static $character_entities_re = '';
00153
00154 public static function __static ( ) {
00155 self::$whitelist_elements = Plugins::filter( 'inputfilter_whitelist_elements', self::$whitelist_elements );
00156 self::$whitelist_attributes = Plugins::filter( 'inputfilter_whitelist_attributes', self::$whitelist_attributes );
00157 self::$elements_empty = Plugins::filter( 'inputfilter_elements_empty', self::$elements_empty );
00158 self::$whitelist_protocols = Plugins::filter( 'inputfilter_whitelist_protocols', self::$whitelist_protocols );
00159 self::$character_entities = Plugins::filter( 'inputfilter_character_entities', self::$character_entities );
00160 self::$character_entities_re = Plugins::filter( 'inputfilter_character_entities_re', self::$character_entities_re );
00161 }
00162
00168 public static function filter( $str )
00169 {
00170 if ( !MultiByte::valid_data( $str ) ) {
00171 return '';
00172 }
00173 else {
00174 do {
00175 $_str = $str;
00176 $str = self::strip_nulls( $str );
00177 $str = self::strip_illegal_entities( $str );
00178 $str = self::filter_html_elements( $str );
00179 } while ( $str != $_str );
00180
00181 return $str;
00182 }
00183 }
00184
00190 public static function strip_nulls( $str )
00191 {
00192 $str = str_replace( '\0', '', $str );
00193
00194 return $str;
00195 }
00196
00202 public static function _validate_entity( $m )
00203 {
00204 $is_valid = false;
00205
00214 $e = trim( $m[1] );
00215 $r = $m[2];
00216 if ( $r == ';' ) {
00217 $r = '';
00218 }
00219
00220 if ( $e{0} == '#' ) {
00221 $e = strtolower( $e );
00222 if ( $e{1} == 'x' ) {
00223 $e = hexdec( substr( $e, 2 ) );
00224 }
00225 else {
00226 $e = substr( $e, 1 );
00227 }
00228
00229
00230
00231 $is_valid = ( intval( $e ) > 8 && intval( $e ) <= 65535 );
00232
00233 if ( $is_valid ) {
00234
00235 $e = '#' . intval( $e ) . ';';
00236 }
00237 }
00238 else {
00239 if ( self::$character_entities_re == '' ) {
00240 self::$character_entities_re = ';(' . implode( '|', self::$character_entities ) . ');';
00241 }
00242
00243
00244 $is_valid = preg_match( self::$character_entities_re, $e, $matches );
00245
00246
00247
00248 if ( $is_valid ) {
00249
00250 $e = $matches[1] . ';';
00251 }
00252 }
00253
00254 return $is_valid ? '&' . $e . $r : '';
00255 }
00256
00262 public static function strip_illegal_entities( $str )
00263 {
00264 $str = preg_replace_callback( "/&([^;<\n\r]+)([;<\n\r])/", array( __CLASS__, '_validate_entity' ), $str );
00265
00266 return $str;
00267 }
00268
00272 public static function parse_url( $url )
00273 {
00274
00275 $r = array(
00276 'scheme' => '',
00277 'host' => '',
00278 'port' => '',
00279 'user' => '',
00280 'pass' => '',
00281 'path' => '',
00282 'query' => '',
00283 'fragment' => '',
00284
00285 'is_relative' => false,
00286 'is_pseudo' => false,
00287 'is_error' => false,
00288
00289 'pseudo_args' => '',
00290 );
00291
00292
00293 $sanitized = html_entity_decode( $url, null, 'UTF-8' );
00294 $sanitized = filter_var( $sanitized, FILTER_SANITIZE_URL );
00295
00296 $sanitized_scheme = parse_url( $sanitized, PHP_URL_SCHEME );
00297
00298
00299 $parsed = parse_url( $url );
00300 if ( $parsed == false ) {
00301 $r['is_error'] = true;
00302 return $r;
00303 }
00304 $r = array_merge( $r, $parsed );
00305
00306
00307 $r['scheme'] = $sanitized_scheme;
00308
00309 $r['is_pseudo'] = !in_array( $r['scheme'], array( 'http', 'https', '' ) );
00310 $r['is_relative'] = ( $r['host'] == '' && !$r['is_pseudo'] );
00311
00312 if ( $r['is_pseudo'] ) {
00313 $r['pseudo_args'] = $r['path'];
00314 $r['path'] = '';
00315 }
00316
00317 return $r;
00318 }
00319
00324 public static function glue_url( $parsed_url )
00325 {
00326 if ( ! is_array( $parsed_url ) ) {
00327 return false;
00328 }
00329
00330 $res = '';
00331 $res .= $parsed_url['scheme'];
00332 if ( $parsed_url['is_pseudo'] || in_array( strtolower( $parsed_url['scheme'] ), array( 'mailto', 'callto' ) ) ) {
00333 $res .= ':';
00334 }
00335 else {
00336 if ( ! $parsed_url['is_relative'] ) {
00337 $res .= '://';
00338 }
00339 }
00340 if ( $parsed_url['is_pseudo'] ) {
00341 $res .= $parsed_url['pseudo_args'];
00342 }
00343 else {
00344
00345 if ( $parsed_url['user'] ) {
00346 $res .= $parsed_url['user'];
00347 if ( $parsed_url['pass'] ) {
00348 $res .= ':' . $parsed_url['pass'];
00349 }
00350 $res .= '@';
00351 }
00352 $res .= $parsed_url['host'];
00353 if ( !empty( $parsed_url['port'] ) ) {
00354 if ( array_key_exists( $parsed_url['scheme'], Utils::scheme_ports() ) && Utils::scheme_ports( $parsed_url['scheme'] ) == $parsed_url['port'] ) {
00355
00356 }
00357 else {
00358 $res .= ':' . $parsed_url['port'];
00359 }
00360 }
00361 if ( !empty( $parsed_url['path'] ) ) {
00362 $res .= $parsed_url['path'];
00363 }
00364 else {
00365 $res .= '/';
00366 }
00367 if ( $parsed_url['query'] ) {
00368 $res .= '?' . $parsed_url['query'];
00369 }
00370 if ( $parsed_url['fragment'] ) {
00371 $res .= '#' . $parsed_url['fragment'];
00372 }
00373 }
00374
00375 return $res;
00376 }
00377
00378 private static function check_attr_value( $k, $v, $type )
00379 {
00380 if ( is_array( $type ) ) {
00381
00382 return in_array( $v, $type, true );
00383 }
00384 else {
00385
00386 switch ( $type ) {
00387 case 'uri':
00388
00389 $bits = self::parse_url( $v );
00390 return $bits['is_relative'] || in_array( $bits['scheme'], self::$whitelist_protocols );
00391 break;
00392 case 'language-code':
00393
00394
00395
00396
00397 return preg_match( '/^[a-zA-Z]{1,8}(?:-[a-zA-Z]{1,8})*$/i', $v );
00398 break;
00399 case 'text':
00400
00401 return is_string( $v );
00402 break;
00403 case 'datetime':
00404
00405
00406
00407 return preg_match( '/^[0-9]{4}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9](?:Z|[\+-][0-2][0-9]:[0-5][0-9])$/', $v );
00408 break;
00409 default:
00410 Error::raise( _t( 'Unknown attribute type "%s" in %s', array( $type, __CLASS__ ) ) );
00411 return false;
00412 }
00413 }
00414 }
00415
00420 public static function filter_html_elements( $str )
00421 {
00422 $tokenizer = new HTMLTokenizer( $str );
00423
00424
00425 $tokens = $tokenizer->parse();
00426
00427
00428 $filtered = new HTMLTokenSet;
00429 $stack = array();
00430 foreach ( $tokens as $node ) {
00431 switch ( $node['type'] ) {
00432 case HTMLTokenizer::NODE_TYPE_TEXT:
00433 $node['value'] = html_entity_decode( $node['value'], ENT_QUOTES, MultiByte::hab_encoding() );
00434 break;
00435 case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
00436 case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY:
00437
00438 if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
00439 if ( ! in_array( strtolower( $node['name'] ), self::$elements_empty ) ) {
00440 array_push( $stack, $node['name'] );
00441 }
00442
00443
00444 $node = array(
00445 'type' => HTMLTokenizer::NODE_TYPE_TEXT,
00446 'name' => '#text',
00447 'value' => HTMLTokenSet::token_to_string( $node ),
00448 'attrs' => array(),
00449 );
00450 }
00451 else {
00452
00453 foreach ( $node['attrs'] as $k => $v ) {
00454
00455 $attr_ok = false;
00456
00457
00458 if ( array_key_exists( strtolower( $k ), self::$whitelist_attributes['*'] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes['*'][ strtolower( $k ) ] ) ) {
00459 $attr_ok = true;
00460 }
00461
00462
00463 if ( array_key_exists( strtolower( $node['name'] ), self::$whitelist_attributes ) && array_key_exists( strtolower( $k ), self::$whitelist_attributes[ strtolower( $node['name'] ) ] ) && self::check_attr_value( strtolower( $k ), $v, self::$whitelist_attributes[ strtolower( $node['name'] ) ][ strtolower( $k ) ] ) ) {
00464 $attr_ok = true;
00465 }
00466
00467
00468 if ( $attr_ok != true ) {
00469 unset( $node['attrs'][$k] );
00470 }
00471 }
00472 }
00473 break;
00474 case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
00475 if ( ! in_array( strtolower( $node['name'] ), self::$whitelist_elements ) ) {
00476 if ( strtolower( $temp = array_pop( $stack ) ) !== strtolower( $node['name'] ) ) {
00477
00478 array_push( $stack, $temp );
00479 }
00480
00481
00482 $node = array(
00483 'type' => HTMLTokenizer::NODE_TYPE_TEXT,
00484 'name' => '#text',
00485 'value' => HTMLTokenSet::token_to_string( $node ),
00486 'attrs' => array(),
00487 );
00488 }
00489 break;
00490 case HTMLTokenizer::NODE_TYPE_PI:
00491 case HTMLTokenizer::NODE_TYPE_COMMENT:
00492 case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
00493 case HTMLTokenizer::NODE_TYPE_STATEMENT:
00494 default:
00495 $node = null;
00496 break;
00497 }
00498
00499 if ( $node != null ) {
00500 $filtered[] = $node;
00501 }
00502 }
00503
00504
00505 return preg_replace( '#<([^>\s]+)(?:\s+[^>]+)?></\1>#u', '', (string) $filtered );
00506 }
00507 }
00508
00509 ?>