[BUGFIX] Use callback in preg_replace in RemoveXSS
[Packages/TYPO3.CMS.git] / typo3 / contrib / RemoveXSS / RemoveXSS.php
1 <?php
2 /**
3 * Usage: Run *every* variable passed in through it.
4 * The goal of this function is to be a generic function that can be used to
5 * parse almost any input and render it XSS safe. For more information on
6 * actual XSS attacks, check out http://ha.ckers.org/xss.html. Another
7 * excellent site is the XSS Database which details each attack and how it
8 * works.
9 *
10 * Used with permission by the author.
11 * URL: http://quickwired.com/smallprojects/php_xss_filter_function.php
12 *
13 * Check XSS attacks on http://ha.ckers.org/xss.html
14 *
15 * License:
16 * This code is public domain, you are free to do whatever you want with it,
17 * including adding it to your own project which can be under any license.
18 *
19 * @author Travis Puderbaugh <kallahar@quickwired.com>
20 * @author Jigal van Hemert <jigal@xs4all.nl>
21 * @package RemoveXSS
22 */
23 class RemoveXSS {
24
25 /**
26 * Removes potential XSS code from an input string.
27 *
28 * Using an external class by Travis Puderbaugh <kallahar@quickwired.com>
29 *
30 * @param string $val Input string
31 * @param string $replaceString replaceString for inserting in keywords (which destroys the tags)
32 * @return string Input string with potential XSS code removed
33 */
34 public static function process($val, $replaceString = '<x>') {
35 // Don't use empty $replaceString because then no XSS-remove will be done
36 if ($replaceString == '') {
37 $replaceString = '<x>';
38 }
39 // Remove all non-printable characters. CR(0a) and LF(0b) and TAB(9) are allowed.
40 // This prevents some character re-spacing such as <java\0script>
41 // Note that you have to handle splits with \n, \r, and \t later since they *are* allowed in some inputs
42 $val = preg_replace('/([\x00-\x08]|[\x0b-\x0c]|[\x0e-\x19])/', '', $val);
43
44 // Straight replacements, the user should never need these since they're normal characters.
45 // This prevents like <IMG SRC=&#X40&#X61&#X76&#X61&#X73&#X63&#X72&#X69&#X70&#X74&#X3A&#X61&#X6C&#X65&#X72&#X74&#X28&#X27&#X58&#X53&#X53&#X27&#X29>
46 $searchHexEncodings = '/&#[xX]0{0,8}(21|22|23|24|25|26|27|28|29|2a|2b|2d|2f|30|31|32|33|34|35|36|37|38|39|3a|3b|3d|3f|40|41|42|43|44|45|46|47|48|49|4a|4b|4c|4d|4e|4f|50|51|52|53|54|55|56|57|58|59|5a|5b|5c|5d|5e|5f|60|61|62|63|64|65|66|67|68|69|6a|6b|6c|6d|6e|6f|70|71|72|73|74|75|76|77|78|79|7a|7b|7c|7d|7e);?/i';
47 $searchUnicodeEncodings = '/&#0{0,8}(33|34|35|36|37|38|39|40|41|42|43|45|47|48|49|50|51|52|53|54|55|56|57|58|59|61|63|64|65|66|67|68|69|70|71|72|73|74|75|76|77|78|79|80|81|82|83|84|85|86|87|88|89|90|91|92|93|94|95|96|97|98|99|100|101|102|103|104|105|106|107|108|109|110|111|112|113|114|115|116|117|118|119|120|121|122|123|124|125|126);?/i';
48 while (preg_match($searchHexEncodings, $val) || preg_match($searchUnicodeEncodings, $val)) {
49 $val = preg_replace_callback(
50 $searchHexEncodings,
51 function ($matches) {
52 return chr(hexdec($matches[1]));
53 },
54 $val
55 );
56 $val = preg_replace_callback(
57 $searchUnicodeEncodings,
58 function ($matches) {
59 return chr($matches[1]);
60 },
61 $val
62 );
63 }
64
65 // Now the only remaining whitespace attacks are \t, \n, and \r
66 $ra1 = array('javascript', 'vbscript', 'expression', 'applet', 'meta', 'xml', 'blink', 'link', 'style', 'script', 'embed',
67 'object', 'iframe', 'frame', 'frameset', 'ilayer', 'layer', 'bgsound', 'title', 'base', 'video', 'audio', 'track',
68 'canvas', 'onabort', 'onactivate', 'onafterprint', 'onafterupdate', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut',
69 'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint', 'onbeforeunload', 'onbeforeupdate',
70 'onblur', 'onbounce', 'oncanplay', 'oncanplaythrough', 'oncellchange', 'onchange', 'onclick', 'oncontextmenu',
71 'oncontrolselect', 'oncopy', 'oncuechange', 'oncut', 'ondataavailable', 'ondatasetchanged', 'ondatasetcomplete',
72 'ondblclick', 'ondeactivate', 'ondrag', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover', 'ondragstart',
73 'ondrop', 'ondurationchange', 'onemptied', 'onended', 'onerror', 'onerrorupdate', 'onfilterchange', 'onfinish',
74 'onfocus', 'onfocusin', 'onfocusout', 'onhashchange', 'onhelp', 'oninput', 'oninvalid', 'onkeydown', 'onkeypress',
75 'onkeyup', 'onlayoutcomplete', 'onload', 'onloadeddata', 'onloadedmetadata', 'onloadstart', 'onlosecapture',
76 'onmessage', 'onmousedown', 'onmouseenter', 'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
77 'onmousewheel', 'onmove', 'onmoveend', 'onmovestart', 'onoffline', 'ononline', 'onpagehide', 'onpageshow', 'onpaste',
78 'onpause', 'onplay', 'onplaying', 'onpopstate', 'onprogress', 'onpropertychange', 'onratechange', 'onreadystatechange',
79 'onreset', 'onresize', 'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete', 'onrowsinserted',
80 'onscroll', 'onseeked', 'onseeking','onselect', 'onselectionchange', 'onselectstart', 'onshow', 'onstalled', 'onstart',
81 'onstop', 'onstorage', 'onsubmit', 'onsuspend', 'ontimeupdate', 'onunload', 'onvolumechange', 'onwaiting');
82 $ra_tag = array('applet', 'meta', 'xml', 'blink', 'link', 'style', 'script', 'embed', 'object', 'iframe', 'frame',
83 'frameset', 'ilayer', 'layer', 'bgsound', 'title', 'base', 'video', 'audio', 'track', 'canvas');
84 $ra_attribute = array('style', 'onabort', 'onactivate', 'onafterprint', 'onafterupdate', 'onbeforeactivate',
85 'onbeforecopy', 'onbeforecut', 'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint',
86 'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'oncanplay', 'oncanplaythrough', 'oncellchange', 'onchange',
87 'onclick', 'oncontextmenu', 'oncontrolselect', 'oncopy', 'oncuechange', 'oncut', 'ondataavailable', 'ondatasetchanged',
88 'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover',
89 'ondragstart', 'ondrop', 'ondurationchange', 'onemptied', 'onended', 'onerror', 'onerrorupdate', 'onfilterchange',
90 'onfinish', 'onfocus', 'onfocusin', 'onfocusout', 'onhashchange', 'onhelp', 'oninput', 'oninvalid,', 'onkeydown',
91 'onkeypress', 'onkeyup', 'onlayoutcomplete', 'onload', 'onloadeddata', 'onloadedmetadata', 'onloadstart',
92 'onlosecapture', 'onmessage', 'onmousedown', 'onmouseenter', 'onmouseleave', 'onmousemove', 'onmouseout',
93 'onmouseover', 'onmouseup', 'onmousewheel', 'onmove', 'onmoveend', 'onmovestart', 'onoffline', 'ononline',
94 'onpagehide', 'onpageshow', 'onpaste', 'onpause', 'onplay', 'onplaying', 'onpopstate', 'onprogress',
95 'onpropertychange', 'onratechange', 'onreadystatechange', 'onredo', 'onreset', 'onresize', 'onresizeend',
96 'onresizestart','onrowenter', 'onrowexit', 'onrowsdelete', 'onrowsinserted', 'onscroll', 'onseeked', 'onseeking',
97 'onselect', 'onselectionchange', 'onselectstart', 'onshow', 'onstalled', 'onstart', 'onstop', 'onstorage', 'onsubmit',
98 'onsuspend', 'ontimeupdate', 'onundo', 'onunload', 'onvolumechange', 'onwaiting');
99 $ra_protocol = array('javascript', 'vbscript', 'expression');
100
101 // Remove the potential &#xxx; stuff for testing
102 $val2 = preg_replace('/(&#[xX]?0{0,8}(9|10|13|a|b);?)*\s*/i', '', $val);
103 $ra = array();
104
105 foreach ($ra1 as $ra1word) {
106 // Stripos is faster than the regular expressions used later and because the words we're looking for only have
107 // chars < 0x80 we can use the non-multibyte safe version.
108 if (stripos($val2, $ra1word ) !== FALSE ) {
109 //keep list of potential words that were found
110 if (in_array($ra1word, $ra_protocol, TRUE)) {
111 $ra[] = array($ra1word, 'ra_protocol');
112 }
113 if (in_array($ra1word, $ra_tag, TRUE)) {
114 $ra[] = array($ra1word, 'ra_tag');
115 }
116 if (in_array($ra1word, $ra_attribute, TRUE)) {
117 $ra[] = array($ra1word, 'ra_attribute');
118 }
119 // Some keywords appear in more than one array.
120 // These get multiple entries in $ra, each with the appropriate type
121 }
122 }
123 // Only process potential words
124 if (count($ra) > 0) {
125 // Keep replacing as long as the previous round replaced something
126 $found = TRUE;
127 while ($found == TRUE) {
128 $val_before = $val;
129 for ($i = 0; $i < sizeof($ra); $i++) {
130 $pattern = '';
131 for ($j = 0; $j < strlen($ra[$i][0]); $j++) {
132 if ($j > 0) {
133 $pattern .= '((&#[xX]0{0,8}([9ab]);?)|(&#0{0,8}(9|10|13);?)|\s)*';
134 }
135 $pattern .= $ra[$i][0][$j];
136 }
137 // Handle each type a little different (extra conditions to prevent false positives a bit better)
138 switch ($ra[$i][1]) {
139 case 'ra_protocol':
140 // These take the form of e.g. 'javascript:'
141 $pattern .= '((&#[xX]0{0,8}([9ab]);?)|(&#0{0,8}(9|10|13);?)|\s)*(?=:)';
142 break;
143 case 'ra_tag':
144 // These take the form of e.g. '<SCRIPT[^\da-z] ....';
145 $pattern = '(?<=<)' . $pattern . '((&#[xX]0{0,8}([9ab]);?)|(&#0{0,8}(9|10|13);?)|\s)*(?=[^\da-z])';
146 break;
147 case 'ra_attribute':
148 // These take the form of e.g. 'onload=' Beware that a lot of characters are allowed
149 // between the attribute and the equal sign!
150 $pattern .= '[\s\!\#\$\%\&\(\)\*\~\+\-\_\.\,\:\;\?\@\[\/\|\\\\\]\^\`]*(?==)';
151 break;
152 }
153 $pattern = '/' . $pattern . '/i';
154 // Add in <x> to nerf the tag
155 $replacement = substr_replace($ra[$i][0], $replaceString, 2, 0);
156 // Filter out the hex tags
157 $val = preg_replace($pattern, $replacement, $val);
158 if ($val_before == $val) {
159 // No replacements were made, so exit the loop
160 $found = FALSE;
161 }
162 }
163 }
164 }
165 return $val;
166 }
167 }