001    /**
002     * Copyright (c) 2000-2012 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.util.CharPool;
018    import com.liferay.portal.kernel.util.Html;
019    import com.liferay.portal.kernel.util.HttpUtil;
020    import com.liferay.portal.kernel.util.StringBundler;
021    import com.liferay.portal.kernel.util.StringPool;
022    import com.liferay.portal.kernel.util.StringUtil;
023    
024    import java.util.regex.Matcher;
025    import java.util.regex.Pattern;
026    
027    import net.htmlparser.jericho.Source;
028    import net.htmlparser.jericho.TextExtractor;
029    
030    /**
031     * @author Brian Wing Shun Chan
032     * @author Clarence Shen
033     * @author Harry Mark
034     * @author Samuel Kong
035     * @author Connor McKay
036     * @author Shuyang Zhou
037     */
038    public class HtmlImpl implements Html {
039    
040            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
041    
042            public static final int ESCAPE_MODE_CSS = 2;
043    
044            public static final int ESCAPE_MODE_JS = 3;
045    
046            public static final int ESCAPE_MODE_TEXT = 4;
047    
048            public static final int ESCAPE_MODE_URL = 5;
049    
050            public String escape(String text) {
051                    if (text == null) {
052                            return null;
053                    }
054    
055                    if (text.length() == 0) {
056                            return StringPool.BLANK;
057                    }
058    
059                    // Escape using XSS recommendations from
060                    // http://www.owasp.org/index.php/Cross_Site_Scripting
061                    // #How_to_Protect_Yourself
062    
063                    StringBundler sb = null;
064    
065                    int lastReplacementIndex = 0;
066    
067                    for (int i = 0; i < text.length(); i++) {
068                            char c = text.charAt(i);
069    
070                            String replacement = null;
071    
072                            switch (c) {
073                                    case '<':
074                                            replacement = "&lt;";
075    
076                                            break;
077    
078                                    case '>':
079                                            replacement = "&gt;";
080    
081                                            break;
082    
083                                    case '&':
084                                            replacement = "&amp;";
085    
086                                            break;
087    
088                                    case '"':
089                                            replacement = "&#034;";
090    
091                                            break;
092    
093                                    case '\'':
094                                            replacement = "&#039;";
095    
096                                            break;
097    
098                                    case '\u00bb': // '�'
099                                            replacement = "&#187;";
100    
101                                            break;
102    
103                                    case '\u2013':
104                                            replacement = "&#x2013;";
105    
106                                            break;
107    
108                                    case '\u2014':
109                                            replacement = "&#x2014;";
110    
111                                            break;
112                            }
113    
114                            if (replacement != null) {
115                                    if (sb == null) {
116                                            sb = new StringBundler();
117                                    }
118    
119                                    if (i > lastReplacementIndex) {
120                                            sb.append(text.substring(lastReplacementIndex, i));
121                                    }
122    
123                                    sb.append(replacement);
124    
125                                    lastReplacementIndex = i + 1;
126                            }
127                    }
128    
129                    if (sb == null) {
130                            return text;
131                    }
132                    else {
133                            if (lastReplacementIndex < text.length()) {
134                                    sb.append(text.substring(lastReplacementIndex));
135                            }
136    
137                            return sb.toString();
138                    }
139            }
140    
141            public String escape(String text, int type) {
142                    if (text == null) {
143                            return null;
144                    }
145    
146                    if (text.length() == 0) {
147                            return StringPool.BLANK;
148                    }
149    
150                    String prefix = StringPool.BLANK;
151                    String postfix = StringPool.BLANK;
152    
153                    if (type == ESCAPE_MODE_ATTRIBUTE) {
154                            prefix = "&#x";
155                            postfix = StringPool.SEMICOLON;
156                    }
157                    else if (type == ESCAPE_MODE_CSS) {
158                            prefix = StringPool.BACK_SLASH;
159                    }
160                    else if (type == ESCAPE_MODE_JS) {
161                            prefix = "\\x";
162                    }
163                    else if (type == ESCAPE_MODE_URL) {
164                            return HttpUtil.encodeURL(text, true);
165                    }
166                    else {
167                            return escape(text);
168                    }
169    
170                    StringBuilder sb = new StringBuilder();
171    
172                    for (int i = 0; i < text.length(); i++) {
173                            char c = text.charAt(i);
174    
175                            if ((Character.isLetterOrDigit(c)) ||
176                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
177    
178                                    sb.append(c);
179                            }
180                            else {
181                                    sb.append(prefix);
182    
183                                    String hexString = StringUtil.toHexString(c);
184    
185                                    if (hexString.length() == 1) {
186                                            sb.append(StringPool.ASCII_TABLE[48]);
187                                    }
188    
189                                    sb.append(hexString);
190                                    sb.append(postfix);
191                            }
192                    }
193    
194                    if (sb.length() == text.length()) {
195                            return text;
196                    }
197                    else {
198                            return sb.toString();
199                    }
200            }
201    
202            public String escapeAttribute(String attribute) {
203                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
204            }
205    
206            public String escapeCSS(String css) {
207                    return escape(css, ESCAPE_MODE_CSS);
208            }
209    
210            public String escapeHREF(String href) {
211                    if (href == null) {
212                            return null;
213                    }
214    
215                    if (href.length() == 0) {
216                            return StringPool.BLANK;
217                    }
218    
219                    if (href.indexOf(StringPool.COLON) == 10) {
220                            String protocol = href.substring(0, 10).toLowerCase();
221    
222                            if (protocol.equals("javascript")) {
223                                    return StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
224                            }
225                    }
226    
227                    return href;
228            }
229    
230            public String escapeJS(String js) {
231                    return escape(js, ESCAPE_MODE_JS);
232            }
233    
234            public String escapeURL(String url) {
235                    return escape(url, ESCAPE_MODE_URL);
236            }
237    
238            public String extractText(String html) {
239                    if (html == null) {
240                            return null;
241                    }
242    
243                    Source source = new Source(html);
244    
245                    TextExtractor textExtractor = source.getTextExtractor();
246    
247                    return textExtractor.toString();
248            }
249    
250            public String fromInputSafe(String text) {
251                    return StringUtil.replace(text, "&amp;", "&");
252            }
253    
254            public String replaceMsWordCharacters(String text) {
255                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
256            }
257    
258            public String stripBetween(String text, String tag) {
259                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
260            }
261    
262            public String stripComments(String text) {
263                    return StringUtil.stripBetween(text, "<!--", "-->");
264            }
265    
266            public String stripHtml(String text) {
267                    if (text == null) {
268                            return null;
269                    }
270    
271                    text = stripComments(text);
272    
273                    StringBuilder sb = new StringBuilder(text.length());
274    
275                    int x = 0;
276                    int y = text.indexOf("<");
277    
278                    while (y != -1) {
279                            sb.append(text.substring(x, y));
280                            sb.append(StringPool.SPACE);
281    
282                            // Look for text enclosed by <script></script>
283    
284                            boolean scriptFound = isScriptTag(text, y + 1);
285    
286                            if (scriptFound) {
287                                    int pos = y + _TAG_SCRIPT.length;
288    
289                                    // Find end of the tag
290    
291                                    pos = text.indexOf(">", pos);
292    
293                                    if (pos >= 0) {
294    
295                                            // Check if preceding character is / (i.e. is this instance
296                                            // of <script/>)
297    
298                                            if (text.charAt(pos-1) != '/') {
299    
300                                                    // Search for the ending </script> tag
301    
302                                                    for (;;) {
303                                                            pos = text.indexOf("</", pos);
304    
305                                                            if (pos >= 0) {
306                                                                    if (isScriptTag(text, pos + 2)) {
307                                                                            y = pos;
308    
309                                                                            break;
310                                                                    }
311                                                                    else {
312    
313                                                                            // Skip past "</"
314    
315                                                                            pos += 2;
316                                                                    }
317                                                            }
318                                                            else {
319                                                                    break;
320                                                            }
321                                                    }
322                                            }
323                                    }
324                            }
325    
326                            x = text.indexOf(">", y);
327    
328                            if (x == -1) {
329                                    break;
330                            }
331    
332                            x++;
333    
334                            if (x < y) {
335    
336                                    // <b>Hello</b
337    
338                                    break;
339                            }
340    
341                            y = text.indexOf("<", x);
342                    }
343    
344                    if (y == -1) {
345                            sb.append(text.substring(x, text.length()));
346                    }
347    
348                    return sb.toString();
349            }
350    
351            public String toInputSafe(String text) {
352                    return StringUtil.replace(
353                            text,
354                            new String[] {"&", "\""},
355                            new String[] {"&amp;", "&quot;"});
356            }
357    
358            public String unescape(String text) {
359                    if (text == null) {
360                            return null;
361                    }
362    
363                    if (text.length() == 0) {
364                            return StringPool.BLANK;
365                    }
366    
367                    // Optimize this
368    
369                    text = StringUtil.replace(text, "&lt;", "<");
370                    text = StringUtil.replace(text, "&gt;", ">");
371                    text = StringUtil.replace(text, "&amp;", "&");
372                    text = StringUtil.replace(text, "&#034;", "\"");
373                    text = StringUtil.replace(text, "&#039;", "'");
374                    text = StringUtil.replace(text, "&#040;", "(");
375                    text = StringUtil.replace(text, "&#041;", ")");
376                    text = StringUtil.replace(text, "&#044;", ",");
377                    text = StringUtil.replace(text, "&#035;", "#");
378                    text = StringUtil.replace(text, "&#037;", "%");
379                    text = StringUtil.replace(text, "&#059;", ";");
380                    text = StringUtil.replace(text, "&#061;", "=");
381                    text = StringUtil.replace(text, "&#043;", "+");
382                    text = StringUtil.replace(text, "&#045;", "-");
383    
384                    return text;
385            }
386    
387            public String unescapeCDATA(String text) {
388                    if (text == null) {
389                            return null;
390                    }
391    
392                    if (text.length() == 0) {
393                            return StringPool.BLANK;
394                    }
395    
396                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
397                    text = StringUtil.replace(text, "]]&gt;", "]]>");
398    
399                    return text;
400            }
401    
402            public String wordBreak(String text, int columns) {
403                    StringBundler sb = new StringBundler();
404    
405                    int length = 0;
406                    int lastWrite = 0;
407                    int pos = 0;
408    
409                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
410    
411                    Matcher matcher = pattern.matcher(text);
412    
413                    while (matcher.find()) {
414                            if (matcher.start() < pos) {
415                                    continue;
416                            }
417    
418                            while ((length + matcher.start() - pos) >= columns) {
419                                    pos += columns - length;
420    
421                                    sb.append(text.substring(lastWrite, pos));
422                                    sb.append("<wbr/>&shy;");
423    
424                                    length = 0;
425                                    lastWrite = pos;
426                            }
427    
428                            length += matcher.start() - pos;
429    
430                            String group = matcher.group();
431    
432                            if (group.equals(StringPool.AMPERSAND)) {
433                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
434    
435                                    if (x != -1) {
436                                            length++;
437                                            pos = x + 1;
438                                    }
439    
440                                    continue;
441                            }
442    
443                            if (group.equals(StringPool.LESS_THAN)) {
444                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
445    
446                                    if (x != -1) {
447                                            pos = x + 1;
448                                    }
449    
450                                    continue;
451                            }
452    
453                            if (group.equals(StringPool.SPACE) ||
454                                    group.equals(StringPool.NEW_LINE)) {
455    
456                                    length = 0;
457                                    pos = matcher.start() + 1;
458                            }
459                    }
460    
461                    sb.append(text.substring(lastWrite));
462    
463                    return sb.toString();
464            }
465    
466            protected boolean isScriptTag(String text, int pos) {
467                    if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
468                            char item;
469    
470                            for (int i = 0; i < _TAG_SCRIPT.length; i++) {
471                                    item = text.charAt(pos++);
472    
473                                    if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
474                                            return false;
475                                    }
476                            }
477    
478                            item = text.charAt(pos);
479    
480                            // Check that char after "script" is not a letter (i.e. another tag)
481    
482                            return !Character.isLetter(item);
483                    }
484                    else {
485                            return false;
486                    }
487            }
488    
489            private static final String[] _MS_WORD_HTML = new String[] {
490                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
491            };
492    
493            private static final String[] _MS_WORD_UNICODE = new String[] {
494                    "\u00ae", "\u2019", "\u201c", "\u201d"
495            };
496    
497            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
498    
499    }