src/com/google/streamhtmlparser/util/HtmlUtils.java - platform/external/jsilver - Git at Google

 /*
  * Copyright (C) 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package com.google.streamhtmlparser.util;

 import com.google.common.collect.ImmutableSortedSet;

 import java.util.Set;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;

 /**
  * Utility functions for HTML and Javascript that are most likely
  * not interesting to users outside this package.
  *
  * <p>The <code>HtmlParser</code> will be open-sourced hence we took the
  * decision to keep these utilities in this package as well as not to
  * leverage others that may exist in the <code>google3</code> code base.
  *
  * <p>The functionality exposed is designed to be 100% compatible with
  * the corresponding logic in the C-version of the HtmlParser as such
  * we are particularly concerned with cross-language compatibility.
  *
  * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
  * interchangeably unless otherwise noted.
  */
 public final class HtmlUtils {

   /**
    * static utility class
    */
   private HtmlUtils() {
   }  // COV_NF_LINE

   /**
    * Indicates the type of content contained in the {@code content} HTML
    * attribute of the {@code meta} HTML tag. Used by
    * {@link HtmlUtils#parseContentAttributeForUrl(String)}.
    * <p>The values are:
    * <ul>
    * <li>{@code NONE} if it does not contain a URL in the expected format.
    * <li>{@code URL_START} if it contains a URL but hasn't seen any of
    * its contents.
    * <li>{@code URL} if it contains a URL and has seen at least some of
    * its contents.
    * </ul>
    */
   public enum META_REDIRECT_TYPE {
     NONE,
     URL_START,
     URL
   }

   /**
    * A regular expression matching the format of a {@code content} attribute
    * that contains a URL. Used by {@link #parseContentAttributeForUrl}.
    */
   private static final String META_REDIRECT_REGEX =
       "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";

   // Safe for use by concurrent threads so we compile once.
   private static final Pattern META_REDIRECT_PATTERN =
       Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);

   /**
    * Set of keywords that can precede a regular expression literal. Taken from:
    * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
    * Language Syntax</a>
    *
    * <p>The token {@code void} was added to the list. Several keywords are
    * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
    * simple we do not differentiate on the version and bundle them all together.
    */
   private static final Set<String> REGEXP_TOKEN_PREFIXS =
       ImmutableSortedSet.of(
           "abstract",
           "break",
           "case",
           "catch",
           "class",
           "const",
           "continue",
           "debugger",
           "default",
           "delete",
           "do",
           "else",
           "enum",
           "eval",
           "export",
           "extends",
           "field",
           "final",
           "finally",
           "for",
           "function",
           "goto",
           "if",
           "implements",
           "import",
           "in",
           "instanceof",
           "native",
           "new",
           "package",
           "private",
           "protected",
           "public",
           "return",
           "static",
           "switch",
           "synchronized",
           "throw",
           "throws",
           "transient",
           "try",
           "typeof",
           "var",
           "void",
           "volatile",
           "while",
           "with");

   /**
    * Set of all HTML attributes which expect a URI (as the value).
    * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
    */
   private static final Set<String> ATTRIBUTE_EXPECTS_URI =
       ImmutableSortedSet.of(
           "action",
           "archive",
           "background",
           "cite",
           "classid",
           "codebase",
           "data",
           "dynsrc",
           "href",
           "longdesc",
           "src",
           "usemap");

   /**
    * Set of {@code Character}s considered whitespace in Javascript.
    * See {@link #isJavascriptWhitespace(char)}
    */
   private static final Set<Character> JAVASCRIPT_WHITESPACE =
       ImmutableSortedSet.of(
             '\u0009',         /* Tab \t */
             '\n',             /* Line-Feed 0x0A */
             '\u000B',         /* Vertical Tab 0x0B */
             '\u000C',         /* Form Feed \f */
             '\r',             /* Carriage Return 0x0D */
             ' ',              /* Space 0x20 */
             '\u00A0',         /* Non-breaking space 0xA0 */
             '\u2028',         /* Line separator */
             '\u2029');        /* Paragraph separator */

   /**
   * Set of {@code Character}s considered whitespace in HTML.
   * See {@link #isHtmlSpace(char)}
   */
  private static final Set<Character> HTML_WHITESPACE =
       ImmutableSortedSet.of(
           ' ',
           '\t',
           '\n',
           '\r',
           '\u200B');


   /**
    * Determines if the HTML attribute specified expects javascript
    * for its value. Such is the case for example with the {@code onclick}
    * attribute.
    *
    * <p>Currently returns {@code true} for any attribute name that starts
    * with "on" which is not exactly correct but we trust a developer to
    * not use non-spec compliant attribute names (e.g. onbogus).
    *
    * @param attribute the name of an HTML attribute
    * @return {@code false} if the input is null or is not an attribute
    *         that expects javascript code; {@code true}
    */
   public static boolean isAttributeJavascript(String attribute) {
     return ((attribute != null) && attribute.startsWith("on"));
   }

   /**
    * Determines if the HTML attribute specified expects a {@code style}
    * for its value. Currently this is only true for the {@code style}
    * HTML attribute.
    *
    * @param attribute the name of an HTML attribute
    * @return {@code true} iff the attribute name is one that expects a
    *     style for a value; otherwise {@code false}
    */
   public static boolean isAttributeStyle(String attribute) {
     return "style".equals(attribute);
   }

   /**
    * Determines if the HTML attribute specified expects a {@code URI}
    * for its value. For example, both {@code href} and {@code src}
    * expect a {@code URI} but {@code style} does not. Returns
    * {@code false} if the attribute given was {@code null}.
    *
    * @param attribute the name of an HTML attribute
    * @return {@code true} if the attribute name is one that expects
    *         a URI for a value; otherwise {@code null}
    *
    * @see #ATTRIBUTE_EXPECTS_URI
    */
   public static boolean isAttributeUri(String attribute) {
     return ATTRIBUTE_EXPECTS_URI.contains(attribute);
   }

   /**
    * Determines if the specified character is an HTML whitespace character.
    * A character is an HTML whitespace character if and only if it is one
    * of the characters below.
    * <ul>
    * <li>A <code>Space</code> character
    * <li>A <code>Tab</code> character
    * <li>A <code>Line feed</code> character
    * <li>A <code>Carriage Return</code> character
    * <li>A <code>Zero-Width Space</code> character
    * </ul>
    *
    * Note: The list includes the zero-width space (<code>&amp;#x200B;</code>)
    * which is not included in the C version.
    *
    * @param chr the {@code char} to check
    * @return {@code true} if the character is an HTML whitespace character
    *
    * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
    */
   public static boolean isHtmlSpace(char chr) {
     return HTML_WHITESPACE.contains(chr);
   }

   /**
    * Determines if the specified character is an ECMAScript whitespace or line
    * terminator character. A character is a whitespace or line terminator if
    * and only if it is one of the characters below:
    * <ul>
    * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
    *     <code>Form Feed</code>, <code>Space</code>,
    *     <code>No-break space</code>)
    * <li>A line terminator character (<code>Line Feed</code>,
    *     <code>Carriage Return</code>, <code>Line separator</code>,
    *     <code>Paragraph Separator</code>).
    * </ul>
    *
    * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
    * particular, this list is quite different from that in
    * <code>Character.isWhitespace</code>.
    * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
    * ECMAScript Language Specification</a>
    *
    * @param chr the {@code char} to check
    * @return {@code true} or {@code false}
    *
    */
   public static boolean isJavascriptWhitespace(char chr) {
     return JAVASCRIPT_WHITESPACE.contains(chr);
   }

   /**
    * Determines if the specified character is a valid character in an
    * ECMAScript identifier. This determination is currently not exact,
    * in particular:
    * <ul>
    * <li>It does not accept Unicode letters, only ASCII ones.
    * <li>It does not distinguish between the first character of an identifier
    *     (which cannot contain numbers) and subsequent characters.
    * </li>
    * </ul>
    *
    * We are considering leveraging <code>Character.isJavaIdentifierStart</code>
    * and <code>Character.isJavaIdentifierPart</code> given that Java
    * and Javascript follow similar identifier naming rules but we lose
    * compatibility with the C-version.
    *
    * @param chr {@code char} to check
    * @return {@code true} if the {@code chr} is a Javascript whitespace
    *         character; otherwise {@code false}
    */
   public static boolean isJavascriptIdentifier(char chr) {
     return ((chr >= 'a' && chr <= 'z')
         || (chr >= 'A' && chr <= 'Z')
         || (chr >= '0' && chr <= '9')
         || chr == '_' || chr == '$');
   }

   /**
    * Determines if the input token provided is a valid token prefix to a
    * javascript regular expression.  The token argument is compared against
    * a {@code Set} of identifiers that can precede a regular expression in the
    * javascript grammar, and returns {@code true} if the provided
    * {@code String} is in that {@code Set}.
    *
    * @param input the {@code String} token to check
    * @return {@code true} iff the token is a valid prefix of a regexp
    */
   public static boolean isJavascriptRegexpPrefix(String input) {
     return REGEXP_TOKEN_PREFIXS.contains(input);
   }

   /**
    * Encodes the specified character using Ascii for convenient insertion into
    * a single-quote enclosed {@code String}. Printable characters
    * are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
    * back-slash and single quote are all backslash-escaped. All other characters
    * are returned hex-encoded.
    *
    * @param chr {@code char} to encode
    * @return an Ascii-friendly encoding of the given {@code char}
    */
   public static String encodeCharForAscii(char chr) {
     if (chr == '\'') {
       return "\\'";
     } else if (chr == '\\') {
       return "\\\\";
     } else if (chr >= 32 && chr <= 126) {
       return String.format("%c", chr);
     } else if (chr == '\n') {
       return "\\n";
     } else if (chr == '\r') {
       return "\\r";
     } else if (chr == '\t') {
       return "\\t";
     } else {
       // Cannot apply a precision specifier for integral types. Specifying
       // 0-padded hex-encoding with minimum width of two.
       return String.format("\\u%04x", (int)chr);
     }
   }

   /**
    * Parses the given {@code String} to determine if it contains a URL in the
    * format followed by the {@code content} attribute of the {@code meta}
    * HTML tag.
    *
    * <p>This function expects to receive the value of the {@code content} HTML
    * attribute. This attribute takes on different meanings depending on the
    * value of the {@code http-equiv} HTML attribute of the same {@code meta}
    * tag. Since we may not have access to the {@code http-equiv} attribute,
    * we instead rely on parsing the given value to determine if it contains
    * a URL.
    *
    * The specification of the {@code meta} HTML tag can be found in:
    *   http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
    *
    * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
    * value contains a URL and whether we are at the start of the URL or past
    * the start. We are at the start of the URL if and only if one of the two
    * conditions below is true:
    * <ul>
    * <li>The given input does not contain any characters from the URL proper.
    * Example "5; URL=".
    * <li>The given input only contains the optional leading single or double
    * quote leading the URL. Example "5; URL='".
    * </li>
    * </ul>
    *
    * <p>Examples:
    * <ul>
    * <li> Example of a complete {@code meta} tag where the {@code content}
    * attribute contains a URL [we are not at the start of the URL]:
    * <pre>
    * &lt;meta http-equiv="refresh" content="5; URL=http://www.google.com"&gt;
    * </pre>
    * <li> Example of a complete {@code meta} tag where the {@code content}
    * attribute contains a URL [we are at the start of the URL]:
    * <pre>
    * &lt;meta http-equiv="refresh" content="5; URL="&gt;
    * </pre>
    * <li>Example of a complete {@code meta} tag where the {@code content}
    * attribute does not contain a URL:
    * <pre>
    * &lt;meta http-equiv="content-type" content="text/html"&gt;
    * </pre>
    * </ul>
    *
    * @param value {@code String} to parse
    * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
    * of a URL in the given value
    */
   public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
     if (value == null)
       return META_REDIRECT_TYPE.NONE;

     Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
     if (!matcher.find())
       return META_REDIRECT_TYPE.NONE;

     // We have more content.
     if (value.length() > matcher.end())
       return META_REDIRECT_TYPE.URL;

     return META_REDIRECT_TYPE.URL_START;
   }
 }
	/*
	* Copyright (C) 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package com.google.streamhtmlparser.util;

	import com.google.common.collect.ImmutableSortedSet;

	import java.util.Set;
	import java.util.regex.Pattern;
	import java.util.regex.Matcher;

	/**
	* Utility functions for HTML and Javascript that are most likely
	* not interesting to users outside this package.
	*
	* <p>The <code>HtmlParser</code> will be open-sourced hence we took the
	* decision to keep these utilities in this package as well as not to
	* leverage others that may exist in the <code>google3</code> code base.
	*
	* <p>The functionality exposed is designed to be 100% compatible with
	* the corresponding logic in the C-version of the HtmlParser as such
	* we are particularly concerned with cross-language compatibility.
	*
	* <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
	* interchangeably unless otherwise noted.
	*/
	public final class HtmlUtils {

	/**
	* static utility class
	*/
	private HtmlUtils() {
	} // COV_NF_LINE

	/**
	* Indicates the type of content contained in the {@code content} HTML
	* attribute of the {@code meta} HTML tag. Used by
	* {@link HtmlUtils#parseContentAttributeForUrl(String)}.
	* <p>The values are:
	* <ul>
	* <li>{@code NONE} if it does not contain a URL in the expected format.
	* <li>{@code URL_START} if it contains a URL but hasn't seen any of
	* its contents.
	* <li>{@code URL} if it contains a URL and has seen at least some of
	* its contents.
	* </ul>
	*/
	public enum META_REDIRECT_TYPE {
	NONE,
	URL_START,
	URL
	}

	/**
	* A regular expression matching the format of a {@code content} attribute
	* that contains a URL. Used by {@link #parseContentAttributeForUrl}.
	*/
	private static final String META_REDIRECT_REGEX =
	"^\\s\\d\\s;\\sURL\\s=\\s[\'\"]?";

	// Safe for use by concurrent threads so we compile once.
	private static final Pattern META_REDIRECT_PATTERN =
	Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);

	/**
	* Set of keywords that can precede a regular expression literal. Taken from:
	* <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
	* Language Syntax</a>
	*
	* <p>The token {@code void} was added to the list. Several keywords are
	* defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
	* simple we do not differentiate on the version and bundle them all together.
	*/
	private static final Set<String> REGEXP_TOKEN_PREFIXS =
	ImmutableSortedSet.of(
	"abstract",
	"break",
	"case",
	"catch",
	"class",
	"const",
	"continue",
	"debugger",
	"default",
	"delete",
	"do",
	"else",
	"enum",
	"eval",
	"export",
	"extends",
	"field",
	"final",
	"finally",
	"for",
	"function",
	"goto",
	"if",
	"implements",
	"import",
	"in",
	"instanceof",
	"native",
	"new",
	"package",
	"private",
	"protected",
	"public",
	"return",
	"static",
	"switch",
	"synchronized",
	"throw",
	"throws",
	"transient",
	"try",
	"typeof",
	"var",
	"void",
	"volatile",
	"while",
	"with");

	/**
	* Set of all HTML attributes which expect a URI (as the value).
	* <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
	*/
	private static final Set<String> ATTRIBUTE_EXPECTS_URI =
	ImmutableSortedSet.of(
	"action",
	"archive",
	"background",
	"cite",
	"classid",
	"codebase",
	"data",
	"dynsrc",
	"href",
	"longdesc",
	"src",
	"usemap");

	/**
	* Set of {@code Character}s considered whitespace in Javascript.
	* See {@link #isJavascriptWhitespace(char)}
	*/
	private static final Set<Character> JAVASCRIPT_WHITESPACE =
	ImmutableSortedSet.of(
	'\u0009', /* Tab \t */
	'\n', /* Line-Feed 0x0A */
	'\u000B', /* Vertical Tab 0x0B */
	'\u000C', /* Form Feed \f */
	'\r', /* Carriage Return 0x0D */
	' ', /* Space 0x20 */
	'\u00A0', /* Non-breaking space 0xA0 */
	'\u2028', /* Line separator */
	'\u2029'); /* Paragraph separator */

	/**
	* Set of {@code Character}s considered whitespace in HTML.
	* See {@link #isHtmlSpace(char)}
	*/
	private static final Set<Character> HTML_WHITESPACE =
	ImmutableSortedSet.of(
	' ',
	'\t',
	'\n',
	'\r',
	'\u200B');


	/**
	* Determines if the HTML attribute specified expects javascript
	* for its value. Such is the case for example with the {@code onclick}
	* attribute.
	*
	* <p>Currently returns {@code true} for any attribute name that starts
	* with "on" which is not exactly correct but we trust a developer to
	* not use non-spec compliant attribute names (e.g. onbogus).
	*
	* @param attribute the name of an HTML attribute
	* @return {@code false} if the input is null or is not an attribute
	* that expects javascript code; {@code true}
	*/
	public static boolean isAttributeJavascript(String attribute) {
	return ((attribute != null) && attribute.startsWith("on"));
	}

	/**
	* Determines if the HTML attribute specified expects a {@code style}
	* for its value. Currently this is only true for the {@code style}
	* HTML attribute.
	*
	* @param attribute the name of an HTML attribute
	* @return {@code true} iff the attribute name is one that expects a
	* style for a value; otherwise {@code false}
	*/
	public static boolean isAttributeStyle(String attribute) {
	return "style".equals(attribute);
	}

	/**
	* Determines if the HTML attribute specified expects a {@code URI}
	* for its value. For example, both {@code href} and {@code src}
	* expect a {@code URI} but {@code style} does not. Returns
	* {@code false} if the attribute given was {@code null}.
	*
	* @param attribute the name of an HTML attribute
	* @return {@code true} if the attribute name is one that expects
	* a URI for a value; otherwise {@code null}
	*
	* @see #ATTRIBUTE_EXPECTS_URI
	*/
	public static boolean isAttributeUri(String attribute) {
	return ATTRIBUTE_EXPECTS_URI.contains(attribute);
	}

	/**
	* Determines if the specified character is an HTML whitespace character.
	* A character is an HTML whitespace character if and only if it is one
	* of the characters below.
	* <ul>
	* <li>A <code>Space</code> character
	* <li>A <code>Tab</code> character
	* <li>A <code>Line feed</code> character
	* <li>A <code>Carriage Return</code> character
	* <li>A <code>Zero-Width Space</code> character
	* </ul>
	*
	* Note: The list includes the zero-width space (<code>&#x200B;</code>)
	* which is not included in the C version.
	*
	* @param chr the {@code char} to check
	* @return {@code true} if the character is an HTML whitespace character
	*
	* <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
	*/
	public static boolean isHtmlSpace(char chr) {
	return HTML_WHITESPACE.contains(chr);
	}

	/**
	* Determines if the specified character is an ECMAScript whitespace or line
	* terminator character. A character is a whitespace or line terminator if
	* and only if it is one of the characters below:
	* <ul>
	* <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
	* <code>Form Feed</code>, <code>Space</code>,
	* <code>No-break space</code>)
	* <li>A line terminator character (<code>Line Feed</code>,
	* <code>Carriage Return</code>, <code>Line separator</code>,
	* <code>Paragraph Separator</code>).
	* </ul>
	*
	* <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
	* particular, this list is quite different from that in
	* <code>Character.isWhitespace</code>.
	* <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
	* ECMAScript Language Specification</a>
	*
	* @param chr the {@code char} to check
	* @return {@code true} or {@code false}
	*
	*/
	public static boolean isJavascriptWhitespace(char chr) {
	return JAVASCRIPT_WHITESPACE.contains(chr);
	}

	/**
	* Determines if the specified character is a valid character in an
	* ECMAScript identifier. This determination is currently not exact,
	* in particular:
	* <ul>
	* <li>It does not accept Unicode letters, only ASCII ones.
	* <li>It does not distinguish between the first character of an identifier
	* (which cannot contain numbers) and subsequent characters.
	* </li>
	* </ul>
	*
	* We are considering leveraging <code>Character.isJavaIdentifierStart</code>
	* and <code>Character.isJavaIdentifierPart</code> given that Java
	* and Javascript follow similar identifier naming rules but we lose
	* compatibility with the C-version.
	*
	* @param chr {@code char} to check
	* @return {@code true} if the {@code chr} is a Javascript whitespace
	* character; otherwise {@code false}
	*/
	public static boolean isJavascriptIdentifier(char chr) {
	return ((chr >= 'a' && chr <= 'z')
	\|\| (chr >= 'A' && chr <= 'Z')
	\|\| (chr >= '0' && chr <= '9')
	\|\| chr == '_' \|\| chr == '$');
	}

	/**
	* Determines if the input token provided is a valid token prefix to a
	* javascript regular expression. The token argument is compared against
	* a {@code Set} of identifiers that can precede a regular expression in the
	* javascript grammar, and returns {@code true} if the provided
	* {@code String} is in that {@code Set}.
	*
	* @param input the {@code String} token to check
	* @return {@code true} iff the token is a valid prefix of a regexp
	*/
	public static boolean isJavascriptRegexpPrefix(String input) {
	return REGEXP_TOKEN_PREFIXS.contains(input);
	}

	/**
	* Encodes the specified character using Ascii for convenient insertion into
	* a single-quote enclosed {@code String}. Printable characters
	* are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
	* back-slash and single quote are all backslash-escaped. All other characters
	* are returned hex-encoded.
	*
	* @param chr {@code char} to encode
	* @return an Ascii-friendly encoding of the given {@code char}
	*/
	public static String encodeCharForAscii(char chr) {
	if (chr == '\'') {
	return "\\'";
	} else if (chr == '\\') {
	return "\\\\";
	} else if (chr >= 32 && chr <= 126) {
	return String.format("%c", chr);
	} else if (chr == '\n') {
	return "\\n";
	} else if (chr == '\r') {
	return "\\r";
	} else if (chr == '\t') {
	return "\\t";
	} else {
	// Cannot apply a precision specifier for integral types. Specifying
	// 0-padded hex-encoding with minimum width of two.
	return String.format("\\u%04x", (int)chr);
	}
	}

	/**
	* Parses the given {@code String} to determine if it contains a URL in the
	* format followed by the {@code content} attribute of the {@code meta}
	* HTML tag.
	*
	* <p>This function expects to receive the value of the {@code content} HTML
	* attribute. This attribute takes on different meanings depending on the
	* value of the {@code http-equiv} HTML attribute of the same {@code meta}
	* tag. Since we may not have access to the {@code http-equiv} attribute,
	* we instead rely on parsing the given value to determine if it contains
	* a URL.
	*
	* The specification of the {@code meta} HTML tag can be found in:
	* http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
	*
	* <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
	* value contains a URL and whether we are at the start of the URL or past
	* the start. We are at the start of the URL if and only if one of the two
	* conditions below is true:
	* <ul>
	* <li>The given input does not contain any characters from the URL proper.
	* Example "5; URL=".
	* <li>The given input only contains the optional leading single or double
	* quote leading the URL. Example "5; URL='".
	* </li>
	* </ul>
	*
	* <p>Examples:
	* <ul>
	* <li> Example of a complete {@code meta} tag where the {@code content}
	* attribute contains a URL [we are not at the start of the URL]:
	* <pre>
	* <meta http-equiv="refresh" content="5; URL=http://www.google.com">
	* </pre>
	* <li> Example of a complete {@code meta} tag where the {@code content}
	* attribute contains a URL [we are at the start of the URL]:
	* <pre>
	* <meta http-equiv="refresh" content="5; URL=">
	* </pre>
	* <li>Example of a complete {@code meta} tag where the {@code content}
	* attribute does not contain a URL:
	* <pre>
	* <meta http-equiv="content-type" content="text/html">
	* </pre>
	* </ul>
	*
	* @param value {@code String} to parse
	* @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
	* of a URL in the given value
	*/
	public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
	if (value == null)
	return META_REDIRECT_TYPE.NONE;

	Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
	if (!matcher.find())
	return META_REDIRECT_TYPE.NONE;

	// We have more content.
	if (value.length() > matcher.end())
	return META_REDIRECT_TYPE.URL;

	return META_REDIRECT_TYPE.URL_START;
	}
	}