/* Woodstox Lite ("wool") XML processor
*
* Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import com.fasterxml.aalto.util.
XmlCharTypes;
public final class
InputCharTypes
extends
XmlCharTypes
{
/* Most of the type values are shared, but name handling differs
* enough, to warrant partially separate value spaces
*/
/**
*<p>
* Important: must not overlap with the base constants.
* Last constant (CT_NAME_ANY) currently has value 3.
*/
public final static int
CT_INPUT_NAME_MB_N = 4;
public final static int
CT_INPUT_NAME_MB_2 = 5;
public final static int
CT_INPUT_NAME_MB_3 = 6;
public final static int
CT_INPUT_NAME_MB_4 = 7;
// Singleton instances:
/* Let's create non-UTF types lazily, as there's a good chance
* they might not be used, thereby possibly reducing memory footprint
* and startup time
*/
private static
XmlCharTypes sAsciiCharTypes = null;
private static
XmlCharTypes sLatin1CharTypes = null;
/* Note: unlike others, let's create eagerly, not lazily,
* as this is expected to be the common case:
*/
private final static
XmlCharTypes sUtf8CharTypes = new
XmlCharTypes();
static {
fillInUtf8Chars(
sUtf8CharTypes.
TEXT_CHARS,
sUtf8CharTypes.
ATTR_CHARS,
sUtf8CharTypes.
NAME_CHARS,
sUtf8CharTypes.
DTD_CHARS,
sUtf8CharTypes.
OTHER_CHARS);
}
public final static
XmlCharTypes getUtf8CharTypes()
{
return
sUtf8CharTypes;
}
public final static synchronized
XmlCharTypes getAsciiCharTypes()
{
if (
sAsciiCharTypes == null) {
sAsciiCharTypes = new
XmlCharTypes();
fillInLatin1Chars(
sAsciiCharTypes.
TEXT_CHARS,
sAsciiCharTypes.
ATTR_CHARS,
sAsciiCharTypes.
NAME_CHARS,
sAsciiCharTypes.
DTD_CHARS,
sAsciiCharTypes.
OTHER_CHARS);
// but need to wipe out everything for high-bit range:
fillInIllegalAsciiRange(
sAsciiCharTypes.
TEXT_CHARS);
fillInIllegalAsciiRange(
sAsciiCharTypes.
ATTR_CHARS);
fillInIllegalAsciiRange(
sAsciiCharTypes.
NAME_CHARS);
fillInIllegalAsciiRange(
sAsciiCharTypes.
DTD_CHARS);
fillInIllegalAsciiRange(
sAsciiCharTypes.
OTHER_CHARS);
}
return
sAsciiCharTypes;
}
public final static synchronized
XmlCharTypes getLatin1CharTypes()
{
if (
sLatin1CharTypes == null) {
sLatin1CharTypes = new
XmlCharTypes();
fillInLatin1Chars(
sLatin1CharTypes.
TEXT_CHARS,
sLatin1CharTypes.
ATTR_CHARS,
sLatin1CharTypes.
NAME_CHARS,
sLatin1CharTypes.
DTD_CHARS,
sLatin1CharTypes.
OTHER_CHARS);
}
return
sLatin1CharTypes;
}
public static void
fillInUtf8Chars(int[]
textChars, int[]
attrChars, int[]
nameChars,
int[]
dtdChars, int[]
otherChars)
{
// text chars
fillIn8BitTextRange(
textChars);
fillInMultiByteTextRange(
textChars);
// attr chars
fillIn8BitAttrRange(
attrChars);
fillInMultiByteTextRange(
attrChars);
// name chars
fillIn8BitNameRange(
nameChars);
/* Although 7-bit range uses different values, let's use
* same byte length markers for 8-bit range (as with text content)
*/
fillInMultiByteNameRange(
nameChars);
// // DTD chars:
fillIn8BitDtdRange(
dtdChars);
fillInMultiByteTextRange(
dtdChars);
// ... lotsa matching to do here
// 25-Jan-2011, tatu: Can't remember why LBRACKET would be needed:
// otherChars['['] = CT_LBRACKET;
otherChars[']'] =
CT_RBRACKET;
otherChars['>'] =
CT_GT;
// and finally, others (comment, CDATA, PI)
// let's start with basic text chars:
fillIn8BitTextRange(
otherChars);
fillInMultiByteTextRange(
otherChars);
/* And then just remove ampersand and lt (not special in any of
* these events), and add ']', '?' and '-', which mark start of end
* markers in the events.
*/
otherChars['&'] =
CT_OK;
otherChars['<'] =
CT_OK;
otherChars[']'] =
CT_RBRACKET; // for CDATA
otherChars['?'] =
CT_QMARK; // for PI
otherChars['-'] =
CT_HYPHEN; // for Comment
}
private static void
fillInMultiByteTextRange(int[]
arr)
{
for (int
c = 128;
c < 256; ++
c) {
int
code;
// Let's use code from UTF-8 decoder, to ensure correctness
if ((
c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
code =
CT_MULTIBYTE_2;
} else if ((
c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
code =
CT_MULTIBYTE_3;
} else if ((
c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
code =
CT_MULTIBYTE_4;
} else {
code =
CT_INVALID;
}
arr[
c] =
code;
}
}
private static void
fillInMultiByteNameRange(int[]
arr)
{
for (int
c = 128;
c < 256; ++
c) {
int
code;
// Let's use code from UTF-8 decoder, to ensure correctness
if ((
c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
code =
CT_INPUT_NAME_MB_2;
} else if ((
c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
code =
CT_INPUT_NAME_MB_3;
} else if ((
c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
code =
CT_INPUT_NAME_MB_4;
} else {
code =
CT_INVALID;
}
arr[
c] =
code;
}
}
protected static void
fillInIllegalAsciiRange(int[]
arr)
{
for (int
i = 128;
i <= 255; ++
i) {
arr[
i] =
CT_INVALID;
}
}
}