ByteBasedScanner.java - 源码阅读网

aalto-xml-1.1.0.jar

|

com.fasterxml:aalto-xml:1.1.0

META-INF

com

fasterxml

aalto

io

AsyncInputFeeder.java

package-info.java

async

AsyncXMLStreamReader.java

util

evt

stax

in

NsDeclaration.java

ByteBasedPNameFactory.java

PName.java

NsBinding.java

FixedNsContext.java

XmlScanner.java

Utf32Reader.java

InputCharTypes.java

ByteBasedScanner.java

ByteSourceBootstrapper.java

PNameC.java

ElementScope.java

PName3.java

CharSourceBootstrapper.java

AttributeCollector.java

CharBasedPNameTable.java

PNameN.java

ByteBasedPName.java

PName1.java

EntityNames.java

MergedStream.java

ReaderScanner.java

PName2.java

InputBootstrapper.java

ByteBasedPNameTable.java

Utf8Scanner.java

ReaderConfig.java

StreamScanner.java

AsyncByteArrayFeeder.java

UncheckedStreamException.java

sax

WFCException.java

AsyncXMLInputFactory.java

impl

dom

AsyncByteBufferFeeder.java

ValidationException.java

out

annotations

test

ByteBasedScanner.java

清空

类结构

/* Aalto XML processor * * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi * * Licensed under the License specified in the file LICENSE which is * included with the source code. * You may not use this file except in compliance with the License. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.fasterxml.aalto.in; import java.io.IOException; import javax.xml.stream.XMLStreamException; import org.codehaus.stax2.XMLStreamLocation2; import com.fasterxml.aalto.impl.LocationImpl; import com.fasterxml.aalto.util.DataUtil; import com.fasterxml.aalto.util.XmlCharTypes; import com.fasterxml.aalto.util.XmlChars; /** * Intermediate base class used by different byte-backed scanners. * Specifically, used as a base by both blocking (stream) and * non-blocking (async) byte-based scanners (as opposed to Reader-backed, * character-based scanners) */ public abstract class ByteBasedScanner extends XmlScanner { /* /********************************************************************** /* Byte constants /********************************************************************** */ // White-space: final protected static byte BYTE_NULL = (byte) 0; final protected static byte BYTE_SPACE = (byte) ' '; final protected static byte BYTE_LF = (byte) '\n'; final protected static byte BYTE_CR = (byte) '\r'; final protected static byte BYTE_TAB = (byte) 9; final protected static byte BYTE_LT = (byte) '<'; final protected static byte BYTE_GT = (byte) '>'; final protected static byte BYTE_AMP = (byte) '&'; final protected static byte BYTE_HASH = (byte) '#'; final protected static byte BYTE_EXCL = (byte) '!'; final protected static byte BYTE_HYPHEN = (byte) '-'; final protected static byte BYTE_QMARK = (byte) '?'; final protected static byte BYTE_SLASH = (byte) '/'; final protected static byte BYTE_EQ = (byte) '='; final protected static byte BYTE_QUOT = (byte) '"'; final protected static byte BYTE_APOS = (byte) '\''; final protected static byte BYTE_LBRACKET = (byte) '['; final protected static byte BYTE_RBRACKET = (byte) ']'; final protected static byte BYTE_SEMICOLON = (byte) ';'; final protected static byte BYTE_a = (byte) 'a'; final protected static byte BYTE_g = (byte) 'g'; final protected static byte BYTE_l = (byte) 'l'; final protected static byte BYTE_m = (byte) 'm'; final protected static byte BYTE_o = (byte) 'o'; final protected static byte BYTE_p = (byte) 'p'; final protected static byte BYTE_q = (byte) 'q'; final protected static byte BYTE_s = (byte) 's'; final protected static byte BYTE_t = (byte) 't'; final protected static byte BYTE_u = (byte) 'u'; final protected static byte BYTE_x = (byte) 'x'; final protected static byte BYTE_A = (byte) 'A'; final protected static byte BYTE_C = (byte) 'C'; final protected static byte BYTE_D = (byte) 'D'; final protected static byte BYTE_P = (byte) 'P'; final protected static byte BYTE_S = (byte) 'S'; final protected static byte BYTE_T = (byte) 'T'; /* /********************************************************************** /* Input buffering /********************************************************************** */ /** * Pointer to the next unread byte in the input buffer. */ protected int _inputPtr; /** * Pointer to the first byte <b>after</b> the end of valid content. * This may point beyond of the physical buffer array. */ protected int _inputEnd; /* /********************************************************************** /* Symbol and character handling /********************************************************************** */ /** * This buffer is used for name parsing. Will be expanded if/as * needed; 32 ints can hold names 128 ascii chars long. */ protected int[] _quadBuffer = new int[32]; /** * For now, symbol table contains prefixed names. In future it is * possible that they may be split into prefixes and local names? */ protected final ByteBasedPNameTable _symbols; /** * This is a simple container object that is used to access the * decoding tables for characters. Indirection is needed since * we actually support multiple utf-8 compatible encodings, not * just utf-8 itself. */ protected final XmlCharTypes _charTypes; /* /********************************************************************** /* Parsing state /********************************************************************** */ /** * Storage location for a single character that can not be easily * pushed back (for example, multi-byte char; or char entity * expansion). Negative, if from entity expansion; positive if * a singular char. */ protected int _tmpChar = INT_NULL; /* /********************************************************************** /* Life-cycle /********************************************************************** */ protected ByteBasedScanner(ReaderConfig cfg) { super(cfg); _symbols = cfg.getBBSymbols(); _charTypes = cfg.getCharTypes(); _pastBytesOrChars = 0; // should it be passed by caller? _rowStartOffset = 0; // should probably be passed by caller... } @Override protected void _releaseBuffers() { super._releaseBuffers(); if (_symbols.maybeDirty()) { _config.updateBBSymbols(_symbols); } } @Override protected abstract void _closeSource() throws IOException; /* /********************************************************************** /* Location handling /********************************************************************** */ @Override public XMLStreamLocation2 getCurrentLocation() { return LocationImpl.fromZeroBased(_config.getPublicId(), _config.getSystemId(), _pastBytesOrChars + _inputPtr, _currRow, _inputPtr - _rowStartOffset); } @Override public int getCurrentColumnNr() { return _inputPtr - _rowStartOffset; } @Override public long getStartingByteOffset() { return _startRawOffset; } @Override public long getStartingCharOffset() { // N/A for this type return -1L; } @Override public long getEndingByteOffset() throws XMLStreamException { // Have to complete the token to know the ending location... if (_tokenIncomplete) { finishToken(); } return _pastBytesOrChars + _inputPtr; } @Override public long getEndingCharOffset() throws XMLStreamException { // N/A for this type return -1L; } protected final void markLF(int offset) { _rowStartOffset = offset; ++_currRow; } protected final void markLF() { _rowStartOffset = _inputPtr; ++_currRow; } protected final void setStartLocation() { _startRawOffset = _pastBytesOrChars + _inputPtr; _startRow = _currRow; _startColumn = _inputPtr - _rowStartOffset; } /* /********************************************************************** /* Abstract methods for sub-classes to implement /********************************************************************** */ /** * Method called by methods when encountering a byte that * can not be part of a valid character in the current context. * Should return the actual decoded character for error reporting * purposes. */ protected abstract int decodeCharForError(byte b) throws XMLStreamException; protected final PName addPName(int hash, int[] quads, int qlen, int lastQuadBytes) throws XMLStreamException { return addUtfPName(_charTypes, hash, quads, qlen, lastQuadBytes); } /* /********************************************************************** /* And then shared functionality for sub-classes /********************************************************************** */ /** * Conceptually, this method really does NOT belong here. However, * currently it is quite hard to refactor it, so it'll have to * stay here until better place is found */ protected final PName addUtfPName(XmlCharTypes charTypes, int hash, int[] quads, int qlen, int lastQuadBytes) throws XMLStreamException { // 4 bytes per quad, except last one maybe less int byteLen = (qlen << 2) - 4 + lastQuadBytes; /* And last one is not correctly aligned (leading zero bytes instead * need to shift a bit, instead of trailing). Only need to shift it * for UTF-8 decoding; need revert for storage (since key will not * be aligned, to optimize lookup speed) */ int lastQuad; if (lastQuadBytes < 4) { lastQuad = quads[qlen-1]; // 8/16/24 bit left shift quads[qlen-1] = (lastQuad << ((4 - lastQuadBytes) << 3)); } else { lastQuad = 0; } // Let's handle first char separately (different validation): int ch = (quads[0] >>> 24); boolean ok; int ix = 1; char[] cbuf = _nameBuffer; int cix = 0; final int[] TYPES = charTypes.NAME_CHARS; switch (TYPES[ch]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_NAME_COLON: // not ok as first case XmlCharTypes.CT_NAME_NONFIRST: case InputCharTypes.CT_INPUT_NAME_MB_N: ok = false; break; case XmlCharTypes.CT_NAME_ANY: ok = true; break; default: // multi-byte (UTF-8) chars: { int needed; if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) ch &= 0x1F; needed = 1; } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) ch &= 0x0F; needed = 2; } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... ch &= 0x07; needed = 3; } else { // 5- and 6-byte chars not valid xml chars reportInvalidInitial(ch); needed = ch = 1; // never really gets this far } if ((ix + needed) > byteLen) { reportEofInName(cbuf, 0); } ix += needed; int q = quads[0]; // Always need at least one more right away: int ch2 = (q >> 16) & 0xFF; if ((ch2 & 0xC0) != 0x080) { reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); /* And then may need more. Note: here we do not do all the * checks that UTF-8 text decoder might do. Reason is that * name validity checking methods handle most of such checks */ if (needed > 1) { ch2 = (q >> 8) & 0xFF; if ((ch2 & 0xC0) != 0x080) { reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates on output) ch2 = q & 0xFF; if ((ch2 & 0xC0) != 0x080) { reportInvalidOther(ch2 & 0xFF); } ch = (ch << 6) | (ch2 & 0x3F); } } ok = XmlChars.is10NameStartChar(ch); if (needed > 2) { // outside of basic 16-bit range? need surrogates /* so, let's first output first char (high surrogate), * let second be output by later code */ ch -= 0x10000; // to normalize it starting with 0x0 cbuf[cix++] = (char) (0xD800 + (ch >> 10)); ch = (0xDC00 | (ch & 0x03FF)); } } } if (!ok) { // 0 to indicate it's first char, even with surrogates reportInvalidNameChar(ch, 0); } cbuf[cix++] = (char) ch; // the only char, or second (low) surrogate /* Whoa! Tons of code for just the start char. But now we get to * decode the name proper, at last! */ int last_colon = -1; for (; ix < byteLen; ) { ch = quads[ix >> 2]; // current quad, need to shift+mask int byteIx = (ix & 3); ch = (ch >> ((3 - byteIx) << 3)) & 0xFF; ++ix; // Ascii? switch (TYPES[ch]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_MULTIBYTE_N: ok = false; break; case XmlCharTypes.CT_NAME_COLON: // not ok as first if (last_colon >= 0) { reportMultipleColonsInName(); } last_colon = cix; ok = true; break; case XmlCharTypes.CT_NAME_NONFIRST: case XmlCharTypes.CT_NAME_ANY: ok = true; break; default: { int needed; if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) ch &= 0x1F; needed = 1; } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) ch &= 0x0F; needed = 2; } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... ch &= 0x07; needed = 3; } else { // 5- and 6-byte chars not valid xml chars reportInvalidInitial(ch); needed = ch = 1; // never really gets this far } if ((ix + needed) > byteLen) { reportEofInName(cbuf, cix); } // Ok, always need at least one more: int ch2 = quads[ix >> 2]; // current quad, need to shift+mask byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); // Once again, some of validation deferred to name char validator if (needed > 1) { ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates on output) ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { reportInvalidOther(ch2 & 0xFF); } ch = (ch << 6) | (ch2 & 0x3F); } } ok = XmlChars.is10NameChar(ch); if (needed > 2) { // surrogate pair? once again, let's output one here, one later on ch -= 0x10000; // to normalize it starting with 0x0 if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = (char) (0xD800 + (ch >> 10)); ch = 0xDC00 | (ch & 0x03FF); } } } if (!ok) { reportInvalidNameChar(ch, cix); } if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = (char) ch; } /* Ok. Now we have the character array, and can construct the * String (as well as check proper composition of semicolons * for ns-aware mode...) */ String baseName = new String(cbuf, 0, cix); // And finally, unalign if necessary if (lastQuadBytes < 4) { quads[qlen-1] = lastQuad; } return _symbols.addSymbol(hash, baseName, last_colon, quads, qlen); } /* /********************************************************************** /* Error reporting /********************************************************************** */ protected void reportInvalidInitial(int mask) throws XMLStreamException { reportInputProblem("Invalid UTF-8 start byte 0x"+Integer.toHexString(mask)); } protected void reportInvalidOther(int mask) throws XMLStreamException { reportInputProblem("Invalid UTF-8 middle byte 0x"+Integer.toHexString(mask)); } }

查找资源

输入类名或文件名

类结构窗口