/* Woodstox Lite ("wool") XML processor
*
* Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.*;
import javax.xml.stream.
Location;
import javax.xml.stream.
XMLStreamException;
import com.fasterxml.aalto.impl.
IoStreamException;
import com.fasterxml.aalto.impl.
LocationImpl;
import com.fasterxml.aalto.util.
CharsetNames;
/**
* Class that takes care of bootstrapping main document input from
* a byte-oriented input source: usually either an <code>InputStream</code>,
* or a block source like byte array.
*/
public final class
ByteSourceBootstrapper
extends
InputBootstrapper
{
private final static byte
BYTE_NULL = (byte) 0;
private final static byte
BYTE_CR = (byte) '\r';
private final static byte
BYTE_LF = (byte) '\n';
/*
/**********************************************************************
/* Configuration
/**********************************************************************
*/
/**
* Underlying InputStream to use for reading content.
*/
final
InputStream _in;
/*
/**********************************************************************
/* Input buffering
/**********************************************************************
*/
final byte[]
_inputBuffer;
private int
_inputPtr;
private int
_inputLen;
/*
/**********************************************************************
/* Data gathered
/**********************************************************************
*/
boolean
mBigEndian = true;
int
mBytesPerChar = 0; // 0 means "dunno yet"
boolean
mHadBOM = false;
boolean
mByteSizeFound = false;
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
private
ByteSourceBootstrapper(
ReaderConfig cfg,
InputStream in)
{
super(
cfg);
_in =
in;
_inputBuffer =
cfg.
allocFullBBuffer(4000);
_inputLen =
_inputPtr = 0;
}
private
ByteSourceBootstrapper(
ReaderConfig cfg, byte[]
inputBuffer, int
inputStart, int
inputLen)
{
super(
cfg);
_in = null;
_inputBuffer =
inputBuffer;
_inputPtr =
inputStart;
_inputLen = (
inputStart +
inputLen);
// Need to offset this, to keep location correct
_inputProcessed = -
inputStart;
}
public static
ByteSourceBootstrapper construct(
ReaderConfig cfg,
InputStream in)
throws
XMLStreamException
{
return new
ByteSourceBootstrapper(
cfg,
in);
}
public static
ByteSourceBootstrapper construct(
ReaderConfig cfg, byte[]
inputBuffer, int
inputStart, int
inputLen)
throws
XMLStreamException
{
return new
ByteSourceBootstrapper(
cfg,
inputBuffer,
inputStart,
inputLen);
}
@
Override
public final
XmlScanner bootstrap() throws
XMLStreamException
{
try {
return
doBootstrap();
} catch (
IOException ioe) {
throw new
IoStreamException(
ioe);
} finally {
_config.
freeSmallCBuffer(
mKeyword);
}
}
public
XmlScanner doBootstrap() throws
IOException,
XMLStreamException
{
String normEnc = null;
determineStreamEncoding();
if (
hasXmlDeclaration()) { // yup, has xml decl:
readXmlDeclaration();
if (
mFoundEncoding != null) {
normEnc =
verifyXmlEncoding(
mFoundEncoding);
}
}
// Now, have we figured out the encoding?
if (
normEnc == null) { // not via xml declaration
if (
mBytesPerChar == 2) { // UTF-16, BE/LE
normEnc =
mBigEndian ?
CharsetNames.
CS_UTF16BE :
CharsetNames.
CS_UTF16LE;
} else if (
mBytesPerChar == 4) { // UCS-4... ?
/* 22-Mar-2005, TSa: JDK apparently has no way of dealing
* with these encodings... not sure if and how it should
* be dealt with, really. Name could be UCS-4xx... or
* perhaps UTF-32xx
*/
normEnc =
mBigEndian ?
CharsetNames.
CS_UTF32BE :
CharsetNames.
CS_UTF32LE;
} else {
// Ok, default has to be UTF-8, as per XML specs
normEnc =
CharsetNames.
CS_UTF8;
}
}
_config.
setActualEncoding(
normEnc);
_config.
setXmlDeclInfo(
mDeclaredXmlVersion,
mFoundEncoding,
mStandalone);
// Normalized, can thus use straight equality checks now
// UTF-8 compatible (loosely speaking) ones can use same scanner
if (
normEnc ==
CharsetNames.
CS_UTF8
||
normEnc ==
CharsetNames.
CS_ISO_LATIN1
||
normEnc ==
CharsetNames.
CS_US_ASCII) {
return new
Utf8Scanner(
_config,
_in,
_inputBuffer,
_inputPtr,
_inputLen);
} else if (
normEnc.
startsWith(
CharsetNames.
CS_UTF32)) {
/* Since this is such a rare encoding, we'll just create
* a Reader, and dispatch it to reader scanner?
*/
// let's augment with actual endianness info
if (
normEnc ==
CharsetNames.
CS_UTF32) {
normEnc =
mBigEndian ?
CharsetNames.
CS_UTF32BE :
CharsetNames.
CS_UTF32LE;
}
Reader r = new
Utf32Reader(
_config,
_in,
_inputBuffer,
_inputPtr,
_inputLen,
mBigEndian);
return new
ReaderScanner(
_config,
r);
}
/* And finally, if all else fails, we'll also fall back to
* using JDK-provided decoders and ReaderScanner:
*/
InputStream in =
_in;
if (
_inputPtr <
_inputLen) {
in = new
MergedStream(
_config,
in,
_inputBuffer,
_inputPtr,
_inputLen);
}
if (
normEnc ==
CharsetNames.
CS_UTF16) {
normEnc =
mBigEndian ?
CharsetNames.
CS_UTF16BE :
CharsetNames.
CS_UTF16LE;
}
try {
Reader r = new
InputStreamReader(
in,
normEnc);
return new
ReaderScanner(
_config,
r);
} catch (
UnsupportedEncodingException usex) {
throw new
IoStreamException("Unsupported encoding: "+
usex.
getMessage());
}
}
/*
/**********************************************************************
// Internal methods, main xml decl processing
/**********************************************************************
*/
/**
* Method called to figure out what the physical encoding of the
* file appears to be (in case it can be determined from BOM, or
* xml declaration, either of which may be present)
*/
private void
determineStreamEncoding()
throws
IOException
{
/* Ok; first just need 4 bytes for determining bytes-per-char from
* BOM or first char(s) of likely xml declaration:
*/
if (
ensureLoaded(4)) {
int
origPtr =
_inputPtr;
bomblock:
do { // BOM/auto-detection block
int
quartet = (
_inputBuffer[
_inputPtr] << 24)
| ((
_inputBuffer[
_inputPtr+1] & 0xFF) << 16)
| ((
_inputBuffer[
_inputPtr+2] & 0xFF) << 8)
| (
_inputBuffer[
_inputPtr+3] & 0xFF);
/* Handling of (usually) optional BOM (required for
* multi-byte formats); first 32-bit charsets:
*/
switch (
quartet) {
case 0x0000FEFF:
mBigEndian = true;
_inputPtr += 4;
mBytesPerChar = 4;
break
bomblock;
case 0xFFFE0000: // UCS-4, LE?
mBigEndian = false;
_inputPtr += 4;
mBytesPerChar = 4;
break
bomblock;
case 0x0000FFFE: // UCS-4, in-order...
reportWeirdUCS4("2143");
break
bomblock;
case 0x0FEFF0000: // UCS-4, in-order...
reportWeirdUCS4("3412");
break
bomblock;
}
// Ok, if not, how about 16-bit encoding BOMs?
int
msw =
quartet >>> 16;
if (
msw == 0xFEFF) { // UTF-16, BE
_inputPtr += 2;
mBytesPerChar = 2;
mBigEndian = true;
break;
}
if (
msw == 0xFFFE) { // UTF-16, LE
_inputPtr += 2;
mBytesPerChar = 2;
mBigEndian = false;
break;
}
// And if not, then UTF-8 BOM?
if ((
quartet >>> 8) == 0xEFBBBF) { // UTF-8
_inputPtr += 3;
mBytesPerChar = 1;
mBigEndian = true; // doesn't really matter
break;
}
/* And if that wasn't succesful, how about auto-detection
* for '<?xm' (or subset for multi-byte encodings) marker?
*/
// Note: none of these consume bytes... so ptr remains at 0
switch (
quartet) {
case 0x0000003c: // UCS-4, BE?
mBigEndian = true;
mBytesPerChar = 4;
break
bomblock;
case 0x3c000000: // UCS-4, LE?
mBytesPerChar = 4;
mBigEndian = false;
break
bomblock;
case 0x00003c00: // UCS-4, in-order...
reportWeirdUCS4("2143");
break
bomblock;
case 0x003c0000: // UCS-4, in-order...
reportWeirdUCS4("3412");
break
bomblock;
case 0x003c003f: // UTF-16, BE
mBytesPerChar = 2;
mBigEndian = true;
break
bomblock;
case 0x3c003f00: // UTF-16, LE
mBytesPerChar = 2;
mBigEndian = false;
break
bomblock;
case 0x3c3f786d: // UTF-8, Ascii, ISO-Latin
mBytesPerChar = 1;
mBigEndian = true; // doesn't really matter
break
bomblock;
case 0x4c6fa794: // EBCDIC, not (yet?) supported...
reportEBCDIC();
}
/* Otherwise it's either single-byte doc without xml
* declaration, or corrupt input...
*/
} while (false); // BOM/auto-detection block
mHadBOM = (
_inputPtr >
origPtr);
/* Let's update location markers to ignore BOM when calculating
* column positions (but not from raw byte offsets)
*/
_inputRowStart =
_inputPtr;
}
/* Hmmh. If we haven't figured it out, let's just assume
* UTF-8 as per XML specs:
*/
mByteSizeFound = (
mBytesPerChar > 0);
if (!
mByteSizeFound) {
mBytesPerChar = 1;
mBigEndian = true; // doesn't matter
}
}
protected boolean
hasXmlDeclaration()
throws
IOException,
XMLStreamException
{
// First the common case, 1-byte encoding (Ascii/ISO-Latin/UTF-8):
if (
mBytesPerChar == 1) {
// Need 6 chars to determine for sure...
if (
ensureLoaded(6)) {
if (
_inputBuffer[
_inputPtr] == '<'
&&
_inputBuffer[
_inputPtr+1] == '?'
&&
_inputBuffer[
_inputPtr+2] == 'x'
&&
_inputBuffer[
_inputPtr+3] == 'm'
&&
_inputBuffer[
_inputPtr+4] == 'l'
&& ((
_inputBuffer[
_inputPtr+5] & 0xFF) <=
CHAR_SPACE)) {
// Let's skip stuff so far:
_inputPtr += 6;
return true;
}
}
} else { // ... and then for slower fixed-multibyte encodings:
if (
ensureLoaded (6 *
mBytesPerChar)) { // 6 chars as well
int
start =
_inputPtr; // if we have to 'unread' chars
if (
nextMultiByte() == '<'
&&
nextMultiByte() == '?'
&&
nextMultiByte() == 'x'
&&
nextMultiByte() == 'm'
&&
nextMultiByte() == 'l'
&&
nextMultiByte() <=
CHAR_SPACE) {
return true;
}
_inputPtr =
start; // push data back
}
}
return false;
}
/**
* @return Normalized encoding name
*/
protected
String verifyXmlEncoding(
String enc)
throws
XMLStreamException
{
enc =
CharsetNames.
normalize(
enc);
// Let's actually verify we got matching information:
if (
enc ==
CharsetNames.
CS_UTF8) {
verifyEncoding(
enc, 1);
} else if (
enc ==
CharsetNames.
CS_ISO_LATIN1) {
verifyEncoding(
enc, 1);
} else if (
enc ==
CharsetNames.
CS_US_ASCII) {
verifyEncoding(
enc, 1);
} else if (
enc ==
CharsetNames.
CS_UTF16) {
// BOM should be obligatory, to know the ordering?
// For now, let's not enforce that though.
//if (!mHadBOM) {
//reportMissingBOM(enc);
//}
verifyEncoding(
enc, 2);
} else if (
enc ==
CharsetNames.
CS_UTF16LE) {
verifyEncoding(
enc, 2, false);
} else if (
enc ==
CharsetNames.
CS_UTF16BE) {
verifyEncoding(
enc, 2, true);
} else if (
enc ==
CharsetNames.
CS_UTF32) {
// Do we require a BOM here? we can live without it...
//if (!mHadBOM) {
// reportMissingBOM(enc);
//}
verifyEncoding(
enc, 4);
} else if (
enc ==
CharsetNames.
CS_UTF32LE) {
verifyEncoding(
enc, 4, false);
} else if (
enc ==
CharsetNames.
CS_UTF32BE) {
verifyEncoding(
enc, 4, true);
}
return
enc;
}
/*
/**********************************************************************
/* Internal methods, loading input data
/**********************************************************************
*/
protected boolean
ensureLoaded(int
minimum)
throws
IOException
{
/* Let's assume here buffer has enough room -- this will always
* be true for the limited used this method gets
*/
int
gotten = (
_inputLen -
_inputPtr);
while (
gotten <
minimum) {
int
count;
if (
_in == null) { // block source
count = -1;
} else {
count =
_in.
read(
_inputBuffer,
_inputLen,
_inputBuffer.length -
_inputLen);
}
if (
count < 1) {
return false;
}
_inputLen +=
count;
gotten +=
count;
}
return true;
}
protected void
loadMore()
throws
IOException,
XMLStreamException
{
_inputProcessed +=
_inputLen;
_inputRowStart -=
_inputLen;
_inputPtr = 0;
if (
_in == null) { // block source
_inputLen = -1;
} else {
_inputLen =
_in.
read(
_inputBuffer, 0,
_inputBuffer.length);
}
if (
_inputLen < 1) {
reportEof();
}
}
/*
/**********************************************************************
/* Implementations of abstract parsing methods
/**********************************************************************
*/
@
Override
protected void
pushback() {
_inputPtr -=
mBytesPerChar;
}
@
Override
protected int
getNext()
throws
IOException,
XMLStreamException
{
if (
mBytesPerChar > 1) {
return
nextMultiByte();
}
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
return (
b & 0xFF);
}
@
Override
protected int
getNextAfterWs(boolean
reqWs)
throws
IOException,
XMLStreamException
{
int
count;
if (
mBytesPerChar > 1) { // multi-byte
count =
skipMbWs();
} else {
count =
skipSbWs();
}
if (
reqWs &&
count == 0) {
reportUnexpectedChar(
getNext(),
ERR_XMLDECL_EXP_SPACE);
}
// inlined getNext()
if (
mBytesPerChar > 1) {
return
nextMultiByte();
}
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
return (
b & 0xFF);
}
/**
* @return First character that does not match expected, if any;
* CHAR_NULL if match succeeded
*/
@
Override
protected int
checkKeyword(
String exp)
throws
IOException,
XMLStreamException
{
if (
mBytesPerChar > 1) {
return
checkMbKeyword(
exp);
}
return
checkSbKeyword(
exp);
}
@
Override
protected int
readQuotedValue(char[]
kw, int
quoteChar)
throws
IOException,
XMLStreamException
{
int
i = 0;
int
len =
kw.length;
boolean
mb = (
mBytesPerChar > 1);
while (
i <
len) {
int
c;
if (
mb) {
c =
nextMultiByte();
if (
c ==
CHAR_CR ||
c ==
CHAR_LF) {
skipMbLF(
c);
c =
CHAR_LF;
}
} else {
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
if (
b ==
BYTE_NULL) {
reportNull();
}
if (
b ==
BYTE_CR ||
b ==
BYTE_LF) {
skipSbLF(
b);
b =
BYTE_LF;
}
c = (
b & 0xFF);
}
if (
c ==
quoteChar) {
return (
i <
len) ?
i : -1;
}
if (
i <
len) {
kw[
i++] = (char)
c;
}
}
/* If we end up this far, we ran out of buffer space... let's let
* caller figure that out, though
*/
return -1;
}
@
Override
protected
Location getLocation()
{
/* Ok; for fixed-size multi-byte encodings, need to divide numbers
* to get character locations. For variable-length encodings the
* good thing is that xml declaration only uses shortest codepoints,
* ie. char count == byte count.
*/
int
total =
_inputProcessed +
_inputPtr;
int
col =
_inputPtr -
_inputRowStart;
if (
mBytesPerChar > 1) {
total /=
mBytesPerChar;
col /=
mBytesPerChar;
}
return
LocationImpl.
fromZeroBased
(
_config.
getPublicId(),
_config.
getSystemId(),
total,
_inputRow,
col);
}
/*
/**********************************************************************
/* Internal methods, single-byte access methods
/**********************************************************************
*/
protected byte
nextByte()
throws
IOException,
XMLStreamException
{
if (
_inputPtr >=
_inputLen) {
loadMore();
}
return
_inputBuffer[
_inputPtr++];
}
protected int
skipSbWs()
throws
IOException,
XMLStreamException
{
int
count = 0;
while (true) {
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
if ((
b & 0xFF) >
CHAR_SPACE) {
--
_inputPtr;
break;
}
if (
b ==
BYTE_CR ||
b ==
BYTE_LF) {
skipSbLF(
b);
} else if (
b ==
BYTE_NULL) {
reportNull();
}
++
count;
}
return
count;
}
protected void
skipSbLF(byte
lfByte)
throws
IOException,
XMLStreamException
{
if (
lfByte ==
BYTE_CR) {
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
if (
b !=
BYTE_LF) {
--
_inputPtr; // pushback if not 2-char/byte lf
}
}
++
_inputRow;
_inputRowStart =
_inputPtr;
}
/**
* @return First character that does not match expected, if any;
* CHAR_NULL if match succeeded
*/
protected int
checkSbKeyword(
String expected)
throws
IOException,
XMLStreamException
{
int
len =
expected.
length();
for (int
ptr = 1;
ptr <
len; ++
ptr) {
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
if (
b ==
BYTE_NULL) {
reportNull();
}
if ((
b & 0xFF) !=
expected.
charAt(
ptr)) {
return (
b & 0xFF);
}
}
return
CHAR_NULL;
}
/*
/**********************************************************************
/* Internal methods, multi-byte access/checks
/**********************************************************************
*/
protected int
nextMultiByte()
throws
IOException,
XMLStreamException
{
byte
b = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
byte
b2 = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
int
c;
if (
mBytesPerChar == 2) {
if (
mBigEndian) {
c = ((
b & 0xFF) << 8) | (
b2 & 0xFF);
} else {
c = (
b & 0xFF) | ((
b2 & 0xFF) << 8);
}
} else {
// Has to be 4 bytes
byte
b3 = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
byte
b4 = (
_inputPtr <
_inputLen) ?
_inputBuffer[
_inputPtr++] :
nextByte();
if (
mBigEndian) {
c = (
b << 24) | ((
b2 & 0xFF) << 16)
| ((
b3 & 0xFF) << 8) | (
b4 & 0xFF);
} else {
c = (
b4 << 24) | ((
b3 & 0xFF) << 16)
| ((
b2 & 0xFF) << 8) | (
b & 0xFF);
}
}
// Let's catch null chars early
if (
c == 0) {
reportNull();
}
return
c;
}
protected int
skipMbWs()
throws
IOException,
XMLStreamException
{
int
count = 0;
while (true) {
int
c =
nextMultiByte();
if (
c >
CHAR_SPACE) {
_inputPtr -=
mBytesPerChar;
break;
}
if (
c ==
CHAR_CR ||
c ==
CHAR_LF) {
skipMbLF(
c);
} else if (
c ==
CHAR_NULL) {
reportNull();
}
++
count;
}
return
count;
}
protected void
skipMbLF(int
lf)
throws
IOException,
XMLStreamException
{
if (
lf ==
CHAR_CR) {
int
c =
nextMultiByte();
if (
c !=
CHAR_LF) {
_inputPtr -=
mBytesPerChar;
}
}
++
_inputRow;
_inputRowStart =
_inputPtr;
}
/**
* @return First character that does not match expected, if any;
* CHAR_NULL if match succeeded
*/
protected int
checkMbKeyword(
String expected)
throws
IOException,
XMLStreamException
{
int
len =
expected.
length();
for (int
ptr = 1;
ptr <
len; ++
ptr) {
int
c =
nextMultiByte();
if (
c ==
BYTE_NULL) {
reportNull();
}
if (
c !=
expected.
charAt(
ptr)) {
return
c;
}
}
return
CHAR_NULL;
}
/*
/**********************************************************************
/* Other private methods:
/**********************************************************************
*/
private void
verifyEncoding(
String id, int
bpc)
throws
XMLStreamException
{
if (
mByteSizeFound) {
/* Let's verify that if we matched an encoding, it's the same
* as what was declared...
*/
if (
bpc !=
mBytesPerChar) {
reportXmlProblem("Declared encoding '"+
id+"' uses "+
bpc
+" bytes per character; but physical encoding appeared to use "+
mBytesPerChar+"; cannot decode");
}
}
}
private void
verifyEncoding(
String id, int
bpc, boolean
bigEndian)
throws
XMLStreamException
{
if (
mByteSizeFound) {
verifyEncoding(
id,
bpc);
if (
bigEndian !=
mBigEndian) {
String bigStr =
bigEndian ? "big" : "little";
reportXmlProblem
("Declared encoding '"+
id+"' has different endianness ("
+
bigStr+" endian) than what physical ordering appeared to be; cannot decode");
}
}
}
private void
reportWeirdUCS4(
String type)
throws
IOException
{
throw new
CharConversionException("Unsupported UCS-4 endianness ("+
type+") detected");
}
private void
reportEBCDIC()
throws
IOException
{
throw new
CharConversionException("Unsupported encoding (EBCDIC)");
}
}