/*
* Copyright (c) 2003, PostgreSQL Global Development Group
* See the LICENSE file in the project root for more information.
*/
package org.postgresql.core;
import java.io.
IOException;
import java.io.
InputStream;
import java.io.
InputStreamReader;
import java.io.
OutputStream;
import java.io.
OutputStreamWriter;
import java.io.
Reader;
import java.io.
Writer;
import java.nio.charset.
Charset;
import java.util.
HashMap;
import java.util.logging.
Level;
import java.util.logging.
Logger;
/**
* Representation of a particular character encoding.
*/
public class
Encoding {
private static final
Logger LOGGER =
Logger.
getLogger(
Encoding.class.
getName());
private static final
Encoding DEFAULT_ENCODING = new
Encoding();
private static final
Encoding UTF8_ENCODING = new
Encoding("UTF-8");
/*
* Preferred JVM encodings for backend encodings.
*/
private static final
HashMap<
String,
String[]>
encodings = new
HashMap<
String,
String[]>();
static {
//Note: this list should match the set of supported server
// encodings found in backend/util/mb/encnames.c
encodings.
put("SQL_ASCII", new
String[]{"ASCII", "US-ASCII"});
encodings.
put("UNICODE", new
String[]{"UTF-8", "UTF8"});
encodings.
put("UTF8", new
String[]{"UTF-8", "UTF8"});
encodings.
put("LATIN1", new
String[]{"ISO8859_1"});
encodings.
put("LATIN2", new
String[]{"ISO8859_2"});
encodings.
put("LATIN3", new
String[]{"ISO8859_3"});
encodings.
put("LATIN4", new
String[]{"ISO8859_4"});
encodings.
put("ISO_8859_5", new
String[]{"ISO8859_5"});
encodings.
put("ISO_8859_6", new
String[]{"ISO8859_6"});
encodings.
put("ISO_8859_7", new
String[]{"ISO8859_7"});
encodings.
put("ISO_8859_8", new
String[]{"ISO8859_8"});
encodings.
put("LATIN5", new
String[]{"ISO8859_9"});
encodings.
put("LATIN7", new
String[]{"ISO8859_13"});
encodings.
put("LATIN9", new
String[]{"ISO8859_15_FDIS"});
encodings.
put("EUC_JP", new
String[]{"EUC_JP"});
encodings.
put("EUC_CN", new
String[]{"EUC_CN"});
encodings.
put("EUC_KR", new
String[]{"EUC_KR"});
encodings.
put("JOHAB", new
String[]{"Johab"});
encodings.
put("EUC_TW", new
String[]{"EUC_TW"});
encodings.
put("SJIS", new
String[]{"MS932", "SJIS"});
encodings.
put("BIG5", new
String[]{"Big5", "MS950", "Cp950"});
encodings.
put("GBK", new
String[]{"GBK", "MS936"});
encodings.
put("UHC", new
String[]{"MS949", "Cp949", "Cp949C"});
encodings.
put("TCVN", new
String[]{"Cp1258"});
encodings.
put("WIN1256", new
String[]{"Cp1256"});
encodings.
put("WIN1250", new
String[]{"Cp1250"});
encodings.
put("WIN874", new
String[]{"MS874", "Cp874"});
encodings.
put("WIN", new
String[]{"Cp1251"});
encodings.
put("ALT", new
String[]{"Cp866"});
// We prefer KOI8-U, since it is a superset of KOI8-R.
encodings.
put("KOI8", new
String[]{"KOI8_U", "KOI8_R"});
// If the database isn't encoding-aware then we can't have
// any preferred encodings.
encodings.
put("UNKNOWN", new
String[0]);
// The following encodings do not have a java equivalent
encodings.
put("MULE_INTERNAL", new
String[0]);
encodings.
put("LATIN6", new
String[0]);
encodings.
put("LATIN8", new
String[0]);
encodings.
put("LATIN10", new
String[0]);
}
private final
String encoding;
private final boolean
fastASCIINumbers;
/**
* Uses the default charset of the JVM.
*/
private
Encoding() {
this(
Charset.
defaultCharset().
name());
}
/**
* Use the charset passed as parameter.
*
* @param encoding charset name to use
*/
protected
Encoding(
String encoding) {
if (
encoding == null) {
throw new
NullPointerException("Null encoding charset not supported");
}
this.
encoding =
encoding;
fastASCIINumbers =
testAsciiNumbers();
if (
LOGGER.
isLoggable(
Level.
FINEST)) {
LOGGER.
log(
Level.
FINEST, "Creating new Encoding {0} with fastASCIINumbers {1}",
new
Object[]{
encoding,
fastASCIINumbers});
}
}
/**
* Returns true if this encoding has characters '-' and '0'..'9' in exactly same posision as
* ascii.
*
* @return true if the bytes can be scanned directly for ascii numbers.
*/
public boolean
hasAsciiNumbers() {
return
fastASCIINumbers;
}
/**
* Construct an Encoding for a given JVM encoding.
*
* @param jvmEncoding the name of the JVM encoding
* @return an Encoding instance for the specified encoding, or an Encoding instance for the
* default JVM encoding if the specified encoding is unavailable.
*/
public static
Encoding getJVMEncoding(
String jvmEncoding) {
if ("UTF-8".
equals(
jvmEncoding)) {
return new
UTF8Encoding(
jvmEncoding);
}
if (
Charset.
isSupported(
jvmEncoding)) {
return new
Encoding(
jvmEncoding);
} else {
return
DEFAULT_ENCODING;
}
}
/**
* Construct an Encoding for a given database encoding.
*
* @param databaseEncoding the name of the database encoding
* @return an Encoding instance for the specified encoding, or an Encoding instance for the
* default JVM encoding if the specified encoding is unavailable.
*/
public static
Encoding getDatabaseEncoding(
String databaseEncoding) {
if ("UTF8".
equals(
databaseEncoding)) {
return
UTF8_ENCODING;
}
// If the backend encoding is known and there is a suitable
// encoding in the JVM we use that. Otherwise we fall back
// to the default encoding of the JVM.
String[]
candidates =
encodings.
get(
databaseEncoding);
if (
candidates != null) {
for (
String candidate :
candidates) {
LOGGER.
log(
Level.
FINEST, "Search encoding candidate {0}",
candidate);
if (
Charset.
isSupported(
candidate)) {
return new
Encoding(
candidate);
}
}
}
// Try the encoding name directly -- maybe the charset has been
// provided by the user.
if (
Charset.
isSupported(
databaseEncoding)) {
return new
Encoding(
databaseEncoding);
}
// Fall back to default JVM encoding.
LOGGER.
log(
Level.
FINEST, "{0} encoding not found, returning default encoding",
databaseEncoding);
return
DEFAULT_ENCODING;
}
/**
* Get the name of the (JVM) encoding used.
*
* @return the JVM encoding name used by this instance.
*/
public
String name() {
return
Charset.
isSupported(
encoding) ?
Charset.
forName(
encoding).
name() :
encoding;
}
/**
* Encode a string to an array of bytes.
*
* @param s the string to encode
* @return a bytearray containing the encoded string
* @throws IOException if something goes wrong
*/
public byte[]
encode(
String s) throws
IOException {
if (
s == null) {
return null;
}
return
s.
getBytes(
encoding);
}
/**
* Decode an array of bytes into a string.
*
* @param encodedString a byte array containing the string to decode
* @param offset the offset in <code>encodedString</code> of the first byte of the encoded
* representation
* @param length the length, in bytes, of the encoded representation
* @return the decoded string
* @throws IOException if something goes wrong
*/
public
String decode(byte[]
encodedString, int
offset, int
length) throws
IOException {
return new
String(
encodedString,
offset,
length,
encoding);
}
/**
* Decode an array of bytes into a string.
*
* @param encodedString a byte array containing the string to decode
* @return the decoded string
* @throws IOException if something goes wrong
*/
public
String decode(byte[]
encodedString) throws
IOException {
return
decode(
encodedString, 0,
encodedString.length);
}
/**
* Get a Reader that decodes the given InputStream using this encoding.
*
* @param in the underlying stream to decode from
* @return a non-null Reader implementation.
* @throws IOException if something goes wrong
*/
public
Reader getDecodingReader(
InputStream in) throws
IOException {
return new
InputStreamReader(
in,
encoding);
}
/**
* Get a Writer that encodes to the given OutputStream using this encoding.
*
* @param out the underlying stream to encode to
* @return a non-null Writer implementation.
* @throws IOException if something goes wrong
*/
public
Writer getEncodingWriter(
OutputStream out) throws
IOException {
return new
OutputStreamWriter(
out,
encoding);
}
/**
* Get an Encoding using the default encoding for the JVM.
*
* @return an Encoding instance
*/
public static
Encoding defaultEncoding() {
return
DEFAULT_ENCODING;
}
public
String toString() {
return
encoding;
}
/**
* Checks weather this encoding is compatible with ASCII for the number characters '-' and
* '0'..'9'. Where compatible means that they are encoded with exactly same values.
*
* @return If faster ASCII number parsing can be used with this encoding.
*/
private boolean
testAsciiNumbers() {
// TODO: test all postgres supported encoding to see if there are
// any which do _not_ have ascii numbers in same location
// at least all the encoding listed in the encodings hashmap have
// working ascii numbers
try {
String test = "-0123456789";
byte[]
bytes =
encode(
test);
String res = new
String(
bytes, "US-ASCII");
return
test.
equals(
res);
} catch (java.io.
UnsupportedEncodingException e) {
return false;
} catch (
IOException e) {
return false;
}
}
}