/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package groovy.util;
import java.io.
BufferedReader;
import java.io.
File;
import java.io.
FileInputStream;
import java.io.
FileNotFoundException;
import java.io.
IOException;
import java.io.
InputStream;
import java.io.
InputStreamReader;
import java.io.
LineNumberReader;
import java.nio.charset.
Charset;
import java.util.
Collection;
/**
* Utility class to guess the encoding of a given text file.
* <p>
* Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
* with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
* is wide enough, the charset should also be discovered.
* <p>
* A byte buffer of 4KB is used to be able to guess the encoding.
* <p>
* Usage:
* <pre>
* CharsetToolkit toolkit = new CharsetToolkit(file);
*
* // guess the encoding
* Charset guessedCharset = toolkit.getCharset();
*
* // create a reader with the correct charset
* BufferedReader reader = toolkit.getReader();
*
* // read the file content
* String line;
* while ((line = br.readLine())!= null)
* {
* System.out.println(line);
* }
* </pre>
*
* @author Guillaume Laforge
*/
public class
CharsetToolkit {
private final byte[]
buffer;
private
Charset defaultCharset;
private
Charset charset;
private boolean
enforce8Bit = true;
private final
File file;
private static final byte[]
EMPTY_BYTE_ARRAY = new byte[0];
/**
* Constructor of the <code>CharsetToolkit</code> utility class.
*
* @param file of which we want to know the encoding.
*/
public
CharsetToolkit(
File file) throws
IOException {
this.
file =
file;
this.
defaultCharset =
getDefaultSystemCharset();
this.
charset = null;
InputStream input = new
FileInputStream(
file);
try {
byte[]
bytes = new byte[4096];
int
bytesRead =
input.
read(
bytes);
if (
bytesRead == -1) {
this.
buffer =
EMPTY_BYTE_ARRAY;
}
else if (
bytesRead < 4096) {
byte[]
bytesToGuess = new byte[
bytesRead];
System.
arraycopy(
bytes, 0,
bytesToGuess, 0,
bytesRead);
this.
buffer =
bytesToGuess;
}
else {
this.
buffer =
bytes;
}
} finally {
try {
input.
close();} catch (
IOException e){
// IGNORE
}
}
}
/**
* Defines the default <code>Charset</code> used in case the buffer represents
* an 8-bit <code>Charset</code>.
*
* @param defaultCharset the default <code>Charset</code> to be returned
* if an 8-bit <code>Charset</code> is encountered.
*/
public void
setDefaultCharset(
Charset defaultCharset) {
if (
defaultCharset != null)
this.
defaultCharset =
defaultCharset;
else
this.
defaultCharset =
getDefaultSystemCharset();
}
public
Charset getCharset() {
if (this.
charset == null)
this.
charset =
guessEncoding();
return
charset;
}
/**
* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
* It might be a file without any special character in the range 128-255, but that may be or become
* a file encoded with the default <code>charset</code> rather than US-ASCII.
*
* @param enforce a boolean specifying the use or not of US-ASCII.
*/
public void
setEnforce8Bit(boolean
enforce) {
this.
enforce8Bit =
enforce;
}
/**
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
*
* @return a boolean representing the flag of use of US-ASCII.
*/
public boolean
getEnforce8Bit() {
return this.
enforce8Bit;
}
/**
* Retrieves the default Charset
*/
public
Charset getDefaultCharset() {
return
defaultCharset;
}
/**
* Guess the encoding of the provided buffer.
* If Byte Order Markers are encountered at the beginning of the buffer, we immediately
* return the charset implied by this BOM. Otherwise, the file would not be a human
* readable text file.
* <p>
* If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
* If it is not UTF-8, we assume the encoding is the default system encoding
* (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).
* <p>
* It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.
* <pre>
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* </pre>
* With UTF-8, 0xFE and 0xFF never appear.
*
* @return the Charset recognized.
*/
private
Charset guessEncoding() {
// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
// otherwise, the file would not be human readable
if (
hasUTF8Bom())
return
Charset.
forName("UTF-8");
if (
hasUTF16LEBom())
return
Charset.
forName("UTF-16LE");
if (
hasUTF16BEBom())
return
Charset.
forName("UTF-16BE");
// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
// otherwise, the file is in US-ASCII
boolean
highOrderBit = false;
// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
// if it's not the case, we can assume the encoding is the default encoding of the system
boolean
validU8Char = true;
// TODO the buffer is not read up to the end, but up to length - 6
int
length =
buffer.length;
int
i = 0;
while (
i <
length - 6) {
byte
b0 =
buffer[
i];
byte
b1 =
buffer[
i + 1];
byte
b2 =
buffer[
i + 2];
byte
b3 =
buffer[
i + 3];
byte
b4 =
buffer[
i + 4];
byte
b5 =
buffer[
i + 5];
if (
b0 < 0) {
// a high order bit was encountered, thus the encoding is not US-ASCII
// it may be either an 8-bit encoding or UTF-8
highOrderBit = true;
// a two-bytes sequence was encountered
if (
isTwoBytesSequence(
b0)) {
// there must be one continuation byte of the form 10xxxxxx,
// otherwise the following character is is not a valid UTF-8 construct
if (!
isContinuationChar(
b1))
validU8Char = false;
else
i++;
}
// a three-bytes sequence was encountered
else if (
isThreeBytesSequence(
b0)) {
// there must be two continuation bytes of the form 10xxxxxx,
// otherwise the following character is is not a valid UTF-8 construct
if (!(
isContinuationChar(
b1) &&
isContinuationChar(
b2)))
validU8Char = false;
else
i += 2;
}
// a four-bytes sequence was encountered
else if (
isFourBytesSequence(
b0)) {
// there must be three continuation bytes of the form 10xxxxxx,
// otherwise the following character is is not a valid UTF-8 construct
if (!(
isContinuationChar(
b1) &&
isContinuationChar(
b2) &&
isContinuationChar(
b3)))
validU8Char = false;
else
i += 3;
}
// a five-bytes sequence was encountered
else if (
isFiveBytesSequence(
b0)) {
// there must be four continuation bytes of the form 10xxxxxx,
// otherwise the following character is is not a valid UTF-8 construct
if (!(
isContinuationChar(
b1)
&&
isContinuationChar(
b2)
&&
isContinuationChar(
b3)
&&
isContinuationChar(
b4)))
validU8Char = false;
else
i += 4;
}
// a six-bytes sequence was encountered
else if (
isSixBytesSequence(
b0)) {
// there must be five continuation bytes of the form 10xxxxxx,
// otherwise the following character is is not a valid UTF-8 construct
if (!(
isContinuationChar(
b1)
&&
isContinuationChar(
b2)
&&
isContinuationChar(
b3)
&&
isContinuationChar(
b4)
&&
isContinuationChar(
b5)))
validU8Char = false;
else
i += 5;
}
else
validU8Char = false;
}
if (!
validU8Char)
break;
i++;
}
// if no byte with an high order bit set, the encoding is US-ASCII
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
if (!
highOrderBit) {
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
if (this.
enforce8Bit)
return this.
defaultCharset;
else
return
Charset.
forName("US-ASCII");
}
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
// otherwise the file would not be human readable
if (
validU8Char)
return
Charset.
forName("UTF-8");
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
return this.
defaultCharset;
}
/**
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
*
* @param b a byte.
* @return true if it's a continuation char.
*/
private static boolean
isContinuationChar(byte
b) {
return -128 <=
b &&
b <= -65;
}
/**
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a two-bytes sequence.
*/
private static boolean
isTwoBytesSequence(byte
b) {
return -64 <=
b &&
b <= -33;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a three-bytes sequence.
*/
private static boolean
isThreeBytesSequence(byte
b) {
return -32 <=
b &&
b <= -17;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a four-bytes sequence.
*/
private static boolean
isFourBytesSequence(byte
b) {
return -16 <=
b &&
b <= -9;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a five-bytes sequence.
*/
private static boolean
isFiveBytesSequence(byte
b) {
return -8 <=
b &&
b <= -5;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
*
* @param b a byte.
* @return true if it's the first byte of a six-bytes sequence.
*/
private static boolean
isSixBytesSequence(byte
b) {
return -4 <=
b &&
b <= -3;
}
/**
* Retrieve the default charset of the system.
*
* @return the default <code>Charset</code>.
*/
public static
Charset getDefaultSystemCharset() {
return
Charset.
forName(
System.
getProperty("file.encoding"));
}
/**
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
*
* @return true if the buffer has a BOM for UTF8.
*/
public boolean
hasUTF8Bom() {
if (
buffer.length >= 3)
return (
buffer[0] == -17 &&
buffer[1] == -69 &&
buffer[2] == -65);
else
return false;
}
/**
* Has a Byte Order Marker for UTF-16 Low Endian
* (ucs-2le, ucs-4le, and ucs-16le).
*
* @return true if the buffer has a BOM for UTF-16 Low Endian.
*/
public boolean
hasUTF16LEBom() {
if (
buffer.length >= 2)
return (
buffer[0] == -1 &&
buffer[1] == -2);
else
return false;
}
/**
* Has a Byte Order Marker for UTF-16 Big Endian
* (utf-16 and ucs-2).
*
* @return true if the buffer has a BOM for UTF-16 Big Endian.
*/
public boolean
hasUTF16BEBom() {
if (
buffer.length >= 2)
return (
buffer[0] == -2 &&
buffer[1] == -1);
else
return false;
}
/**
* Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
* specified in the constructor of <code>CharsetToolkit</code> using the charset discovered or the default
* charset if an 8-bit <code>Charset</code> is encountered.
*
* @return a <code>BufferedReader</code>
* @throws FileNotFoundException if the file is not found.
*/
public
BufferedReader getReader() throws
FileNotFoundException {
LineNumberReader reader = new
LineNumberReader(new
InputStreamReader(new
FileInputStream(
file),
getCharset()));
if (
hasUTF8Bom() ||
hasUTF16LEBom() ||
hasUTF16BEBom()) {
try {
reader.
read();
}
catch (
IOException e) {
// should never happen, as a file with no content
// but with a BOM has at least one char
}
}
return
reader;
}
/**
* Retrieves all the available <code>Charset</code>s on the platform,
* among which the default <code>charset</code>.
*
* @return an array of <code>Charset</code>s.
*/
public static
Charset[]
getAvailableCharsets() {
Collection collection =
Charset.
availableCharsets().
values();
return (
Charset[])
collection.
toArray(new
Charset[0]);
}
}