CharsetToolkit.java - 源码阅读网

groovy-2.5.2.jar

|

org.codehaus.groovy:groovy:2.5.2

META-INF

org

groovy

io

security

cli

time

transform

util

FileNameByRegexFinder.groovy

FileTreeBuilder.groovy

NodePrinter.java

ObservableMap.java

IndentPrinter.java

GroovyScriptEngine.java

CharsetToolkit.java

ResourceConnector.java

ConfigObject.java

ObjectGraphBuilder.java

package.html

ProxyGenerator.java

IFileNameFinder.java

FactoryBuilderSupport.java

ScriptException.java

NodeList.java

Eval.java

NodeBuilder.java

Node.java

Proxy.java

ResourceException.java

DelegatingScript.java

PermutationGenerator.java

OrderBy.java

GroovyCollections.java

BufferedIterator.java

Expando.java

ObservableSet.java

BuilderSupport.java

ClosureComparator.java

AbstractFactory.java

Factory.java

logging

ObservableList.java

MapEntry.java

ConfigSlurper.groovy

grape

beans

ui

lang

xml

inspect

overview.html

overviewj.html

CharsetToolkit.java

清空

类结构

/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package groovy.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.nio.charset.Charset; import java.util.Collection; /** * Utility class to guess the encoding of a given text file. * <p> * Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer * is wide enough, the charset should also be discovered. * <p> * A byte buffer of 4KB is used to be able to guess the encoding. * <p> * Usage: * <pre> * CharsetToolkit toolkit = new CharsetToolkit(file); * * // guess the encoding * Charset guessedCharset = toolkit.getCharset(); * * // create a reader with the correct charset * BufferedReader reader = toolkit.getReader(); * * // read the file content * String line; * while ((line = br.readLine())!= null) * { * System.out.println(line); * } * </pre> * * @author Guillaume Laforge */ public class CharsetToolkit { private final byte[] buffer; private Charset defaultCharset; private Charset charset; private boolean enforce8Bit = true; private final File file; private static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; /** * Constructor of the <code>CharsetToolkit</code> utility class. * * @param file of which we want to know the encoding. */ public CharsetToolkit(File file) throws IOException { this.file = file; this.defaultCharset = getDefaultSystemCharset(); this.charset = null; InputStream input = new FileInputStream(file); try { byte[] bytes = new byte[4096]; int bytesRead = input.read(bytes); if (bytesRead == -1) { this.buffer = EMPTY_BYTE_ARRAY; } else if (bytesRead < 4096) { byte[] bytesToGuess = new byte[bytesRead]; System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); this.buffer = bytesToGuess; } else { this.buffer = bytes; } } finally { try {input.close();} catch (IOException e){ // IGNORE } } } /** * Defines the default <code>Charset</code> used in case the buffer represents * an 8-bit <code>Charset</code>. * * @param defaultCharset the default <code>Charset</code> to be returned * if an 8-bit <code>Charset</code> is encountered. */ public void setDefaultCharset(Charset defaultCharset) { if (defaultCharset != null) this.defaultCharset = defaultCharset; else this.defaultCharset = getDefaultSystemCharset(); } public Charset getCharset() { if (this.charset == null) this.charset = guessEncoding(); return charset; } /** * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. * It might be a file without any special character in the range 128-255, but that may be or become * a file encoded with the default <code>charset</code> rather than US-ASCII. * * @param enforce a boolean specifying the use or not of US-ASCII. */ public void setEnforce8Bit(boolean enforce) { this.enforce8Bit = enforce; } /** * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding. * * @return a boolean representing the flag of use of US-ASCII. */ public boolean getEnforce8Bit() { return this.enforce8Bit; } /** * Retrieves the default Charset */ public Charset getDefaultCharset() { return defaultCharset; } /** * Guess the encoding of the provided buffer. * If Byte Order Markers are encountered at the beginning of the buffer, we immediately * return the charset implied by this BOM. Otherwise, the file would not be a human * readable text file. * <p> * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. * If it is not UTF-8, we assume the encoding is the default system encoding * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one). * <p> * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence. * <pre> * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * </pre> * With UTF-8, 0xFE and 0xFF never appear. * * @return the Charset recognized. */ private Charset guessEncoding() { // if the file has a Byte Order Marker, we can assume the file is in UTF-xx // otherwise, the file would not be human readable if (hasUTF8Bom()) return Charset.forName("UTF-8"); if (hasUTF16LEBom()) return Charset.forName("UTF-16LE"); if (hasUTF16BEBom()) return Charset.forName("UTF-16BE"); // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding // otherwise, the file is in US-ASCII boolean highOrderBit = false; // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid // if it's not the case, we can assume the encoding is the default encoding of the system boolean validU8Char = true; // TODO the buffer is not read up to the end, but up to length - 6 int length = buffer.length; int i = 0; while (i < length - 6) { byte b0 = buffer[i]; byte b1 = buffer[i + 1]; byte b2 = buffer[i + 2]; byte b3 = buffer[i + 3]; byte b4 = buffer[i + 4]; byte b5 = buffer[i + 5]; if (b0 < 0) { // a high order bit was encountered, thus the encoding is not US-ASCII // it may be either an 8-bit encoding or UTF-8 highOrderBit = true; // a two-bytes sequence was encountered if (isTwoBytesSequence(b0)) { // there must be one continuation byte of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!isContinuationChar(b1)) validU8Char = false; else i++; } // a three-bytes sequence was encountered else if (isThreeBytesSequence(b0)) { // there must be two continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2))) validU8Char = false; else i += 2; } // a four-bytes sequence was encountered else if (isFourBytesSequence(b0)) { // there must be three continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) validU8Char = false; else i += 3; } // a five-bytes sequence was encountered else if (isFiveBytesSequence(b0)) { // there must be four continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) validU8Char = false; else i += 4; } // a six-bytes sequence was encountered else if (isSixBytesSequence(b0)) { // there must be five continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5))) validU8Char = false; else i += 5; } else validU8Char = false; } if (!validU8Char) break; i++; } // if no byte with an high order bit set, the encoding is US-ASCII // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) if (!highOrderBit) { // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. if (this.enforce8Bit) return this.defaultCharset; else return Charset.forName("US-ASCII"); } // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, // otherwise the file would not be human readable if (validU8Char) return Charset.forName("UTF-8"); // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding return this.defaultCharset; } /** * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; * * @param b a byte. * @return true if it's a continuation char. */ private static boolean isContinuationChar(byte b) { return -128 <= b && b <= -65; } /** * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a two-bytes sequence. */ private static boolean isTwoBytesSequence(byte b) { return -64 <= b && b <= -33; } /** * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a three-bytes sequence. */ private static boolean isThreeBytesSequence(byte b) { return -32 <= b && b <= -17; } /** * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a four-bytes sequence. */ private static boolean isFourBytesSequence(byte b) { return -16 <= b && b <= -9; } /** * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a five-bytes sequence. */ private static boolean isFiveBytesSequence(byte b) { return -8 <= b && b <= -5; } /** * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a six-bytes sequence. */ private static boolean isSixBytesSequence(byte b) { return -4 <= b && b <= -3; } /** * Retrieve the default charset of the system. * * @return the default <code>Charset</code>. */ public static Charset getDefaultSystemCharset() { return Charset.forName(System.getProperty("file.encoding")); } /** * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). * * @return true if the buffer has a BOM for UTF8. */ public boolean hasUTF8Bom() { if (buffer.length >= 3) return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65); else return false; } /** * Has a Byte Order Marker for UTF-16 Low Endian * (ucs-2le, ucs-4le, and ucs-16le). * * @return true if the buffer has a BOM for UTF-16 Low Endian. */ public boolean hasUTF16LEBom() { if (buffer.length >= 2) return (buffer[0] == -1 && buffer[1] == -2); else return false; } /** * Has a Byte Order Marker for UTF-16 Big Endian * (utf-16 and ucs-2). * * @return true if the buffer has a BOM for UTF-16 Big Endian. */ public boolean hasUTF16BEBom() { if (buffer.length >= 2) return (buffer[0] == -2 && buffer[1] == -1); else return false; } /** * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code> * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered or the default * charset if an 8-bit <code>Charset</code> is encountered. * * @return a <code>BufferedReader</code> * @throws FileNotFoundException if the file is not found. */ public BufferedReader getReader() throws FileNotFoundException { LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) { try { reader.read(); } catch (IOException e) { // should never happen, as a file with no content // but with a BOM has at least one char } } return reader; } /** * Retrieves all the available <code>Charset</code>s on the platform, * among which the default <code>charset</code>. * * @return an array of <code>Charset</code>s. */ public static Charset[] getAvailableCharsets() { Collection collection = Charset.availableCharsets().values(); return (Charset[]) collection.toArray(new Charset[0]); } }

查找资源

输入类名或文件名

类结构窗口