/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Kohei TAKETA <k-tak@void.in> (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet;
import org.mozilla.universalchardet.prober.
CharsetProber;
import org.mozilla.universalchardet.prober.
MBCSGroupProber;
import org.mozilla.universalchardet.prober.
SBCSGroupProber;
import org.mozilla.universalchardet.prober.
EscCharsetProber;
import org.mozilla.universalchardet.prober.
Latin1Prober;
import org.mozilla.universalchardet.
Constants;
public class
UniversalDetector
{
////////////////////////////////////////////////////////////////
// constants
////////////////////////////////////////////////////////////////
public static final float
SHORTCUT_THRESHOLD = 0.95f;
public static final float
MINIMUM_THRESHOLD = 0.20f;
////////////////////////////////////////////////////////////////
// inner types
////////////////////////////////////////////////////////////////
public enum
InputState
{
PURE_ASCII,
ESC_ASCII,
HIGHBYTE
}
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private
InputState inputState;
private boolean
done;
private boolean
start;
private boolean
gotData;
private byte
lastChar;
private
String detectedCharset;
private
CharsetProber[]
probers;
private
CharsetProber escCharsetProber;
private
CharsetListener listener;
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
/**
* @param listener a listener object that is notified of
* the detected encocoding. Can be null.
*/
public
UniversalDetector(
CharsetListener listener)
{
this.
listener =
listener;
this.
escCharsetProber = null;
this.
probers = new
CharsetProber[3];
for (int
i=0;
i<this.
probers.length; ++
i) {
this.
probers[
i] = null;
}
reset();
}
public boolean
isDone()
{
return this.
done;
}
/**
* @return The detected encoding is returned. If the detector couldn't
* determine what encoding was used, null is returned.
*/
public
String getDetectedCharset()
{
return this.
detectedCharset;
}
public void
setListener(
CharsetListener listener)
{
this.
listener =
listener;
}
public
CharsetListener getListener()
{
return this.
listener;
}
public void
handleData(final byte[]
buf, int
offset, int
length)
{
if (this.
done) {
return;
}
if (
length > 0) {
this.
gotData = true;
}
if (this.
start) {
this.
start = false;
if (
length > 3) {
int
b1 =
buf[
offset] & 0xFF;
int
b2 =
buf[
offset+1] & 0xFF;
int
b3 =
buf[
offset+2] & 0xFF;
int
b4 =
buf[
offset+3] & 0xFF;
switch (
b1) {
case 0xEF:
if (
b2 == 0xBB &&
b3 == 0xBF) {
this.
detectedCharset =
Constants.
CHARSET_UTF_8;
}
break;
case 0xFE:
if (
b2 == 0xFF &&
b3 == 0x00 &&
b4 == 0x00) {
this.
detectedCharset =
Constants.
CHARSET_X_ISO_10646_UCS_4_3412;
} else if (
b2 == 0xFF) {
this.
detectedCharset =
Constants.
CHARSET_UTF_16BE;
}
break;
case 0x00:
if (
b2 == 0x00 &&
b3 == 0xFE &&
b4 == 0xFF) {
this.
detectedCharset =
Constants.
CHARSET_UTF_32BE;
} else if (
b2 == 0x00 &&
b3 == 0xFF &&
b4 == 0xFE) {
this.
detectedCharset =
Constants.
CHARSET_X_ISO_10646_UCS_4_2143;
}
break;
case 0xFF:
if (
b2 == 0xFE &&
b3 == 0x00 &&
b4 == 0x00) {
this.
detectedCharset =
Constants.
CHARSET_UTF_32LE;
} else if (
b2 == 0xFE) {
this.
detectedCharset =
Constants.
CHARSET_UTF_16LE;
}
break;
} // swich end
if (this.
detectedCharset != null) {
this.
done = true;
return;
}
}
} // if (start) end
int
maxPos =
offset +
length;
for (int
i=
offset;
i<
maxPos; ++
i) {
int
c =
buf[
i] & 0xFF;
if ((
c & 0x80) != 0 &&
c != 0xA0) {
if (this.
inputState !=
InputState.
HIGHBYTE) {
this.
inputState =
InputState.
HIGHBYTE;
if (this.
escCharsetProber != null) {
this.
escCharsetProber = null;
}
if (this.
probers[0] == null) {
this.
probers[0] = new
MBCSGroupProber();
}
if (this.
probers[1] == null) {
this.
probers[1] = new
SBCSGroupProber();
}
if (this.
probers[2] == null) {
this.
probers[2] = new
Latin1Prober();
}
}
} else {
if (this.
inputState ==
InputState.
PURE_ASCII &&
(
c == 0x1B || (
c == 0x7B && this.
lastChar == 0x7E))) {
this.
inputState =
InputState.
ESC_ASCII;
}
this.
lastChar =
buf[
i];
}
} // for end
CharsetProber.
ProbingState st;
if (this.
inputState ==
InputState.
ESC_ASCII) {
if (this.
escCharsetProber == null) {
this.
escCharsetProber = new
EscCharsetProber();
}
st = this.
escCharsetProber.
handleData(
buf,
offset,
length);
if (
st ==
CharsetProber.
ProbingState.
FOUND_IT) {
this.
done = true;
this.
detectedCharset = this.
escCharsetProber.
getCharSetName();
}
} else if (this.
inputState ==
InputState.
HIGHBYTE) {
for (int
i=0;
i<this.
probers.length; ++
i) {
st = this.
probers[
i].
handleData(
buf,
offset,
length);
if (
st ==
CharsetProber.
ProbingState.
FOUND_IT) {
this.
done = true;
this.
detectedCharset = this.
probers[
i].
getCharSetName();
return;
}
}
} else { // pure ascii
// do nothing
}
}
public void
dataEnd()
{
if (!this.
gotData) {
return;
}
if (this.
detectedCharset != null) {
this.
done = true;
if (this.
listener != null) {
this.
listener.
report(this.
detectedCharset);
}
return;
}
if (this.
inputState ==
InputState.
HIGHBYTE) {
float
proberConfidence;
float
maxProberConfidence = 0.0f;
int
maxProber = 0;
for (int
i=0;
i<this.
probers.length; ++
i) {
proberConfidence = this.
probers[
i].
getConfidence();
if (
proberConfidence >
maxProberConfidence) {
maxProberConfidence =
proberConfidence;
maxProber =
i;
}
}
if (
maxProberConfidence >
MINIMUM_THRESHOLD) {
this.
detectedCharset = this.
probers[
maxProber].
getCharSetName();
if (this.
listener != null) {
this.
listener.
report(this.
detectedCharset);
}
}
} else if (this.
inputState ==
InputState.
ESC_ASCII) {
// do nothing
} else {
// do nothing
}
}
public void
reset()
{
this.
done = false;
this.
start = true;
this.
detectedCharset = null;
this.
gotData = false;
this.
inputState =
InputState.
PURE_ASCII;
this.
lastChar = 0;
if (this.
escCharsetProber != null) {
this.
escCharsetProber.
reset();
}
for (int
i=0;
i<this.
probers.length; ++
i) {
if (this.
probers[
i] != null) {
this.
probers[
i].
reset();
}
}
}
////////////////////////////////////////////////////////////////
// testing
////////////////////////////////////////////////////////////////
public static void
main(
String[]
args) throws
Exception
{
if (
args.length != 1) {
System.
out.
println("USAGE: java UniversalDetector filename");
return;
}
UniversalDetector detector = new
UniversalDetector(
new
CharsetListener() {
public void
report(
String name)
{
System.
out.
println("charset = " +
name);
}
}
);
byte[]
buf = new byte[4096];
java.io.
FileInputStream fis = new java.io.
FileInputStream(
args[0]);
int
nread;
while ((
nread =
fis.
read(
buf)) > 0 && !
detector.
isDone()) {
detector.
handleData(
buf, 0,
nread);
}
detector.
dataEnd();
}
}