/*
* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
* ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*/
package java.lang;
import java.text.
BreakIterator;
import java.util.
HashSet;
import java.util.
Hashtable;
import java.util.
Iterator;
import java.util.
Locale;
import sun.text.
Normalizer;
/**
* This is a utility class for <code>String.toLowerCase()</code> and
* <code>String.toUpperCase()</code>, that handles special casing with
* conditions. In other words, it handles the mappings with conditions
* that are defined in
* <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
* Casing Properties</a> file.
* <p>
* Note that the unconditional case mappings (including 1:M mappings)
* are handled in <code>Character.toLower/UpperCase()</code>.
*/
final class
ConditionalSpecialCasing {
// context conditions.
final static int
FINAL_CASED = 1;
final static int
AFTER_SOFT_DOTTED = 2;
final static int
MORE_ABOVE = 3;
final static int
AFTER_I = 4;
final static int
NOT_BEFORE_DOT = 5;
// combining class definitions
final static int
COMBINING_CLASS_ABOVE = 230;
// Special case mapping entries
static
Entry[]
entry = {
//# ================================================================================
//# Conditional mappings
//# ================================================================================
new
Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null,
FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
new
Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
//# ================================================================================
//# Locale-sensitive mappings
//# ================================================================================
//# Lithuanian
new
Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt",
AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
new
Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt",
MORE_ABOVE), // # LATIN CAPITAL LETTER I
new
Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt",
MORE_ABOVE), // # LATIN CAPITAL LETTER J
new
Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt",
MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
new
Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
new
Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
new
Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
//# ================================================================================
//# Turkish and Azeri
new
Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
new
Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
new
Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr",
AFTER_I), // # COMBINING DOT ABOVE
new
Entry(0x0307, new char[]{}, new char[]{0x0307}, "az",
AFTER_I), // # COMBINING DOT ABOVE
new
Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr",
NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
new
Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az",
NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
new
Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
new
Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
};
// A hash table that contains the above entries
static
Hashtable<
Integer,
HashSet<
Entry>>
entryTable = new
Hashtable<>();
static {
// create hashtable from the entry
for (int
i = 0;
i <
entry.length;
i ++) {
Entry cur =
entry[
i];
Integer cp = new
Integer(
cur.
getCodePoint());
HashSet<
Entry>
set =
entryTable.
get(
cp);
if (
set == null) {
set = new
HashSet<
Entry>();
}
set.
add(
cur);
entryTable.
put(
cp,
set);
}
}
static int
toLowerCaseEx(
String src, int
index,
Locale locale) {
char[]
result =
lookUpTable(
src,
index,
locale, true);
if (
result != null) {
if (
result.length == 1) {
return
result[0];
} else {
return
Character.
ERROR;
}
} else {
// default to Character class' one
return
Character.
toLowerCase(
src.
codePointAt(
index));
}
}
static int
toUpperCaseEx(
String src, int
index,
Locale locale) {
char[]
result =
lookUpTable(
src,
index,
locale, false);
if (
result != null) {
if (
result.length == 1) {
return
result[0];
} else {
return
Character.
ERROR;
}
} else {
// default to Character class' one
return
Character.
toUpperCaseEx(
src.
codePointAt(
index));
}
}
static char[]
toLowerCaseCharArray(
String src, int
index,
Locale locale) {
return
lookUpTable(
src,
index,
locale, true);
}
static char[]
toUpperCaseCharArray(
String src, int
index,
Locale locale) {
char[]
result =
lookUpTable(
src,
index,
locale, false);
if (
result != null) {
return
result;
} else {
return
Character.
toUpperCaseCharArray(
src.
codePointAt(
index));
}
}
private static char[]
lookUpTable(
String src, int
index,
Locale locale, boolean
bLowerCasing) {
HashSet<
Entry>
set =
entryTable.
get(new
Integer(
src.
codePointAt(
index)));
char[]
ret = null;
if (
set != null) {
Iterator<
Entry>
iter =
set.
iterator();
String currentLang =
locale.
getLanguage();
while (
iter.
hasNext()) {
Entry entry =
iter.
next();
String conditionLang =
entry.
getLanguage();
if (((
conditionLang == null) || (
conditionLang.
equals(
currentLang))) &&
isConditionMet(
src,
index,
locale,
entry.
getCondition())) {
ret =
bLowerCasing ?
entry.
getLowerCase() :
entry.
getUpperCase();
if (
conditionLang != null) {
break;
}
}
}
}
return
ret;
}
private static boolean
isConditionMet(
String src, int
index,
Locale locale, int
condition) {
switch (
condition) {
case
FINAL_CASED:
return
isFinalCased(
src,
index,
locale);
case
AFTER_SOFT_DOTTED:
return
isAfterSoftDotted(
src,
index);
case
MORE_ABOVE:
return
isMoreAbove(
src,
index);
case
AFTER_I:
return
isAfterI(
src,
index);
case
NOT_BEFORE_DOT:
return !
isBeforeDot(
src,
index);
default:
return true;
}
}
/**
* Implements the "Final_Cased" condition
*
* Specification: Within the closest word boundaries containing C, there is a cased
* letter before C, and there is no cased letter after C.
*
* Regular Expression:
* Before C: [{cased==true}][{wordBoundary!=true}]*
* After C: !([{wordBoundary!=true}]*[{cased}])
*/
private static boolean
isFinalCased(
String src, int
index,
Locale locale) {
BreakIterator wordBoundary =
BreakIterator.
getWordInstance(
locale);
wordBoundary.
setText(
src);
int
ch;
// Look for a preceding 'cased' letter
for (int
i =
index; (
i >= 0) && !
wordBoundary.
isBoundary(
i);
i -=
Character.
charCount(
ch)) {
ch =
src.
codePointBefore(
i);
if (
isCased(
ch)) {
int
len =
src.
length();
// Check that there is no 'cased' letter after the index
for (
i =
index +
Character.
charCount(
src.
codePointAt(
index));
(
i <
len) && !
wordBoundary.
isBoundary(
i);
i +=
Character.
charCount(
ch)) {
ch =
src.
codePointAt(
i);
if (
isCased(
ch)) {
return false;
}
}
return true;
}
}
return false;
}
/**
* Implements the "After_I" condition
*
* Specification: The last preceding base character was an uppercase I,
* and there is no intervening combining character class 230 (ABOVE).
*
* Regular Expression:
* Before C: [I]([{cc!=230}&{cc!=0}])*
*/
private static boolean
isAfterI(
String src, int
index) {
int
ch;
int
cc;
// Look for the last preceding base character
for (int
i =
index;
i > 0;
i -=
Character.
charCount(
ch)) {
ch =
src.
codePointBefore(
i);
if (
ch == 'I') {
return true;
} else {
cc =
Normalizer.
getCombiningClass(
ch);
if ((
cc == 0) || (
cc ==
COMBINING_CLASS_ABOVE)) {
return false;
}
}
}
return false;
}
/**
* Implements the "After_Soft_Dotted" condition
*
* Specification: The last preceding character with combining class
* of zero before C was Soft_Dotted, and there is no intervening
* combining character class 230 (ABOVE).
*
* Regular Expression:
* Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
*/
private static boolean
isAfterSoftDotted(
String src, int
index) {
int
ch;
int
cc;
// Look for the last preceding character
for (int
i =
index;
i > 0;
i -=
Character.
charCount(
ch)) {
ch =
src.
codePointBefore(
i);
if (
isSoftDotted(
ch)) {
return true;
} else {
cc =
Normalizer.
getCombiningClass(
ch);
if ((
cc == 0) || (
cc ==
COMBINING_CLASS_ABOVE)) {
return false;
}
}
}
return false;
}
/**
* Implements the "More_Above" condition
*
* Specification: C is followed by one or more characters of combining
* class 230 (ABOVE) in the combining character sequence.
*
* Regular Expression:
* After C: [{cc!=0}]*[{cc==230}]
*/
private static boolean
isMoreAbove(
String src, int
index) {
int
ch;
int
cc;
int
len =
src.
length();
// Look for a following ABOVE combining class character
for (int
i =
index +
Character.
charCount(
src.
codePointAt(
index));
i <
len;
i +=
Character.
charCount(
ch)) {
ch =
src.
codePointAt(
i);
cc =
Normalizer.
getCombiningClass(
ch);
if (
cc ==
COMBINING_CLASS_ABOVE) {
return true;
} else if (
cc == 0) {
return false;
}
}
return false;
}
/**
* Implements the "Before_Dot" condition
*
* Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
* Any sequence of characters with a combining class that is
* neither 0 nor 230 may intervene between the current character
* and the combining dot above.
*
* Regular Expression:
* After C: ([{cc!=230}&{cc!=0}])*[\u0307]
*/
private static boolean
isBeforeDot(
String src, int
index) {
int
ch;
int
cc;
int
len =
src.
length();
// Look for a following COMBINING DOT ABOVE
for (int
i =
index +
Character.
charCount(
src.
codePointAt(
index));
i <
len;
i +=
Character.
charCount(
ch)) {
ch =
src.
codePointAt(
i);
if (
ch == '\u0307') {
return true;
} else {
cc =
Normalizer.
getCombiningClass(
ch);
if ((
cc == 0) || (
cc ==
COMBINING_CLASS_ABOVE)) {
return false;
}
}
}
return false;
}
/**
* Examines whether a character is 'cased'.
*
* A character C is defined to be 'cased' if and only if at least one of
* following are true for C: uppercase==true, or lowercase==true, or
* general_category==titlecase_letter.
*
* The uppercase and lowercase property values are specified in the data
* file DerivedCoreProperties.txt in the Unicode Character Database.
*/
private static boolean
isCased(int
ch) {
int
type =
Character.
getType(
ch);
if (
type ==
Character.
LOWERCASE_LETTER ||
type ==
Character.
UPPERCASE_LETTER ||
type ==
Character.
TITLECASE_LETTER) {
return true;
} else {
// Check for Other_Lowercase and Other_Uppercase
//
if ((
ch >= 0x02B0) && (
ch <= 0x02B8)) {
// MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
return true;
} else if ((
ch >= 0x02C0) && (
ch <= 0x02C1)) {
// MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
return true;
} else if ((
ch >= 0x02E0) && (
ch <= 0x02E4)) {
// MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
return true;
} else if (
ch == 0x0345) {
// COMBINING GREEK YPOGEGRAMMENI
return true;
} else if (
ch == 0x037A) {
// GREEK YPOGEGRAMMENI
return true;
} else if ((
ch >= 0x1D2C) && (
ch <= 0x1D61)) {
// MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
return true;
} else if ((
ch >= 0x2160) && (
ch <= 0x217F)) {
// ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
// SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
return true;
} else if ((
ch >= 0x24B6) && (
ch <= 0x24E9)) {
// CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
// CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
return true;
} else {
return false;
}
}
}
private static boolean
isSoftDotted(int
ch) {
switch (
ch) {
case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
return true;
default:
return false;
}
}
/**
* An internal class that represents an entry in the Special Casing Properties.
*/
static class
Entry {
int
ch;
char []
lower;
char []
upper;
String lang;
int
condition;
Entry(int
ch, char[]
lower, char[]
upper,
String lang, int
condition) {
this.
ch =
ch;
this.
lower =
lower;
this.
upper =
upper;
this.
lang =
lang;
this.
condition =
condition;
}
int
getCodePoint() {
return
ch;
}
char[]
getLowerCase() {
return
lower;
}
char[]
getUpperCase() {
return
upper;
}
String getLanguage() {
return
lang;
}
int
getCondition() {
return
condition;
}
}
}