/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright (c) 1997-2018 Oracle and/or its affiliates. All rights reserved.
*
* The contents of this file are subject to the terms of either the GNU
* General Public License Version 2 only ("GPL") or the Common Development
* and Distribution License("CDDL") (collectively, the "License"). You
* may not use this file except in compliance with the License. You can
* obtain a copy of the License at
* https://oss.oracle.com/licenses/CDDL+GPL-1.1
* or LICENSE.txt. See the License for the specific
* language governing permissions and limitations under the License.
*
* When distributing the software, include this License Header Notice in each
* file and include the License file at LICENSE.txt.
*
* GPL Classpath Exception:
* Oracle designates this particular file as subject to the "Classpath"
* exception as provided by Oracle in the GPL Version 2 section of the License
* file that accompanied this code.
*
* Modifications:
* If applicable, add the following below the License Header, with the fields
* enclosed by brackets [] replaced by your own identifying information:
* "Portions Copyright [year] [name of copyright owner]"
*
* Contributor(s):
* If you wish your version of this file to be governed by only the CDDL or
* only the GPL Version 2, indicate your decision by adding "[Contributor]
* elects to include this software in this distribution under the [CDDL or GPL
* Version 2] license." If you don't indicate a single choice of license, a
* recipient has the option to distribute your version of this file under
* either the CDDL, the GPL Version 2 or to extend the choice of license to
* its licensees as provided above. However, if you add GPL Version 2 code
* and therefore, elected the GPL Version 2 license, then the option applies
* only if the new code is made subject to such option by the copyright
* holder.
*/
package javax.mail.internet;
import javax.mail.
MessagingException;
import javax.mail.
EncodingAware;
import javax.activation.*;
import java.util.*;
import java.io.*;
import java.nio.charset.
Charset;
import com.sun.mail.util.
PropUtil;
import com.sun.mail.util.
ASCIIUtility;
import com.sun.mail.util.
BASE64DecoderStream;
import com.sun.mail.util.
BASE64EncoderStream;
import com.sun.mail.util.
BEncoderStream;
import com.sun.mail.util.
LineInputStream;
import com.sun.mail.util.
LineOutputStream;
import com.sun.mail.util.
LogOutputStream;
import com.sun.mail.util.
QDecoderStream;
import com.sun.mail.util.
QEncoderStream;
import com.sun.mail.util.
QPDecoderStream;
import com.sun.mail.util.
QPEncoderStream;
import com.sun.mail.util.
UUDecoderStream;
import com.sun.mail.util.
UUEncoderStream;
/**
* This is a utility class that provides various MIME related
* functionality. <p>
*
* There are a set of methods to encode and decode MIME headers as
* per RFC 2047. Note that, in general, these methods are
* <strong>not</strong> needed when using methods such as
* <code>setSubject</code> and <code>setRecipients</code>; JavaMail
* will automatically encode and decode data when using these "higher
* level" methods. The methods below are only needed when maniuplating
* raw MIME headers using <code>setHeader</code> and <code>getHeader</code>
* methods. A brief description on handling such headers is given below: <p>
*
* RFC 822 mail headers <strong>must</strong> contain only US-ASCII
* characters. Headers that contain non US-ASCII characters must be
* encoded so that they contain only US-ASCII characters. Basically,
* this process involves using either BASE64 or QP to encode certain
* characters. RFC 2047 describes this in detail. <p>
*
* In Java, Strings contain (16 bit) Unicode characters. ASCII is a
* subset of Unicode (and occupies the range 0 - 127). A String
* that contains only ASCII characters is already mail-safe. If the
* String contains non US-ASCII characters, it must be encoded. An
* additional complexity in this step is that since Unicode is not
* yet a widely used charset, one might want to first charset-encode
* the String into another charset and then do the transfer-encoding.
* <p>
* Note that to get the actual bytes of a mail-safe String (say,
* for sending over SMTP), one must do
* <blockquote><pre>
*
* byte[] bytes = string.getBytes("iso-8859-1");
*
* </pre></blockquote><p>
*
* The <code>setHeader</code> and <code>addHeader</code> methods
* on MimeMessage and MimeBodyPart assume that the given header values
* are Unicode strings that contain only US-ASCII characters. Hence
* the callers of those methods must insure that the values they pass
* do not contain non US-ASCII characters. The methods in this class
* help do this. <p>
*
* The <code>getHeader</code> family of methods on MimeMessage and
* MimeBodyPart return the raw header value. These might be encoded
* as per RFC 2047, and if so, must be decoded into Unicode Strings.
* The methods in this class help to do this. <p>
*
* Several System properties control strict conformance to the MIME
* spec. Note that these are not session properties but must be set
* globally as System properties. <p>
*
* The <code>mail.mime.decodetext.strict</code> property controls
* decoding of MIME encoded words. The MIME spec requires that encoded
* words start at the beginning of a whitespace separated word. Some
* mailers incorrectly include encoded words in the middle of a word.
* If the <code>mail.mime.decodetext.strict</code> System property is
* set to <code>"false"</code>, an attempt will be made to decode these
* illegal encoded words. The default is true. <p>
*
* The <code>mail.mime.encodeeol.strict</code> property controls the
* choice of Content-Transfer-Encoding for MIME parts that are not of
* type "text". Often such parts will contain textual data for which
* an encoding that allows normal end of line conventions is appropriate.
* In rare cases, such a part will appear to contain entirely textual
* data, but will require an encoding that preserves CR and LF characters
* without change. If the <code>mail.mime.encodeeol.strict</code>
* System property is set to <code>"true"</code>, such an encoding will
* be used when necessary. The default is false. <p>
*
* In addition, the <code>mail.mime.charset</code> System property can
* be used to specify the default MIME charset to use for encoded words
* and text parts that don't otherwise specify a charset. Normally, the
* default MIME charset is derived from the default Java charset, as
* specified in the <code>file.encoding</code> System property. Most
* applications will have no need to explicitly set the default MIME
* charset. In cases where the default MIME charset to be used for
* mail messages is different than the charset used for files stored on
* the system, this property should be set. <p>
*
* The current implementation also supports the following System property.
* <p>
* The <code>mail.mime.ignoreunknownencoding</code> property controls
* whether unknown values in the <code>Content-Transfer-Encoding</code>
* header, as passed to the <code>decode</code> method, cause an exception.
* If set to <code>"true"</code>, unknown values are ignored and 8bit
* encoding is assumed. Otherwise, unknown values cause a MessagingException
* to be thrown.
*
* @author John Mani
* @author Bill Shannon
*/
public class
MimeUtility {
// This class cannot be instantiated
private
MimeUtility() { }
public static final int
ALL = -1;
// cached map of whether a charset is compatible with ASCII
// Map<String,Boolean>
private static final
Map<
String,
Boolean>
nonAsciiCharsetMap
= new
HashMap<>();
private static final boolean
decodeStrict =
PropUtil.
getBooleanSystemProperty("mail.mime.decodetext.strict", true);
private static final boolean
encodeEolStrict =
PropUtil.
getBooleanSystemProperty("mail.mime.encodeeol.strict", false);
private static final boolean
ignoreUnknownEncoding =
PropUtil.
getBooleanSystemProperty(
"mail.mime.ignoreunknownencoding", false);
private static final boolean
allowUtf8 =
PropUtil.
getBooleanSystemProperty("mail.mime.allowutf8", false);
/*
* The following two properties allow disabling the fold()
* and unfold() methods and reverting to the previous behavior.
* They should never need to be changed and are here only because
* of my paranoid concern with compatibility.
*/
private static final boolean
foldEncodedWords =
PropUtil.
getBooleanSystemProperty("mail.mime.foldencodedwords", false);
private static final boolean
foldText =
PropUtil.
getBooleanSystemProperty("mail.mime.foldtext", true);
/**
* Get the Content-Transfer-Encoding that should be applied
* to the input stream of this DataSource, to make it mail-safe. <p>
*
* The algorithm used here is: <br>
* <ul>
* <li>
* If the DataSource implements {@link EncodingAware}, ask it
* what encoding to use. If it returns non-null, return that value.
* <li>
* If the primary type of this datasource is "text" and if all
* the bytes in its input stream are US-ASCII, then the encoding
* is "7bit". If more than half of the bytes are non-US-ASCII, then
* the encoding is "base64". If less than half of the bytes are
* non-US-ASCII, then the encoding is "quoted-printable".
* <li>
* If the primary type of this datasource is not "text", then if
* all the bytes of its input stream are US-ASCII, the encoding
* is "7bit". If there is even one non-US-ASCII character, the
* encoding is "base64".
* </ul>
*
* @param ds the DataSource
* @return the encoding. This is either "7bit",
* "quoted-printable" or "base64"
*/
public static
String getEncoding(
DataSource ds) {
ContentType cType = null;
InputStream is = null;
String encoding = null;
if (
ds instanceof
EncodingAware) {
encoding = ((
EncodingAware)
ds).
getEncoding();
if (
encoding != null)
return
encoding;
}
try {
cType = new
ContentType(
ds.
getContentType());
is =
ds.
getInputStream();
boolean
isText =
cType.
match("text/*");
// if not text, stop processing when we see non-ASCII
int
i =
checkAscii(
is,
ALL, !
isText);
switch (
i) {
case
ALL_ASCII:
encoding = "7bit"; // all ASCII
break;
case
MOSTLY_ASCII:
if (
isText &&
nonAsciiCharset(
cType))
encoding = "base64"; // charset isn't compatible with ASCII
else
encoding = "quoted-printable"; // mostly ASCII
break;
default:
encoding = "base64"; // mostly binary
break;
}
} catch (
Exception ex) {
return "base64"; // what else ?!
} finally {
// Close the input stream
try {
if (
is != null)
is.
close();
} catch (
IOException ioex) { }
}
return
encoding;
}
/**
* Determine whether the charset in the Content-Type is compatible
* with ASCII or not. A charset is compatible with ASCII if the
* encoded byte stream representing the Unicode string "\r\n" is
* the ASCII characters CR and LF. For example, the utf-16be
* charset is not compatible with ASCII.
*
* For performance, we keep a static map that caches the results.
*/
private static boolean
nonAsciiCharset(
ContentType ct) {
String charset =
ct.
getParameter("charset");
if (
charset == null)
return false;
charset =
charset.
toLowerCase(
Locale.
ENGLISH);
Boolean bool;
synchronized (
nonAsciiCharsetMap) {
bool =
nonAsciiCharsetMap.
get(
charset);
}
if (
bool == null) {
try {
byte[]
b = "\r\n".
getBytes(
charset);
bool =
Boolean.
valueOf(
b.length != 2 ||
b[0] != 015 ||
b[1] != 012);
} catch (
UnsupportedEncodingException uex) {
bool =
Boolean.
FALSE; // a guess
} catch (
RuntimeException ex) {
bool =
Boolean.
TRUE; // one of the weird ones?
}
synchronized (
nonAsciiCharsetMap) {
nonAsciiCharsetMap.
put(
charset,
bool);
}
}
return
bool.
booleanValue();
}
/**
* Same as <code>getEncoding(DataSource)</code> except that instead
* of reading the data from an <code>InputStream</code> it uses the
* <code>writeTo</code> method to examine the data. This is more
* efficient in the common case of a <code>DataHandler</code>
* created with an object and a MIME type (for example, a
* "text/plain" String) because all the I/O is done in this
* thread. In the case requiring an <code>InputStream</code> the
* <code>DataHandler</code> uses a thread, a pair of pipe streams,
* and the <code>writeTo</code> method to produce the data. <p>
*
* @param dh the DataHandler
* @return the Content-Transfer-Encoding
* @since JavaMail 1.2
*/
public static
String getEncoding(
DataHandler dh) {
ContentType cType = null;
String encoding = null;
/*
* Try to pick the most efficient means of determining the
* encoding. If this DataHandler was created using a DataSource,
* the getEncoding(DataSource) method is typically faster. If
* the DataHandler was created with an object, this method is
* much faster. To distinguish the two cases, we use a heuristic.
* A DataHandler created with an object will always have a null name.
* A DataHandler created with a DataSource will usually have a
* non-null name.
*
* XXX - This is actually quite a disgusting hack, but it makes
* a common case run over twice as fast.
*/
if (
dh.
getName() != null)
return
getEncoding(
dh.
getDataSource());
try {
cType = new
ContentType(
dh.
getContentType());
} catch (
Exception ex) {
return "base64"; // what else ?!
}
if (
cType.
match("text/*")) {
// Check all of the available bytes
AsciiOutputStream aos = new
AsciiOutputStream(false, false);
try {
dh.
writeTo(
aos);
} catch (
IOException ex) {
// ignore it, can't happen
}
switch (
aos.
getAscii()) {
case
ALL_ASCII:
encoding = "7bit"; // all ascii
break;
case
MOSTLY_ASCII:
encoding = "quoted-printable"; // mostly ascii
break;
default:
encoding = "base64"; // mostly binary
break;
}
} else { // not "text"
// Check all of available bytes, break out if we find
// at least one non-US-ASCII character
AsciiOutputStream aos =
new
AsciiOutputStream(true,
encodeEolStrict);
try {
dh.
writeTo(
aos);
} catch (
IOException ex) { } // ignore it
if (
aos.
getAscii() ==
ALL_ASCII) // all ascii
encoding = "7bit";
else // found atleast one non-ascii character, use b64
encoding = "base64";
}
return
encoding;
}
/**
* Decode the given input stream. The Input stream returned is
* the decoded input stream. All the encodings defined in RFC 2045
* are supported here. They include "base64", "quoted-printable",
* "7bit", "8bit", and "binary". In addition, "uuencode" is also
* supported. <p>
*
* In the current implementation, if the
* <code>mail.mime.ignoreunknownencoding</code> system property is set to
* <code>"true"</code>, unknown encoding values are ignored and the
* original InputStream is returned.
*
* @param is input stream
* @param encoding the encoding of the stream.
* @return decoded input stream.
* @exception MessagingException if the encoding is unknown
*/
public static
InputStream decode(
InputStream is,
String encoding)
throws
MessagingException {
if (
encoding.
equalsIgnoreCase("base64"))
return new
BASE64DecoderStream(
is);
else if (
encoding.
equalsIgnoreCase("quoted-printable"))
return new
QPDecoderStream(
is);
else if (
encoding.
equalsIgnoreCase("uuencode") ||
encoding.
equalsIgnoreCase("x-uuencode") ||
encoding.
equalsIgnoreCase("x-uue"))
return new
UUDecoderStream(
is);
else if (
encoding.
equalsIgnoreCase("binary") ||
encoding.
equalsIgnoreCase("7bit") ||
encoding.
equalsIgnoreCase("8bit"))
return
is;
else {
if (!
ignoreUnknownEncoding)
throw new
MessagingException("Unknown encoding: " +
encoding);
return
is;
}
}
/**
* Wrap an encoder around the given output stream.
* All the encodings defined in RFC 2045 are supported here.
* They include "base64", "quoted-printable", "7bit", "8bit" and
* "binary". In addition, "uuencode" is also supported.
*
* @param os output stream
* @param encoding the encoding of the stream.
* @return output stream that applies the
* specified encoding.
* @exception MessagingException if the encoding is unknown
*/
public static
OutputStream encode(
OutputStream os,
String encoding)
throws
MessagingException {
if (
encoding == null)
return
os;
else if (
encoding.
equalsIgnoreCase("base64"))
return new
BASE64EncoderStream(
os);
else if (
encoding.
equalsIgnoreCase("quoted-printable"))
return new
QPEncoderStream(
os);
else if (
encoding.
equalsIgnoreCase("uuencode") ||
encoding.
equalsIgnoreCase("x-uuencode") ||
encoding.
equalsIgnoreCase("x-uue"))
return new
UUEncoderStream(
os);
else if (
encoding.
equalsIgnoreCase("binary") ||
encoding.
equalsIgnoreCase("7bit") ||
encoding.
equalsIgnoreCase("8bit"))
return
os;
else
throw new
MessagingException("Unknown encoding: " +
encoding);
}
/**
* Wrap an encoder around the given output stream.
* All the encodings defined in RFC 2045 are supported here.
* They include "base64", "quoted-printable", "7bit", "8bit" and
* "binary". In addition, "uuencode" is also supported.
* The <code>filename</code> parameter is used with the "uuencode"
* encoding and is included in the encoded output.
*
* @param os output stream
* @param encoding the encoding of the stream.
* @param filename name for the file being encoded (only used
* with uuencode)
* @return output stream that applies the
* specified encoding.
* @exception MessagingException for unknown encodings
* @since JavaMail 1.2
*/
public static
OutputStream encode(
OutputStream os,
String encoding,
String filename)
throws
MessagingException {
if (
encoding == null)
return
os;
else if (
encoding.
equalsIgnoreCase("base64"))
return new
BASE64EncoderStream(
os);
else if (
encoding.
equalsIgnoreCase("quoted-printable"))
return new
QPEncoderStream(
os);
else if (
encoding.
equalsIgnoreCase("uuencode") ||
encoding.
equalsIgnoreCase("x-uuencode") ||
encoding.
equalsIgnoreCase("x-uue"))
return new
UUEncoderStream(
os,
filename);
else if (
encoding.
equalsIgnoreCase("binary") ||
encoding.
equalsIgnoreCase("7bit") ||
encoding.
equalsIgnoreCase("8bit"))
return
os;
else
throw new
MessagingException("Unknown encoding: " +
encoding);
}
/**
* Encode a RFC 822 "text" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the platform's
* default charset, then transfer-encoded using either the B or
* Q encoding. The resulting bytes are then returned as a Unicode
* string containing only ASCII characters. <p>
*
* Note that this method should be used to encode only
* "unstructured" RFC 822 headers. <p>
*
* Example of usage:
* <blockquote><pre>
*
* MimePart part = ...
* String rawvalue = "FooBar Mailer, Japanese version 1.1"
* try {
* // If we know for sure that rawvalue contains only US-ASCII
* // characters, we can skip the encoding part
* part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
* } catch (UnsupportedEncodingException e) {
* // encoding failure
* } catch (MessagingException me) {
* // setHeader() failure
* }
*
* </pre></blockquote><p>
*
* @param text Unicode string
* @return Unicode string containing only US-ASCII characters
* @exception UnsupportedEncodingException if the encoding fails
*/
public static
String encodeText(
String text)
throws
UnsupportedEncodingException {
return
encodeText(
text, null, null);
}
/**
* Encode a RFC 822 "text" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the specified
* charset, then transfer-encoded using either the B or Q encoding.
* The resulting bytes are then returned as a Unicode string
* containing only ASCII characters. <p>
*
* Note that this method should be used to encode only
* "unstructured" RFC 822 headers.
*
* @param text the header value
* @param charset the charset. If this parameter is null, the
* platform's default chatset is used.
* @param encoding the encoding to be used. Currently supported
* values are "B" and "Q". If this parameter is null, then
* the "Q" encoding is used if most of characters to be
* encoded are in the ASCII charset, otherwise "B" encoding
* is used.
* @return Unicode string containing only US-ASCII characters
* @exception UnsupportedEncodingException if the charset
* conversion failed.
*/
public static
String encodeText(
String text,
String charset,
String encoding)
throws
UnsupportedEncodingException {
return
encodeWord(
text,
charset,
encoding, false);
}
/**
* Decode "unstructured" headers, that is, headers that are defined
* as '*text' as per RFC 822. <p>
*
* The string is decoded using the algorithm specified in
* RFC 2047, Section 6.1. If the charset-conversion fails
* for any sequence, an UnsupportedEncodingException is thrown.
* If the String is not an RFC 2047 style encoded header, it is
* returned as-is <p>
*
* Example of usage:
* <blockquote><pre>
*
* MimePart part = ...
* String rawvalue = null;
* String value = null;
* try {
* if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
* value = MimeUtility.decodeText(rawvalue);
* } catch (UnsupportedEncodingException e) {
* // Don't care
* value = rawvalue;
* } catch (MessagingException me) { }
*
* return value;
*
* </pre></blockquote><p>
*
* @param etext the possibly encoded value
* @return the decoded text
* @exception UnsupportedEncodingException if the charset
* conversion failed.
*/
public static
String decodeText(
String etext)
throws
UnsupportedEncodingException {
/*
* We look for sequences separated by "linear-white-space".
* (as per RFC 2047, Section 6.1)
* RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
*/
String lwsp = " \t\n\r";
StringTokenizer st;
/*
* First, lets do a quick run thru the string and check
* whether the sequence "=?" exists at all. If none exists,
* we know there are no encoded-words in here and we can just
* return the string as-is, without suffering thru the later
* decoding logic.
* This handles the most common case of unencoded headers
* efficiently.
*/
if (
etext.
indexOf("=?") == -1)
return
etext;
// Encoded words found. Start decoding ...
st = new
StringTokenizer(
etext,
lwsp, true);
StringBuilder sb = new
StringBuilder(); // decode buffer
StringBuilder wsb = new
StringBuilder(); // white space buffer
boolean
prevWasEncoded = false;
while (
st.
hasMoreTokens()) {
char
c;
String s =
st.
nextToken();
// If whitespace, append it to the whitespace buffer
if (((
c =
s.
charAt(0)) == ' ') || (
c == '\t') ||
(
c == '\r') || (
c == '\n'))
wsb.
append(
c);
else {
// Check if token is an 'encoded-word' ..
String word;
try {
word =
decodeWord(
s);
// Yes, this IS an 'encoded-word'.
if (!
prevWasEncoded &&
wsb.
length() > 0) {
// if the previous word was also encoded, we
// should ignore the collected whitespace. Else
// we include the whitespace as well.
sb.
append(
wsb);
}
prevWasEncoded = true;
} catch (
ParseException pex) {
// This is NOT an 'encoded-word'.
word =
s;
// possibly decode inner encoded words
if (!
decodeStrict) {
String dword =
decodeInnerWords(
word);
if (
dword !=
word) {
// if a different String object was returned,
// decoding was done.
if (
prevWasEncoded &&
word.
startsWith("=?")) {
// encoded followed by encoded,
// throw away whitespace between
} else {
// include collected whitespace ..
if (
wsb.
length() > 0)
sb.
append(
wsb);
}
// did original end with encoded?
prevWasEncoded =
word.
endsWith("?=");
word =
dword;
} else {
// include collected whitespace ..
if (
wsb.
length() > 0)
sb.
append(
wsb);
prevWasEncoded = false;
}
} else {
// include collected whitespace ..
if (
wsb.
length() > 0)
sb.
append(
wsb);
prevWasEncoded = false;
}
}
sb.
append(
word); // append the actual word
wsb.
setLength(0); // reset wsb for reuse
}
}
sb.
append(
wsb); // append trailing whitespace
return
sb.
toString();
}
/**
* Encode a RFC 822 "word" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the platform's
* default charset, then transfer-encoded using either the B or
* Q encoding. The resulting bytes are then returned as a Unicode
* string containing only ASCII characters. <p>
*
* This method is meant to be used when creating RFC 822 "phrases".
* The InternetAddress class, for example, uses this to encode
* it's 'phrase' component.
*
* @param word Unicode string
* @return Array of Unicode strings containing only US-ASCII
* characters.
* @exception UnsupportedEncodingException if the encoding fails
*/
public static
String encodeWord(
String word)
throws
UnsupportedEncodingException {
return
encodeWord(
word, null, null);
}
/**
* Encode a RFC 822 "word" token into mail-safe form as per
* RFC 2047. <p>
*
* The given Unicode string is examined for non US-ASCII
* characters. If the string contains only US-ASCII characters,
* it is returned as-is. If the string contains non US-ASCII
* characters, it is first character-encoded using the specified
* charset, then transfer-encoded using either the B or Q encoding.
* The resulting bytes are then returned as a Unicode string
* containing only ASCII characters. <p>
*
* @param word Unicode string
* @param charset the MIME charset
* @param encoding the encoding to be used. Currently supported
* values are "B" and "Q". If this parameter is null, then
* the "Q" encoding is used if most of characters to be
* encoded are in the ASCII charset, otherwise "B" encoding
* is used.
* @return Unicode string containing only US-ASCII characters
* @exception UnsupportedEncodingException if the encoding fails
*/
public static
String encodeWord(
String word,
String charset,
String encoding)
throws
UnsupportedEncodingException {
return
encodeWord(
word,
charset,
encoding, true);
}
/*
* Encode the given string. The parameter 'encodingWord' should
* be true if a RFC 822 "word" token is being encoded and false if a
* RFC 822 "text" token is being encoded. This is because the
* "Q" encoding defined in RFC 2047 has more restrictions when
* encoding "word" tokens. (Sigh)
*/
private static
String encodeWord(
String string,
String charset,
String encoding, boolean
encodingWord)
throws
UnsupportedEncodingException {
// If 'string' contains only US-ASCII characters, just
// return it.
int
ascii =
checkAscii(
string);
if (
ascii ==
ALL_ASCII)
return
string;
// Else, apply the specified charset conversion.
String jcharset;
if (
charset == null) { // use default charset
jcharset =
getDefaultJavaCharset(); // the java charset
charset =
getDefaultMIMECharset(); // the MIME equivalent
} else // MIME charset -> java charset
jcharset =
javaCharset(
charset);
// If no transfer-encoding is specified, figure one out.
if (
encoding == null) {
if (
ascii !=
MOSTLY_NONASCII)
encoding = "Q";
else
encoding = "B";
}
boolean
b64;
if (
encoding.
equalsIgnoreCase("B"))
b64 = true;
else if (
encoding.
equalsIgnoreCase("Q"))
b64 = false;
else
throw new
UnsupportedEncodingException(
"Unknown transfer encoding: " +
encoding);
StringBuilder outb = new
StringBuilder(); // the output buffer
doEncode(
string,
b64,
jcharset,
// As per RFC 2047, size of an encoded string should not
// exceed 75 bytes.
// 7 = size of "=?", '?', 'B'/'Q', '?', "?="
75 - 7 -
charset.
length(), // the available space
"=?" +
charset + "?" +
encoding + "?", // prefix
true,
encodingWord,
outb);
return
outb.
toString();
}
private static void
doEncode(
String string, boolean
b64,
String jcharset, int
avail,
String prefix,
boolean
first, boolean
encodingWord,
StringBuilder buf)
throws
UnsupportedEncodingException {
// First find out what the length of the encoded version of
// 'string' would be.
byte[]
bytes =
string.
getBytes(
jcharset);
int
len;
if (
b64) // "B" encoding
len =
BEncoderStream.
encodedLength(
bytes);
else // "Q"
len =
QEncoderStream.
encodedLength(
bytes,
encodingWord);
int
size;
if ((
len >
avail) && ((
size =
string.
length()) > 1)) {
// If the length is greater than 'avail', split 'string'
// into two and recurse.
// Have to make sure not to split a Unicode surrogate pair.
int
split =
size / 2;
if (
Character.
isHighSurrogate(
string.
charAt(
split-1)))
split--;
if (
split > 0)
doEncode(
string.
substring(0,
split),
b64,
jcharset,
avail,
prefix,
first,
encodingWord,
buf);
doEncode(
string.
substring(
split,
size),
b64,
jcharset,
avail,
prefix, false,
encodingWord,
buf);
} else {
// length <= than 'avail'. Encode the given string
ByteArrayOutputStream os = new
ByteArrayOutputStream();
OutputStream eos; // the encoder
if (
b64) // "B" encoding
eos = new
BEncoderStream(
os);
else // "Q" encoding
eos = new
QEncoderStream(
os,
encodingWord);
try { // do the encoding
eos.
write(
bytes);
eos.
close();
} catch (
IOException ioex) { }
byte[]
encodedBytes =
os.
toByteArray(); // the encoded stuff
// Now write out the encoded (all ASCII) bytes into our
// StringBuilder
if (!
first) // not the first line of this sequence
if (
foldEncodedWords)
buf.
append("\r\n "); // start a continuation line
else
buf.
append(" "); // line will be folded later
buf.
append(
prefix);
for (int
i = 0;
i <
encodedBytes.length;
i++)
buf.
append((char)
encodedBytes[
i]);
buf.
append("?="); // terminate the current sequence
}
}
/**
* The string is parsed using the rules in RFC 2047 and RFC 2231 for
* parsing an "encoded-word". If the parse fails, a ParseException is
* thrown. Otherwise, it is transfer-decoded, and then
* charset-converted into Unicode. If the charset-conversion
* fails, an UnsupportedEncodingException is thrown.<p>
*
* @param eword the encoded value
* @return the decoded word
* @exception ParseException if the string is not an
* encoded-word as per RFC 2047 and RFC 2231.
* @exception UnsupportedEncodingException if the charset
* conversion failed.
*/
public static
String decodeWord(
String eword)
throws
ParseException,
UnsupportedEncodingException {
if (!
eword.
startsWith("=?")) // not an encoded word
throw new
ParseException(
"encoded word does not start with \"=?\": " +
eword);
// get charset
int
start = 2; int
pos;
if ((
pos =
eword.
indexOf('?',
start)) == -1)
throw new
ParseException(
"encoded word does not include charset: " +
eword);
String charset =
eword.
substring(
start,
pos);
int
lpos =
charset.
indexOf('*'); // RFC 2231 language specified?
if (
lpos >= 0) // yes, throw it away
charset =
charset.
substring(0,
lpos);
charset =
javaCharset(
charset);
// get encoding
start =
pos+1;
if ((
pos =
eword.
indexOf('?',
start)) == -1)
throw new
ParseException(
"encoded word does not include encoding: " +
eword);
String encoding =
eword.
substring(
start,
pos);
// get encoded-sequence
start =
pos+1;
if ((
pos =
eword.
indexOf("?=",
start)) == -1)
throw new
ParseException(
"encoded word does not end with \"?=\": " +
eword);
/*
* XXX - should include this, but leaving it out for compatibility...
*
if (decodeStrict && pos != eword.length() - 2)
throw new ParseException(
"encoded word does not end with \"?=\": " + eword););
*/
String word =
eword.
substring(
start,
pos);
try {
String decodedWord;
if (
word.
length() > 0) {
// Extract the bytes from word
ByteArrayInputStream bis =
new
ByteArrayInputStream(
ASCIIUtility.
getBytes(
word));
// Get the appropriate decoder
InputStream is;
if (
encoding.
equalsIgnoreCase("B"))
is = new
BASE64DecoderStream(
bis);
else if (
encoding.
equalsIgnoreCase("Q"))
is = new
QDecoderStream(
bis);
else
throw new
UnsupportedEncodingException(
"unknown encoding: " +
encoding);
// For b64 & q, size of decoded word <= size of word. So
// the decoded bytes must fit into the 'bytes' array. This
// is certainly more efficient than writing bytes into a
// ByteArrayOutputStream and then pulling out the byte[]
// from it.
int
count =
bis.
available();
byte[]
bytes = new byte[
count];
// count is set to the actual number of decoded bytes
count =
is.
read(
bytes, 0,
count);
// Finally, convert the decoded bytes into a String using
// the specified charset
decodedWord =
count <= 0 ? "" :
new
String(
bytes, 0,
count,
charset);
} else {
// no characters to decode, return empty string
decodedWord = "";
}
if (
pos + 2 <
eword.
length()) {
// there's still more text in the string
String rest =
eword.
substring(
pos + 2);
if (!
decodeStrict)
rest =
decodeInnerWords(
rest);
decodedWord +=
rest;
}
return
decodedWord;
} catch (
UnsupportedEncodingException uex) {
// explicitly catch and rethrow this exception, otherwise
// the below IOException catch will swallow this up!
throw
uex;
} catch (
IOException ioex) {
// Shouldn't happen.
throw new
ParseException(
ioex.
toString());
} catch (
IllegalArgumentException iex) {
/* An unknown charset of the form ISO-XXX-XXX, will cause
* the JDK to throw an IllegalArgumentException ... Since the
* JDK will attempt to create a classname using this string,
* but valid classnames must not contain the character '-',
* and this results in an IllegalArgumentException, rather than
* the expected UnsupportedEncodingException. Yikes
*/
throw new
UnsupportedEncodingException(
charset);
}
}
/**
* Look for encoded words within a word. The MIME spec doesn't
* allow this, but many broken mailers, especially Japanese mailers,
* produce such incorrect encodings.
*/
private static
String decodeInnerWords(
String word)
throws
UnsupportedEncodingException {
int
start = 0,
i;
StringBuilder buf = new
StringBuilder();
while ((
i =
word.
indexOf("=?",
start)) >= 0) {
buf.
append(
word.
substring(
start,
i));
// find first '?' after opening '=?' - end of charset
int
end =
word.
indexOf('?',
i + 2);
if (
end < 0)
break;
// find next '?' after that - end of encoding
end =
word.
indexOf('?',
end + 1);
if (
end < 0)
break;
// find terminating '?='
end =
word.
indexOf("?=",
end + 1);
if (
end < 0)
break;
String s =
word.
substring(
i,
end + 2);
try {
s =
decodeWord(
s);
} catch (
ParseException pex) {
// ignore it, just use the original string
}
buf.
append(
s);
start =
end + 2;
}
if (
start == 0)
return
word;
if (
start <
word.
length())
buf.
append(
word.
substring(
start));
return
buf.
toString();
}
/**
* A utility method to quote a word, if the word contains any
* characters from the specified 'specials' list.<p>
*
* The <code>HeaderTokenizer</code> class defines two special
* sets of delimiters - MIME and RFC 822. <p>
*
* This method is typically used during the generation of
* RFC 822 and MIME header fields.
*
* @param word word to be quoted
* @param specials the set of special characters
* @return the possibly quoted word
* @see javax.mail.internet.HeaderTokenizer#MIME
* @see javax.mail.internet.HeaderTokenizer#RFC822
*/
public static
String quote(
String word,
String specials) {
int
len =
word == null ? 0 :
word.
length();
if (
len == 0)
return "\"\""; // an empty string is handled specially
/*
* Look for any "bad" characters, Escape and
* quote the entire string if necessary.
*/
boolean
needQuoting = false;
for (int
i = 0;
i <
len;
i++) {
char
c =
word.
charAt(
i);
if (
c == '"' ||
c == '\\' ||
c == '\r' ||
c == '\n') {
// need to escape them and then quote the whole string
StringBuilder sb = new
StringBuilder(
len + 3);
sb.
append('"');
sb.
append(
word.
substring(0,
i));
int
lastc = 0;
for (int
j =
i;
j <
len;
j++) {
char
cc =
word.
charAt(
j);
if ((
cc == '"') || (
cc == '\\') ||
(
cc == '\r') || (
cc == '\n'))
if (
cc == '\n' &&
lastc == '\r')
; // do nothing, CR was already escaped
else
sb.
append('\\'); // Escape the character
sb.
append(
cc);
lastc =
cc;
}
sb.
append('"');
return
sb.
toString();
} else if (
c < 040 || (
c >= 0177 && !
allowUtf8) ||
specials.
indexOf(
c) >= 0)
// These characters cause the string to be quoted
needQuoting = true;
}
if (
needQuoting) {
StringBuilder sb = new
StringBuilder(
len + 2);
sb.
append('"').
append(
word).
append('"');
return
sb.
toString();
} else
return
word;
}
/**
* Fold a string at linear whitespace so that each line is no longer
* than 76 characters, if possible. If there are more than 76
* non-whitespace characters consecutively, the string is folded at
* the first whitespace after that sequence. The parameter
* <code>used</code> indicates how many characters have been used in
* the current line; it is usually the length of the header name. <p>
*
* Note that line breaks in the string aren't escaped; they probably
* should be.
*
* @param used characters used in line so far
* @param s the string to fold
* @return the folded string
* @since JavaMail 1.4
*/
public static
String fold(int
used,
String s) {
if (!
foldText)
return
s;
int
end;
char
c;
// Strip trailing spaces and newlines
for (
end =
s.
length() - 1;
end >= 0;
end--) {
c =
s.
charAt(
end);
if (
c != ' ' &&
c != '\t' &&
c != '\r' &&
c != '\n')
break;
}
if (
end !=
s.
length() - 1)
s =
s.
substring(0,
end + 1);
// if the string fits now, just return it
if (
used +
s.
length() <= 76)
return
makesafe(
s);
// have to actually fold the string
StringBuilder sb = new
StringBuilder(
s.
length() + 4);
char
lastc = 0;
while (
used +
s.
length() > 76) {
int
lastspace = -1;
for (int
i = 0;
i <
s.
length();
i++) {
if (
lastspace != -1 &&
used +
i > 76)
break;
c =
s.
charAt(
i);
if (
c == ' ' ||
c == '\t')
if (!(
lastc == ' ' ||
lastc == '\t'))
lastspace =
i;
lastc =
c;
}
if (
lastspace == -1) {
// no space, use the whole thing
sb.
append(
s);
s = "";
used = 0;
break;
}
sb.
append(
s.
substring(0,
lastspace));
sb.
append("\r\n");
lastc =
s.
charAt(
lastspace);
sb.
append(
lastc);
s =
s.
substring(
lastspace + 1);
used = 1;
}
sb.
append(
s);
return
makesafe(
sb);
}
/**
* If the String or StringBuilder has any embedded newlines,
* make sure they're followed by whitespace, to prevent header
* injection errors.
*/
private static
String makesafe(
CharSequence s) {
int
i;
for (
i = 0;
i <
s.
length();
i++) {
char
c =
s.
charAt(
i);
if (
c == '\r' ||
c == '\n')
break;
}
if (
i ==
s.
length()) // went through whole string with no CR or LF
return
s.
toString();
// read the lines in the string and reassemble them,
// eliminating blank lines and inserting whitespace as necessary
StringBuilder sb = new
StringBuilder(
s.
length() + 1);
BufferedReader r = new
BufferedReader(new
StringReader(
s.
toString()));
String line;
try {
while ((
line =
r.
readLine()) != null) {
if (
line.
trim().
length() == 0)
continue; // ignore empty lines
if (
sb.
length() > 0) {
sb.
append("\r\n");
assert
line.
length() > 0; // proven above
char
c =
line.
charAt(0);
if (
c != ' ' &&
c != '\t')
sb.
append(' ');
}
sb.
append(
line);
}
} catch (
IOException ex) {
// XXX - should never happen when reading from a string
return
s.
toString();
}
return
sb.
toString();
}
/**
* Unfold a folded header. Any line breaks that aren't escaped and
* are followed by whitespace are removed.
*
* @param s the string to unfold
* @return the unfolded string
* @since JavaMail 1.4
*/
public static
String unfold(
String s) {
if (!
foldText)
return
s;
StringBuilder sb = null;
int
i;
while ((
i =
indexOfAny(
s, "\r\n")) >= 0) {
int
start =
i;
int
slen =
s.
length();
i++; // skip CR or NL
if (
i <
slen &&
s.
charAt(
i - 1) == '\r' &&
s.
charAt(
i) == '\n')
i++; // skip LF
if (
start > 0 &&
s.
charAt(
start - 1) == '\\') {
// there's a backslash before the line break
// strip it out, but leave in the line break
if (
sb == null)
sb = new
StringBuilder(
s.
length());
sb.
append(
s.
substring(0,
start - 1));
sb.
append(
s.
substring(
start,
i));
s =
s.
substring(
i);
} else {
char
c;
// if next line starts with whitespace,
// or at the end of the string, remove the line break
// XXX - next line should always start with whitespace
if (
i >=
slen || (
c =
s.
charAt(
i)) == ' ' ||
c == '\t') {
if (
sb == null)
sb = new
StringBuilder(
s.
length());
sb.
append(
s.
substring(0,
start));
s =
s.
substring(
i);
} else {
// it's not a continuation line, just leave in the newline
if (
sb == null)
sb = new
StringBuilder(
s.
length());
sb.
append(
s.
substring(0,
i));
s =
s.
substring(
i);
}
}
}
if (
sb != null) {
sb.
append(
s);
return
sb.
toString();
} else
return
s;
}
/**
* Return the first index of any of the characters in "any" in "s",
* or -1 if none are found.
*
* This should be a method on String.
*/
private static int
indexOfAny(
String s,
String any) {
return
indexOfAny(
s,
any, 0);
}
private static int
indexOfAny(
String s,
String any, int
start) {
try {
int
len =
s.
length();
for (int
i =
start;
i <
len;
i++) {
if (
any.
indexOf(
s.
charAt(
i)) >= 0)
return
i;
}
return -1;
} catch (
StringIndexOutOfBoundsException e) {
return -1;
}
}
/**
* Convert a MIME charset name into a valid Java charset name. <p>
*
* @param charset the MIME charset name
* @return the Java charset equivalent. If a suitable mapping is
* not available, the passed in charset is itself returned.
*/
public static
String javaCharset(
String charset) {
if (
mime2java == null ||
charset == null)
// no mapping table, or charset parameter is null
return
charset;
String alias =
mime2java.
get(
charset.
toLowerCase(
Locale.
ENGLISH));
if (
alias != null) {
// verify that the mapped name is valid before trying to use it
try {
Charset.
forName(
alias);
} catch (
Exception ex) {
alias = null; // charset alias not valid, use original name
}
}
return
alias == null ?
charset :
alias;
}
/**
* Convert a java charset into its MIME charset name. <p>
*
* Note that a future version of JDK (post 1.2) might provide
* this functionality, in which case, we may deprecate this
* method then.
*
* @param charset the JDK charset
* @return the MIME/IANA equivalent. If a mapping
* is not possible, the passed in charset itself
* is returned.
* @since JavaMail 1.1
*/
public static
String mimeCharset(
String charset) {
if (
java2mime == null ||
charset == null)
// no mapping table or charset param is null
return
charset;
String alias =
java2mime.
get(
charset.
toLowerCase(
Locale.
ENGLISH));
return
alias == null ?
charset :
alias;
}
private static
String defaultJavaCharset;
private static
String defaultMIMECharset;
/**
* Get the default charset corresponding to the system's current
* default locale. If the System property <code>mail.mime.charset</code>
* is set, a system charset corresponding to this MIME charset will be
* returned. <p>
*
* @return the default charset of the system's default locale,
* as a Java charset. (NOT a MIME charset)
* @since JavaMail 1.1
*/
public static
String getDefaultJavaCharset() {
if (
defaultJavaCharset == null) {
/*
* If mail.mime.charset is set, it controls the default
* Java charset as well.
*/
String mimecs = null;
try {
mimecs =
System.
getProperty("mail.mime.charset");
} catch (
SecurityException ex) { } // ignore it
if (
mimecs != null &&
mimecs.
length() > 0) {
defaultJavaCharset =
javaCharset(
mimecs);
return
defaultJavaCharset;
}
try {
defaultJavaCharset =
System.
getProperty("file.encoding",
"8859_1");
} catch (
SecurityException sex) {
class
NullInputStream extends
InputStream {
@
Override
public int
read() {
return 0;
}
}
InputStreamReader reader =
new
InputStreamReader(new
NullInputStream());
defaultJavaCharset =
reader.
getEncoding();
if (
defaultJavaCharset == null)
defaultJavaCharset = "8859_1";
}
}
return
defaultJavaCharset;
}
/*
* Get the default MIME charset for this locale.
*/
static
String getDefaultMIMECharset() {
if (
defaultMIMECharset == null) {
try {
defaultMIMECharset =
System.
getProperty("mail.mime.charset");
} catch (
SecurityException ex) { } // ignore it
}
if (
defaultMIMECharset == null)
defaultMIMECharset =
mimeCharset(
getDefaultJavaCharset());
return
defaultMIMECharset;
}
// Tables to map MIME charset names to Java names and vice versa.
// XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
private static
Map<
String,
String>
mime2java;
private static
Map<
String,
String>
java2mime;
static {
java2mime = new
HashMap<>(40);
mime2java = new
HashMap<>(14);
try {
// Use this class's classloader to load the mapping file
// XXX - we should use SecuritySupport, but it's in another package
InputStream is =
javax.mail.internet.
MimeUtility.class.
getResourceAsStream(
"/META-INF/javamail.charset.map");
if (
is != null) {
try {
is = new
LineInputStream(
is);
// Load the JDK-to-MIME charset mapping table
loadMappings((
LineInputStream)
is,
java2mime);
// Load the MIME-to-JDK charset mapping table
loadMappings((
LineInputStream)
is,
mime2java);
} finally {
try {
is.
close();
} catch (
Exception cex) {
// ignore
}
}
}
} catch (
Exception ex) { }
// If we didn't load the tables, e.g., because we didn't have
// permission, load them manually. The entries here should be
// the same as the default javamail.charset.map.
if (
java2mime.
isEmpty()) {
java2mime.
put("8859_1", "ISO-8859-1");
java2mime.
put("iso8859_1", "ISO-8859-1");
java2mime.
put("iso8859-1", "ISO-8859-1");
java2mime.
put("8859_2", "ISO-8859-2");
java2mime.
put("iso8859_2", "ISO-8859-2");
java2mime.
put("iso8859-2", "ISO-8859-2");
java2mime.
put("8859_3", "ISO-8859-3");
java2mime.
put("iso8859_3", "ISO-8859-3");
java2mime.
put("iso8859-3", "ISO-8859-3");
java2mime.
put("8859_4", "ISO-8859-4");
java2mime.
put("iso8859_4", "ISO-8859-4");
java2mime.
put("iso8859-4", "ISO-8859-4");
java2mime.
put("8859_5", "ISO-8859-5");
java2mime.
put("iso8859_5", "ISO-8859-5");
java2mime.
put("iso8859-5", "ISO-8859-5");
java2mime.
put("8859_6", "ISO-8859-6");
java2mime.
put("iso8859_6", "ISO-8859-6");
java2mime.
put("iso8859-6", "ISO-8859-6");
java2mime.
put("8859_7", "ISO-8859-7");
java2mime.
put("iso8859_7", "ISO-8859-7");
java2mime.
put("iso8859-7", "ISO-8859-7");
java2mime.
put("8859_8", "ISO-8859-8");
java2mime.
put("iso8859_8", "ISO-8859-8");
java2mime.
put("iso8859-8", "ISO-8859-8");
java2mime.
put("8859_9", "ISO-8859-9");
java2mime.
put("iso8859_9", "ISO-8859-9");
java2mime.
put("iso8859-9", "ISO-8859-9");
java2mime.
put("sjis", "Shift_JIS");
java2mime.
put("jis", "ISO-2022-JP");
java2mime.
put("iso2022jp", "ISO-2022-JP");
java2mime.
put("euc_jp", "euc-jp");
java2mime.
put("koi8_r", "koi8-r");
java2mime.
put("euc_cn", "euc-cn");
java2mime.
put("euc_tw", "euc-tw");
java2mime.
put("euc_kr", "euc-kr");
}
if (
mime2java.
isEmpty()) {
mime2java.
put("iso-2022-cn", "ISO2022CN");
mime2java.
put("iso-2022-kr", "ISO2022KR");
mime2java.
put("utf-8", "UTF8");
mime2java.
put("utf8", "UTF8");
mime2java.
put("ja_jp.iso2022-7", "ISO2022JP");
mime2java.
put("ja_jp.eucjp", "EUCJIS");
mime2java.
put("euc-kr", "KSC5601");
mime2java.
put("euckr", "KSC5601");
mime2java.
put("us-ascii", "ISO-8859-1");
mime2java.
put("x-us-ascii", "ISO-8859-1");
mime2java.
put("gb2312", "GB18030");
mime2java.
put("cp936", "GB18030");
mime2java.
put("ms936", "GB18030");
mime2java.
put("gbk", "GB18030");
}
}
private static void
loadMappings(
LineInputStream is,
Map<
String,
String>
table) {
String currLine;
while (true) {
try {
currLine =
is.
readLine();
} catch (
IOException ioex) {
break; // error in reading, stop
}
if (
currLine == null) // end of file, stop
break;
if (
currLine.
startsWith("--") &&
currLine.
endsWith("--"))
// end of this table
break;
// ignore empty lines and comments
if (
currLine.
trim().
length() == 0 ||
currLine.
startsWith("#"))
continue;
// A valid entry is of the form <key><separator><value>
// where, <separator> := SPACE | HT. Parse this
StringTokenizer tk = new
StringTokenizer(
currLine, " \t");
try {
String key =
tk.
nextToken();
String value =
tk.
nextToken();
table.
put(
key.
toLowerCase(
Locale.
ENGLISH),
value);
} catch (
NoSuchElementException nex) { }
}
}
static final int
ALL_ASCII = 1;
static final int
MOSTLY_ASCII = 2;
static final int
MOSTLY_NONASCII = 3;
/**
* Check if the given string contains non US-ASCII characters.
* @param s string
* @return ALL_ASCII if all characters in the string
* belong to the US-ASCII charset. MOSTLY_ASCII
* if more than half of the available characters
* are US-ASCII characters. Else MOSTLY_NONASCII.
*/
static int
checkAscii(
String s) {
int
ascii = 0,
non_ascii = 0;
int
l =
s.
length();
for (int
i = 0;
i <
l;
i++) {
if (
nonascii((int)
s.
charAt(
i))) // non-ascii
non_ascii++;
else
ascii++;
}
if (
non_ascii == 0)
return
ALL_ASCII;
if (
ascii >
non_ascii)
return
MOSTLY_ASCII;
return
MOSTLY_NONASCII;
}
/**
* Check if the given byte array contains non US-ASCII characters.
* @param b byte array
* @return ALL_ASCII if all characters in the string
* belong to the US-ASCII charset. MOSTLY_ASCII
* if more than half of the available characters
* are US-ASCII characters. Else MOSTLY_NONASCII.
*
* XXX - this method is no longer used
*/
static int
checkAscii(byte[]
b) {
int
ascii = 0,
non_ascii = 0;
for (int
i=0;
i <
b.length;
i++) {
// The '&' operator automatically causes b[i] to be promoted
// to an int, and we mask out the higher bytes in the int
// so that the resulting value is not a negative integer.
if (
nonascii(
b[
i] & 0xff)) // non-ascii
non_ascii++;
else
ascii++;
}
if (
non_ascii == 0)
return
ALL_ASCII;
if (
ascii >
non_ascii)
return
MOSTLY_ASCII;
return
MOSTLY_NONASCII;
}
/**
* Check if the given input stream contains non US-ASCII characters.
* Upto <code>max</code> bytes are checked. If <code>max</code> is
* set to <code>ALL</code>, then all the bytes available in this
* input stream are checked. If <code>breakOnNonAscii</code> is true
* the check terminates when the first non-US-ASCII character is
* found and MOSTLY_NONASCII is returned. Else, the check continues
* till <code>max</code> bytes or till the end of stream.
*
* @param is the input stream
* @param max maximum bytes to check for. The special value
* ALL indicates that all the bytes in this input
* stream must be checked.
* @param breakOnNonAscii if <code>true</code>, then terminate the
* the check when the first non-US-ASCII character
* is found.
* @return ALL_ASCII if all characters in the string
* belong to the US-ASCII charset. MOSTLY_ASCII
* if more than half of the available characters
* are US-ASCII characters. Else MOSTLY_NONASCII.
*/
static int
checkAscii(
InputStream is, int
max, boolean
breakOnNonAscii) {
int
ascii = 0,
non_ascii = 0;
int
len;
int
block = 4096;
int
linelen = 0;
boolean
longLine = false,
badEOL = false;
boolean
checkEOL =
encodeEolStrict &&
breakOnNonAscii;
byte
buf[] = null;
if (
max != 0) {
block = (
max ==
ALL) ? 4096 :
Math.
min(
max, 4096);
buf = new byte[
block];
}
while (
max != 0) {
try {
if ((
len =
is.
read(
buf, 0,
block)) == -1)
break;
int
lastb = 0;
for (int
i = 0;
i <
len;
i++) {
// The '&' operator automatically causes b[i] to
// be promoted to an int, and we mask out the higher
// bytes in the int so that the resulting value is
// not a negative integer.
int
b =
buf[
i] & 0xff;
if (
checkEOL &&
((
lastb == '\r' &&
b != '\n') ||
(
lastb != '\r' &&
b == '\n')))
badEOL = true;
if (
b == '\r' ||
b == '\n')
linelen = 0;
else {
linelen++;
if (
linelen > 998) // 1000 - CRLF
longLine = true;
}
if (
nonascii(
b)) { // non-ascii
if (
breakOnNonAscii) // we are done
return
MOSTLY_NONASCII;
else
non_ascii++;
} else
ascii++;
lastb =
b;
}
} catch (
IOException ioex) {
break;
}
if (
max !=
ALL)
max -=
len;
}
if (
max == 0 &&
breakOnNonAscii)
// We have been told to break on the first non-ascii character.
// We haven't got any non-ascii character yet, but then we
// have not checked all of the available bytes either. So we
// cannot say for sure that this input stream is ALL_ASCII,
// and hence we must play safe and return MOSTLY_NONASCII
return
MOSTLY_NONASCII;
if (
non_ascii == 0) { // no non-us-ascii characters so far
// If we're looking at non-text data, and we saw CR without LF
// or vice versa, consider this mostly non-ASCII so that it
// will be base64 encoded (since the quoted-printable encoder
// doesn't encode this case properly).
if (
badEOL)
return
MOSTLY_NONASCII;
// if we've seen a long line, we degrade to mostly ascii
else if (
longLine)
return
MOSTLY_ASCII;
else
return
ALL_ASCII;
}
if (
ascii >
non_ascii) // mostly ascii
return
MOSTLY_ASCII;
return
MOSTLY_NONASCII;
}
static final boolean
nonascii(int
b) {
return
b >= 0177 || (
b < 040 &&
b != '\r' &&
b != '\n' &&
b != '\t');
}
}
/**
* An OutputStream that determines whether the data written to
* it is all ASCII, mostly ASCII, or mostly non-ASCII.
*/
class
AsciiOutputStream extends
OutputStream {
private boolean
breakOnNonAscii;
private int
ascii = 0,
non_ascii = 0;
private int
linelen = 0;
private boolean
longLine = false;
private boolean
badEOL = false;
private boolean
checkEOL = false;
private int
lastb = 0;
private int
ret = 0;
public
AsciiOutputStream(boolean
breakOnNonAscii, boolean
encodeEolStrict) {
this.
breakOnNonAscii =
breakOnNonAscii;
checkEOL =
encodeEolStrict &&
breakOnNonAscii;
}
@
Override
public void
write(int
b) throws
IOException {
check(
b);
}
@
Override
public void
write(byte
b[]) throws
IOException {
write(
b, 0,
b.length);
}
@
Override
public void
write(byte
b[], int
off, int
len) throws
IOException {
len +=
off;
for (int
i =
off;
i <
len ;
i++)
check(
b[
i]);
}
private final void
check(int
b) throws
IOException {
b &= 0xff;
if (
checkEOL &&
((
lastb == '\r' &&
b != '\n') || (
lastb != '\r' &&
b == '\n')))
badEOL = true;
if (
b == '\r' ||
b == '\n')
linelen = 0;
else {
linelen++;
if (
linelen > 998) // 1000 - CRLF
longLine = true;
}
if (
MimeUtility.
nonascii(
b)) { // non-ascii
non_ascii++;
if (
breakOnNonAscii) { // we are done
ret =
MimeUtility.
MOSTLY_NONASCII;
throw new
EOFException();
}
} else
ascii++;
lastb =
b;
}
/**
* Return ASCII-ness of data stream.
*/
public int
getAscii() {
if (
ret != 0)
return
ret;
// If we're looking at non-text data, and we saw CR without LF
// or vice versa, consider this mostly non-ASCII so that it
// will be base64 encoded (since the quoted-printable encoder
// doesn't encode this case properly).
if (
badEOL)
return
MimeUtility.
MOSTLY_NONASCII;
else if (
non_ascii == 0) { // no non-us-ascii characters so far
// if we've seen a long line, we degrade to mostly ascii
if (
longLine)
return
MimeUtility.
MOSTLY_ASCII;
else
return
MimeUtility.
ALL_ASCII;
}
if (
ascii >
non_ascii) // mostly ascii
return
MimeUtility.
MOSTLY_ASCII;
return
MimeUtility.
MOSTLY_NONASCII;
}
}