/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import static org.apache.poi.hssf.model.
InternalWorkbook.
OLD_WORKBOOK_DIR_ENTRY_NAME;
import static org.apache.poi.hssf.model.
InternalWorkbook.
WORKBOOK_DIR_ENTRY_NAMES;
import java.io.
IOException;
import java.io.
InputStream;
import java.lang.reflect.
Method;
import java.util.
ArrayList;
import java.util.
Iterator;
import java.util.
List;
import org.apache.poi.
EncryptedDocumentException;
import org.apache.poi.
POIOLE2TextExtractor;
import org.apache.poi.
POITextExtractor;
import org.apache.poi.hssf.
OldExcelFormatException;
import org.apache.poi.hssf.extractor.
EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.
ExcelExtractor;
import org.apache.poi.hssf.record.crypto.
Biff8EncryptionKey;
import org.apache.poi.poifs.crypt.
Decryptor;
import org.apache.poi.poifs.crypt.
EncryptionInfo;
import org.apache.poi.poifs.filesystem.
DirectoryEntry;
import org.apache.poi.poifs.filesystem.
DirectoryNode;
import org.apache.poi.poifs.filesystem.
Entry;
import org.apache.poi.poifs.filesystem.
NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.
OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.
POIFSFileSystem;
import org.apache.poi.util.
IOUtils;
import org.apache.poi.util.
POILogFactory;
import org.apache.poi.util.
POILogger;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
* document, and returns it.
*
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - for text extractor creation across all formats, use
* {@link org.apache.poi.extractor.ExtractorFactory} contained within
* the OOXML jar.</p>
* <p>Note 3 - rather than using this, for most cases you would be better
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@
SuppressWarnings("WeakerAccess")
public class
OLE2ExtractorFactory {
private static final
POILogger LOGGER =
POILogFactory.
getLogger(
OLE2ExtractorFactory.class);
/** Should this thread prefer event based over usermodel based extractors? */
private static final
ThreadLocal<
Boolean>
threadPreferEventExtractors = new
ThreadLocal<
Boolean>() {
@
Override
protected
Boolean initialValue() { return
Boolean.
FALSE; }
};
/** Should all threads prefer event based over usermodel based extractors? */
private static
Boolean allPreferEventExtractors;
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*/
public static boolean
getThreadPrefersEventExtractors() {
return
threadPreferEventExtractors.
get();
}
/**
* Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false.
*/
public static
Boolean getAllThreadsPreferEventExtractors() {
return
allPreferEventExtractors;
}
/**
* Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null.
*/
public static void
setThreadPrefersEventExtractors(boolean
preferEventExtractors) {
threadPreferEventExtractors.
set(
preferEventExtractors);
}
/**
* Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting.
*/
public static void
setAllThreadsPreferEventExtractors(
Boolean preferEventExtractors) {
allPreferEventExtractors =
preferEventExtractors;
}
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
*/
protected static boolean
getPreferEventExtractor() {
if(
allPreferEventExtractors != null) {
return
allPreferEventExtractors;
}
return
threadPreferEventExtractors.
get();
}
public static
POIOLE2TextExtractor createExtractor(
POIFSFileSystem fs) throws
IOException {
// Only ever an OLE2 one from the root of the FS
return (
POIOLE2TextExtractor)
createExtractor(
fs.
getRoot());
}
public static
POIOLE2TextExtractor createExtractor(
NPOIFSFileSystem fs) throws
IOException {
// Only ever an OLE2 one from the root of the FS
return (
POIOLE2TextExtractor)
createExtractor(
fs.
getRoot());
}
public static
POIOLE2TextExtractor createExtractor(
OPOIFSFileSystem fs) throws
IOException {
// Only ever an OLE2 one from the root of the FS
return (
POIOLE2TextExtractor)
createExtractor(
fs.
getRoot());
}
public static
POITextExtractor createExtractor(
InputStream input) throws
IOException {
Class<?>
cls =
getOOXMLClass();
if (
cls != null) {
// Use Reflection to get us the full OOXML-enabled version
try {
Method m =
cls.
getDeclaredMethod("createExtractor",
InputStream.class);
return (
POITextExtractor)
m.
invoke(null,
input);
} catch (
IllegalArgumentException iae) {
throw
iae;
} catch (
Exception e) {
throw new
IllegalArgumentException("Error creating Extractor for InputStream",
e);
}
} else {
// Best hope it's OLE2....
return
createExtractor(new
NPOIFSFileSystem(
input));
}
}
private static
Class<?>
getOOXMLClass() {
try {
return
OLE2ExtractorFactory.class.
getClassLoader().
loadClass(
"org.apache.poi.extractor.ExtractorFactory"
);
} catch (
ClassNotFoundException e) {
LOGGER.
log(
POILogger.
WARN, "POI OOXML jar missing");
return null;
}
}
private static
Class<?>
getScratchpadClass() {
try {
return
OLE2ExtractorFactory.class.
getClassLoader().
loadClass(
"org.apache.poi.extractor.OLE2ScratchpadExtractorFactory"
);
} catch (
ClassNotFoundException e) {
LOGGER.
log(
POILogger.
ERROR, "POI Scratchpad jar missing");
throw new
IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
}
}
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
* {@link org.apache.poi.extractor.ExtractorFactory} for that.
*/
public static
POITextExtractor createExtractor(
DirectoryNode poifsDir) throws
IOException {
// Look for certain entries in the stream, to figure it
// out from
for (
String workbookName :
WORKBOOK_DIR_ENTRY_NAMES) {
if (
poifsDir.
hasEntry(
workbookName)) {
if (
getPreferEventExtractor()) {
return new
EventBasedExcelExtractor(
poifsDir);
}
return new
ExcelExtractor(
poifsDir);
}
}
if (
poifsDir.
hasEntry(
OLD_WORKBOOK_DIR_ENTRY_NAME)) {
throw new
OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
+ "found. Please call OldExcelExtractor directly for basic text extraction");
}
// Ask Scratchpad, or fail trying
Class<?>
cls =
getScratchpadClass();
try {
Method m =
cls.
getDeclaredMethod("createExtractor",
DirectoryNode.class);
POITextExtractor ext = (
POITextExtractor)
m.
invoke(null,
poifsDir);
if (
ext != null) return
ext;
} catch (
IllegalArgumentException iae) {
throw
iae;
} catch (
Exception e) {
throw new
IllegalArgumentException("Error creating Scratchpad Extractor",
e);
}
throw new
IllegalArgumentException("No supported documents found in the OLE2 stream");
}
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static
POITextExtractor[]
getEmbededDocsTextExtractors(
POIOLE2TextExtractor ext)
throws
IOException
{
// All the embedded directories we spotted
List<
Entry>
dirs = new
ArrayList<
Entry>();
// For anything else not directly held in as a POIFS directory
List<
InputStream>
nonPOIFS = new
ArrayList<
InputStream>();
// Find all the embedded directories
DirectoryEntry root =
ext.
getRoot();
if(
root == null) {
throw new
IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if(
ext instanceof
ExcelExtractor) {
// These are in MBD... under the root
Iterator<
Entry>
it =
root.
getEntries();
while(
it.
hasNext()) {
Entry entry =
it.
next();
if(
entry.
getName().
startsWith("MBD")) {
dirs.
add(
entry);
}
}
} else {
// Ask Scratchpad, or fail trying
Class<?>
cls =
getScratchpadClass();
try {
Method m =
cls.
getDeclaredMethod(
"identifyEmbeddedResources",
POIOLE2TextExtractor.class,
List.class,
List.class);
m.
invoke(null,
ext,
dirs,
nonPOIFS);
} catch (
Exception e) {
throw new
IllegalArgumentException("Error checking for Scratchpad embedded resources",
e);
}
}
// Create the extractors
if(
dirs.
size() == 0 &&
nonPOIFS.
size() == 0){
return new
POITextExtractor[0];
}
ArrayList<
POITextExtractor>
e = new
ArrayList<
POITextExtractor>();
for (
Entry dir :
dirs) {
e.
add(
createExtractor(
(
DirectoryNode)
dir
));
}
for (
InputStream nonPOIF :
nonPOIFS) {
try {
e.
add(
createExtractor(
nonPOIF));
} catch (
IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
LOGGER.
log(
POILogger.
WARN,
ie);
} catch (
Exception xe) {
// Ignore, invalid format
LOGGER.
log(
POILogger.
WARN,
xe);
}
}
return
e.
toArray(new
POITextExtractor[
e.
size()]);
}
private static
POITextExtractor createEncyptedOOXMLExtractor(
DirectoryNode poifsDir)
throws
IOException {
String pass =
Biff8EncryptionKey.
getCurrentUserPassword();
if (
pass == null) {
pass =
Decryptor.
DEFAULT_PASSWORD;
}
EncryptionInfo ei = new
EncryptionInfo(
poifsDir);
Decryptor dec =
ei.
getDecryptor();
InputStream is = null;
try {
if (!
dec.
verifyPassword(
pass)) {
throw new
EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
}
is =
dec.
getDataStream(
poifsDir);
return
createExtractor(
is);
} catch (
IOException e) {
throw
e;
} catch (
Exception e) {
throw new
IOException(
e);
} finally {
IOUtils.
closeQuietly(
is);
}
}
}