diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java
index c0cf281e8b..1c7bf70611 100644
--- a/tika-core/src/main/java/org/apache/tika/Tika.java
+++ b/tika-core/src/main/java/org/apache/tika/Tika.java
@@ -527,7 +527,7 @@ public String parseToString(InputStream stream, Metadata metadata)
parser.parse(
stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
- if (!handler.isWriteLimitReached(e)) {
+ if (!handler.isWriteLimitReached(e) && !handler.isMaxParseTimeReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
@@ -568,7 +568,7 @@ public String parseToString(InputStream stream, Metadata metadata, int maxLength
parser.parse(
stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
- if (!handler.isWriteLimitReached(e)) {
+ if (!handler.isWriteLimitReached(e) && !handler.isMaxParseTimeReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index e9de9bab37..c175448785 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -233,11 +233,13 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
} catch (SAXException e) {
- boolean wlr = isWriteLimitReached(e);
- if (wlr == false) {
+ if (isWriteLimitReached(e)) {
+ metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
+ } else if (isMaxParseTimeReached(e)) {
+ metadata.set(RecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED, "true");
+ } else {
throw e;
}
- metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
} catch (Throwable e) {
//try our best to record the problem in the metadata object
//then rethrow
@@ -323,6 +325,22 @@ private boolean isWriteLimitReached(Throwable t) {
}
}
+ /**
+ * Copied/modified from WriteOutContentHandler. Couldn't make that
+ * static, and we need to have something that will work
+ * with exceptions thrown from both BodyContentHandler and WriteOutContentHandler
+ * @param t The exception to check.
+ * @return True if this is a MaxParseTimeReachedException exception.
+ */
+ private boolean isMaxParseTimeReached(Throwable t) {
+ if (t.getMessage() != null &&
+ t.getMessage().indexOf("Your document took more than ") == 0) {
+ return true;
+ } else {
+ return t.getCause() != null && isMaxParseTimeReached(t.getCause());
+ }
+ }
+
private String getResourceName(Metadata metadata, ParserState state) {
String objectName = "";
if (metadata.get(Metadata.RESOURCE_NAME_KEY) != null) {
@@ -387,15 +405,14 @@ public void parse(InputStream stream, ContentHandler ignore,
try {
super.parse(stream, secureContentHandler, metadata, context);
} catch (SAXException e) {
- boolean wlr = isWriteLimitReached(e);
- if (wlr == true) {
+ if (isWriteLimitReached(e)) {
metadata.add(WRITE_LIMIT_REACHED, "true");
+ } else if (isMaxParseTimeReached(e)) {
+ metadata.set(RecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED, "true");
+ } else if (catchEmbeddedExceptions) {
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
- if (catchEmbeddedExceptions) {
- ParserUtils.recordParserFailure(this, e, metadata);
- } else {
- throw e;
- }
+ throw e;
}
} catch(CorruptedFileException e) {
throw e;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index 8515f09c4c..6a3373dfd1 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -43,6 +43,8 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
public final static Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
+ public final static Property MAX_PARSE_TIME_REACHED =
+ Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "max_parse_time_reached");
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 899994e053..361c23da52 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -70,6 +70,7 @@ public enum HANDLER_TYPE {
private final HANDLER_TYPE type;
private final int writeLimit;
+ private final long maxParseTime;
/**
*
@@ -79,6 +80,20 @@ public enum HANDLER_TYPE {
public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) {
this.type = type;
this.writeLimit = writeLimit;
+ this.maxParseTime = -1L;
+ }
+
+ /**
+ *
+ * @param type basic type of handler
+ * @param writeLimit max number of characters to store; if < 0, the handler will store all characters
+ * @param maxParseTime if > -1, stop writing characters to store if this many milliseconds has elapsed since
+ * first characters were written.
+ */
+ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, long maxParseTime) {
+ this.type = type;
+ this.writeLimit = writeLimit;
+ this.maxParseTime = maxParseTime;
}
@Override
@@ -89,25 +104,23 @@ public ContentHandler getNewContentHandler() {
} else if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
}
- if (writeLimit > -1) {
+ if (writeLimit > -1 || maxParseTime > -1L) {
switch(type) {
- case TEXT:
- return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
case HTML:
- return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit);
+ return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit, maxParseTime);
case XML:
- return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit);
+ return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit, maxParseTime);
default:
- return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
+ case TEXT:
+ return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit, maxParseTime);
}
} else {
switch (type) {
- case TEXT:
- return new ToTextContentHandler();
case HTML:
return new ToHTMLContentHandler();
case XML:
return new ToXMLContentHandler();
+ case TEXT:
default:
return new ToTextContentHandler();
@@ -127,31 +140,29 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
return new DefaultHandler();
}
try {
- if (writeLimit > -1) {
+ if (writeLimit > -1 || maxParseTime > -1L) {
switch (type) {
case BODY:
return new WriteOutContentHandler(
new BodyContentHandler(
- new OutputStreamWriter(os, charset)), writeLimit);
- case TEXT:
- return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit);
+ new OutputStreamWriter(os, charset)), writeLimit, maxParseTime);
case HTML:
- return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit);
+ return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit, maxParseTime);
case XML:
- return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit);
+ return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit, maxParseTime);
+ case TEXT:
default:
- return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit);
+ return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit, maxParseTime);
}
} else {
switch (type) {
case BODY:
return new BodyContentHandler(new OutputStreamWriter(os, charset));
- case TEXT:
- return new ToTextContentHandler(os, charset.name());
case HTML:
return new ToHTMLContentHandler(os, charset.name());
case XML:
return new ToXMLContentHandler(os, charset.name());
+ case TEXT:
default:
return new ToTextContentHandler(os, charset.name());
diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
index 90b98ace0c..099dee5e7c 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
@@ -44,11 +44,22 @@ public class WriteOutContentHandler extends ContentHandlerDecorator {
*/
private final int writeLimit;
+ /**
+ * The maximum amount of time that can be spent parsing in milliseconds.
+ * Set to -1 for no limit.
+ */
+ private final long maxParseTime;
+
/**
* Number of characters written so far.
*/
private int writeCount = 0;
+ /**
+ * Time the parse started.
+ */
+ private long parseStartedMs = 0;
+
/**
* Creates a content handler that writes content up to the given
* write limit to the given content handler.
@@ -60,6 +71,36 @@ public class WriteOutContentHandler extends ContentHandlerDecorator {
public WriteOutContentHandler(ContentHandler handler, int writeLimit) {
super(handler);
this.writeLimit = writeLimit;
+ this.maxParseTime = -1L;
+ }
+
+ /**
+ * Creates a content handler that writes content up to the given
+ * write limit to the given character stream.
+ *
+ * @since Apache Tika 1.25
+ * @param writer character stream
+ * @param writeLimit If > -1, stop writing chars to content handler if exceeded this value.
+ * @param maxParseTime The max time in milliseconds that can be spent parsing this file before we stop parsing.
+ */
+ public WriteOutContentHandler(Writer writer, int writeLimit, long maxParseTime) {
+ this(new ToTextContentHandler(writer), writeLimit, maxParseTime);
+ }
+
+ /**
+ * Creates a content handler that writes content up to the given
+ * write limit to the given content handler.
+ *
+ * @since Apache Tika 1.25
+ * @param handler content handler to be decorated
+ * @param writeLimit write limit
+ * @param maxParseTime if > -1, stop writing characters to handler if this many milliseconds has elapsed since
+ * first characters were written.
+ */
+ public WriteOutContentHandler(ContentHandler handler, int writeLimit, long maxParseTime) {
+ super(handler);
+ this.writeLimit = writeLimit;
+ this.maxParseTime = maxParseTime;
}
/**
@@ -71,7 +112,7 @@ public WriteOutContentHandler(ContentHandler handler, int writeLimit) {
* @param writeLimit write limit
*/
public WriteOutContentHandler(Writer writer, int writeLimit) {
- this(new ToTextContentHandler(writer), writeLimit);
+ this(new ToTextContentHandler(writer), writeLimit, -1L);
}
/**
@@ -81,7 +122,7 @@ public WriteOutContentHandler(Writer writer, int writeLimit) {
* @param writer writer
*/
public WriteOutContentHandler(Writer writer) {
- this(writer, -1);
+ this(writer, -1, -1L);
}
/**
@@ -109,7 +150,7 @@ public WriteOutContentHandler(OutputStream stream) {
* or -1 to disable the write limit
*/
public WriteOutContentHandler(int writeLimit) {
- this(new StringWriter(), writeLimit);
+ this(new StringWriter(), writeLimit, -1L);
}
/**
@@ -132,9 +173,19 @@ public WriteOutContentHandler() {
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
+ if (parseStartedMs == 0) {
+ parseStartedMs = System.currentTimeMillis();
+ }
if (writeLimit == -1 || writeCount + length <= writeLimit) {
super.characters(ch, start, length);
writeCount += length;
+ if (maxParseTime > -1 && System.currentTimeMillis() - parseStartedMs > maxParseTime) {
+ throw new MaxParseTimeReachedException(
+ "Your document took more than " + maxParseTime
+ + " ms to parse. To receive the full text of the document,"
+ + " increase your max parse limit. (Text up to the limit is"
+ + " however available).", tag);
+ }
} else {
super.characters(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
@@ -182,6 +233,23 @@ public boolean isWriteLimitReached(Throwable t) {
}
}
+ /**
+ * Checks whether the given exception (or any of it's root causes) was
+ * thrown by this handler as a signal of reaching the max parse time.
+ *
+ * @since Apache Tika 1.25
+ * @param t throwable
+ * @return true
if the write limit was reached,
+ * false
otherwise
+ */
+ public boolean isMaxParseTimeReached(Throwable t) {
+ if (t instanceof MaxParseTimeReachedException) {
+ return tag.equals(((MaxParseTimeReachedException) t).tag);
+ } else {
+ return t.getCause() != null && isMaxParseTimeReached(t.getCause());
+ }
+ }
+
/**
* The exception used as a signal when the write limit has been reached.
*/
@@ -200,4 +268,22 @@ public WriteLimitReachedException(String message, Serializable tag) {
}
+ /**
+ * The exception used as a signal when the max parse time has been reached.
+ */
+ private static class MaxParseTimeReachedException extends SAXException {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -5860681246457428931L;
+
+ /** Serializable tag of the handler that caused this exception */
+ private final Serializable tag;
+
+ public MaxParseTimeReachedException(String message, Serializable tag) {
+ super(message);
+ this.tag = tag;
+ }
+
+ }
+
}
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 71e71809d2..7261206202 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -31,7 +31,6 @@
import java.io.InputStream;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.tika.language.detect.LanguageHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -139,20 +138,25 @@ private MetadataList parseMetadata(InputStream is, Metadata metadata,
TikaResource.fillParseContext(context, httpHeaders, null);
TikaResource.logRequest(LOG, info, metadata);
- int writeLimit = -1;
- if (httpHeaders.containsKey("writeLimit")) {
- writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
- }
+ int writeLimit = -1;
+ if (httpHeaders.containsKey("writeLimit")) {
+ writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
+ }
- int maxEmbeddedResources = -1;
- if (httpHeaders.containsKey("maxEmbeddedResources")) {
- maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
- }
+ int maxEmbeddedResources = -1;
+ if (httpHeaders.containsKey("maxEmbeddedResources")) {
+ maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
+ }
+
+ long maxParseTime = -1L;
+ if (httpHeaders.containsKey("maxParseTime")) {
+ maxParseTime = Long.parseLong(httpHeaders.getFirst("maxParseTime"));
+ }
- BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.HANDLER_TYPE type =
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources,
+ new BasicContentHandlerFactory(type, writeLimit, maxParseTime), maxEmbeddedResources,
TikaResource.getConfig().getMetadataFilter());
try {
TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 544a6021ea..d58d260f21 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -376,4 +376,37 @@ public void testWriteLimit() throws Exception {
metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
}
+
+ @Test
+ public void testMaxParseTimeLimit() throws Exception {
+ long maxParseTime = 1L;
+ Response response = WebClient
+ .create(endPoint + META_PATH)
+ .accept("application/json")
+ .header("maxParseTime", Long.toString(maxParseTime))
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ assertEquals(200, response.getStatus());
+ // Check results
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals("true", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED));
+
+ // now test with -1L
+ maxParseTime = -1L;
+ response = WebClient
+ .create(endPoint + META_PATH)
+ .accept("application/json")
+ .header("maxParseTime", Long.toString(maxParseTime))
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ assertEquals(200, response.getStatus());
+ // Check results
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertNull(metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED));
+
+ }
}