diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java index c0cf281e8b..1c7bf70611 100644 --- a/tika-core/src/main/java/org/apache/tika/Tika.java +++ b/tika-core/src/main/java/org/apache/tika/Tika.java @@ -527,7 +527,7 @@ public String parseToString(InputStream stream, Metadata metadata) parser.parse( stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { - if (!handler.isWriteLimitReached(e)) { + if (!handler.isWriteLimitReached(e) && !handler.isMaxParseTimeReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } @@ -568,7 +568,7 @@ public String parseToString(InputStream stream, Metadata metadata, int maxLength parser.parse( stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { - if (!handler.isWriteLimitReached(e)) { + if (!handler.isWriteLimitReached(e) && !handler.isMaxParseTimeReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index e9de9bab37..c175448785 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -233,11 +233,13 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl context.set(RecursivelySecureContentHandler.class, secureContentHandler); getWrappedParser().parse(tis, secureContentHandler, metadata, context); } catch (SAXException e) { - boolean wlr = isWriteLimitReached(e); - if (wlr == false) { + if (isWriteLimitReached(e)) { + metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true"); + } else if (isMaxParseTimeReached(e)) { + metadata.set(RecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED, "true"); + } else { throw e; } - metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true"); } catch (Throwable e) { //try our best to record the problem in the metadata object //then rethrow @@ -323,6 +325,22 @@ private boolean isWriteLimitReached(Throwable t) { } } + /** + * Copied/modified from WriteOutContentHandler. Couldn't make that + * static, and we need to have something that will work + * with exceptions thrown from both BodyContentHandler and WriteOutContentHandler + * @param t The exception to check. + * @return True if this is a MaxParseTimeReachedException exception. + */ + private boolean isMaxParseTimeReached(Throwable t) { + if (t.getMessage() != null && + t.getMessage().indexOf("Your document took more than ") == 0) { + return true; + } else { + return t.getCause() != null && isMaxParseTimeReached(t.getCause()); + } + } + private String getResourceName(Metadata metadata, ParserState state) { String objectName = ""; if (metadata.get(Metadata.RESOURCE_NAME_KEY) != null) { @@ -387,15 +405,14 @@ public void parse(InputStream stream, ContentHandler ignore, try { super.parse(stream, secureContentHandler, metadata, context); } catch (SAXException e) { - boolean wlr = isWriteLimitReached(e); - if (wlr == true) { + if (isWriteLimitReached(e)) { metadata.add(WRITE_LIMIT_REACHED, "true"); + } else if (isMaxParseTimeReached(e)) { + metadata.set(RecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED, "true"); + } else if (catchEmbeddedExceptions) { + ParserUtils.recordParserFailure(this, e, metadata); } else { - if (catchEmbeddedExceptions) { - ParserUtils.recordParserFailure(this, e, metadata); - } else { - throw e; - } + throw e; } } catch(CorruptedFileException e) { throw e; diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java index 8515f09c4c..6a3373dfd1 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java @@ -43,6 +43,8 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis"); public final static Property WRITE_LIMIT_REACHED = Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); + public final static Property MAX_PARSE_TIME_REACHED = + Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "max_parse_time_reached"); public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 899994e053..361c23da52 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -70,6 +70,7 @@ public enum HANDLER_TYPE { private final HANDLER_TYPE type; private final int writeLimit; + private final long maxParseTime; /** * @@ -79,6 +80,20 @@ public enum HANDLER_TYPE { public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) { this.type = type; this.writeLimit = writeLimit; + this.maxParseTime = -1L; + } + + /** + * + * @param type basic type of handler + * @param writeLimit max number of characters to store; if < 0, the handler will store all characters + * @param maxParseTime if > -1, stop writing characters to store if this many milliseconds has elapsed since + * first characters were written. + */ + public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, long maxParseTime) { + this.type = type; + this.writeLimit = writeLimit; + this.maxParseTime = maxParseTime; } @Override @@ -89,25 +104,23 @@ public ContentHandler getNewContentHandler() { } else if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } - if (writeLimit > -1) { + if (writeLimit > -1 || maxParseTime > -1L) { switch(type) { - case TEXT: - return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); case HTML: - return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit); + return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit, maxParseTime); case XML: - return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit); + return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit, maxParseTime); default: - return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); + case TEXT: + return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit, maxParseTime); } } else { switch (type) { - case TEXT: - return new ToTextContentHandler(); case HTML: return new ToHTMLContentHandler(); case XML: return new ToXMLContentHandler(); + case TEXT: default: return new ToTextContentHandler(); @@ -127,31 +140,29 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { return new DefaultHandler(); } try { - if (writeLimit > -1) { + if (writeLimit > -1 || maxParseTime > -1L) { switch (type) { case BODY: return new WriteOutContentHandler( new BodyContentHandler( - new OutputStreamWriter(os, charset)), writeLimit); - case TEXT: - return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); + new OutputStreamWriter(os, charset)), writeLimit, maxParseTime); case HTML: - return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit); + return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit, maxParseTime); case XML: - return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit); + return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit, maxParseTime); + case TEXT: default: - return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); + return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit, maxParseTime); } } else { switch (type) { case BODY: return new BodyContentHandler(new OutputStreamWriter(os, charset)); - case TEXT: - return new ToTextContentHandler(os, charset.name()); case HTML: return new ToHTMLContentHandler(os, charset.name()); case XML: return new ToXMLContentHandler(os, charset.name()); + case TEXT: default: return new ToTextContentHandler(os, charset.name()); diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java index 90b98ace0c..099dee5e7c 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java @@ -44,11 +44,22 @@ public class WriteOutContentHandler extends ContentHandlerDecorator { */ private final int writeLimit; + /** + * The maximum amount of time that can be spent parsing in milliseconds. + * Set to -1 for no limit. + */ + private final long maxParseTime; + /** * Number of characters written so far. */ private int writeCount = 0; + /** + * Time the parse started. + */ + private long parseStartedMs = 0; + /** * Creates a content handler that writes content up to the given * write limit to the given content handler. @@ -60,6 +71,36 @@ public class WriteOutContentHandler extends ContentHandlerDecorator { public WriteOutContentHandler(ContentHandler handler, int writeLimit) { super(handler); this.writeLimit = writeLimit; + this.maxParseTime = -1L; + } + + /** + * Creates a content handler that writes content up to the given + * write limit to the given character stream. + * + * @since Apache Tika 1.25 + * @param writer character stream + * @param writeLimit If > -1, stop writing chars to content handler if exceeded this value. + * @param maxParseTime The max time in milliseconds that can be spent parsing this file before we stop parsing. + */ + public WriteOutContentHandler(Writer writer, int writeLimit, long maxParseTime) { + this(new ToTextContentHandler(writer), writeLimit, maxParseTime); + } + + /** + * Creates a content handler that writes content up to the given + * write limit to the given content handler. + * + * @since Apache Tika 1.25 + * @param handler content handler to be decorated + * @param writeLimit write limit + * @param maxParseTime if > -1, stop writing characters to handler if this many milliseconds has elapsed since + * first characters were written. + */ + public WriteOutContentHandler(ContentHandler handler, int writeLimit, long maxParseTime) { + super(handler); + this.writeLimit = writeLimit; + this.maxParseTime = maxParseTime; } /** @@ -71,7 +112,7 @@ public WriteOutContentHandler(ContentHandler handler, int writeLimit) { * @param writeLimit write limit */ public WriteOutContentHandler(Writer writer, int writeLimit) { - this(new ToTextContentHandler(writer), writeLimit); + this(new ToTextContentHandler(writer), writeLimit, -1L); } /** @@ -81,7 +122,7 @@ public WriteOutContentHandler(Writer writer, int writeLimit) { * @param writer writer */ public WriteOutContentHandler(Writer writer) { - this(writer, -1); + this(writer, -1, -1L); } /** @@ -109,7 +150,7 @@ public WriteOutContentHandler(OutputStream stream) { * or -1 to disable the write limit */ public WriteOutContentHandler(int writeLimit) { - this(new StringWriter(), writeLimit); + this(new StringWriter(), writeLimit, -1L); } /** @@ -132,9 +173,19 @@ public WriteOutContentHandler() { @Override public void characters(char[] ch, int start, int length) throws SAXException { + if (parseStartedMs == 0) { + parseStartedMs = System.currentTimeMillis(); + } if (writeLimit == -1 || writeCount + length <= writeLimit) { super.characters(ch, start, length); writeCount += length; + if (maxParseTime > -1 && System.currentTimeMillis() - parseStartedMs > maxParseTime) { + throw new MaxParseTimeReachedException( + "Your document took more than " + maxParseTime + + " ms to parse. To receive the full text of the document," + + " increase your max parse limit. (Text up to the limit is" + + " however available).", tag); + } } else { super.characters(ch, start, writeLimit - writeCount); writeCount = writeLimit; @@ -182,6 +233,23 @@ public boolean isWriteLimitReached(Throwable t) { } } + /** + * Checks whether the given exception (or any of it's root causes) was + * thrown by this handler as a signal of reaching the max parse time. + * + * @since Apache Tika 1.25 + * @param t throwable + * @return true if the write limit was reached, + * false otherwise + */ + public boolean isMaxParseTimeReached(Throwable t) { + if (t instanceof MaxParseTimeReachedException) { + return tag.equals(((MaxParseTimeReachedException) t).tag); + } else { + return t.getCause() != null && isMaxParseTimeReached(t.getCause()); + } + } + /** * The exception used as a signal when the write limit has been reached. */ @@ -200,4 +268,22 @@ public WriteLimitReachedException(String message, Serializable tag) { } + /** + * The exception used as a signal when the max parse time has been reached. + */ + private static class MaxParseTimeReachedException extends SAXException { + + /** Serial version UID */ + private static final long serialVersionUID = -5860681246457428931L; + + /** Serializable tag of the handler that caused this exception */ + private final Serializable tag; + + public MaxParseTimeReachedException(String message, Serializable tag) { + super(message); + this.tag = tag; + } + + } + } diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java index 71e71809d2..7261206202 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java @@ -31,7 +31,6 @@ import java.io.InputStream; import org.apache.cxf.jaxrs.ext.multipart.Attachment; -import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -139,20 +138,25 @@ private MetadataList parseMetadata(InputStream is, Metadata metadata, TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(LOG, info, metadata); - int writeLimit = -1; - if (httpHeaders.containsKey("writeLimit")) { - writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit")); - } + int writeLimit = -1; + if (httpHeaders.containsKey("writeLimit")) { + writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit")); + } - int maxEmbeddedResources = -1; - if (httpHeaders.containsKey("maxEmbeddedResources")) { - maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources")); - } + int maxEmbeddedResources = -1; + if (httpHeaders.containsKey("maxEmbeddedResources")) { + maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources")); + } + + long maxParseTime = -1L; + if (httpHeaders.containsKey("maxParseTime")) { + maxParseTime = Long.parseLong(httpHeaders.getFirst("maxParseTime")); + } - BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources, + new BasicContentHandlerFactory(type, writeLimit, maxParseTime), maxEmbeddedResources, TikaResource.getConfig().getMetadataFilter()); try { TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context); diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java index 544a6021ea..d58d260f21 100644 --- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java +++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java @@ -376,4 +376,37 @@ public void testWriteLimit() throws Exception { metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } + + @Test + public void testMaxParseTimeLimit() throws Exception { + long maxParseTime = 1L; + Response response = WebClient + .create(endPoint + META_PATH) + .accept("application/json") + .header("maxParseTime", Long.toString(maxParseTime)) + .put(ClassLoader + .getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + + assertEquals(200, response.getStatus()); + // Check results + Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + List metadataList = JsonMetadataList.fromJson(reader); + assertEquals("true", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED)); + + // now test with -1L + maxParseTime = -1L; + response = WebClient + .create(endPoint + META_PATH) + .accept("application/json") + .header("maxParseTime", Long.toString(maxParseTime)) + .put(ClassLoader + .getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + + assertEquals(200, response.getStatus()); + // Check results + reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + metadataList = JsonMetadataList.fromJson(reader); + assertNull(metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED)); + + } }