Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new maxParseTime parameter to the WriteOutContentHandler #374

Open
wants to merge 6 commits into
base: branch_1x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tika-core/src/main/java/org/apache/tika/Tika.java
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ public String parseToString(InputStream stream, Metadata metadata)
parser.parse(
stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
if (!handler.isWriteLimitReached(e) && !handler.isMaxParseTimeReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
Expand Down Expand Up @@ -568,7 +568,7 @@ public String parseToString(InputStream stream, Metadata metadata, int maxLength
parser.parse(
stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
if (!handler.isWriteLimitReached(e) && !handler.isMaxParseTimeReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,13 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
} catch (SAXException e) {
boolean wlr = isWriteLimitReached(e);
if (wlr == false) {
if (isWriteLimitReached(e)) {
metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
} else if (isMaxParseTimeReached(e)) {
metadata.set(RecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED, "true");
} else {
throw e;
}
metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
} catch (Throwable e) {
//try our best to record the problem in the metadata object
//then rethrow
Expand Down Expand Up @@ -323,6 +325,22 @@ private boolean isWriteLimitReached(Throwable t) {
}
}

/**
* Copied/modified from WriteOutContentHandler. Couldn't make that
* static, and we need to have something that will work
* with exceptions thrown from both BodyContentHandler and WriteOutContentHandler
* @param t The exception to check.
* @return True if this is a MaxParseTimeReachedException exception.
*/
private boolean isMaxParseTimeReached(Throwable t) {
if (t.getMessage() != null &&
t.getMessage().indexOf("Your document took more than ") == 0) {
return true;
} else {
return t.getCause() != null && isMaxParseTimeReached(t.getCause());
}
}

private String getResourceName(Metadata metadata, ParserState state) {
String objectName = "";
if (metadata.get(Metadata.RESOURCE_NAME_KEY) != null) {
Expand Down Expand Up @@ -387,15 +405,14 @@ public void parse(InputStream stream, ContentHandler ignore,
try {
super.parse(stream, secureContentHandler, metadata, context);
} catch (SAXException e) {
boolean wlr = isWriteLimitReached(e);
if (wlr == true) {
if (isWriteLimitReached(e)) {
metadata.add(WRITE_LIMIT_REACHED, "true");
} else if (isMaxParseTimeReached(e)) {
metadata.set(RecursiveParserWrapperHandler.MAX_PARSE_TIME_REACHED, "true");
} else if (catchEmbeddedExceptions) {
ParserUtils.recordParserFailure(this, e, metadata);
} else {
if (catchEmbeddedExceptions) {
ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
throw e;
}
} catch(CorruptedFileException e) {
throw e;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
public final static Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
public final static Property MAX_PARSE_TIME_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "max_parse_time_reached");
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ public enum HANDLER_TYPE {

private final HANDLER_TYPE type;
private final int writeLimit;
private final long maxParseTime;

/**
*
Expand All @@ -79,6 +80,20 @@ public enum HANDLER_TYPE {
public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) {
this.type = type;
this.writeLimit = writeLimit;
this.maxParseTime = -1L;
}

/**
*
* @param type basic type of handler
* @param writeLimit max number of characters to store; if < 0, the handler will store all characters
* @param maxParseTime if > -1, stop writing characters to store if this many milliseconds has elapsed since
* first characters were written.
*/
public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, long maxParseTime) {
this.type = type;
this.writeLimit = writeLimit;
this.maxParseTime = maxParseTime;
}

@Override
Expand All @@ -89,25 +104,23 @@ public ContentHandler getNewContentHandler() {
} else if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
}
if (writeLimit > -1) {
if (writeLimit > -1 || maxParseTime > -1L) {
switch(type) {
case TEXT:
return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
case HTML:
return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit);
return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit, maxParseTime);
case XML:
return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit);
return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit, maxParseTime);
default:
return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
case TEXT:
return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit, maxParseTime);
}
} else {
switch (type) {
case TEXT:
return new ToTextContentHandler();
case HTML:
return new ToHTMLContentHandler();
case XML:
return new ToXMLContentHandler();
case TEXT:
default:
return new ToTextContentHandler();

Expand All @@ -127,31 +140,29 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
return new DefaultHandler();
}
try {
if (writeLimit > -1) {
if (writeLimit > -1 || maxParseTime > -1L) {
switch (type) {
case BODY:
return new WriteOutContentHandler(
new BodyContentHandler(
new OutputStreamWriter(os, charset)), writeLimit);
case TEXT:
return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit);
new OutputStreamWriter(os, charset)), writeLimit, maxParseTime);
case HTML:
return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit);
return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit, maxParseTime);
case XML:
return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit);
return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit, maxParseTime);
case TEXT:
default:
return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit);
return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit, maxParseTime);
}
} else {
switch (type) {
case BODY:
return new BodyContentHandler(new OutputStreamWriter(os, charset));
case TEXT:
return new ToTextContentHandler(os, charset.name());
case HTML:
return new ToHTMLContentHandler(os, charset.name());
case XML:
return new ToXMLContentHandler(os, charset.name());
case TEXT:
default:
return new ToTextContentHandler(os, charset.name());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,22 @@ public class WriteOutContentHandler extends ContentHandlerDecorator {
*/
private final int writeLimit;

/**
* The maximum amount of time that can be spent parsing in milliseconds.
* Set to -1 for no limit.
*/
private final long maxParseTime;

/**
* Number of characters written so far.
*/
private int writeCount = 0;

/**
* Time the parse started.
*/
private long parseStartedMs = 0;

/**
* Creates a content handler that writes content up to the given
* write limit to the given content handler.
Expand All @@ -60,6 +71,36 @@ public class WriteOutContentHandler extends ContentHandlerDecorator {
public WriteOutContentHandler(ContentHandler handler, int writeLimit) {
super(handler);
this.writeLimit = writeLimit;
this.maxParseTime = -1L;
}

/**
* Creates a content handler that writes content up to the given
* write limit to the given character stream.
*
* @since Apache Tika 1.25
* @param writer character stream
* @param writeLimit If > -1, stop writing chars to content handler if exceeded this value.
* @param maxParseTime The max time in milliseconds that can be spent parsing this file before we stop parsing.
*/
public WriteOutContentHandler(Writer writer, int writeLimit, long maxParseTime) {
this(new ToTextContentHandler(writer), writeLimit, maxParseTime);
}

/**
* Creates a content handler that writes content up to the given
* write limit to the given content handler.
*
* @since Apache Tika 1.25
* @param handler content handler to be decorated
* @param writeLimit write limit
* @param maxParseTime if > -1, stop writing characters to handler if this many milliseconds has elapsed since
* first characters were written.
*/
public WriteOutContentHandler(ContentHandler handler, int writeLimit, long maxParseTime) {
super(handler);
this.writeLimit = writeLimit;
this.maxParseTime = maxParseTime;
}

/**
Expand All @@ -71,7 +112,7 @@ public WriteOutContentHandler(ContentHandler handler, int writeLimit) {
* @param writeLimit write limit
*/
public WriteOutContentHandler(Writer writer, int writeLimit) {
this(new ToTextContentHandler(writer), writeLimit);
this(new ToTextContentHandler(writer), writeLimit, -1L);
}

/**
Expand All @@ -81,7 +122,7 @@ public WriteOutContentHandler(Writer writer, int writeLimit) {
* @param writer writer
*/
public WriteOutContentHandler(Writer writer) {
this(writer, -1);
this(writer, -1, -1L);
}

/**
Expand Down Expand Up @@ -109,7 +150,7 @@ public WriteOutContentHandler(OutputStream stream) {
* or -1 to disable the write limit
*/
public WriteOutContentHandler(int writeLimit) {
this(new StringWriter(), writeLimit);
this(new StringWriter(), writeLimit, -1L);
}

/**
Expand All @@ -132,9 +173,19 @@ public WriteOutContentHandler() {
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (parseStartedMs == 0) {
parseStartedMs = System.currentTimeMillis();
}
if (writeLimit == -1 || writeCount + length <= writeLimit) {
super.characters(ch, start, length);
writeCount += length;
if (maxParseTime > -1 && System.currentTimeMillis() - parseStartedMs > maxParseTime) {
throw new MaxParseTimeReachedException(
"Your document took more than " + maxParseTime
+ " ms to parse. To receive the full text of the document,"
+ " increase your max parse limit. (Text up to the limit is"
+ " however available).", tag);
}
} else {
super.characters(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
Expand Down Expand Up @@ -182,6 +233,23 @@ public boolean isWriteLimitReached(Throwable t) {
}
}

/**
* Checks whether the given exception (or any of it's root causes) was
* thrown by this handler as a signal of reaching the max parse time.
*
* @since Apache Tika 1.25
* @param t throwable
* @return <code>true</code> if the write limit was reached,
* <code>false</code> otherwise
*/
public boolean isMaxParseTimeReached(Throwable t) {
if (t instanceof MaxParseTimeReachedException) {
return tag.equals(((MaxParseTimeReachedException) t).tag);
} else {
return t.getCause() != null && isMaxParseTimeReached(t.getCause());
}
}

/**
* The exception used as a signal when the write limit has been reached.
*/
Expand All @@ -200,4 +268,22 @@ public WriteLimitReachedException(String message, Serializable tag) {

}

/**
* The exception used as a signal when the max parse time has been reached.
*/
private static class MaxParseTimeReachedException extends SAXException {

/** Serial version UID */
private static final long serialVersionUID = -5860681246457428931L;

/** Serializable tag of the handler that caused this exception */
private final Serializable tag;

public MaxParseTimeReachedException(String message, Serializable tag) {
super(message);
this.tag = tag;
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import java.io.InputStream;

import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.tika.language.detect.LanguageHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
Expand Down Expand Up @@ -139,20 +138,25 @@ private MetadataList parseMetadata(InputStream is, Metadata metadata,
TikaResource.fillParseContext(context, httpHeaders, null);
TikaResource.logRequest(LOG, info, metadata);

int writeLimit = -1;
if (httpHeaders.containsKey("writeLimit")) {
writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
}
int writeLimit = -1;
if (httpHeaders.containsKey("writeLimit")) {
writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
}

int maxEmbeddedResources = -1;
if (httpHeaders.containsKey("maxEmbeddedResources")) {
maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
}
int maxEmbeddedResources = -1;
if (httpHeaders.containsKey("maxEmbeddedResources")) {
maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
}

long maxParseTime = -1L;
if (httpHeaders.containsKey("maxParseTime")) {
maxParseTime = Long.parseLong(httpHeaders.getFirst("maxParseTime"));
}

BasicContentHandlerFactory.HANDLER_TYPE type =
BasicContentHandlerFactory.HANDLER_TYPE type =
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources,
new BasicContentHandlerFactory(type, writeLimit, maxParseTime), maxEmbeddedResources,
TikaResource.getConfig().getMetadataFilter());
try {
TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
Expand Down
Loading