diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java index 497e767c8..3ba500db3 100644 --- a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java +++ b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java @@ -84,8 +84,9 @@ public abstract class WikipediaPage extends Indexable { /** * Start delimiter of the text, which is <text xml:space=\"preserve\">. + * Note: No close bracket because text element can have multiple attributes. */ - protected static final String XML_START_TAG_TEXT = ""; + protected static final String XML_START_TAG_TEXT = "/text>. @@ -172,9 +173,11 @@ public String getLanguage() { */ public String getContent() { String s = getWikiMarkup(); - + if(s == null) return null; // Bliki doesn't seem to properly handle inter-language links, so remove manually. - s = LANG_LINKS.matcher(s).replaceAll(" "); + if(LANG_LINKS.matcher(s).matches()){ + s = LANG_LINKS.matcher(s).replaceAll(" "); + } wikiModel.setUp(); s = getTitle() + "\n" + wikiModel.render(textConverter, s); @@ -222,7 +225,7 @@ public String getRawXML() { * Returns the text of this page. */ public String getWikiMarkup() { - if (textStart == -1) + if (textStart == -1 || textStart + 27 > textEnd) return null; return page.substring(textStart + 27, textEnd); diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java index e8f9c3b8e..a8b1a9d72 100644 --- a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java +++ b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java @@ -53,18 +53,22 @@ protected void processPage(String s) { // parse out title int start = s.indexOf(XML_START_TAG_TITLE); int end = s.indexOf(XML_END_TAG_TITLE, start); + if(start < 0 || end < 0){ + textStart = -1; + return; + } this.title = StringEscapeUtils.unescapeHtml(s.substring(start + 7, end)); // determine if article belongs to the article namespace start = s.indexOf(XML_START_TAG_NAMESPACE); end = s.indexOf(XML_END_TAG_NAMESPACE); - this.isArticle = start == -1 ? true : s.substring(start + 4, end).trim().equals("0"); + this.isArticle = (start == -1 || end == -1 || start > end) ? false : s.substring(start + 4, end).trim().equals("0"); // add check because namespace tag not present in older dumps // parse out the document id start = s.indexOf(XML_START_TAG_ID); end = s.indexOf(XML_END_TAG_ID); - this.mId = s.substring(start + 4, end); + this.mId = (start == -1 || end == -1 || start > end) ? "0" : s.substring(start + 4, end); // parse out actual text of article this.textStart = s.indexOf(XML_START_TAG_TEXT);