diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java
index 497e767c8..3ba500db3 100644
--- a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java
+++ b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java
@@ -84,8 +84,9 @@ public abstract class WikipediaPage extends Indexable {
/**
* Start delimiter of the text, which is <text xml:space=\"preserve\"
>.
+ * Note: No close bracket because text element can have multiple attributes.
*/
- protected static final String XML_START_TAG_TEXT = "";
+ protected static final String XML_START_TAG_TEXT = "/text>.
@@ -172,9 +173,11 @@ public String getLanguage() {
*/
public String getContent() {
String s = getWikiMarkup();
-
+ if(s == null) return null;
// Bliki doesn't seem to properly handle inter-language links, so remove manually.
- s = LANG_LINKS.matcher(s).replaceAll(" ");
+ if(LANG_LINKS.matcher(s).matches()){
+ s = LANG_LINKS.matcher(s).replaceAll(" ");
+ }
wikiModel.setUp();
s = getTitle() + "\n" + wikiModel.render(textConverter, s);
@@ -222,7 +225,7 @@ public String getRawXML() {
* Returns the text of this page.
*/
public String getWikiMarkup() {
- if (textStart == -1)
+ if (textStart == -1 || textStart + 27 > textEnd)
return null;
return page.substring(textStart + 27, textEnd);
diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java
index e8f9c3b8e..a8b1a9d72 100644
--- a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java
+++ b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java
@@ -53,18 +53,22 @@ protected void processPage(String s) {
// parse out title
int start = s.indexOf(XML_START_TAG_TITLE);
int end = s.indexOf(XML_END_TAG_TITLE, start);
+ if(start < 0 || end < 0){
+ textStart = -1;
+ return;
+ }
this.title = StringEscapeUtils.unescapeHtml(s.substring(start + 7, end));
// determine if article belongs to the article namespace
start = s.indexOf(XML_START_TAG_NAMESPACE);
end = s.indexOf(XML_END_TAG_NAMESPACE);
- this.isArticle = start == -1 ? true : s.substring(start + 4, end).trim().equals("0");
+ this.isArticle = (start == -1 || end == -1 || start > end) ? false : s.substring(start + 4, end).trim().equals("0");
// add check because namespace tag not present in older dumps
// parse out the document id
start = s.indexOf(XML_START_TAG_ID);
end = s.indexOf(XML_END_TAG_ID);
- this.mId = s.substring(start + 4, end);
+ this.mId = (start == -1 || end == -1 || start > end) ? "0" : s.substring(start + 4, end);
// parse out actual text of article
this.textStart = s.indexOf(XML_START_TAG_TEXT);