lintool · seweissman · Jun 11, 2013 · Jun 11, 2013
diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java
@@ -84,8 +84,9 @@ public abstract class WikipediaPage extends Indexable {
 
   /**
    * Start delimiter of the text, which is &lt;<code>text xml:space=\"preserve\"</code>&gt;.
+   * Note: No close bracket because text element can have multiple attributes.
    */
-  protected static final String XML_START_TAG_TEXT = "<text xml:space=\"preserve\">";
+  protected static final String XML_START_TAG_TEXT = "<text xml:space=\"preserve\"";
 
   /**
    * End delimiter of the text, which is &lt;<code>/text</code>&gt;.
@@ -172,9 +173,11 @@ public String getLanguage() {
    */
   public String getContent() {
     String s = getWikiMarkup();
-
+    if(s == null) return null;
     // Bliki doesn't seem to properly handle inter-language links, so remove manually.
-    s = LANG_LINKS.matcher(s).replaceAll(" ");
+    if(LANG_LINKS.matcher(s).matches()){
+        s = LANG_LINKS.matcher(s).replaceAll(" ");
+    }
 
     wikiModel.setUp();
     s = getTitle() + "\n" + wikiModel.render(textConverter, s);
@@ -222,7 +225,7 @@ public String getRawXML() {
    * Returns the text of this page.
    */
   public String getWikiMarkup() {
-    if (textStart == -1)
+    if (textStart == -1 || textStart + 27 > textEnd)
       return null;
 
     return page.substring(textStart + 27, textEnd);

diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java
@@ -53,18 +53,22 @@ protected void processPage(String s) {
     // parse out title
     int start = s.indexOf(XML_START_TAG_TITLE);
     int end = s.indexOf(XML_END_TAG_TITLE, start);
+    if(start < 0 || end < 0){
+        textStart = -1;
+        return;
+    }
     this.title = StringEscapeUtils.unescapeHtml(s.substring(start + 7, end));
 
     // determine if article belongs to the article namespace
     start = s.indexOf(XML_START_TAG_NAMESPACE);
     end = s.indexOf(XML_END_TAG_NAMESPACE);
-    this.isArticle = start == -1 ? true : s.substring(start + 4, end).trim().equals("0");
+    this.isArticle = (start == -1 || end == -1 || start > end) ? false : s.substring(start + 4, end).trim().equals("0");
     // add check because namespace tag not present in older dumps
 
     // parse out the document id
     start = s.indexOf(XML_START_TAG_ID);
     end = s.indexOf(XML_END_TAG_ID);
-    this.mId = s.substring(start + 4, end);
+    this.mId = (start == -1 || end == -1 || start > end) ? "0" : s.substring(start + 4, end);
 
     // parse out actual text of article
     this.textStart = s.indexOf(XML_START_TAG_TEXT);