Skip to content
This repository has been archived by the owner on May 6, 2018. It is now read-only.

Error checks for parsing wikipedia dumps #52

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@ public abstract class WikipediaPage extends Indexable {

/**
* Start delimiter of the text, which is &lt;<code>text xml:space=\"preserve\"</code>&gt;.
* Note: No close bracket because text element can have multiple attributes.
*/
protected static final String XML_START_TAG_TEXT = "<text xml:space=\"preserve\">";
protected static final String XML_START_TAG_TEXT = "<text xml:space=\"preserve\"";

/**
* End delimiter of the text, which is &lt;<code>/text</code>&gt;.
Expand Down Expand Up @@ -172,9 +173,11 @@ public String getLanguage() {
*/
public String getContent() {
String s = getWikiMarkup();

if(s == null) return null;
// Bliki doesn't seem to properly handle inter-language links, so remove manually.
s = LANG_LINKS.matcher(s).replaceAll(" ");
if(LANG_LINKS.matcher(s).matches()){
s = LANG_LINKS.matcher(s).replaceAll(" ");
}

wikiModel.setUp();
s = getTitle() + "\n" + wikiModel.render(textConverter, s);
Expand Down Expand Up @@ -222,7 +225,7 @@ public String getRawXML() {
* Returns the text of this page.
*/
public String getWikiMarkup() {
if (textStart == -1)
if (textStart == -1 || textStart + 27 > textEnd)
return null;

return page.substring(textStart + 27, textEnd);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,22 @@ protected void processPage(String s) {
// parse out title
int start = s.indexOf(XML_START_TAG_TITLE);
int end = s.indexOf(XML_END_TAG_TITLE, start);
if(start < 0 || end < 0){
textStart = -1;
return;
}
this.title = StringEscapeUtils.unescapeHtml(s.substring(start + 7, end));

// determine if article belongs to the article namespace
start = s.indexOf(XML_START_TAG_NAMESPACE);
end = s.indexOf(XML_END_TAG_NAMESPACE);
this.isArticle = start == -1 ? true : s.substring(start + 4, end).trim().equals("0");
this.isArticle = (start == -1 || end == -1 || start > end) ? false : s.substring(start + 4, end).trim().equals("0");
// add check because namespace tag not present in older dumps

// parse out the document id
start = s.indexOf(XML_START_TAG_ID);
end = s.indexOf(XML_END_TAG_ID);
this.mId = s.substring(start + 4, end);
this.mId = (start == -1 || end == -1 || start > end) ? "0" : s.substring(start + 4, end);

// parse out actual text of article
this.textStart = s.indexOf(XML_START_TAG_TEXT);
Expand Down