Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iterator for directories containing text files. #35

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions src/cc/mallet/pipe/iterator/TextFileIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package cc.mallet.pipe.iterator;

import cc.mallet.types.Instance;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.util.Iterator;
import java.util.Scanner;
/**
* An iterator that iterates through a directory and returns
* one instance per file. <p>
*/

public class TextFileIterator implements Iterator<Instance>
{
File[] fileList;
int currentFile;



/**
* Defaults constructor
* @param path the path of the file (s)
* @param filter filename ending
*/
public TextFileIterator(String path, String filter){
try {
fileIterator(path, filter);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}


/**
* A private file iterator that creates an array containing all files at the given directory path.
* @param path The directory path.
* @throws FileNotFoundException if the path is not a directory.
*/
private void fileIterator(String path, String langCode) throws FileNotFoundException {
if (langCode.equals(null))
langCode = "";
if (fileList == null) {
File dir = new File(path);
if (dir.isDirectory()) {
final String finalLangCode = langCode;
fileList = dir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(finalLangCode +".txt");
}
});
}
else {
throw new IllegalStateException("The given path is not a directory.");
}
}
}

/**
* A private file reader that reads through the current file.
* @return The content of the file read.
* @throws FileNotFoundException if the file is not found.
*/
private String readFile() throws FileNotFoundException {
Scanner scanner = new Scanner(fileList[currentFile], "UTF-8");
String text = "";
while (scanner.hasNextLine()) {
String line = scanner.nextLine().trim();
if(line.length()> 2)
text += line;
}
scanner.close();
return text;
}

/**
* Next instance
* @return The next instance (next file)
*/
public Instance next(){
String data = null;
try {
data = readFile();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
String name = fileList[currentFile].getName();

int underScorePos = 0;
if(name.contains("_"))
underScorePos = name.indexOf("_");
String target = name.substring(underScorePos+1, name.indexOf("."));
String uri = name.substring(0, underScorePos);
currentFile++;
if (data == null){
throw new IllegalStateException("The iterated file contained no data.");
}
return new Instance(data, target, uri, null);
}

/**
* @return true if there are more files in the iterator.
*/
public boolean hasNext () { return fileList.length > currentFile; }

public void remove () {
throw new IllegalStateException ("This Iterator<Instance> does not support remove().");
}

}