Skip to content

Commit

Permalink
Added wordbag serialization to Clusterer
Browse files Browse the repository at this point in the history
  • Loading branch information
MarceloLaser committed Jun 6, 2022
1 parent 1c9bbd9 commit f5d7db1
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ public class Architecture extends TreeMap<String, Cluster> {
/**
* The path to where this data structure should be serialized.
*/
private final String projectPath;
private final SimMeasure.SimMeasureType simMeasure;
public final String projectPath;
public final SimMeasure.SimMeasureType simMeasure;
/**
* The total number of features that can exist in any clusters of this
* architecture. For architectures constructed from structural data such as
Expand Down Expand Up @@ -330,8 +330,7 @@ public void writeToRsf(String path) throws FileNotFoundException {

Map<Integer, String> architectureIndex = computeArchitectureIndex();

try (PrintWriter out = new PrintWriter(
new OutputStreamWriter(
try (PrintWriter out = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(rsfFile), StandardCharsets.UTF_8))) {
for (Map.Entry<Integer, String> cluster : architectureIndex.entrySet()) {
Integer clusterIndex = cluster.getKey();
Expand All @@ -351,7 +350,7 @@ public void writeToRsf(String path) throws FileNotFoundException {
* a unique identifier. The names of the entities are broken down into
* separate entries by the serialization method.
*/
private Map<Integer, String> computeArchitectureIndex() {
protected Map<Integer, String> computeArchitectureIndex() {
List<String> orderedClusterNames = this.values().stream()
.map(Cluster::getName).sorted().collect(Collectors.toList());

Expand Down
5 changes: 5 additions & 0 deletions src/main/java/edu/usc/softarch/arcade/clustering/Cluster.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import edu.usc.softarch.arcade.topics.Concern;
import edu.usc.softarch.arcade.topics.DocTopicItem;
import edu.usc.softarch.arcade.topics.UnmatchingDocTopicItemsException;

Expand Down Expand Up @@ -211,6 +213,9 @@ public double computeStructuralCentroid(int numFeatures) {

return centroidAvg / getNumEntities();
}

public Concern computeConcern(Map<Integer, List<String>> wordBags) {
return this.dti.computeConcern(wordBags); }
//endregion

//region OBJECT METHODS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ public static Architecture run(ClusteringAlgorithmType algorithm,
// Compute the clustering algorithm and return the resulting architecture
runner.computeArchitecture(stopCrit, stoppingCriterionName, simMeasure);
// Compute DTI word bags if concern-based technique is used
if (runner.architecture instanceof ConcernArchitecture)
if (runner.architecture instanceof ConcernArchitecture) {
((ConcernArchitecture) runner.architecture).computeConcernWordBags();
((ConcernArchitecture) runner.architecture).serializeBagOfWords();
}

return runner.architecture;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,18 @@
import com.fasterxml.jackson.annotation.JsonIgnore;
import edu.usc.softarch.arcade.clustering.simmeasures.SimMeasure;
import edu.usc.softarch.arcade.topics.Concern;
import edu.usc.softarch.arcade.topics.DocTopicItem;
import edu.usc.softarch.arcade.topics.DocTopics;
import edu.usc.softarch.arcade.topics.UnmatchingDocTopicItemsException;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
Expand Down Expand Up @@ -127,10 +134,40 @@ private void removeClassesWithoutDTI(Map<Cluster, Cluster> parentClassMap) {
super.removeAll(excessClusters);
super.removeAll(excessInners);
}
//endregion

//region PROCESSING
public List<Concern> computeConcernWordBags() {
return this.docTopics.getConcerns(); }
List<Concern> concernList = new ArrayList<>();
for (Cluster cluster : this.values())
concernList.add(cluster.computeConcern(this.docTopics.getTopicWordLists()));

return concernList;
}
//endregion

//region SERIALIZATION
public void serializeBagOfWords() throws FileNotFoundException {
String fs = File.separator;
String path = this.projectPath + fs + this.projectName + "_"
+ this.simMeasure + "_concerns.txt";
File outputFile = new File(path);
outputFile.getParentFile().mkdirs();

Map<Integer, String> architectureIndex = computeArchitectureIndex();

try (PrintWriter out = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(outputFile), StandardCharsets.UTF_8))) {
StringBuilder output = new StringBuilder();

for (Map.Entry<Integer, String> cluster : architectureIndex.entrySet()) {
DocTopicItem dti = this.get(cluster.getValue()).getDocTopicItem();
Concern concernWords = dti.getConcern();
output.append(cluster.getKey());
output.append(concernWords);
output.append(System.lineSeparator());
}

out.print(output);
}
}
//endregion
}
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,14 @@ public double getJsDivergence(DocTopicItem toCompare)

return Maths.jensenShannonDivergence(sortedP, sortedQ);
}

//TODO make this less horrible
public Concern getConcern() {
if (this.concern == null)
throw new IllegalStateException("Attempted to get concern before " +
"it was computed");
return new Concern(this.concern);
}
//endregion

//region PROCESSING
Expand Down
9 changes: 2 additions & 7 deletions src/main/java/edu/usc/softarch/arcade/topics/DocTopics.java
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,8 @@ else if (dti.source.endsWith(".S")) {
return null;
}

public List<Concern> getConcerns() {
List<Concern> concernList = new ArrayList<>();
for (DocTopicItem dti : dtItemList)
concernList.add(dti.computeConcern(this.topicWordLists));

return concernList;
}
public Map<Integer, List<String>> getTopicWordLists() {
return topicWordLists; }
// #endregion ACCESSORS ------------------------------------------------------

// #region PROCESSING --------------------------------------------------------
Expand Down

0 comments on commit f5d7db1

Please sign in to comment.