Skip to content

Commit

Permalink
external
Browse files Browse the repository at this point in the history
  • Loading branch information
PrathamSoni committed Oct 21, 2021
1 parent 2c9c65e commit 96f7da9
Show file tree
Hide file tree
Showing 8 changed files with 229 additions and 157 deletions.
45 changes: 41 additions & 4 deletions models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
os.chdir(os.path.dirname(os.path.abspath(__file__)))
sys.path.append("../utils")
from features import Mel, get_vggish_embedding, preprocess
import labels as la
import librosa as lb
from file import get_location
import augment as au
import torchvision.transforms as transforms
import time
import numpy as np
import h5py as h5
import copy
from glob import glob

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
Expand Down Expand Up @@ -79,7 +81,7 @@ def __getitem__(self, idx):

# Positive pairs are from the same locations in the lung. Negative pairs are from different patients in the same
# or different locations.
try :
try:
if self.exp == 0:
cycles = self.labels[self.labels['ID'] == id].drop(columns=['level_0'])
cycles = cycles.reset_index()
Expand Down Expand Up @@ -526,6 +528,9 @@ def get_class_val(self, row):
else:
return 1

else:
return row["diagnosis"]

def get_split(self, df, split_file_path, train_prop=1.0):
# Takes in a DataFrame and a path to a file of only Ints denoting Patient ID.
# Returns a DataFrame of only samples with Patient IDs contained in the split.
Expand Down Expand Up @@ -579,7 +584,7 @@ def __len__(self):
def __getitem__(self, idx):
row = self.labels.iloc[idx]
X = self.data[idx]
#Resize function here possibly (check X dim)
# Resize function here possibly (check X dim)

# Get label
y = self.get_class_val(row)
Expand Down Expand Up @@ -681,6 +686,36 @@ def get_split(self, df, split_file_path, train_prop=1.0):
return df[df.ID.isin(IDs)]


class ExternalDataset(Dataset):
def __init__(self, path):
self.files = glob(os.path.join(os.getcwd(), path, "processed/audio/*.wav"))
self.max_len = 0
self.mel_transform = Mel()
data_list = []

for file in self.files:
data, _ = lb.load(file)
data_list.append(data)
self.max_len = max(data.shape[0], self.max_len)

self.data = np.zeros((len(data_list), self.max_len))
for i, datum in enumerate(data_list):
length = datum.shape[0]
self.data[i][(self.max_len - length) // 2:(self.max_len - length) // 2 + length] = datum

self.norm_func = transforms.Normalize([self.data.mean()], [self.data.std()])


def __len__(self):
return len(self.files)

def __getitem__(self, idx):
path = self.files[idx]
diagnosis = path.split("_")[1].split(",")[0]
X, y = process_data("test", self.mel_transform, self.data[idx], la.external_one_hot(diagnosis), self.norm_func)
return idx, X, y


def get_transform(augment=None):
if augment is None:
mel = Mel()
Expand Down Expand Up @@ -740,7 +775,7 @@ def process_data(mode, augment, X, y, norm_func):
def get_dataset(task, label_file, base_dir, split="train", train_prop=1.0, df=None, transform=None, data=None,
exp=None):
dataset = []
if task == "crackle" or task == "disease" or task == "wheeze":
if task == "crackle" or task == "disease" or task == "wheeze" or task == "lung":
dataset = LungDataset(label_file, base_dir, task, split=split, transform=transform, train_prop=train_prop,
df=df, data=data)
elif task == "heart":
Expand All @@ -757,6 +792,8 @@ def get_dataset(task, label_file, base_dir, split="train", train_prop=1.0, df=No
dataset = LungDataset(label_file, base_dir, "disease", split=split, transform=transform,
train_prop=train_prop,
df=df, data=data)
elif task == "external":
dataset = ExternalDataset(base_dir)
return dataset


Expand Down
62 changes: 35 additions & 27 deletions models/intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ def single_replicate_all_models(gt, task, models, metric_str, replicate_num):

for model in models:
y_score = pd.Series(model.scores)[sample_ids].to_list()
performance = sklearn.metrics.roc_auc_score(y_true, y_score)

if metric_str == 'auc':
performance = sklearn.metrics.roc_auc_score(y_true, y_score)
elif metric_str == 'f1':
performance = sklearn.metrics.f1_score(y_true, np.round(y_score))
performances[model.name] = performance
return performances

Expand Down Expand Up @@ -192,7 +196,7 @@ def run_stage3(metric, read_path, save_path):


def main(working_dir, dataset, num_replicates):
metrics = 'auc'
metrics = ['f1', 'auc']
directories = glob.glob(os.path.join(working_dir, '*'))

base_dir = os.path.split(working_dir)[0][:-5]
Expand All @@ -211,35 +215,39 @@ def main(working_dir, dataset, num_replicates):

scikit = id, X, y = get_scikit_loader(device, dataset, label_file, base_dir, split="test",
encoder=encoder)

stage_1_save_path = f'{direct}/results/raw_{metrics}'
run_stage1(dataset,
num_replicates=num_replicates,
metric_str=metrics,
read_path=f'{direct}',
save_path=stage_1_save_path,
encoder=encoder,
gt=y,
data=data,
scikit=scikit
)

stage_2_save_path = f'{direct}/results/processed_{metrics}'
run_stage2(dataset,
read_path=stage_1_save_path,
save_path=stage_2_save_path,
confidence_level=0.05,
)

stage_3_save_path = f'{direct}/results/'
run_stage3(metrics,
read_path=stage_2_save_path,
save_path=stage_3_save_path
)
for metric in metrics:
stage_1_save_path = f'{direct}/results/raw_{metric}'
run_stage1(dataset,
num_replicates=num_replicates,
metric_str=metric,
read_path=f'{direct}',
save_path=stage_1_save_path,
encoder=encoder,
gt=y,
data=data,
scikit=scikit
)

stage_2_save_path = f'{direct}/results/processed_{metric}'
run_stage2(dataset,
read_path=stage_1_save_path,
save_path=stage_2_save_path,
confidence_level=0.05,
)

stage_3_save_path = f'{direct}/results/'
run_stage3(metric,
read_path=stage_2_save_path,
save_path=stage_3_save_path
)


if __name__ == "__main__":
num_replicates = 1000
device = 'cuda' if torch.cuda.is_available() else 'cpu'
main('../data/logs/3_27', 'disease', num_replicates)
main('../heart/logs/3_27', 'heart', num_replicates)
main('../external/logs/heart', 'external', num_replicates)
main('../external/logs/lung', 'external', num_replicates)
main('../external/logs/extra', 'external', num_replicates)

29 changes: 15 additions & 14 deletions models/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,22 @@ def visualize_embeddings(task, base_dir, log_dir, model_file):

pca_50 = PCA(n_components=50)
pca_result_50 = pca_50.fit_transform(data_subset)
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
tsne_pca_results = tsne.fit_transform(pca_result_50)
for perplexity in [5, 25, 50]:
tsne = TSNE(n_components=2, verbose=0, perplexity=perplexity, n_iter=300)
tsne_pca_results = tsne.fit_transform(pca_result_50)

df['tsne-pca50-one'] = tsne_pca_results[:, 0]
df['tsne-pca50-two'] = tsne_pca_results[:, 1]
plt.figure(figsize=(5, 4))
sns.scatterplot(
x="tsne-pca50-one", y="tsne-pca50-two",
hue="y",
palette=sns.color_palette("hls", 2),
data=df,
legend="full",
alpha=0.3,
)
plt.savefig(os.path.join(log_dir, model_file[:-4] + '_viz.png'))
df['tsne-pca50-one'] = tsne_pca_results[:, 0]
df['tsne-pca50-two'] = tsne_pca_results[:, 1]
plt.figure(figsize=(5, 4))
sns.scatterplot(
x="tsne-pca50-one", y="tsne-pca50-two",
hue="y",
palette=sns.color_palette("hls", len(pd.unique(df['y']))),
data=df,
legend="full",
alpha=0.3,
)
plt.savefig(os.path.join(log_dir, model_file[:-4] + f'_{perplexity}_viz.png'))


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 96f7da9

Please sign in to comment.