Skip to content

Commit

Permalink
🎨 🔧 Log2, scaling and linting
Browse files Browse the repository at this point in the history
- Optional log2 transform of continuous data
- Black formatting of training_loop.py
  • Loading branch information
mpielies committed Aug 8, 2024
1 parent 30bee60 commit feefc76
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 12 deletions.
1 change: 1 addition & 0 deletions src/move/conf/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class InputConfig:
@dataclass
class ContinuousInputConfig(InputConfig):
scale: bool = True
log2: bool = False


@dataclass
Expand Down
6 changes: 4 additions & 2 deletions src/move/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntA
return encoded_value


def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]:
def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]:
"""Center to mean and scale to unit variance. Convert NaN values to 0.
Args:
Expand All @@ -74,7 +74,9 @@ def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]:
Tuple containing (1) scaled output and (2) a 1D mask marking columns
(i.e., features) without zero variance
"""
logx = np.log2(x + 1)
logx = x
if log2:
logx = np.log2(x + 1)
mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0)
scaled_x = standardize(logx[:, mask_1d], axis=0)
scaled_x[np.isnan(scaled_x)] = 0
Expand Down
19 changes: 10 additions & 9 deletions src/move/tasks/encode_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,19 @@ def encode_data(config: DataConfig):
# before preprocessing:
fig = plot_value_distributions(values)
fig_path = str(
output_path / "Value_distribution_{}_unprocessed.png".format(dataset_name)
output_path / f"Value_distribution_{dataset_name}_unprocessed.png"
)
fig.savefig(fig_path)

# Plotting the value distribution for all continuous datasets:
fig = plot_value_distributions(values)
fig_path = str(output_path / f"Value_distribution_{dataset_name}.png")
fig.savefig(fig_path)

if scale:
values, mask_1d = preprocessing.scale(values)
logger.debug(f"Scaling dataset: {dataset_name}, log2 transform: {input_config.log2}")
values, mask_1d = preprocessing.scale(values, input_config.log2)
names = names[mask_1d]
logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")
io.dump_names(interim_data_path / f"{input_config.name}.txt", names)
np.save(interim_data_path / f"{input_config.name}.npy", values)
# Plotting the value distribution for all continuous datasets:
fig = plot_value_distributions(values)
fig_path = str(output_path / f"Value_distribution_{dataset_name}.png")
fig.savefig(fig_path)

io.dump_names(interim_data_path / f"{dataset_name}.txt", names)
np.save(interim_data_path / f"{dataset_name}.npy", values)
1 change: 0 additions & 1 deletion src/move/training/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def training_loop(

kld_weight = 0.0


for epoch in range(1, num_epochs + 1):
if epoch in kld_warmup_steps:
kld_weight += 1 / len(kld_warmup_steps)
Expand Down
4 changes: 4 additions & 0 deletions tutorial/config/data/random_continuous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,8 @@ categorical_inputs: [] # no categorical inputs

continuous_inputs: # a list of continuous datasets
- name: random.continuous.proteomics
log2: true
scale: true
- name: random.continuous.metagenomics
log2: true
scale: true
4 changes: 4 additions & 0 deletions tutorial/config/data/random_small.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,8 @@ categorical_inputs: # a list of categorical datasets

continuous_inputs: # a list of continuous datasets
- name: random.small.proteomics
log2: true #apply log2 before scaling
scale: true #scale data (z-score normalize)
- name: random.small.metagenomics
log2: true
scale: true

0 comments on commit feefc76

Please sign in to comment.