open-mmlab · HighPriest · Feb 16, 2024 · Feb 16, 2024
diff --git a/bins/calc_metrics.py b/bins/calc_metrics.py
@@ -86,14 +86,16 @@ def calc_metric(
             continue
 
         audios_ref = []
+        import os
+
         audios_deg = []
 
-        files = glob(ref_dir + "/*.wav")
+        files = glob(os.path.join(ref_dir, "*.wav"))
 
         for file in files:
             audios_ref.append(file)
-            uid = file.split("/")[-1].split(".wav")[0]
-            file_gt = deg_dir + "/{}.wav".format(uid)
+            uid = os.path.splitext(os.path.basename(file))[0]
+            file_gt = os.path.join(deg_dir, f"{uid}.wav")
             audios_deg.append(file_gt)
 
         if metric in ["v_uv_f1"]:

diff --git a/bins/svc/inference.py b/bins/svc/inference.py
@@ -227,14 +227,14 @@ def main():
         audio_list = []
         for suffix in ["wav", "flac", "mp3"]:
             audio_list += glob.glob(
-                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+                os.path.join(source_audio_dir, "**", "*.{}".format(suffix)), recursive=True
             )
         print("There are {} source audios: ".format(len(audio_list)))
 
         # Infer for every file as dataset
         output_root_path = args.output_dir
         for audio_path in tqdm(audio_list):
-            audio_name = audio_path.split("/")[-1].split(".")[0]
+            audio_name = os.path.splitext(os.path.basename(audio_path))[0]
             args.output_dir = os.path.join(output_root_path, audio_name)
             print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
 

diff --git a/egs/tts/VALLE/README.md b/egs/tts/VALLE/README.md
@@ -54,10 +54,10 @@ Specify the `processed_dir` and the `log_dir` and for saving the processed data
 Run the `run.sh` as the preproces stage (set  `--stage 1`):
 
 ```bash
-sh egs/tts/VALLE/run.sh --stage 1
+python egs/tts/VALLE/run.py --stage 1
 ```
 
-> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.py` by specifying such as `--gpu "1"`.
 
 
 ## 3. Training
@@ -74,31 +74,31 @@ We provide the default hyparameters in the `exp_config.json`. They can work on s
 
 ### Run
 
-Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
+Run the `run.py` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
 
 Specifically, VALL-E need to train a autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the ckeckpoint path to the trained AR model.
 
 
 Train a AR moel, just run:
 
 ```bash
-sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName]
+python egs/tts/VALLE/run.py --stage 2 --model_train_stage 1 --name [YourExptName]
 ```
 
 Train a NAR model, just run:
 ```bash
-sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName]
+python egs/tts/VALLE/run.py --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName]
 ```
 <!-- > **NOTE:** To train a NAR model, `--checkpoint_path` should be set as the ckeckpoint path to the trained AR model. -->
 
-> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.py` by specifying such as `--gpu "0,1,2,3"`.
 
 
 ## 4. Inference
 
 ### Configuration
 
-For inference, you need to specify the following configurations when running `run.sh`:
+For inference, you need to specify the following configurations when running `run.py`:
 
 
 
@@ -117,7 +117,7 @@ For inference, you need to specify the following configurations when running `ru
 For example, if you want to generate a single clip of speech, just run:
 
 ```bash
-sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \
+python egs/tts/VALLE/run.py --stage 3 --gpu "0" \
     --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
     --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
     --infer_mode "single" \

diff --git a/egs/tts/VALLE/run.py b/egs/tts/VALLE/run.py
@@ -0,0 +1,105 @@
+import os
+import sys
+import subprocess
+from argparse import ArgumentParser
+
+
+# Set up directories
+work_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
+print(f"Work Directory: {work_dir}")
+
+os.environ['WORK_DIR'] = work_dir
+os.environ['PYTHONPATH'] = work_dir
+os.environ['PYTHONIOENCODING'] = 'UTF-8'
+
+# Build Monotonic Align Module
+os.chdir(os.path.join(work_dir, 'modules', 'monotonic_align'))
+os.makedirs('monotonic_align', exist_ok=True)
+subprocess.run(['python', 'setup.py', 'build_ext', '--inplace'], check=True)
+os.chdir(work_dir)
+
+# Parse parameters
+parser = ArgumentParser()
+parser.add_argument('-c', '--config', help='Experimental Configuration File')
+parser.add_argument('-n', '--name', help='Experimental Name')
+parser.add_argument('-s', '--stage', type=int, help='Running Stage')
+parser.add_argument('--gpu', type=str, help='Visible GPU machines')
+parser.add_argument('--model_train_stage', type=str, help='Model Training Stage')
+parser.add_argument('--ar_model_ckpt_dir', type=str, help='The stage1 ckpt dir')
+parser.add_argument('--infer_expt_dir', type=str, help='The experiment dir')
+parser.add_argument('--infer_output_dir', type=str, help='The output dir to save inferred audios')
+parser.add_argument('--infer_mode', type=str, help='The inference mode')
+parser.add_argument('--infer_test_list_file', type=str, help='The inference test list file')
+parser.add_argument('--infer_text', type=str, help='The text to be synthesized from')
+parser.add_argument('--infer_text_prompt', type=str, help='The inference text prompt')
+parser.add_argument('--infer_audio_prompt', type=str, help='The inference audio prompt')
+args = parser.parse_args()
+
+# Check required parameters
+if args.stage is None:
+    print("Error: Please specify the running stage")
+    sys.exit(1)
+
+if args.config is None:
+    args.config = os.path.join(work_dir, 'exp_config.json')
+print(f"Experimental Configuration File: {args.config}")
+
+if args.gpu is None:
+    args.gpu = '0'
+
+# Features Extraction
+if args.stage == 1:
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    cmd = ['python', os.path.join(work_dir, "bins", "tts", "preprocess.py"), '--config', args.config, '--num_workers', '4']
+    subprocess.run(cmd, check=True, cwd=work_dir)
+
+# Training
+if args.stage ==   2:
+    if args.name is None:
+        print("Error: Please specify the experiments name")
+        sys.exit(1)
+
+    if args.model_train_stage == '2' and args.ar_model_ckpt_dir is None:
+        print("Error: Please specify the checkpoint path to the trained model in stage1.")
+        sys.exit(1)
+
+    if args.model_train_stage == '1':
+        args.ar_model_ckpt_dir = None
+
+    print(f"Experimental Name: {args.name}")
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    cmd = ['python', os.path.join(work_dir, "bins", "tts", "train.py"), '--config', args.config, '--exp_name', args.name, '--log_level', 'debug', '--train_stage', args.model_train_stage, '--checkpoint_path', args.ar_model_ckpt_dir]
+    subprocess.run(cmd, check=True)
+
+# Inference
+if args.stage ==   3:
+    if args.infer_expt_dir is None:
+        print("Error: Please specify the experimental directory. The value is like [Your path to save logs and checkpoints]/[YourExptName]")
+        sys.exit(1)
+
+    if args.infer_output_dir is None:
+        args.infer_output_dir = os.path.join(args.infer_expt_dir, 'result')
+
+    if args.infer_mode is None:
+        print("Error: Please specify the inference mode, e.g., \"batch\", \"single\"")
+        sys.exit(1)
+
+    if args.infer_mode == 'batch' and args.infer_test_list_file is None:
+        print("Error: Please specify the test list file used in inference when the inference mode is batch")
+        sys.exit(1)
+
+    if args.infer_mode == 'single' and args.infer_text is None:
+        print("Error: Please specify the text to be synthesized when the inference mode is single")
+        sys.exit(1)
+
+    if args.infer_mode == 'single':
+        print(f'Text: {args.infer_text}')
+        args.infer_test_list_file = None
+    elif args.infer_mode == 'batch':
+        args.infer_text = ""
+        args.infer_text_prompt = ""
+        args.infer_audio_prompt = ""
+
+    cmd = ['python', os.path.join(work_dir, "bins", "tts", "inference.py"), '--config', args.config, '--log_level', 'debug', '--acoustics_dir', args.infer_expt_dir, '--output_dir', args.infer_output_dir, '--mode', args.infer_mode, '--text', args.infer_text, '--text_prompt', args.infer_text_prompt, '--audio_prompt', args.infer_audio_prompt, '--test_list_file', args.infer_test_list_file]
+    subprocess.run(cmd, check=True)
diff --git a/egs/tts/VALLE/run.sh b/egs/tts/VALLE/run.sh
diff --git a/evaluation/metrics/similarity/resemblyzer_similarity.py b/evaluation/metrics/similarity/resemblyzer_similarity.py
@@ -60,7 +60,7 @@ def extract_resemblyzer_similarity(target_path, reference_path, dump_dir):
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    filename = target_path.split("/")[-1]
+    filename = os.path.basename(target_path)
     csv_file_name = f"similarity_results_{filename}.csv"
     dump_dir = dump_dir + "/" + csv_file_name
 

diff --git a/models/base/base_inference.py b/models/base/base_inference.py
@@ -85,8 +85,8 @@ def get_vocoder_info(self):
             os.path.dirname(self.checkpoint_dir_vocoder), "args.json"
         )
         self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True)
-        self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1]
-        self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0]
+        self.vocoder_tag = os.path.split(self.checkpoint_dir_vocoder)[-2].split(":")[-1]
+        self.vocoder_steps = os.path.splitext(os.path.basename(self.checkpoint_dir_vocoder))[0]
 
     def build_test_utt_data(self):
         raise NotImplementedError