From e787d0182f3651c23515b2afff5ce892f7197510 Mon Sep 17 00:00:00 2001
From: Lengyue <lengyue@lengyue.me>
Date: Sun, 5 May 2024 04:41:22 -0400
Subject: [PATCH] Use SFT medium as base model

---
 docs/en/finetune.md                           |  4 ++--
 docs/en/inference.md                          |  8 +++----
 docs/zh/finetune.md                           |  6 ++---
 docs/zh/inference.md                          | 10 ++++----
 .../configs/text2semantic_finetune.yaml       |  2 +-
 fish_speech/webui/manage.py                   |  2 +-
 pyproject.toml                                |  1 -
 tools/api.py                                  |  2 +-
 tools/llama/merge_lora.py                     |  2 +-
 tools/webui.py                                | 23 +------------------
 10 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/docs/en/finetune.md b/docs/en/finetune.md
index fa72ccd0..694c39b2 100644
--- a/docs/en/finetune.md
+++ b/docs/en/finetune.md
@@ -148,7 +148,7 @@ After the command finishes executing, you should see the `quantized-dataset-ft.p
 Similarly, make sure you have downloaded the `LLAMA` weights. If not, run the following command:
 
 ```bash
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-large-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
 ```
 
 Finally, you can start the fine-tuning by running the following command:
@@ -182,7 +182,7 @@ After training, you need to convert the LoRA weights to regular weights before p
 python tools/llama/merge_lora.py \
     --llama-config dual_ar_2_codebook_large \
     --lora-config r_8_alpha_16 \
-    --llama-weight checkpoints/text2semantic-sft-large-v1-4k.pth \
+    --llama-weight checkpoints/text2semantic-sft-medium-v1-4k.pth \
     --lora-weight results/text2semantic-finetune-medium-lora/checkpoints/step_000000200.ckpt \
     --output checkpoints/merged.ckpt
 ```
diff --git a/docs/en/inference.md b/docs/en/inference.md
index bb050155..4befc16f 100644
--- a/docs/en/inference.md
+++ b/docs/en/inference.md
@@ -16,7 +16,7 @@ Download the required `vqgan` and `text2semantic` models from our Hugging Face r
     
 ```bash
 huggingface-cli download fishaudio/fish-speech-1 vq-gan-group-fsq-2x1024.pth --local-dir checkpoints
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-large-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
 ```
 
 ### 1. Generate prompt from voice:
@@ -38,7 +38,7 @@ python tools/llama/generate.py \
     --prompt-text "Your reference text" \
     --prompt-tokens "fake.npy" \
     --config-name dual_ar_2_codebook_large \
-    --checkpoint-path "checkpoints/text2semantic-sft-large-v1-4k.pth" \
+    --checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
     --num-samples 2 \
     --compile
 ```
@@ -69,7 +69,7 @@ We provide a HTTP API for inference. You can use the following command to start
 ```bash
 python -m tools.api \
     --listen 0.0.0.0:8000 \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-large-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_large \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 ```
@@ -82,7 +82,7 @@ You can start the WebUI using the following command:
 
 ```bash
 python -m tools.webui \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-large-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_large \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 ```
diff --git a/docs/zh/finetune.md b/docs/zh/finetune.md
index f01f4303..d4fde2f2 100644
--- a/docs/zh/finetune.md
+++ b/docs/zh/finetune.md
@@ -152,13 +152,13 @@ python tools/llama/build_dataset.py \
 同样的, 请确保你已经下载了 `LLAMA` 权重, 如果没有, 请运行以下命令:
 
 ```bash
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-large-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
 ```
 
 对于中国大陆用户, 可使用 mirror 下载.
 
 ```bash
-HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-large-v1-4k.pth --local-dir checkpoints
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
 ```
 
 最后, 你可以运行以下命令来启动微调:
@@ -192,7 +192,7 @@ python fish_speech/train.py --config-name text2semantic_finetune \
 python tools/llama/merge_lora.py \
     --llama-config dual_ar_2_codebook_large \
     --lora-config r_8_alpha_16 \
-    --llama-weight checkpoints/text2semantic-sft-large-v1-4k.pth \
+    --llama-weight checkpoints/text2semantic-sft-medium-v1-4k.pth \
     --lora-weight results/text2semantic-finetune-medium-lora/checkpoints/step_000000200.ckpt \
     --output checkpoints/merged.ckpt
 ```
diff --git a/docs/zh/inference.md b/docs/zh/inference.md
index 35a0ad89..232088c6 100644
--- a/docs/zh/inference.md
+++ b/docs/zh/inference.md
@@ -16,12 +16,12 @@
     
 ```bash
 huggingface-cli download fishaudio/fish-speech-1 vq-gan-group-fsq-2x1024.pth --local-dir checkpoints
-huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-large-v1-4k.pth --local-dir checkpoints
+huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
 ```
 对于中国大陆用户，可使用mirror下载。
 ```bash
 HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 vq-gan-group-fsq-2x1024.pth --local-dir checkpoints
-HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-large-v1-4k.pth --local-dir checkpoints
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1 text2semantic-sft-medium-v1-4k.pth --local-dir checkpoints
 ```
 
 ### 1. 从语音生成 prompt: 
@@ -43,7 +43,7 @@ python tools/llama/generate.py \
     --prompt-text "你的参考文本" \
     --prompt-tokens "fake.npy" \
     --config-name dual_ar_2_codebook_large \
-    --checkpoint-path "checkpoints/text2semantic-sft-large-v1-4k.pth" \
+    --checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
     --num-samples 2 \
     --compile
 ```
@@ -74,7 +74,7 @@ python tools/vqgan/inference.py \
 ```bash
 python -m tools.api \
     --listen 0.0.0.0:8000 \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-large-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_large \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 
@@ -90,7 +90,7 @@ HF_ENDPOINT=https://hf-mirror.com python -m ...
 
 ```bash
 python -m tools.webui \
-    --llama-checkpoint-path "checkpoints/text2semantic-sft-large-v1-4k.pth" \
+    --llama-checkpoint-path "checkpoints/text2semantic-sft-medium-v1-4k.pth" \
     --llama-config-name dual_ar_2_codebook_large \
     --vqgan-checkpoint-path "checkpoints/vq-gan-group-fsq-2x1024.pth"
 ```
diff --git a/fish_speech/configs/text2semantic_finetune.yaml b/fish_speech/configs/text2semantic_finetune.yaml
index 179df1b8..f347f58a 100644
--- a/fish_speech/configs/text2semantic_finetune.yaml
+++ b/fish_speech/configs/text2semantic_finetune.yaml
@@ -5,7 +5,7 @@ defaults:
 
 project: text2semantic_finetune_dual_ar
 max_length: 2048
-ckpt_path: checkpoints/text2semantic-sft-large-v1-4k.pth
+ckpt_path: checkpoints/text2semantic-sft-medium-v1-4k.pth
 resume_weights_only: true
 
 # Lightning Trainer
diff --git a/fish_speech/webui/manage.py b/fish_speech/webui/manage.py
index 05523d66..8e25a7de 100644
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -470,7 +470,7 @@ def generate_folder_name():
         ckpt_path = (
             "text2semantic-pretrain-medium-2k-v1.pth"
             if llama_base_config == "dual_ar_2_codebook_medium"
-            else "text2semantic-sft-large-v1-4k.pth"
+            else "text2semantic-sft-medium-v1-4k.pth"
         )
 
         latest = list(
diff --git a/pyproject.toml b/pyproject.toml
index daca42dd..0989b3b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,6 @@ dependencies = [
     "vector_quantize_pytorch>=1.14.7",
     "samplerate>=0.2.1",
     "resampy>=0.4.3",
-    "spaces>=0.26.1",
     "einx[torch]==0.2.2"
 ]
 
diff --git a/tools/api.py b/tools/api.py
index 9fd619d5..13c817a1 100644
--- a/tools/api.py
+++ b/tools/api.py
@@ -225,7 +225,7 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=str,
-        default="checkpoints/text2semantic-sft-large-v1-4k.pth",
+        default="checkpoints/text2semantic-sft-medium-v1-4k.pth",
     )
     parser.add_argument(
         "--llama-config-name", type=str, default="dual_ar_2_codebook_large"
diff --git a/tools/llama/merge_lora.py b/tools/llama/merge_lora.py
index 89e60ece..d125d129 100644
--- a/tools/llama/merge_lora.py
+++ b/tools/llama/merge_lora.py
@@ -15,7 +15,7 @@
 @click.option("--llama-config", type=str, default="dual_ar_2_codebook_large")
 @click.option("--lora-config", type=str, default="r_8_alpha_16")
 @click.option(
-    "--llama-weight", type=str, default="checkpoints/text2semantic-sft-large-v1-4k.pth"
+    "--llama-weight", type=str, default="checkpoints/text2semantic-sft-medium-v1-4k.pth"
 )
 @click.option("--lora-weight", type=str, required=True)
 @click.option("--output", type=str, required=True)
diff --git a/tools/webui.py b/tools/webui.py
index 2c66a3e7..ee6fbb42 100644
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -40,21 +40,6 @@
 TEXTBOX_PLACEHOLDER = i18n("Put your text here.")
 SPACE_IMPORTED = False
 
-try:
-    import spaces
-
-    GPU_DECORATOR = spaces.GPU
-    SPACE_IMPORTED = True
-except ImportError:
-
-    def GPU_DECORATOR(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            return func(*args, **kwargs)
-
-        wrapper.original = func  # ref
-        return wrapper
-
 
 def build_html_error_message(error):
     return f"""
@@ -65,7 +50,6 @@ def build_html_error_message(error):
     """
 
 
-@GPU_DECORATOR
 @torch.inference_mode()
 def inference(
     text,
@@ -173,11 +157,6 @@ def inference(
 
 inference_stream = partial(inference, streaming=True)
 
-if not SPACE_IMPORTED:
-    logger.info("‘spaces’ not imported, use original")
-    inference = inference.original
-    inference_stream = partial(inference, streaming=True)
-
 
 def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
     buffer = io.BytesIO()
@@ -343,7 +322,7 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/text2semantic-sft-large-v1-4k.pth",
+        default="checkpoints/text2semantic-sft-medium-v1-4k.pth",
     )
     parser.add_argument(
         "--llama-config-name", type=str, default="dual_ar_2_codebook_large"