diff --git a/.gitignore b/.gitignore
index bfb60ec6..88341de1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ asr-label*
/demo-audios
ref_data*
/example
+/faster_whisper
diff --git a/README.md b/README.md
index e12fd111..8f555970 100644
--- a/README.md
+++ b/README.md
@@ -17,35 +17,43 @@
-This codebase and all models are released under CC-BY-NC-SA-4.0 License. Please refer to [LICENSE](LICENSE) for more details.
+This codebase and all models are released under CC-BY-NC-SA-4.0 License. Please refer to [LICENSE](LICENSE) for more details.
此代码库及模型根据 CC-BY-NC-SA-4.0 许可证发布。请参阅 [LICENSE](LICENSE) 了解更多细节.
## Disclaimer / 免责声明
+
We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.
我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规.
## Online Demo
-[Fish Audio](https://fish.audio)
+
+[Fish Audio](https://fish.audio)
## Quick Start
+
[inference.ipynb](https://nbviewer.org/github/AnyaCoder/fish-speech/blob/main/inference.ipynb)
## Videos
+
#### Demo Video: https://www.bilibili.com/video/BV1wz421B71D
+
#### Tech slides Video: https://www.bilibili.com/video/BV1zJ4m1K7cj
## Documents / 文档
+
- [English](https://speech.fish.audio/en/)
- [中文](https://speech.fish.audio/)
-- [日本語](https://speech.fish.audio/)
+- [日本語](https://speech.fish.audio/ja/)
## Samples / 例子
+
- [English](https://speech.fish.audio/en/samples/)
- [中文](https://speech.fish.audio/samples/)
- [日本語](https://speech.fish.audio/ja/samples/)
## Credits / 鸣谢
+
- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
- [GPT VITS](https://github.com/innnky/gpt-vits)
diff --git a/docs/zh/index.md b/docs/zh/index.md
index 7502e1ee..f4f62489 100644
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -67,7 +67,7 @@ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法
- インストール CUDA Toolkit 12
+ 安装 CUDA Toolkit 12
双击 start.bat, 进入 Fish-Speech 训练推理配置 WebUI 页面。
diff --git a/fish_speech/webui/manage.py b/fish_speech/webui/manage.py
index 9e84f25f..e43d4b69 100644
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -727,12 +727,10 @@ def llama_quantify(llama_weight, quantify_mode):
)
label_model = gr.Dropdown(
label=i18n("Whisper Model"),
- info=i18n(
- "Use large for 10G+ GPU, medium for 5G, small for 2G"
- ),
- choices=["large", "medium", "small"],
- value="small",
- interactive=True,
+ info=i18n("Faster Whisper, Up to 5g GPU memory usage"),
+ choices=["large-v3"],
+ value="large-v3",
+ interactive=False,
)
label_radio = gr.Dropdown(
label=i18n("Optional Label Language"),
@@ -744,6 +742,7 @@ def llama_quantify(llama_weight, quantify_mode):
(i18n("English"), "EN"),
(i18n("Japanese"), "JA"),
(i18n("Disabled"), "IGNORE"),
+ (i18n("auto"), "auto"),
],
value="IGNORE",
interactive=True,
diff --git a/pyproject.toml b/pyproject.toml
index e40f1691..5b012295 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,9 @@ dependencies = [
"samplerate>=0.2.1",
"resampy>=0.4.3",
"einx[torch]==0.2.2",
- "zstandard>=0.22.0"
+ "zstandard>=0.22.0",
+ "pydub",
+ "faster_whisper",
]
[project.optional-dependencies]
diff --git a/start.bat b/start.bat
index 903a224d..f3b58a6a 100644
--- a/start.bat
+++ b/start.bat
@@ -5,7 +5,7 @@ set USE_MIRROR=true
set PYTHONPATH=%~dp0
set PYTHON_CMD=%cd%\fishenv\env\python
set API_FLAG_PATH=%~dp0API_FLAGS.txt
-
+set KMP_DUPLICATE_LIB_OK=TRUE
setlocal enabledelayedexpansion
diff --git a/tools/whisper_asr.py b/tools/whisper_asr.py
index 08aa1c8e..47b273c3 100644
--- a/tools/whisper_asr.py
+++ b/tools/whisper_asr.py
@@ -22,48 +22,59 @@
Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
"""
+import re
from pathlib import Path
import click
-import librosa
import soundfile as sf
-import whisper
+from faster_whisper import WhisperModel
from loguru import logger
-from merge_asr_files import merge_and_delete_files
+from pydub import AudioSegment
from tqdm import tqdm
from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
@click.command()
-@click.option("--model-size", default="large", help="Size of the Whisper model")
+@click.option("--model-size", default="large-v3", help="Size of the Whisper model")
+@click.option(
+ "--compute-type",
+ default="float16",
+ help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
+)
@click.option("--audio-dir", required=True, help="Directory containing audio files")
@click.option(
"--save-dir", required=True, help="Directory to save processed audio files"
)
@click.option(
"--sample-rate",
- default=None,
+ default=44100,
type=int,
help="Output sample rate, default to input sample rate",
)
-@click.option("--device", default="cuda", help="Device to use")
-@click.option("--language", default="ZH", help="Language of the transcription")
-def main(model_size, audio_dir, save_dir, sample_rate, device, language):
- logger.info("Loading / Downloading OpenAI Whisper model...")
- model = whisper.load_model(
- name=model_size,
+@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
+@click.option("--language", default="auto", help="Language of the transcription")
+def main(model_size, compute_type, audio_dir, save_dir, sample_rate, device, language):
+ logger.info("Loading / Downloading Faster Whisper model...")
+
+ model = WhisperModel(
+ model_size,
device=device,
- download_root=str(Path(".cache/whisper").resolve()),
+ compute_type=compute_type,
+ download_root="faster_whisper",
)
+
logger.info("Model loaded.")
save_path = Path(save_dir)
save_path.mkdir(parents=True, exist_ok=True)
- original_files = []
+
audio_files = list_files(
path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
)
+
+ numbered_suffix_pattern = re.compile(r"-\d{3}$")
+
for file_path in tqdm(audio_files, desc="Processing audio file"):
file_stem = file_path.stem
file_suffix = file_path.suffix
@@ -71,44 +82,97 @@ def main(model_size, audio_dir, save_dir, sample_rate, device, language):
rel_path = Path(file_path).relative_to(audio_dir)
(save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
- if (save_path / rel_path.parent / f"{rel_path.stem}.wav").exists() and (
- save_path / rel_path.parent / f"{rel_path.stem}.lab"
- ).exists():
+ # Skip files that already have a .lab file or a -{3-digit number} suffix
+ numbered_suffix = numbered_suffix_pattern.search(file_stem)
+ lab_file = file_path.with_suffix(".lab")
+
+ if numbered_suffix and lab_file.exists():
+ continue
+
+ if not numbered_suffix and lab_file.with_stem(lab_file.stem + "-001").exists():
+ if file_path.exists():
+ file_path.unlink()
continue
- audio, sr = librosa.load(file_path, sr=sample_rate, mono=False)
- transcription = model.transcribe(str(file_path), language=language)
+ audio = AudioSegment.from_file(file_path)
- for segment in transcription.get("segments", []):
- id, text, start, end = (
- segment["id"],
- segment["text"],
- segment["start"],
- segment["end"],
- )
+ segments, info = model.transcribe(
+ file_path, beam_size=5, language=None if language == "auto" else language
+ )
- extract = audio[..., int(start * sr) : int(end * sr)]
- audio_save_path = (
- save_path / rel_path.parent / f"{file_stem}-{id}{file_suffix}"
+ print(
+ "Detected language '%s' with probability %f"
+ % (info.language, info.language_probability)
+ )
+ print("Total len(ms): ", len(audio))
+
+ for segment in segments:
+ id, start, end, text = (
+ segment.id,
+ segment.start,
+ segment.end,
+ segment.text,
)
- sf.write(
- audio_save_path,
- extract,
- samplerate=sr,
+ print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
+ start_ms = int(start * 1000)
+ end_ms = int(end * 1000) + 200 # add 0.2s avoid truncating
+ segment_audio = audio[start_ms:end_ms]
+ audio_save_path = (
+ save_path / rel_path.parent / f"{file_stem}-{id:03d}{file_suffix}"
)
- original_files.append(audio_save_path)
+ segment_audio.export(audio_save_path, format=file_suffix[1:])
+ print(f"Exported {audio_save_path}")
- transcript_save_path = save_path / rel_path.parent / f"{file_stem}-{id}.lab"
+ transcript_save_path = (
+ save_path / rel_path.parent / f"{file_stem}-{id:03d}.lab"
+ )
with open(
transcript_save_path,
"w",
encoding="utf-8",
) as f:
- f.write(text)
- original_files.append(transcript_save_path)
+ f.write(segment.text)
- merge_and_delete_files(save_dir, original_files)
+ file_path.unlink()
if __name__ == "__main__":
main()
+ exit(0)
+
+ audio = AudioSegment.from_wav(
+ r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
+ )
+
+ model_size = "large-v3"
+
+ model = WhisperModel(
+ model_size,
+ device="cuda",
+ compute_type="float16",
+ download_root="faster_whisper",
+ )
+
+ segments, info = model.transcribe(
+ r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
+ beam_size=5,
+ )
+
+ print(
+ "Detected language '%s' with probability %f"
+ % (info.language, info.language_probability)
+ )
+ print("Total len(ms): ", len(audio))
+
+ for i, segment in enumerate(segments):
+ print(
+ "Segment %03d [%.2fs -> %.2fs] %s"
+ % (i, segment.start, segment.end, segment.text)
+ )
+ start_ms = int(segment.start * 1000)
+ end_ms = int(segment.end * 1000)
+ segment_audio = audio[start_ms:end_ms]
+ segment_audio.export(f"segment_{i:03d}.wav", format="wav")
+ print(f"Exported segment_{i:03d}.wav")
+
+ print("All segments have been exported.")