diff --git a/.gitignore b/.gitignore index bfb60ec6..88341de1 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ asr-label* /demo-audios ref_data* /example +/faster_whisper diff --git a/README.md b/README.md index e12fd111..8f555970 100644 --- a/README.md +++ b/README.md @@ -17,35 +17,43 @@ -This codebase and all models are released under CC-BY-NC-SA-4.0 License. Please refer to [LICENSE](LICENSE) for more details. +This codebase and all models are released under CC-BY-NC-SA-4.0 License. Please refer to [LICENSE](LICENSE) for more details. 此代码库及模型根据 CC-BY-NC-SA-4.0 许可证发布。请参阅 [LICENSE](LICENSE) 了解更多细节. ## Disclaimer / 免责声明 + We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws. 我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规. ## Online Demo -[Fish Audio](https://fish.audio) + +[Fish Audio](https://fish.audio) ## Quick Start + [inference.ipynb](https://nbviewer.org/github/AnyaCoder/fish-speech/blob/main/inference.ipynb) ## Videos + #### Demo Video: https://www.bilibili.com/video/BV1wz421B71D + #### Tech slides Video: https://www.bilibili.com/video/BV1zJ4m1K7cj ## Documents / 文档 + - [English](https://speech.fish.audio/en/) - [中文](https://speech.fish.audio/) -- [日本語](https://speech.fish.audio/) +- [日本語](https://speech.fish.audio/ja/) ## Samples / 例子 + - [English](https://speech.fish.audio/en/samples/) - [中文](https://speech.fish.audio/samples/) - [日本語](https://speech.fish.audio/ja/samples/) ## Credits / 鸣谢 + - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) - [GPT VITS](https://github.com/innnky/gpt-vits) diff --git a/docs/zh/index.md b/docs/zh/index.md index 7502e1ee..f4f62489 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -67,7 +67,7 @@ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法

-
  • インストール CUDA Toolkit 12
  • +
  • 安装 CUDA Toolkit 12
  • 双击 start.bat, 进入 Fish-Speech 训练推理配置 WebUI 页面。 diff --git a/fish_speech/webui/manage.py b/fish_speech/webui/manage.py index 9e84f25f..e43d4b69 100644 --- a/fish_speech/webui/manage.py +++ b/fish_speech/webui/manage.py @@ -727,12 +727,10 @@ def llama_quantify(llama_weight, quantify_mode): ) label_model = gr.Dropdown( label=i18n("Whisper Model"), - info=i18n( - "Use large for 10G+ GPU, medium for 5G, small for 2G" - ), - choices=["large", "medium", "small"], - value="small", - interactive=True, + info=i18n("Faster Whisper, Up to 5g GPU memory usage"), + choices=["large-v3"], + value="large-v3", + interactive=False, ) label_radio = gr.Dropdown( label=i18n("Optional Label Language"), @@ -744,6 +742,7 @@ def llama_quantify(llama_weight, quantify_mode): (i18n("English"), "EN"), (i18n("Japanese"), "JA"), (i18n("Disabled"), "IGNORE"), + (i18n("auto"), "auto"), ], value="IGNORE", interactive=True, diff --git a/pyproject.toml b/pyproject.toml index e40f1691..5b012295 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,9 @@ dependencies = [ "samplerate>=0.2.1", "resampy>=0.4.3", "einx[torch]==0.2.2", - "zstandard>=0.22.0" + "zstandard>=0.22.0", + "pydub", + "faster_whisper", ] [project.optional-dependencies] diff --git a/start.bat b/start.bat index 903a224d..f3b58a6a 100644 --- a/start.bat +++ b/start.bat @@ -5,7 +5,7 @@ set USE_MIRROR=true set PYTHONPATH=%~dp0 set PYTHON_CMD=%cd%\fishenv\env\python set API_FLAG_PATH=%~dp0API_FLAGS.txt - +set KMP_DUPLICATE_LIB_OK=TRUE setlocal enabledelayedexpansion diff --git a/tools/whisper_asr.py b/tools/whisper_asr.py index 08aa1c8e..47b273c3 100644 --- a/tools/whisper_asr.py +++ b/tools/whisper_asr.py @@ -22,48 +22,59 @@ Note: Be aware of your audio sample rate, which defaults to 44.1kHz. """ +import re from pathlib import Path import click -import librosa import soundfile as sf -import whisper +from faster_whisper import WhisperModel from loguru import logger -from merge_asr_files import merge_and_delete_files +from pydub import AudioSegment from tqdm import tqdm from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files @click.command() -@click.option("--model-size", default="large", help="Size of the Whisper model") +@click.option("--model-size", default="large-v3", help="Size of the Whisper model") +@click.option( + "--compute-type", + default="float16", + help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]", +) @click.option("--audio-dir", required=True, help="Directory containing audio files") @click.option( "--save-dir", required=True, help="Directory to save processed audio files" ) @click.option( "--sample-rate", - default=None, + default=44100, type=int, help="Output sample rate, default to input sample rate", ) -@click.option("--device", default="cuda", help="Device to use") -@click.option("--language", default="ZH", help="Language of the transcription") -def main(model_size, audio_dir, save_dir, sample_rate, device, language): - logger.info("Loading / Downloading OpenAI Whisper model...") - model = whisper.load_model( - name=model_size, +@click.option("--device", default="cuda", help="Device to use [cuda / cpu]") +@click.option("--language", default="auto", help="Language of the transcription") +def main(model_size, compute_type, audio_dir, save_dir, sample_rate, device, language): + logger.info("Loading / Downloading Faster Whisper model...") + + model = WhisperModel( + model_size, device=device, - download_root=str(Path(".cache/whisper").resolve()), + compute_type=compute_type, + download_root="faster_whisper", ) + logger.info("Model loaded.") save_path = Path(save_dir) save_path.mkdir(parents=True, exist_ok=True) - original_files = [] + audio_files = list_files( path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True ) + + numbered_suffix_pattern = re.compile(r"-\d{3}$") + for file_path in tqdm(audio_files, desc="Processing audio file"): file_stem = file_path.stem file_suffix = file_path.suffix @@ -71,44 +82,97 @@ def main(model_size, audio_dir, save_dir, sample_rate, device, language): rel_path = Path(file_path).relative_to(audio_dir) (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True) - if (save_path / rel_path.parent / f"{rel_path.stem}.wav").exists() and ( - save_path / rel_path.parent / f"{rel_path.stem}.lab" - ).exists(): + # Skip files that already have a .lab file or a -{3-digit number} suffix + numbered_suffix = numbered_suffix_pattern.search(file_stem) + lab_file = file_path.with_suffix(".lab") + + if numbered_suffix and lab_file.exists(): + continue + + if not numbered_suffix and lab_file.with_stem(lab_file.stem + "-001").exists(): + if file_path.exists(): + file_path.unlink() continue - audio, sr = librosa.load(file_path, sr=sample_rate, mono=False) - transcription = model.transcribe(str(file_path), language=language) + audio = AudioSegment.from_file(file_path) - for segment in transcription.get("segments", []): - id, text, start, end = ( - segment["id"], - segment["text"], - segment["start"], - segment["end"], - ) + segments, info = model.transcribe( + file_path, beam_size=5, language=None if language == "auto" else language + ) - extract = audio[..., int(start * sr) : int(end * sr)] - audio_save_path = ( - save_path / rel_path.parent / f"{file_stem}-{id}{file_suffix}" + print( + "Detected language '%s' with probability %f" + % (info.language, info.language_probability) + ) + print("Total len(ms): ", len(audio)) + + for segment in segments: + id, start, end, text = ( + segment.id, + segment.start, + segment.end, + segment.text, ) - sf.write( - audio_save_path, - extract, - samplerate=sr, + print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text)) + start_ms = int(start * 1000) + end_ms = int(end * 1000) + 200 # add 0.2s avoid truncating + segment_audio = audio[start_ms:end_ms] + audio_save_path = ( + save_path / rel_path.parent / f"{file_stem}-{id:03d}{file_suffix}" ) - original_files.append(audio_save_path) + segment_audio.export(audio_save_path, format=file_suffix[1:]) + print(f"Exported {audio_save_path}") - transcript_save_path = save_path / rel_path.parent / f"{file_stem}-{id}.lab" + transcript_save_path = ( + save_path / rel_path.parent / f"{file_stem}-{id:03d}.lab" + ) with open( transcript_save_path, "w", encoding="utf-8", ) as f: - f.write(text) - original_files.append(transcript_save_path) + f.write(segment.text) - merge_and_delete_files(save_dir, original_files) + file_path.unlink() if __name__ == "__main__": main() + exit(0) + + audio = AudioSegment.from_wav( + r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav" + ) + + model_size = "large-v3" + + model = WhisperModel( + model_size, + device="cuda", + compute_type="float16", + download_root="faster_whisper", + ) + + segments, info = model.transcribe( + r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav", + beam_size=5, + ) + + print( + "Detected language '%s' with probability %f" + % (info.language, info.language_probability) + ) + print("Total len(ms): ", len(audio)) + + for i, segment in enumerate(segments): + print( + "Segment %03d [%.2fs -> %.2fs] %s" + % (i, segment.start, segment.end, segment.text) + ) + start_ms = int(segment.start * 1000) + end_ms = int(segment.end * 1000) + segment_audio = audio[start_ms:end_ms] + segment_audio.export(f"segment_{i:03d}.wav", format="wav") + print(f"Exported segment_{i:03d}.wav") + + print("All segments have been exported.")