Skip to content

Commit

Permalink
[Feature] Add Faster Whisper (#365)
Browse files Browse the repository at this point in the history
* Add quick start ipynb

* Remove redundant output

* Fix docs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [Feature] Add Fast Whisper

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Different suffix

* Different audio format

* Fix README.md for ja docs

* Fix ZH docs

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
AnyaCoder and pre-commit-ci[bot] committed Jul 11, 2024
1 parent 5dd593f commit 06c5d10
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 49 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ asr-label*
/demo-audios
ref_data*
/example
/faster_whisper
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,43 @@
</a>
</div>

This codebase and all models are released under CC-BY-NC-SA-4.0 License. Please refer to [LICENSE](LICENSE) for more details.
This codebase and all models are released under CC-BY-NC-SA-4.0 License. Please refer to [LICENSE](LICENSE) for more details.

此代码库及模型根据 CC-BY-NC-SA-4.0 许可证发布。请参阅 [LICENSE](LICENSE) 了解更多细节.

## Disclaimer / 免责声明

We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.
我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规.

## Online Demo
[Fish Audio](https://fish.audio)

[Fish Audio](https://fish.audio)

## Quick Start

[inference.ipynb](https://nbviewer.org/github/AnyaCoder/fish-speech/blob/main/inference.ipynb)

## Videos

#### Demo Video: https://www.bilibili.com/video/BV1wz421B71D

#### Tech slides Video: https://www.bilibili.com/video/BV1zJ4m1K7cj

## Documents / 文档

- [English](https://speech.fish.audio/en/)
- [中文](https://speech.fish.audio/)
- [日本語](https://speech.fish.audio/)
- [日本語](https://speech.fish.audio/ja/)

## Samples / 例子

- [English](https://speech.fish.audio/en/samples/)
- [中文](https://speech.fish.audio/samples/)
- [日本語](https://speech.fish.audio/ja/samples/)

## Credits / 鸣谢

- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
- [GPT VITS](https://github.com/innnky/gpt-vits)
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法
</p>
</ul>
</li>
<li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
<li>安装 <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
</ol>
</li>
<li>双击 start.bat, 进入 Fish-Speech 训练推理配置 WebUI 页面。
Expand Down
11 changes: 5 additions & 6 deletions fish_speech/webui/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,12 +727,10 @@ def llama_quantify(llama_weight, quantify_mode):
)
label_model = gr.Dropdown(
label=i18n("Whisper Model"),
info=i18n(
"Use large for 10G+ GPU, medium for 5G, small for 2G"
),
choices=["large", "medium", "small"],
value="small",
interactive=True,
info=i18n("Faster Whisper, Up to 5g GPU memory usage"),
choices=["large-v3"],
value="large-v3",
interactive=False,
)
label_radio = gr.Dropdown(
label=i18n("Optional Label Language"),
Expand All @@ -744,6 +742,7 @@ def llama_quantify(llama_weight, quantify_mode):
(i18n("English"), "EN"),
(i18n("Japanese"), "JA"),
(i18n("Disabled"), "IGNORE"),
(i18n("auto"), "auto"),
],
value="IGNORE",
interactive=True,
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ dependencies = [
"samplerate>=0.2.1",
"resampy>=0.4.3",
"einx[torch]==0.2.2",
"zstandard>=0.22.0"
"zstandard>=0.22.0",
"pydub",
"faster_whisper",
]

[project.optional-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion start.bat
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set USE_MIRROR=true
set PYTHONPATH=%~dp0
set PYTHON_CMD=%cd%\fishenv\env\python
set API_FLAG_PATH=%~dp0API_FLAGS.txt

set KMP_DUPLICATE_LIB_OK=TRUE

setlocal enabledelayedexpansion

Expand Down
138 changes: 101 additions & 37 deletions tools/whisper_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,93 +22,157 @@
Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
"""

import re
from pathlib import Path

import click
import librosa
import soundfile as sf
import whisper
from faster_whisper import WhisperModel
from loguru import logger
from merge_asr_files import merge_and_delete_files
from pydub import AudioSegment
from tqdm import tqdm

from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files


@click.command()
@click.option("--model-size", default="large", help="Size of the Whisper model")
@click.option("--model-size", default="large-v3", help="Size of the Whisper model")
@click.option(
"--compute-type",
default="float16",
help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
)
@click.option("--audio-dir", required=True, help="Directory containing audio files")
@click.option(
"--save-dir", required=True, help="Directory to save processed audio files"
)
@click.option(
"--sample-rate",
default=None,
default=44100,
type=int,
help="Output sample rate, default to input sample rate",
)
@click.option("--device", default="cuda", help="Device to use")
@click.option("--language", default="ZH", help="Language of the transcription")
def main(model_size, audio_dir, save_dir, sample_rate, device, language):
logger.info("Loading / Downloading OpenAI Whisper model...")
model = whisper.load_model(
name=model_size,
@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
@click.option("--language", default="auto", help="Language of the transcription")
def main(model_size, compute_type, audio_dir, save_dir, sample_rate, device, language):
logger.info("Loading / Downloading Faster Whisper model...")

model = WhisperModel(
model_size,
device=device,
download_root=str(Path(".cache/whisper").resolve()),
compute_type=compute_type,
download_root="faster_whisper",
)

logger.info("Model loaded.")

save_path = Path(save_dir)
save_path.mkdir(parents=True, exist_ok=True)
original_files = []

audio_files = list_files(
path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
)

numbered_suffix_pattern = re.compile(r"-\d{3}$")

for file_path in tqdm(audio_files, desc="Processing audio file"):
file_stem = file_path.stem
file_suffix = file_path.suffix

rel_path = Path(file_path).relative_to(audio_dir)
(save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)

if (save_path / rel_path.parent / f"{rel_path.stem}.wav").exists() and (
save_path / rel_path.parent / f"{rel_path.stem}.lab"
).exists():
# Skip files that already have a .lab file or a -{3-digit number} suffix
numbered_suffix = numbered_suffix_pattern.search(file_stem)
lab_file = file_path.with_suffix(".lab")

if numbered_suffix and lab_file.exists():
continue

if not numbered_suffix and lab_file.with_stem(lab_file.stem + "-001").exists():
if file_path.exists():
file_path.unlink()
continue

audio, sr = librosa.load(file_path, sr=sample_rate, mono=False)
transcription = model.transcribe(str(file_path), language=language)
audio = AudioSegment.from_file(file_path)

for segment in transcription.get("segments", []):
id, text, start, end = (
segment["id"],
segment["text"],
segment["start"],
segment["end"],
)
segments, info = model.transcribe(
file_path, beam_size=5, language=None if language == "auto" else language
)

extract = audio[..., int(start * sr) : int(end * sr)]
audio_save_path = (
save_path / rel_path.parent / f"{file_stem}-{id}{file_suffix}"
print(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
print("Total len(ms): ", len(audio))

for segment in segments:
id, start, end, text = (
segment.id,
segment.start,
segment.end,
segment.text,
)
sf.write(
audio_save_path,
extract,
samplerate=sr,
print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
start_ms = int(start * 1000)
end_ms = int(end * 1000) + 200 # add 0.2s avoid truncating
segment_audio = audio[start_ms:end_ms]
audio_save_path = (
save_path / rel_path.parent / f"{file_stem}-{id:03d}{file_suffix}"
)
original_files.append(audio_save_path)
segment_audio.export(audio_save_path, format=file_suffix[1:])
print(f"Exported {audio_save_path}")

transcript_save_path = save_path / rel_path.parent / f"{file_stem}-{id}.lab"
transcript_save_path = (
save_path / rel_path.parent / f"{file_stem}-{id:03d}.lab"
)
with open(
transcript_save_path,
"w",
encoding="utf-8",
) as f:
f.write(text)
original_files.append(transcript_save_path)
f.write(segment.text)

merge_and_delete_files(save_dir, original_files)
file_path.unlink()


if __name__ == "__main__":
main()
exit(0)

audio = AudioSegment.from_wav(
r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
)

model_size = "large-v3"

model = WhisperModel(
model_size,
device="cuda",
compute_type="float16",
download_root="faster_whisper",
)

segments, info = model.transcribe(
r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
beam_size=5,
)

print(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
print("Total len(ms): ", len(audio))

for i, segment in enumerate(segments):
print(
"Segment %03d [%.2fs -> %.2fs] %s"
% (i, segment.start, segment.end, segment.text)
)
start_ms = int(segment.start * 1000)
end_ms = int(segment.end * 1000)
segment_audio = audio[start_ms:end_ms]
segment_audio.export(f"segment_{i:03d}.wav", format="wav")
print(f"Exported segment_{i:03d}.wav")

print("All segments have been exported.")

0 comments on commit 06c5d10

Please sign in to comment.