Adiciona opção batch_size

matheusbach · Sep 23, 2023 · ad6594f · ad6594f
1 parent 3143c88
commit ad6594f
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 6 deletions.
diff --git a/legen.ipynb b/legen.ipynb
@@ -138,6 +138,7 @@
         "transcription_device = 'auto' #@param [\"auto\", \"cpu\", \"cuda\"]\n",
         "transcription_model = 'large' #@param [\"tiny\", \"small\", \"medium\", \"large\"]\n",
         "compute_type = 'default' # @param [\"int8\", \"int16\", \"float16\", \"float32\"]\n",
+        "batch_size = 12 # @param {type: \"number\"}\n",
         "transcription_input_lang = 'auto detect' #@param [\"auto detect\", \"af\", \"am\", \"ar\", \"as\", \"az\", \"ba\", \"be\", \"bg\", \"bn\", \"bo\", \"br\", \"bs\", \"ca\", \"cs\", \"cy\", \"da\", \"de\", \"el\", \"en\", \"es\", \"et\", \"eu\", \"fa\", \"fi\", \"fo\", \"fr\", \"gl\", \"gu\", \"ha\", \"haw\", \"he\", \"hi\", \"hr\", \"ht\", \"hu\", \"hy\", \"id\", \"is\", \"it\", \"ja\", \"jw\", \"ka\", \"kk\", \"km\", \"kn\", \"ko\", \"la\", \"lb\", \"ln\", \"lo\", \"lt\", \"lv\", \"mg\", \"mi\", \"mk\", \"ml\", \"mn\", \"mr\", \"ms\", \"mt\", \"my\", \"ne\", \"nl\", \"nn\", \"no\", \"oc\", \"pa\", \"pl\", \"ps\", \"pt\", \"ro\", \"ru\", \"sa\", \"sd\", \"si\", \"sk\", \"sl\", \"sn\", \"so\", \"sq\", \"sr\", \"su\", \"sv\", \"sw\", \"ta\", \"te\", \"tg\", \"th\", \"tk\", \"tl\", \"tr\", \"tt\", \"uk\", \"ur\", \"uz\", \"vi\", \"yi\", \"yo\", \"zh\"]\n",
         "\n",
         "#@markdown ---\n",
@@ -193,6 +194,7 @@
         "query += f\" --dev {transcription_device}\"\n",
         "query += f\" --model {transcription_model}\"\n",
         "query += f\" --compute_type {compute_type}\"\n",
+        "query += f\" --batch_size {batch_size}\"\n",
         "query += f\" --input_lang {transcription_input_lang}\" if transcription_input_lang != \"auto detect\" else \"\"\n",
         "query += f\" --lang {target_language_code}\"\n",
         "query += f\" -c:v {video_codec}\" + (\"\" if video_hardware_api == \"none\" else f\"_{video_hardware_api}\" if video_hardware_api != \"auto\" else \"_nvenc\" if torch.cuda.is_available() else \"\")\n",

diff --git a/legen.py b/legen.py
@@ -9,7 +9,7 @@
 import file_utils
 import translate_utils
 
-version = "v0.14"
+version = "v0.14.2"
 
 # Terminal colors
 default = "\033[1;0m"
@@ -50,6 +50,8 @@
                     help="Dispositivo para rodar a transcrição pelo Whisper. [cpu, cuda, auto]. (default: auto)")
 parser.add_argument("--compute_type", type=str, default="default",
                     help="Quantization for the neural network. Ex: float32, float16, int8, ...")
+parser.add_argument("--batch_size", type=int, default="4",
+                    help="The higher the value, the faster the processing will be. If you have low RAM or have buggy subtitles, reduce this value. Works only using whisperX. (default: 4)")
 parser.add_argument("--lang", type=str, default="pt",
                     help="Idioma para o qual as legendas devem ser traduzidas. Language equals to source video skip translation (default: pt)")
 parser.add_argument("--input_lang", type=str, default="auto",
@@ -200,7 +202,7 @@
                     print(
                         f"{wblue}Transcribing{default} with {gray}WhisperX{default}")
                     whisperx_utils.transcribe_audio(
-                        whisper_model, audio_extracted.getpath(), transcribed_srt_temp.getpath(), audio_language, device=torch_device)
+                        whisper_model, audio_extracted.getpath(), transcribed_srt_temp.getpath(), audio_language, device=torch_device, batch_size=args.batch_size)
                 else:
                     print(
                         f"{wblue}Transcribing{default} with {gray}Whisper{default}")

diff --git a/whisperx_utils.py b/whisperx_utils.py
@@ -9,10 +9,7 @@
 #import faster_whisper
 #import numpy as np
 
-batch_size = 4  # reduce if low on GPU mem
-
-
-def transcribe_audio(model: whisperx.asr.WhisperModel, audio_path: Path, srt_path: Path, lang: str = None, disable_fp16: bool = False, device: str = "cpu"):
+def transcribe_audio(model: whisperx.asr.WhisperModel, audio_path: Path, srt_path: Path, lang: str = None, device: str = "cpu", batch_size: int = 4):
     audio = whisperx.load_audio(file=audio_path.as_posix())
 
     # Transcribe