-
Notifications
You must be signed in to change notification settings - Fork 2
/
audioTransciption.py
200 lines (166 loc) · 6.18 KB
/
audioTransciption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python2
'''
This python 2 script will transcribe the audio of one audio source with the help ob Bings Speech to Text API.
This is a multi threaded python script, so it might be hard to kill!!!
'''
import asyncore
import io
import json
import sys
import threading
import time
import requests
import speech_recognition as sr
'''
Array of Bing Voice Recognition API keys (32-character lowercase hexadecimal strings)
Reccomended amount of keys min 4/DELTA keys
'''
BING_KEY = ["INSERT API KEY HERE"]
'''
The following parameters describe the configuration of your sliding windows.
DELTA: Minimal distance between each window in seconds
NUMBER_OF_THREADS: Number of sliding windows
WAIT_CYCLE: Time to wait until each windows recording is restarted (WAIT_CYCLE*DELTA = Time to wait). This should be bigger then the response time of bing.
Max(responseTime, DELTA*WAIT_CYCLE) is actually used
=> The resulting length of each sliding window is at least (NUMBER_OF_THREADS - WAIT_CYCLE) * DELTA
'''
DELTA = 3
NUMBER_OF_THREADS = 7
WAIT_CYCLE = 2
'''
Audio device index as defined in PyAudio
'''
AUDIO_DEVICE_INDEX = 0
'''
The IP adress of the node.js server, that exposes the API.
If you use two computers ngrok might be your rescue.
'''
IP = "http://localhost:3000/speech"
# API Header
HEADER = {'Content-Type': 'application/json'}
'''
This method tries to the Bing API with a given audio frame and sends the resulting transcription to the node.js server.
@param audio Audio frame of multiple chunks
@param r Audio recognizer
@param i The unique number of the sliding window is used to select the proper api key.
'''
def callBingApi(r, audio, i):
try:
msg = r.recognize_bing(
audio, key=BING_KEY[i % len(BING_KEY)])
print(msg)
sendToServer(msg)
except sr.UnknownValueError:
print(
"Microsoft Bing Voice Recognition could not understand audio")
except sr.RequestError as e:
print(
"Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e))
'''
This method builds a dictionary of threading events, that is used to coordinate the diffrent sliding windows.
It will build start, stop and done threading events for each thread.
'''
def buildEventDict():
# Build event dictionary
eStart = threading.Event()
eStop = threading.Event()
eDone = threading.Event()
eventDict = {
'start': eStart,
'stop': eStop,
'done': eDone
}
return eventDict
'''
This method will send a string to the node.js backend.
It uses the exposed API of the server to transmit a JSON object with the following structure:
{
'text': string
}
@param msg The string that should be transmited to the server
'''
def sendToServer(msg):
try:
r = requests.post(IP, data=json.dumps({'text': msg}), headers=HEADER)
except e:
print("Request Failed " + str(e))
'''
This function records the audio for one sliding window of an audio source until the stopEvent is set.
@param source Audio source that should be recorded
@param stopEvent The threading event that should stop recording
@return The audio frame containing the recorded audio
'''
def recordWindow(source, stopEvent):
assert source.stream is not None, "Audio source must be entered before recording, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
frames = io.BytesIO()
while not stopEvent.is_set():
buffer = source.stream.read(source.CHUNK)
frames.write(buffer)
frame_data = frames.getvalue()
frames.close()
myTime = str(time.time())
return sr.AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
'''
This function describes one sliding window and can be used as thread.
It will listen to threading events and set them according to its state, to make the sliding windows work.
@param eList
@param countingVar
@param recognizer
@param sourceMic
'''
def slidingWindowThread(eList, countingVar, recognizer, sourceMic):
# Audio main thread rec and asks with events
# obtain audio from the microphone
myTime = str(time.time())
while True:
eList['start'].wait()
eList['start'].clear()
myTime = str(time.time())
startTime = time.time()
with sourceMic as source:
audio = recordWindow(source, eList['stop'])
eList['stop'].clear()
requestTime = time.time()
callBingApi(recognizer, audio, countingVar)
stopTime = time.time()
eList['done'].set()
print(str(countingVar) + "<---->" + str(requestTime - startTime) +
" , " + str(stopTime - requestTime))
myTime = str(time.time())
countingVar += NUMBER_OF_THREADS
'''
Main of the audio transcription script
'''
if __name__ == '__main__':
eList = []
countingVarList = []
rList = []
sourceList = []
threadList = []
for i in range(0, NUMBER_OF_THREADS):
eList.append(buildEventDict())
countingVarList.append(i)
rList.append(sr.Recognizer())
sourceList.append(sr.Microphone(
device_index=AUDIO_DEVICE_INDEX, sample_rate=48000))
for i in range(0, NUMBER_OF_THREADS):
threadList.append(threading.Thread(name=str(i) + '-SlidingWindowThread',
target=slidingWindowThread,
args=(eList[i], countingVarList[i], rList[i], sourceList[i])))
threadList[i].start()
# Initial start of all Threads
for i in range(0, NUMBER_OF_THREADS - WAIT_CYCLE):
eList[i]['start'].set()
for i in range(NUMBER_OF_THREADS - WAIT_CYCLE, NUMBER_OF_THREADS):
eList[i]['done'].set()
while True:
# Only A recording
for i in range(0, NUMBER_OF_THREADS):
# Simultan recording of delta
time.sleep(DELTA)
# Kill i-th recording
eList[i]['stop'].set()
# Start i - k recording
eList[(i - WAIT_CYCLE) % NUMBER_OF_THREADS]['done'].wait()
eList[(i - WAIT_CYCLE) % NUMBER_OF_THREADS]['done'].clear()
eList[(i - WAIT_CYCLE) % NUMBER_OF_THREADS]['start'].set()