Skip to content

Commit

Permalink
Disable resampling for rates that divide gset sr without remainder
Browse files Browse the repository at this point in the history
Sampling rates 11025 Hz and 22050 Hz are safe and do not need resampling.
This simple fix gets rid of many ugly data points that were not present
in the original data annotations.

related to: #7
  • Loading branch information
anthonio9 committed Feb 3, 2024
1 parent 2cf80fd commit a204523
Showing 1 changed file with 38 additions and 36 deletions.
74 changes: 38 additions & 36 deletions penn/data/preprocess/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,46 +272,50 @@ def gset():
unvoiced = pitch == 0
voiced = ~unvoiced

pitch_list = np.vsplit(pitch, pitch.shape[0])
pitch_list_final = []
# FOR sampling rates like 11025, 22050, 44100, resampling isn't necessary
if GSET_SAMPLE_RATE / penn.SAMPLE_RATE % 1 != 0:
printf("Resampling to penn.SAMPLE_RATE")

voiced_list = np.vsplit(voiced, voiced.shape[0])
voiced_list_final = []
pitch_list = np.vsplit(pitch, pitch.shape[0])
pitch_list_final = []

for pitch_arr, voiced_arr in zip(pitch_list, voiced_list):
# Get target number of frames
frames = penn.convert.samples_to_frames(audio.shape[-1])
voiced_list = np.vsplit(voiced, voiced.shape[0])
voiced_list_final = []

pitch_arr = pitch_arr[0, :]
voiced_arr = voiced_arr[0, :]
for pitch_arr, voiced_arr in zip(pitch_list, voiced_list):
# Get target number of frames
frames = penn.convert.samples_to_frames(audio.shape[-1])

# Linearly interpolate to target number of frames
new_times = penn.HOPSIZE_SECONDS * np.arange(0, frames)
new_times += penn.HOPSIZE_SECONDS / 2.
pitch_arr = 2. ** np.interp(new_times, times, np.log2(pitch_arr))
pitch_arr = pitch_arr[0, :]
voiced_arr = voiced_arr[0, :]

# Linearly interpolate voiced_arr/unvoiced_arr tokens
voiced_arr = np.interp(new_times, times, voiced_arr) > .5
# Linearly interpolate to target number of frames
new_times = penn.HOPSIZE_SECONDS * np.arange(0, frames)
new_times += penn.HOPSIZE_SECONDS / 2.
pitch_arr = 2. ** np.interp(new_times, times, np.log2(pitch_arr))

# Check shapes
assert (
penn.convert.samples_to_frames(audio.shape[-1]) ==
pitch_arr.shape[-1] ==
voiced_arr.shape[-1])
# Linearly interpolate voiced_arr/unvoiced_arr tokens
voiced_arr = np.interp(new_times, times, voiced_arr) > .5

assert np.logical_not(pitch_arr[voiced_arr] == 0).all()
# Check shapes
assert (
penn.convert.samples_to_frames(audio.shape[-1]) ==
pitch_arr.shape[-1] ==
voiced_arr.shape[-1])

pitch_list_final.append(pitch_arr)
voiced_list_final.append(voiced_arr)
assert np.logical_not(pitch_arr[voiced_arr] == 0).all()

pitch = np.vstack(pitch_list_final)
voiced = np.vstack(voiced_list_final)
pitch_list_final.append(pitch_arr)
voiced_list_final.append(voiced_arr)

if pitch.shape[0] == 1:
pitch = pitch[0, :]
pitch = np.vstack(pitch_list_final)
voiced = np.vstack(voiced_list_final)

if voiced.shape[0] == 1:
voiced = voiced[0, :]
if pitch.shape[0] == 1:
pitch = pitch[0, :]

if voiced.shape[0] == 1:
voiced = voiced[0, :]

# Save to cache
np.save(output_directory / f'{stem}-pitch.npy', pitch)
Expand Down Expand Up @@ -559,13 +563,11 @@ def extract_pitch_array_jams(jam: jams.JAMS, track, uniform=True) -> Tuple[np.nd
freq = np.array([pitch.value['frequency']])

# Don't keep track of zero or unvoiced frequencies
if np.sum(freq) == 0 or not pitch.value['voiced']:
freq = np.zeros(1)

# Append the observation time
entry_times = np.append(entry_times, pitch.time)
# Append the frequency
slice_pitch_list.append(freq)
if np.sum(freq) != 0 and pitch.value['voiced']:
# Append the observation time
entry_times = np.append(entry_times, pitch.time)
# Append the frequency
slice_pitch_list.append(freq)

# Sort the pitch list before resampling just in case it is not already sorted
entry_times, slice_pitch_array = sort_pitch_list(entry_times, slice_pitch_list)
Expand Down

0 comments on commit a204523

Please sign in to comment.