Skip to content

Commit 6ab5e13

Browse files
vincentqbIsaac Seessel
andauthored
Add support for 24-bit signed LPCM wav in sox_io backend (#1389) (#1398)
Co-authored-by: Isaac Seessel <iseessel@oberlin.edu>
1 parent 099d788 commit 6ab5e13

File tree

3 files changed

+63
-8
lines changed

3 files changed

+63
-8
lines changed

test/torchaudio_unittest/backend/sox_io/load_test.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,49 @@ def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
4242
assert sr == sample_rate
4343
self.assertEqual(data, expected)
4444

45+
def assert_24bit_wav(self, sample_rate, num_channels, normalize, duration):
46+
""" `sox_io_backend.load` can load 24-bit signed PCM wav format. Since torch does not support the ``int24`` dtype,
47+
we implicitly cast the resulting tensor to the ``int32`` dtype.
48+
49+
It is not possible to use #assert_wav method above, as #get_wav_data does not support
50+
the 'int24' dtype. This is because torch does not support the ``int24`` dtype.
51+
Hence, we must use the following workaround.
52+
53+
x
54+
|
55+
| 1. Generate 24-bit wav with Sox.
56+
|
57+
v 2. Convert 24-bit wav to 32-bit wav with Sox.
58+
wav(24-bit) ----------------------> wav(32-bit)
59+
| |
60+
| 3. Load 24-bit wav with torchaudio| 4. Load 32-bit wav with scipy
61+
| |
62+
v v
63+
tensor ----------> x <----------- tensor
64+
5. Compare
65+
66+
# Underlying assumptions are:
67+
# i. Sox properly converts from 24-bit to 32-bit
68+
# ii. Loading 32-bit wav file with scipy is correct.
69+
"""
70+
path = self.get_temp_path('1.original.wav')
71+
ref_path = self.get_temp_path('2.reference.wav')
72+
73+
# 1. Generate 24-bit signed wav with Sox
74+
sox_utils.gen_audio_file(
75+
path, sample_rate, num_channels,
76+
bit_depth=24, duration=duration)
77+
78+
# 2. Convert from 24-bit wav to 32-bit wav with sox
79+
sox_utils.convert_audio_file(path, ref_path, bit_depth=32)
80+
# 3. Load 24-bit wav with torchaudio
81+
data, sr = sox_io_backend.load(path, normalize=normalize)
82+
# 4. Load 32-bit wav with scipy
83+
data_ref = load_wav(ref_path, normalize=normalize)[0]
84+
# 5. Compare
85+
assert sr == sample_rate
86+
self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
87+
4588
def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
4689
"""`sox_io_backend.load` can load mp3 format.
4790
@@ -50,7 +93,7 @@ def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
5093
5194
x
5295
|
53-
| 1. Generate mp3 with Sox
96+
| 1. Generate mp3 with Sox
5497
|
5598
v 2. Convert to wav with Sox
5699
mp3 ------------------------------> wav
@@ -61,7 +104,7 @@ def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
61104
tensor ----------> x <----------- tensor
62105
5. Compare
63106
64-
Underlying assumptions are;
107+
Underlying assumptions are:
65108
i. Conversion of mp3 to wav with Sox preserves data.
66109
ii. Loading wav file with scipy is correct.
67110
@@ -213,6 +256,15 @@ def test_wav(self, dtype, sample_rate, num_channels, normalize):
213256
"""`sox_io_backend.load` can load wav format correctly."""
214257
self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
215258

259+
@parameterized.expand(list(itertools.product(
260+
[8000, 16000],
261+
[1, 2],
262+
[False, True],
263+
)), name_func=name_func)
264+
def test_24bit_wav(self, sample_rate, num_channels, normalize):
265+
"""`sox_io_backend.load` can load 24bit wav format correctly. Corectly casts it to ``int32`` tensor dtype."""
266+
self.assert_24bit_wav(sample_rate, num_channels, normalize, duration=1)
267+
216268
@parameterized.expand(list(itertools.product(
217269
['int16'],
218270
[16000],

torchaudio/backend/sox_io_backend.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def load(
7373
7474
* 32-bit floating-point
7575
* 32-bit signed integer
76+
* 24-bit signed integer
7677
* 16-bit signed integer
7778
* 8-bit unsigned integer (WAV only)
7879
@@ -92,10 +93,11 @@ def load(
9293
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
9394
9495
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
95-
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
96-
by providing ``normalize=False``, this function can return integer Tensor, where the samples
97-
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
98-
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
96+
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
97+
this function can return integer Tensor, where the samples are expressed within the whole range
98+
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
99+
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
100+
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
99101
100102
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
101103
``flac`` and ``mp3``.

torchaudio/csrc/sox/utils.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,16 @@ caffe2::TypeMeta get_dtype(
118118
switch (encoding) {
119119
case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
120120
return torch::kUInt8;
121-
case SOX_ENCODING_SIGN2: // 16-bit or 32-bit PCM WAV
121+
case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
122122
switch (precision) {
123123
case 16:
124124
return torch::kInt16;
125+
case 24: // Cast 24-bit to 32-bit.
125126
case 32:
126127
return torch::kInt32;
127128
default:
128129
throw std::runtime_error(
129-
"Only 16 and 32 bits are supported for signed PCM.");
130+
"Only 16, 24, and 32 bits are supported for signed PCM.");
130131
}
131132
default:
132133
// default to float32 for the other formats, including

0 commit comments

Comments
 (0)