1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Mel-Frequency Cepstral Coefficients (MFCCs) ops."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21from tensorflow.python.framework import dtypes
22from tensorflow.python.framework import ops
23from tensorflow.python.ops import array_ops
24from tensorflow.python.ops import math_ops
25from tensorflow.python.ops.signal import dct_ops
26from tensorflow.python.util.tf_export import tf_export
27
28
29@tf_export('signal.mfccs_from_log_mel_spectrograms')
30def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
31  """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
32
33  Implemented with GPU-compatible ops and supports gradients.
34
35  [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of
36  taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs
37  use a particular scaling of the DCT-II which is almost orthogonal
38  normalization. We follow this convention.
39
40  All `num_mel_bins` MFCCs are returned and it is up to the caller to select
41  a subset of the MFCCs based on their application. For example, it is typical
42  to only use the first few for speech recognition, as this results in
43  an approximately pitch-invariant representation of the signal.
44
45  For example:
46
47  ```python
48  sample_rate = 16000.0
49  # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
50  pcm = tf.placeholder(tf.float32, [None, None])
51
52  # A 1024-point STFT with frames of 64 ms and 75% overlap.
53  stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
54                         fft_length=1024)
55  spectrograms = tf.abs(stfts)
56
57  # Warp the linear scale spectrograms into the mel-scale.
58  num_spectrogram_bins = stfts.shape[-1].value
59  lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
60  linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
61    num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
62    upper_edge_hertz)
63  mel_spectrograms = tf.tensordot(
64    spectrograms, linear_to_mel_weight_matrix, 1)
65  mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
66    linear_to_mel_weight_matrix.shape[-1:]))
67
68  # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
69  log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
70
71  # Compute MFCCs from log_mel_spectrograms and take the first 13.
72  mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
73    log_mel_spectrograms)[..., :13]
74  ```
75
76  Args:
77    log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of
78      log-magnitude mel-scale spectrograms.
79    name: An optional name for the operation.
80  Returns:
81    A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of
82    `log_mel_spectrograms`.
83
84  Raises:
85    ValueError: If `num_mel_bins` is not positive.
86
87  [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
88  [htk]: https://en.wikipedia.org/wiki/HTK_(software)
89  """
90  with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms',
91                      [log_mel_spectrograms]):
92    # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram.
93    # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the
94    # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where
95    # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For
96    # this reason, we don't apply orthogonal normalization and scale the DCT by
97    # `0.5 * sqrt(2/N)` manually.
98    log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms,
99                                                 dtype=dtypes.float32)
100    if (log_mel_spectrograms.shape.ndims and
101        log_mel_spectrograms.shape.dims[-1].value is not None):
102      num_mel_bins = log_mel_spectrograms.shape.dims[-1].value
103      if num_mel_bins == 0:
104        raise ValueError('num_mel_bins must be positive. Got: %s' %
105                         log_mel_spectrograms)
106    else:
107      num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
108
109    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
110    return dct2 * math_ops.rsqrt(
111        math_ops.cast(num_mel_bins, dtypes.float32) * 2.0)
112