speex: use AVFrame instead of AudioBuffer

Change-Id: I680b550f6412925e0fac0e732f40e7e319f01cc8

speex: use AVFrame instead of AudioBuffer
d10fa634 · Adrien Béraud · Adrien Béraud · d09a15e1 · d10fa634 · d10fa634
Commit d10fa634 authored 1 year ago by Adrien Béraud Committed by Adrien Béraud 1 year ago
--- a/src/media/audio/audio-processing/speex.cpp
+++ b/src/media/audio/audio-processing/speex.cpp
@@ -34,19 +34,25 @@ extern "C" {

 namespace jami {

+inline AudioFormat
+audioFormatToSampleFormat(AudioFormat format)
+{
+    return {format.sample_rate, format.nb_channels, AV_SAMPLE_FMT_S16};
+}
+
 SpeexAudioProcessor::SpeexAudioProcessor(AudioFormat format, unsigned frameSize)
-    : AudioProcessor(format, frameSize)
+    : AudioProcessor(format.withSampleFormat(AV_SAMPLE_FMT_S16), frameSize)
    , echoState(speex_echo_state_init_mc((int) frameSize,
                                         (int) frameSize * 16,
-                                         (int) format.nb_channels,
-                                         (int) format.nb_channels),
+                                         (int) format_.nb_channels,
+                                         (int) format_.nb_channels),
                &speex_echo_state_destroy)
-    , iProcBuffer(frameSize_, format)
+    , procBuffer(std::make_unique<AudioFrame>(format.withSampleFormat(AV_SAMPLE_FMT_S16P), frameSize_))
 {
    JAMI_DBG("[speex-dsp] SpeexAudioProcessor, frame size = %d (=%d ms), channels = %d",
             frameSize,
             frameDurationMs_,
-             format.nb_channels);
+             format_.nb_channels);
    // set up speex echo state
    speex_echo_ctl(echoState.get(), SPEEX_ECHO_SET_SAMPLING_RATE, &format_.sample_rate);

@@ -66,10 +72,10 @@ SpeexAudioProcessor::SpeexAudioProcessor(AudioFormat format, unsigned frameSize)

    // set up speex preprocess states, one for each channel
    // note that they are not enabled here, but rather in the enable* functions
-    for (unsigned int i = 0; i < format.nb_channels; i++) {
+    for (unsigned int i = 0; i < format_.nb_channels; i++) {
        auto channelPreprocessorState
            = SpeexPreprocessStatePtr(speex_preprocess_state_init((int) frameSize,
-                                                                  (int) format.sample_rate),
+                                                                  (int) format_.sample_rate),
                                      &speex_preprocess_state_destroy);

        // set max noise suppression level
@@ -184,11 +190,11 @@ SpeexAudioProcessor::getProcessed()
        return {};
    }

-    auto processed = std::make_shared<AudioFrame>(record->getFormat(), record->getFrameSize());
-
+    std::shared_ptr<AudioFrame> processed;
    if (shouldAEC) {
        // we want to echo cancel
        // multichannel, output into processed
+        processed = std::make_shared<AudioFrame>(record->getFormat(), record->getFrameSize());
        speex_echo_cancellation(echoState.get(),
                                (int16_t*) record->pointer()->data[0],
                                (int16_t*) playback->pointer()->data[0],
@@ -198,16 +204,7 @@ SpeexAudioProcessor::getProcessed()
        processed = record;
    }

-    // deinterleave processed into channels
-    std::vector<int16_t*> procData {format_.nb_channels};
-    iProcBuffer.deinterleave((const AudioSample*) processed->pointer()->data[0],
-                             frameSize_,
-                             format_.nb_channels);
-
-    // point procData to correct channels
-    for (unsigned int channel = 0; channel < format_.nb_channels; channel++) {
-        procData[channel] = iProcBuffer.getChannel(channel)->data();
-    }
+    deinterleaveResampler.resample(processed->pointer(), procBuffer->pointer());

    // overall voice activity
    bool overallVad = false;
@@ -218,7 +215,7 @@ SpeexAudioProcessor::getProcessed()
    int channel = 0;
    for (auto& channelPreprocessorState : preprocessorStates) {
        // preprocesses in place, returns voice activity boolean
-        channelVad = speex_preprocess_run(channelPreprocessorState.get(), procData[channel]);
+        channelVad = speex_preprocess_run(channelPreprocessorState.get(), (int16_t*)procBuffer->pointer()->data[channel]);

        // boolean OR
        overallVad |= channelVad;
@@ -226,12 +223,10 @@ SpeexAudioProcessor::getProcessed()
        channel += 1;
    }

-    // reinterleave into processed
-    iProcBuffer.interleave((AudioSample*) processed->pointer()->data[0]);
+    interleaveResampler.resample(procBuffer->pointer(), processed->pointer());

    // add stabilized voice activity to the AudioFrame
    processed->has_voice = shouldDetectVoice && getStabilizedVoiceActivity(overallVad);
-
    return processed;
 }


--- a/src/media/audio/audio-processing/speex.h
+++ b/src/media/audio/audio-processing/speex.h
@@ -21,7 +21,6 @@
 #pragma once

 #include "audio_processor.h"
-#include "media/audio/audiobuffer.h"

 // typedef speex C structs
 extern "C" {
@@ -57,7 +56,9 @@ private:
    // one for each channel
    std::vector<SpeexPreprocessStatePtr> preprocessorStates;

-    AudioBuffer iProcBuffer;
+    std::unique_ptr<AudioFrame> procBuffer {};
+    Resampler deinterleaveResampler;
+    Resampler interleaveResampler;

    // if we should do echo cancellation
    bool shouldAEC {false};

--- a/src/media/audio/audio-processing/webrtc.cpp
+++ b/src/media/audio/audio-processing/webrtc.cpp
@@ -23,12 +23,6 @@

 namespace jami {

-inline AudioFormat
-audioFormatToFloatPlanar(AudioFormat format)
-{
-    return {format.sample_rate, format.nb_channels, AV_SAMPLE_FMT_FLTP};
-}
-
 inline size_t
 webrtcFrameSize(AudioFormat format)
 {
@@ -38,19 +32,19 @@ webrtcFrameSize(AudioFormat format)
 constexpr int webrtcNoError = webrtc::AudioProcessing::kNoError;

 WebRTCAudioProcessor::WebRTCAudioProcessor(AudioFormat format, unsigned /* frameSize */)
-    : AudioProcessor(audioFormatToFloatPlanar(format), webrtcFrameSize(format))
+    : AudioProcessor(format.withSampleFormat(AV_SAMPLE_FMT_FLTP), webrtcFrameSize(format))
 {
    JAMI_LOG("[webrtc-ap] WebRTCAudioProcessor, frame size = {:d} (={:d} ms), channels = {:d}",
             frameSize_,
             frameDurationMs_,
-             format.nb_channels);
+             format_.nb_channels);
    webrtc::Config config;
    config.Set<webrtc::ExtendedFilter>(new webrtc::ExtendedFilter(true));
    config.Set<webrtc::DelayAgnostic>(new webrtc::DelayAgnostic(true));

    apm.reset(webrtc::AudioProcessing::Create(config));

-    webrtc::StreamConfig streamConfig((int) format.sample_rate, (int) format.nb_channels);
+    webrtc::StreamConfig streamConfig((int) format_.sample_rate, (int) format_.nb_channels);
    webrtc::ProcessingConfig pconfig = {
        streamConfig, /* input stream */
        streamConfig, /* output stream */

--- a/src/media/audio/audio_format.h
+++ b/src/media/audio/audio_format.h
@@ -59,6 +59,12 @@ struct AudioFormat
        return fmt::format("{{{}, {} channels, {}Hz}}", av_get_sample_fmt_name(sampleFormat), nb_channels, sample_rate);
    }

+    inline AudioFormat withSampleFormat(AVSampleFormat format)
+    {
+        return {sample_rate, nb_channels, format};
+    }
+
+
    /**
     * Returns bytes necessary to hold one frame of audio data.
     */