audio: add voice activity detection

* rename EchoCanceller to AudioProcessor * enable echo cancellation and voice activity detection * add minimum active duration and trigger time to voice activation Gitlab: #741 Change-Id: I98662462c17539fca1d042482e97fdb3eff86130

audio: add voice activity detection
6ff875cb · Tobias Hildebrandt · Adrien Béraud · 7e74098c · 6ff875cb · 6ff875cb
Commit 6ff875cb authored 2 years ago by Tobias Hildebrandt Committed by Adrien Béraud 2 years ago
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ source_group("Source Files\\jamidht\\eth\\libdevcore" FILES ${Source_Files__jami
 source_group("Source Files\\jamidht\\eth\\libdevcrypto" FILES ${Source_Files__jamidht__eth__libdevcrypto})
 source_group("Source Files\\media" FILES ${Source_Files__media})
 source_group("Source Files\\media\\audio" FILES ${Source_Files__media__audio})
-source_group("Source Files\\media\\audio\\echo-cancel" FILES ${Source_Files__media__audio__echo_cancel})
+source_group("Source Files\\media\\audio\\audio-processing" FILES ${Source_Files__media__audio__audio_processing})
 source_group("Source Files\\media\\audio\\sound" FILES ${Source_Files__media__audio__sound})
 source_group("Source Files\\media\\video" FILES ${Source_Files__media__video})
 source_group("Source Files\\plugin" FILES ${Source_Files__plugin})
@@ -55,7 +55,7 @@ list (APPEND ALL_FILES
      ${Source_Files__media}
      ${Source_Files__media__audio}
      ${Source_Files__media__audio__sound}
-      ${Source_Files__media__audio__echo_cancel}
+      ${Source_Files__media__audio__audio_processing}
      ${Source_Files__media__video}
      ${Source_Files__security}
      ${Source_Files__sip}

--- a/meson.build
+++ b/meson.build
@@ -29,7 +29,7 @@ deplibavformat = dependency('libavformat', version: '>= 56.40.101')
 deplibswscale = dependency('libswscale', version: '>= 3.1.101')
 deplibswresample = dependency('libswresample', version: '>= 1.2.101')
 deplibavutil = dependency('libavutil', version: '>= 55.75.100')
-depspeexdsp = dependency('speexdsp')
+
 depfmt = dependency('fmt', version: '>= 5.3')

 depyamlcpp = dependency('yaml-cpp', version: '>= 0.5.1', required: false)
@@ -113,6 +113,9 @@ conf.set10('HAVE_RINGNS', depopenssl.found())
 depwebrtcap = dependency('webrtc-audio-processing', required: get_option('aec'))
 conf.set10('HAVE_WEBRTC_AP', depwebrtcap.found())

+depspeexdsp = dependency('speexdsp')
+conf.set10('HAVE_SPEEXDSP', depspeexdsp.found())
+
 if get_option('video')
    conf.set('ENABLE_VIDEO', true)
    if host_machine.system() == 'linux' and meson.get_compiler('cpp').get_define('__ANDROID__') != '1'

--- a/src/jami/videomanager_interface.h
+++ b/src/jami/videomanager_interface.h
@@ -105,6 +105,7 @@ public:
    float calcRMS() const;
    jami::AudioFormat getFormat() const;
    size_t getFrameSize() const;
+    bool has_voice {false};

 private:
    void setFormat(const jami::AudioFormat& format);

--- a/src/media/audio/CMakeLists.txt
+++ b/src/media/audio/CMakeLists.txt
@@ -50,5 +50,5 @@ endif()
 add_subdirectory(sound)
 set (Source_Files__media__audio__sound ${Source_Files__media__audio__sound} PARENT_SCOPE)

-add_subdirectory(echo-cancel)
-set (Source_Files__media__audio__echo_cancel ${Source_Files__media__audio__echo_cancel} PARENT_SCOPE)
\ No newline at end of file
+add_subdirectory(audio-processing)
+set (Source_Files__media__audio__audio_processing ${Source_Files__media__audio__audio_processing} PARENT_SCOPE)
--- a/src/media/audio/Makefile.am
+++ b/src/media/audio/Makefile.am
@@ -53,7 +53,7 @@ noinst_HEADERS += $(RING_SPEEXDSP_HEAD) \


 include ./media/audio/sound/Makefile.am
-include ./media/audio/echo-cancel/Makefile.am
+include ./media/audio/audio-processing/Makefile.am

 if BUILD_OPENSL
 include ./media/audio/opensl/Makefile.am

--- a/src/media/audio/audio-processing/CMakeLists.txt
+++ b/src/media/audio/audio-processing/CMakeLists.txt
+################################################################################
+# Source groups - audio-processing
+################################################################################
+list (APPEND Source_Files__media__audio__audio_processing
+      "${CMAKE_CURRENT_SOURCE_DIR}/audio_processor.h"
+      "${CMAKE_CURRENT_SOURCE_DIR}/null_audio_processor.h"
+      "${CMAKE_CURRENT_SOURCE_DIR}/null_audio_processor.cpp"
+      "${CMAKE_CURRENT_SOURCE_DIR}/speex.h"
+      "${CMAKE_CURRENT_SOURCE_DIR}/speex.cpp"
+      "${CMAKE_CURRENT_SOURCE_DIR}/webrtc.h"
+      "${CMAKE_CURRENT_SOURCE_DIR}/webrtc.cpp"
+)
+
+set (Source_Files__media__audio__audio_processing ${Source_Files__media__audio__audio_processing} PARENT_SCOPE)
--- a/src/media/audio/audio-processing/Makefile.am
+++ b/src/media/audio/audio-processing/Makefile.am
+noinst_LTLIBRARIES += libaudioprocessing.la
+
+EC_SRC = ./media/audio/audio-processing/null_audio_processor.cpp
+EC_HDR = ./media/audio/audio-processing/null_audio_processor.h
+
+if BUILD_SPEEXDSP
+EC_SRC += ./media/audio/audio-processing/speex.cpp
+EC_HDR += ./media/audio/audio-processing/speex.h
+endif
+
+if HAVE_WEBRTC_AP
+EC_SRC += ./media/audio/audio-processing/webrtc.cpp
+EC_HDR += ./media/audio/audio-processing/webrtc.h
+libaudioprocessing_la_CXXFLAGS = @WEBRTC_CFLAGS@ $(AM_CXXFLAGS)
+endif
+
+libaudioprocessing_la_SOURCES = \
+	$(EC_SRC)
+
+noinst_HEADERS += \
+	./media/audio/audio-processing/audio_processor.h \
+	$(EC_HDR)
+
+libaudio_la_LIBADD += libaudioprocessing.la
--- a/src/media/audio/echo-cancel/echo_canceller.h
+++ b/src/media/audio/echo-cancel/echo_canceller.h
 /*
 *  Copyright (C) 2021-2022 Savoir-faire Linux Inc.
 *
- *  Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
- *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
@@ -31,20 +29,21 @@

 namespace jami {

-class EchoCanceller
+class AudioProcessor
 {
 private:
-    NON_COPYABLE(EchoCanceller);
+    NON_COPYABLE(AudioProcessor);

 public:
-    EchoCanceller(AudioFormat format, unsigned frameSize)
-        : playbackQueue_(format, frameSize)
-        , recordQueue_(format, frameSize)
+    AudioProcessor(AudioFormat format, unsigned frameSize)
+        : playbackQueue_(format, (int) frameSize)
+        , recordQueue_(format, (int) frameSize)
        , resampler_(new Resampler)
        , format_(format)
        , frameSize_(frameSize)
+        , frameDurationMs_((unsigned int) (frameSize_ * (1.0 / format_.sample_rate) * 1000))
    {}
-    virtual ~EchoCanceller() = default;
+    virtual ~AudioProcessor() = default;

    virtual void putRecorded(std::shared_ptr<AudioFrame>&& buf)
    {
@@ -61,8 +60,27 @@ public:
        auto copy = buf;
        enqueue(playbackQueue_, std::move(copy));
    };
+
+    /**
+     * @brief Process and return a single AudioFrame
+     */
    virtual std::shared_ptr<AudioFrame> getProcessed() = 0;
-    virtual void done() = 0;
+
+    /**
+     * @brief Set the status of echo cancellation
+     */
+    virtual void enableEchoCancel(bool enabled) = 0;
+
+    /**
+     * @brief Set the status of noise suppression
+     * includes de-reverb, de-noise, high pass filter, etc
+     */
+    virtual void enableNoiseSuppression(bool enabled) = 0;
+
+    /**
+     * @brief Set the status of automatic gain control
+     */
+    virtual void enableAutomaticGainControl(bool enabled) = 0;

 protected:
    AudioFrameResizer playbackQueue_;
@@ -71,7 +89,82 @@ protected:
    std::atomic_bool playbackStarted_;
    std::atomic_bool recordStarted_;
    AudioFormat format_;
-    unsigned frameSize_;
+    unsigned int frameSize_;
+    unsigned int frameDurationMs_;
+
+    // artificially extend voice activity by this long
+    unsigned int forceMinimumVoiceActivityMs {1000};
+
+    // current number of frames to force the voice activity to be true
+    unsigned int forceVoiceActiveFramesLeft {0};
+
+    // voice activity must be active for this long _before_ it is considered legitimate
+    unsigned int minimumConsequtiveDurationMs {200};
+
+    // current number of frames that the voice activity has been true
+    unsigned int consecutiveActiveFrames {0};
+
+    /**
+     * @brief Helper method for audio processors, should be called at start of getProcessed()
+     *        Pops frames from audio queues if there's overflow
+     * @returns True if there is underflow, false otherwise. An AudioProcessor should
+     *          return a blank AudioFrame if there is underflow.
+     */
+    bool tidyQueues()
+    {
+        while (recordQueue_.samples() > recordQueue_.frameSize() * 10) {
+            JAMI_DBG("record overflow %d / %d", recordQueue_.samples(), frameSize_);
+            recordQueue_.dequeue();
+        }
+        while (playbackQueue_.samples() > playbackQueue_.frameSize() * 10) {
+            JAMI_DBG("playback overflow %d / %d", playbackQueue_.samples(), frameSize_);
+            playbackQueue_.dequeue();
+        }
+        if (recordQueue_.samples() < recordQueue_.frameSize()
+            || playbackQueue_.samples() < playbackQueue_.frameSize()) {
+            // If there are not enough samples in either queue, we can't
+            // process anything.
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * @brief Stablilizes voice activity
+     * @param voiceStatus the voice status that was detected by the audio processor
+     *                    for the current frame
+     * @returns The voice activity status that should be set on the current frame
+     */
+    bool getStabilizedVoiceActivity(bool voiceStatus)
+    {
+        bool newVoice = false;
+
+        if (voiceStatus) {
+            // we detected activity
+            consecutiveActiveFrames += 1;
+
+            // make sure that we have been active for necessary time
+            if (consecutiveActiveFrames > minimumConsequtiveDurationMs / frameDurationMs_) {
+                newVoice = true;
+
+                // set number of frames that will be forced positive
+                forceVoiceActiveFramesLeft = (int) forceMinimumVoiceActivityMs / frameDurationMs_;
+            }
+        } else if (forceVoiceActiveFramesLeft > 0) {
+            // if we didn't detect voice, but we haven't elapsed the minimum duration,
+            // force voice to be true
+            newVoice = true;
+            forceVoiceActiveFramesLeft -= 1;
+
+            consecutiveActiveFrames += 1;
+        } else {
+            // else no voice and no need to force
+            newVoice = false;
+            consecutiveActiveFrames = 0;
+        }
+
+        return newVoice;
+    }

 private:
    void enqueue(AudioFrameResizer& frameResizer, std::shared_ptr<AudioFrame>&& buf)

--- a/src/media/audio/echo-cancel/null_echo_canceller.cpp
+++ b/src/media/audio/echo-cancel/null_echo_canceller.cpp
 /*
 *  Copyright (C) 2021-2022 Savoir-faire Linux Inc.
 *
- *  Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
- *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
@@ -18,56 +16,30 @@
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
 */

-#include "null_echo_canceller.h"
+#include "null_audio_processor.h"

 #include <cassert>

 namespace jami {

-NullEchoCanceller::NullEchoCanceller(AudioFormat format, unsigned frameSize)
-    : EchoCanceller(format, frameSize)
-{}
-
-void
-NullEchoCanceller::putRecorded(std::shared_ptr<AudioFrame>&& buf)
-{
-    EchoCanceller::putRecorded(std::move(buf));
-};
-
-void
-NullEchoCanceller::putPlayback(const std::shared_ptr<AudioFrame>& buf)
+NullAudioProcessor::NullAudioProcessor(AudioFormat format, unsigned frameSize)
+    : AudioProcessor(format, frameSize)
 {
-    EchoCanceller::putPlayback(buf);
-};
+    JAMI_DBG("[null_audio] NullAudioProcessor, frame size = %d (=%d ms), channels = %d",
+             frameSize,
+             frameDurationMs_,
+             format.nb_channels);
+}

 std::shared_ptr<AudioFrame>
-NullEchoCanceller::getProcessed()
+NullAudioProcessor::getProcessed()
 {
-    while (recordQueue_.samples() > recordQueue_.frameSize() * 10) {
-        JAMI_DBG("record overflow %d / %d", recordQueue_.samples(), frameSize_);
-        recordQueue_.dequeue();
-    }
-    while (playbackQueue_.samples() > playbackQueue_.frameSize() * 10) {
-        JAMI_DBG("playback overflow %d / %d", playbackQueue_.samples(), frameSize_);
-        playbackQueue_.dequeue();
-    }
-    if (recordQueue_.samples() < recordQueue_.frameSize()
-        || playbackQueue_.samples() < playbackQueue_.frameSize()) {
-        JAMI_DBG("underflow rec: %d, play: %d fs: %d",
-                 recordQueue_.samples(),
-                 playbackQueue_.samples(),
-                 frameSize_);
+    if (tidyQueues()) {
        return {};
    }

-    JAMI_WARN("Processing %d samples, rec: %d, play: %d ",
-              frameSize_,
-              recordQueue_.samples(),
-              playbackQueue_.samples());
    playbackQueue_.dequeue();
    return recordQueue_.dequeue();
 };

-void NullEchoCanceller::done() {};
-
 } // namespace jami
--- a/src/media/audio/echo-cancel/null_echo_canceller.h
+++ b/src/media/audio/echo-cancel/null_echo_canceller.h
 /*
 *  Copyright (C) 2021-2022 Savoir-faire Linux Inc.
 *
- *  Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
- *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
@@ -20,20 +18,23 @@

 #pragma once

-#include "echo_canceller.h"
+#include "audio_processor.h"

 namespace jami {

-class NullEchoCanceller final : public EchoCanceller
+class NullAudioProcessor final : public AudioProcessor
 {
 public:
-    NullEchoCanceller(AudioFormat format, unsigned frameSize);
-    ~NullEchoCanceller() = default;
+    NullAudioProcessor(AudioFormat format, unsigned frameSize);
+    ~NullAudioProcessor() = default;

-    void putRecorded(std::shared_ptr<AudioFrame>&& buf) override;
-    void putPlayback(const std::shared_ptr<AudioFrame>& buf) override;
    std::shared_ptr<AudioFrame> getProcessed() override;
-    void done() override;
+
+    void enableEchoCancel(bool) override {};
+
+    void enableNoiseSuppression(bool) override {};
+
+    void enableAutomaticGainControl(bool) override {};
 };

 } // namespace jami
--- a/src/media/audio/audio-processing/speex.cpp
+++ b/src/media/audio/audio-processing/speex.cpp
+/*
+ *  Copyright (C) 2021-2022 Savoir-faire Linux Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
+ */
+
+#include "speex.h"
+
+#include "audio/audiolayer.h"
+#include <cstdint>
+#include <memory>
+#include <speex/speex_config_types.h>
+#include <vector>
+
+extern "C" {
+#include <speex/speex_echo.h>
+#include <speex/speex_preprocess.h>
+}
+
+namespace jami {
+
+SpeexAudioProcessor::SpeexAudioProcessor(AudioFormat format, unsigned frameSize)
+    : AudioProcessor(format, frameSize)
+    , echoState(speex_echo_state_init_mc((int) frameSize,
+                                         (int) frameSize * 16,
+                                         (int) format.nb_channels,
+                                         (int) format.nb_channels),
+                &speex_echo_state_destroy)
+    , iProcBuffer(frameSize_, format)
+{
+    JAMI_DBG("[speex-dsp] SpeexAudioProcessor, frame size = %d (=%d ms), channels = %d",
+             frameSize,
+             frameDurationMs_,
+             format.nb_channels);
+    // set up speex echo state
+    speex_echo_ctl(echoState.get(), SPEEX_ECHO_SET_SAMPLING_RATE, &format_.sample_rate);
+
+    // speex specific value to turn feature on (need to pass a pointer to it)
+    spx_int32_t speexOn = 1;
+
+    // probability integers, i.e. 50 means 50%
+    // vad will be true if speex's raw probability calculation is higher than this in any case
+    spx_int32_t probStart = 99;
+
+    // vad will be true if voice was active last frame
+    //     AND speex's raw probability calculation is higher than this
+    spx_int32_t probContinue = 90;
+
+    // maximum noise suppression in dB (negative)
+    spx_int32_t maxNoiseSuppress = -50;
+
+    // set up speex preprocess states, one for each channel
+    // note that they are not enabled here, but rather in the enable* functions
+    for (unsigned int i = 0; i < format.nb_channels; i++) {
+        auto channelPreprocessorState
+            = SpeexPreprocessStatePtr(speex_preprocess_state_init((int) frameSize,
+                                                                  (int) format.sample_rate),
+                                      &speex_preprocess_state_destroy);
+
+        // set max noise suppression level
+        speex_preprocess_ctl(channelPreprocessorState.get(),
+                             SPEEX_PREPROCESS_SET_NOISE_SUPPRESS,
+                             &maxNoiseSuppress);
+
+        // set up voice activity values
+        speex_preprocess_ctl(channelPreprocessorState.get(), SPEEX_PREPROCESS_SET_VAD, &speexOn);
+        speex_preprocess_ctl(channelPreprocessorState.get(),
+                             SPEEX_PREPROCESS_SET_PROB_START,
+                             &probStart);
+        speex_preprocess_ctl(channelPreprocessorState.get(),
+                             SPEEX_PREPROCESS_SET_PROB_CONTINUE,
+                             &probContinue);
+
+        // keep track of this channel's preprocessor state
+        preprocessorStates.push_back(std::move(channelPreprocessorState));
+    }
+
+    JAMI_INFO("[speex-dsp] Done initializing");
+}
+
+void
+SpeexAudioProcessor::enableEchoCancel(bool enabled)
+{
+    JAMI_DBG("[speex-dsp] enableEchoCancel %d", enabled);
+    // need to set member variable so we know to do it in getProcessed
+    shouldAEC = enabled;
+
+    if (enabled) {
+        // reset the echo canceller
+        speex_echo_state_reset(echoState.get());
+
+        for (auto& channelPreprocessorState : preprocessorStates) {
+            // attach our already-created echo canceller
+            speex_preprocess_ctl(channelPreprocessorState.get(),
+                                 SPEEX_PREPROCESS_SET_ECHO_STATE,
+                                 echoState.get());
+        }
+    } else {
+        for (auto& channelPreprocessorState : preprocessorStates) {
+            // detach echo canceller (set it to NULL)
+            // don't destroy it though, we will reset it when necessary
+            speex_preprocess_ctl(channelPreprocessorState.get(),
+                                 SPEEX_PREPROCESS_SET_ECHO_STATE,
+                                 NULL);
+        }
+    }
+}
+
+void
+SpeexAudioProcessor::enableNoiseSuppression(bool enabled)
+{
+    JAMI_DBG("[speex-dsp] enableNoiseSuppression %d", enabled);
+    spx_int32_t speexSetValue = (spx_int32_t) enabled;
+
+    // for each preprocessor
+    for (auto& channelPreprocessorState : preprocessorStates) {
+        // set denoise status
+        speex_preprocess_ctl(channelPreprocessorState.get(),
+                             SPEEX_PREPROCESS_SET_DENOISE,
+                             &speexSetValue);
+        // set de-reverb status
+        speex_preprocess_ctl(channelPreprocessorState.get(),
+                             SPEEX_PREPROCESS_SET_DEREVERB,
+                             &speexSetValue);
+    }
+}
+
+void
+SpeexAudioProcessor::enableAutomaticGainControl(bool enabled)
+{
+    JAMI_DBG("[speex-dsp] enableAutomaticGainControl %d", enabled);
+    spx_int32_t speexSetValue = (spx_int32_t) enabled;
+
+    // for each preprocessor
+    for (auto& channelPreprocessorState : preprocessorStates) {
+        // set AGC status
+        speex_preprocess_ctl(channelPreprocessorState.get(),
+                             SPEEX_PREPROCESS_SET_AGC,
+                             &speexSetValue);
+    }
+}
+
+std::shared_ptr<AudioFrame>
+SpeexAudioProcessor::getProcessed()
+{
+    if (tidyQueues()) {
+        return {};
+    }
+
+    auto playback = playbackQueue_.dequeue();
+    auto record = recordQueue_.dequeue();
+
+    if (!playback || !record) {
+        return {};
+    }
+
+    auto processed = std::make_shared<AudioFrame>(record->getFormat(), record->getFrameSize());
+
+    if (shouldAEC) {
+        // we want to echo cancel
+        // multichannel, output into processed
+        speex_echo_cancellation(echoState.get(),
+                                (int16_t*) record->pointer()->data[0],
+                                (int16_t*) playback->pointer()->data[0],
+                                (int16_t*) processed->pointer()->data[0]);
+    } else {
+        // don't want to echo cancel, so just use record frame instead
+        processed = record;
+    }
+
+    // deinterleave processed into channels
+    std::vector<int16_t*> procData {format_.nb_channels};
+    iProcBuffer.deinterleave((const AudioSample*) processed->pointer()->data[0],
+                             frameSize_,
+                             format_.nb_channels);
+
+    // point procData to correct channels
+    for (unsigned int channel = 0; channel < format_.nb_channels; channel++) {
+        procData[channel] = iProcBuffer.getChannel(channel)->data();
+    }
+
+    // overall voice activity
+    bool overallVad = false;
+    // current channel voice activity
+    int channelVad;
+
+    // run preprocess on each channel
+    int channel = 0;
+    for (auto& channelPreprocessorState : preprocessorStates) {
+        // preprocesses in place, returns voice activity boolean
+        channelVad = speex_preprocess_run(channelPreprocessorState.get(), procData[channel]);
+
+        // boolean OR
+        overallVad |= channelVad;
+
+        channel += 1;
+    }
+
+    // reinterleave into processed
+    iProcBuffer.interleave((AudioSample*) processed->pointer()->data[0]);
+
+    // add stabilized voice activity to the AudioFrame
+    processed->has_voice = getStabilizedVoiceActivity(overallVad);
+
+    return processed;
+}
+
+} // namespace jami
--- a/src/media/audio/echo-cancel/speex_echo_canceller.h
+++ b/src/media/audio/echo-cancel/speex_echo_canceller.h
@@ -20,32 +20,44 @@

 #pragma once

-#include "audio/echo-cancel/echo_canceller.h"
-#include "audio/audio_frame_resizer.h"
+#include "audio_processor.h"

+// typedef speex C structs
 extern "C" {
 struct SpeexEchoState_;
 typedef struct SpeexEchoState_ SpeexEchoState;
+struct SpeexPreprocessState_;
+typedef struct SpeexPreprocessState_ SpeexPreprocessState;
 }

-#include <memory>
-
 namespace jami {

-class SpeexEchoCanceller final : public EchoCanceller
+class SpeexAudioProcessor final : public AudioProcessor
 {
 public:
-    SpeexEchoCanceller(AudioFormat format, unsigned frameSize);
-    ~SpeexEchoCanceller() = default;
+    SpeexAudioProcessor(AudioFormat format, unsigned frameSize);
+    ~SpeexAudioProcessor() = default;

-    // Inherited via EchoCanceller
-    void putRecorded(std::shared_ptr<AudioFrame>&& buf) override;
-    void putPlayback(const std::shared_ptr<AudioFrame>& buf) override;
    std::shared_ptr<AudioFrame> getProcessed() override;
-    void done() override;
+
+    void enableEchoCancel(bool enabled) override;
+    void enableNoiseSuppression(bool enabled) override;
+    void enableAutomaticGainControl(bool enabled) override;

 private:
-    struct SpeexEchoStateImpl;
-    std::unique_ptr<SpeexEchoStateImpl> pimpl_;
+    using SpeexEchoStatePtr = std::unique_ptr<SpeexEchoState, void (*)(SpeexEchoState*)>;
+    using SpeexPreprocessStatePtr
+        = std::unique_ptr<SpeexPreprocessState, void (*)(SpeexPreprocessState*)>;
+
+    // multichannel, one for the entire audio processor
+    SpeexEchoStatePtr echoState;
+
+    // one for each channel
+    std::vector<SpeexPreprocessStatePtr> preprocessorStates;
+
+    AudioBuffer iProcBuffer;
+
+    // if we should do echo cancellation
+    bool shouldAEC {false};
 };
 } // namespace jami
--- a/src/media/audio/echo-cancel/webrtc_echo_canceller.cpp
+++ b/src/media/audio/echo-cancel/webrtc_echo_canceller.cpp
 /*
 *  Copyright (C) 2021-2022 Savoir-faire Linux Inc.
 *
- *  Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
- *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
@@ -18,97 +16,112 @@
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
 */

-#include "webrtc_echo_canceller.h"
+#include "webrtc.h"

 #include <webrtc/modules/audio_processing/include/audio_processing.h>

 namespace jami {

-WebRTCEchoCanceller::WebRTCEchoCanceller(AudioFormat format, unsigned frameSize)
-    : EchoCanceller(format, frameSize)
-    , pimpl_(std::make_unique<WebRTCAPMImpl>(format, frameSize))
+constexpr int webrtcNoError = webrtc::AudioProcessing::kNoError;
+
+WebRTCAudioProcessor::WebRTCAudioProcessor(AudioFormat format, unsigned frameSize)
+    : AudioProcessor(format, frameSize)
    , fRecordBuffer_(format.nb_channels, std::vector<float>(frameSize_, 0))
    , fPlaybackBuffer_(format.nb_channels, std::vector<float>(frameSize_, 0))
    , iRecordBuffer_(frameSize_, format)
    , iPlaybackBuffer_(frameSize_, format)
-{}
-
-struct WebRTCEchoCanceller::WebRTCAPMImpl
-{
-    using APMPtr = std::unique_ptr<webrtc::AudioProcessing>;
-    APMPtr apm;
-    webrtc::StreamConfig streamConfig;
-
-    WebRTCAPMImpl(AudioFormat format, unsigned)
-        : streamConfig(format.sample_rate, format.nb_channels)
 {
-        webrtc::ProcessingConfig pconfig;
+    JAMI_DBG("[webrtc-ap] WebRTCAudioProcessor, frame size = %d (=%d ms), channels = %d",
+             frameSize,
+             frameDurationMs_,
+             format.nb_channels);
    webrtc::Config config;
-
    config.Set<webrtc::ExtendedFilter>(new webrtc::ExtendedFilter(true));
    config.Set<webrtc::DelayAgnostic>(new webrtc::DelayAgnostic(true));

    apm.reset(webrtc::AudioProcessing::Create(config));

-        pconfig = {
+    webrtc::StreamConfig streamConfig((int) format.sample_rate, (int) format.nb_channels);
+    webrtc::ProcessingConfig pconfig = {
        streamConfig, /* input stream */
        streamConfig, /* output stream */
        streamConfig, /* reverse input stream */
        streamConfig, /* reverse output stream */
    };

-        if (apm->Initialize(pconfig) != webrtc::AudioProcessing::kNoError) {
+    if (apm->Initialize(pconfig) != webrtcNoError) {
        JAMI_ERR("[webrtc-ap] Error initialising audio processing module");
    }

-        // aec
-        apm->echo_cancellation()->set_suppression_level(
-            webrtc::EchoCancellation::SuppressionLevel::kModerateSuppression);
-        apm->echo_cancellation()->enable_drift_compensation(true);
-        apm->echo_cancellation()->Enable(true);
-
-        // hpf
-        apm->high_pass_filter()->Enable(true);
+    // voice activity
+    if (apm->voice_detection()->Enable(true) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error enabling voice detection");
+    }
+    // TODO: change likelihood?
+    if (apm->voice_detection()->set_likelihood(webrtc::VoiceDetection::kVeryLowLikelihood)
+        != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error setting voice detection likelihood");
+    }
+    // asserted to be 10 in voice_detection_impl.cc
+    if (apm->voice_detection()->set_frame_size_ms(10) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error setting voice detection frame size");
+    }

-        // ns
-        apm->noise_suppression()->set_level(webrtc::NoiseSuppression::kHigh);
-        apm->noise_suppression()->Enable(true);
+    JAMI_INFO("[webrtc-ap] Done initializing");
+}

-        // agc
-        apm->gain_control()->set_analog_level_limits(0, 255);
-        apm->gain_control()->set_mode(webrtc::GainControl::kAdaptiveAnalog);
-        apm->gain_control()->Enable(true);
+void
+WebRTCAudioProcessor::enableNoiseSuppression(bool enabled)
+{
+    JAMI_DBG("[webrtc-ap] enableNoiseSuppression %d", enabled);
+    if (apm->noise_suppression()->Enable(enabled) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error enabling noise suppression");
+    }
+    if (apm->noise_suppression()->set_level(webrtc::NoiseSuppression::kVeryHigh) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error setting noise suppression level");
+    }
+    if (apm->high_pass_filter()->Enable(enabled) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error enabling high pass filter");
+    }
 }
-};

 void
-WebRTCEchoCanceller::putRecorded(std::shared_ptr<AudioFrame>&& buf)
+WebRTCAudioProcessor::enableAutomaticGainControl(bool enabled)
 {
-    EchoCanceller::putRecorded(std::move(buf));
+    JAMI_DBG("[webrtc-ap] enableAutomaticGainControl %d", enabled);
+    if (apm->gain_control()->Enable(enabled) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error enabling automatic gain control");
+    }
+    if (apm->gain_control()->set_analog_level_limits(0, 255) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error setting automatic gain control analog level limits");
+    }
+    if (apm->gain_control()->set_mode(webrtc::GainControl::kAdaptiveAnalog) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error setting automatic gain control mode");
+    }
 }

 void
-WebRTCEchoCanceller::putPlayback(const std::shared_ptr<AudioFrame>& buf)
+WebRTCAudioProcessor::enableEchoCancel(bool enabled)
 {
-    EchoCanceller::putPlayback(buf);
+    JAMI_DBG("[webrtc-ap] enableEchoCancel %d", enabled);
+
+    if (apm->echo_cancellation()->Enable(enabled) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error enabling echo cancellation");
+    }
+    if (apm->echo_cancellation()->set_suppression_level(
+            webrtc::EchoCancellation::SuppressionLevel::kHighSuppression)
+        != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error setting echo cancellation level");
+    }
+    if (apm->echo_cancellation()->enable_drift_compensation(true) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] Error enabling echo cancellation drift compensation");
+    }
 }

 std::shared_ptr<AudioFrame>
-WebRTCEchoCanceller::getProcessed()
+WebRTCAudioProcessor::getProcessed()
 {
-    while (recordQueue_.samples() > recordQueue_.frameSize() * 10) {
-        JAMI_DBG("record overflow %d / %d", recordQueue_.samples(), frameSize_);
-        recordQueue_.dequeue();
-    }
-    while (playbackQueue_.samples() > playbackQueue_.frameSize() * 10) {
-        JAMI_DBG("playback overflow %d / %d", playbackQueue_.samples(), frameSize_);
-        playbackQueue_.dequeue();
-    }
-    if (recordQueue_.samples() < recordQueue_.frameSize()
-        || playbackQueue_.samples() < playbackQueue_.frameSize()) {
-        // If there are not enough samples in either queue, we can't
-        // process anything.
-        // JAMI_DBG("underrun p:%d / r:%d", playbackQueue_.samples(), recordQueue_.samples());
+    if (tidyQueues()) {
        return {};
    }

@@ -116,55 +129,73 @@ WebRTCEchoCanceller::getProcessed()

    auto playback = playbackQueue_.dequeue();
    auto record = recordQueue_.dequeue();
-    if (!playback || !record)
+    if (!playback || !record) {
        return {};
+    }

    auto processed = std::make_shared<AudioFrame>(format_, frameSize_);

-    webrtc::StreamConfig& sc = pimpl_->streamConfig;
+    // webrtc::StreamConfig& sc = streamConfig;
+    webrtc::StreamConfig sc((int) format_.sample_rate, (int) format_.nb_channels);

    // analyze deinterleaved float playback data
    iPlaybackBuffer_.deinterleave((const AudioSample*) playback->pointer()->data[0],
                                  frameSize_,
                                  format_.nb_channels);
    std::vector<float*> playData {format_.nb_channels};
-    for (unsigned c = 0; c < format_.nb_channels; ++c) {
-        playData[c] = fPlaybackBuffer_[c].data();
-        iPlaybackBuffer_.channelToFloat(playData[c], c);
+    for (unsigned channel = 0; channel < format_.nb_channels; ++channel) {
+        // point playData channel to appropriate data location
+        playData[channel] = fPlaybackBuffer_[channel].data();
+
+        // write playback to playData channel
+        iPlaybackBuffer_.channelToFloat(playData[channel], (int) channel);
    }
-    if (pimpl_->apm->ProcessReverseStream(playData.data(), sc, sc, playData.data())
-        != webrtc::AudioProcessing::kNoError)
+
+    // process reverse in place
+    if (apm->ProcessReverseStream(playData.data(), sc, sc, playData.data()) != webrtcNoError) {
        JAMI_ERR("[webrtc-ap] ProcessReverseStream failed");
+    }

    // process deinterleaved float recorded data
    iRecordBuffer_.deinterleave((const AudioSample*) record->pointer()->data[0],
                                frameSize_,
                                format_.nb_channels);
    std::vector<float*> recData {format_.nb_channels};
-    for (unsigned c = 0; c < format_.nb_channels; ++c) {
-        recData[c] = fRecordBuffer_[c].data();
-        iRecordBuffer_.channelToFloat(recData[c], c);
+    for (unsigned int channel = 0; channel < format_.nb_channels; ++channel) {
+        // point recData channel to appropriate data location
+        recData[channel] = fRecordBuffer_[channel].data();
+
+        // write data to recData channel
+        iRecordBuffer_.channelToFloat(recData[channel], (int) channel);
    }
-    // TODO: implement this correctly (it MUST be called prior to ProcessStream)
+    // TODO: maybe implement this to see if it's better than automatic drift compensation
+    // (it MUST be called prior to ProcessStream)
    // delay = (t_render - t_analyze) + (t_process - t_capture)
-    pimpl_->apm->set_stream_delay_ms(0);
-    pimpl_->apm->gain_control()->set_stream_analog_level(analogLevel_);
-    pimpl_->apm->echo_cancellation()->set_stream_drift_samples(driftSamples);
-    if (pimpl_->apm->ProcessStream(recData.data(), sc, sc, recData.data())
-        != webrtc::AudioProcessing::kNoError)
+    if (apm->set_stream_delay_ms(0) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] set_stream_delay_ms failed");
+    }
+
+    if (apm->gain_control()->set_stream_analog_level(analogLevel_) != webrtcNoError) {
+        JAMI_ERR("[webrtc-ap] set_stream_analog_level failed");
+    }
+    apm->echo_cancellation()->set_stream_drift_samples(driftSamples);
+
+    // process in place
+    if (apm->ProcessStream(recData.data(), sc, sc, recData.data()) != webrtcNoError) {
        JAMI_ERR("[webrtc-ap] ProcessStream failed");
-    analogLevel_ = pimpl_->apm->gain_control()->stream_analog_level();
+    }
+
+    analogLevel_ = apm->gain_control()->stream_analog_level();

    // return interleaved s16 data
    iRecordBuffer_.convertFloatPlanarToSigned16((uint8_t**) recData.data(),
                                                frameSize_,
                                                format_.nb_channels);
    iRecordBuffer_.interleave((AudioSample*) processed->pointer()->data[0]);
+
+    processed->has_voice = getStabilizedVoiceActivity(apm->voice_detection()->stream_has_voice());
+
    return processed;
 }

-void
-WebRTCEchoCanceller::done()
-{}
-
 } // namespace jami
--- a/src/media/audio/echo-cancel/webrtc_echo_canceller.h
+++ b/src/media/audio/echo-cancel/webrtc_echo_canceller.h
 /*
 *  Copyright (C) 2021-2022 Savoir-faire Linux Inc.
 *
- *  Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
- *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
@@ -20,28 +18,29 @@

 #pragma once

-#include "audio/echo-cancel/echo_canceller.h"
-#include "audio/audio_frame_resizer.h"
+#include "audio_processor.h"

-#include <memory>
+namespace webrtc {
+class AudioProcessing;
+}

 namespace jami {

-class WebRTCEchoCanceller final : public EchoCanceller
+class WebRTCAudioProcessor final : public AudioProcessor
 {
 public:
-    WebRTCEchoCanceller(AudioFormat format, unsigned frameSize);
-    ~WebRTCEchoCanceller() = default;
+    WebRTCAudioProcessor(AudioFormat format, unsigned frameSize);
+    ~WebRTCAudioProcessor() = default;

-    // Inherited via EchoCanceller
-    void putRecorded(std::shared_ptr<AudioFrame>&& buf) override;
-    void putPlayback(const std::shared_ptr<AudioFrame>& buf) override;
+    // Inherited via AudioProcessor
    std::shared_ptr<AudioFrame> getProcessed() override;
-    void done() override;
+
+    void enableEchoCancel(bool enabled) override;
+    void enableNoiseSuppression(bool enabled) override;
+    void enableAutomaticGainControl(bool enabled) override;

 private:
-    struct WebRTCAPMImpl;
-    std::unique_ptr<WebRTCAPMImpl> pimpl_;
+    std::unique_ptr<webrtc::AudioProcessing> apm;

    using fChannelBuffer = std::vector<std::vector<float>>;
    fChannelBuffer fRecordBuffer_;

--- a/src/media/audio/audio_frame_resizer.cpp
+++ b/src/media/audio/audio_frame_resizer.cpp
@@ -109,6 +109,9 @@ AudioFrameResizer::enqueue(std::shared_ptr<AudioFrame>&& frame)
        return; // return if frame was just passed through
    }

+    // voice activity
+    hasVoice_ = frame->has_voice;
+
    // queue reallocates itself if need be
    if ((ret = av_audio_fifo_write(queue_, reinterpret_cast<void**>(f->data), f->nb_samples)) < 0) {
        JAMI_ERR() << "Audio resizer error: " << libav_utils::getError(ret);
@@ -139,6 +142,7 @@ AudioFrameResizer::dequeue()
        return {};
    }
    frame->pointer()->pts = nextOutputPts_;
+    frame->has_voice = hasVoice_;
    nextOutputPts_ += frameSize_;
    return frame;
 }

--- a/src/media/audio/audio_frame_resizer.h
+++ b/src/media/audio/audio_frame_resizer.h
@@ -104,6 +104,7 @@ private:
     */
    AVAudioFifo* queue_;
    int64_t nextOutputPts_ {0};
+    bool hasVoice_ {false};
 };

 } // namespace jami
--- a/src/media/audio/audio_input.cpp
+++ b/src/media/audio/audio_input.cpp
@@ -122,18 +122,20 @@ AudioInput::readFromDevice()
    std::this_thread::sleep_until(wakeUp_);
    wakeUp_ += MS_PER_PACKET;

-    auto& mainBuffer = Manager::instance().getRingBufferPool();
-    auto samples = mainBuffer.getData(id_);
-    if (not samples)
+    auto& bufferPool = Manager::instance().getRingBufferPool();
+    auto audioFrame = bufferPool.getData(id_);
+    if (not audioFrame)
        return;

-    if (muteState_)
-        libav_utils::fillWithSilence(samples->pointer());
+    if (muteState_) {
+        libav_utils::fillWithSilence(audioFrame->pointer());
+        audioFrame->has_voice = false; // force no voice activity when muted
+    }

    std::lock_guard<std::mutex> lk(fmtMutex_);
-    if (mainBuffer.getInternalAudioFormat() != format_)
-        samples = resampler_->resample(std::move(samples), format_);
-    resizer_->enqueue(std::move(samples));
+    if (bufferPool.getInternalAudioFormat() != format_)
+        audioFrame = resampler_->resample(std::move(audioFrame), format_);
+    resizer_->enqueue(std::move(audioFrame));
 }

 void

--- a/src/media/audio/audio_input.h
+++ b/src/media/audio/audio_input.h
@@ -67,7 +67,9 @@ public:
    void setSeekTime(int64_t time);

    void setSuccessfulSetupCb(const std::function<void(MediaType, bool)>& cb)
-        { onSuccessfulSetup_ = cb; }
+    {
+        onSuccessfulSetup_ = cb;
+    }

 private:
    void readFromDevice();

--- a/src/media/audio/audiobuffer.h
+++ b/src/media/audio/audiobuffer.h
@@ -51,13 +51,7 @@ struct AudioFormat
    unsigned nb_channels;
    AVSampleFormat sampleFormat;

-    constexpr AudioFormat(unsigned sr, unsigned c)
-        : sample_rate(sr)
-        , nb_channels(c)
-        , sampleFormat(AV_SAMPLE_FMT_S16)
-    {}
-
-    constexpr AudioFormat(unsigned sr, unsigned c, AVSampleFormat f)
+    constexpr AudioFormat(unsigned sr, unsigned c, AVSampleFormat f = AV_SAMPLE_FMT_S16)
        : sample_rate(sr)
        , nb_channels(c)
        , sampleFormat(f)

--- a/src/media/audio/audiolayer.cpp
+++ b/src/media/audio/audiolayer.cpp
@@ -28,11 +28,13 @@
 #include "tonecontrol.h"
 #include "client/ring_signal.h"

-// aec
+// TODO: decide which library to use/how to decide (compile time? runtime?)
 #if HAVE_WEBRTC_AP
-#include "echo-cancel/webrtc_echo_canceller.h"
+#include "audio-processing/webrtc.h"
+#elif HAVE_SPEEXDSP
+#include "audio-processing/speex.h"
 #else
-#include "echo-cancel/null_echo_canceller.h"
+#include "audio-processing/null_audio_processor.h"
 #endif

 #include <ctime>
@@ -102,55 +104,89 @@ void
 AudioLayer::playbackChanged(bool started)
 {
    playbackStarted_ = started;
-    checkAEC();
 }

 void
 AudioLayer::recordChanged(bool started)
 {
+    std::lock_guard<std::mutex> lock(audioProcessorMutex);
+    if (started) {
+        // create audio processor
+        createAudioProcessor();
+    } else {
+        // destroy audio processor
+        destroyAudioProcessor();
+    }
    recordStarted_ = started;
-    checkAEC();
 }

 void
 AudioLayer::setHasNativeAEC(bool hasEAC)
 {
+    std::lock_guard<std::mutex> lock(audioProcessorMutex);
    hasNativeAEC_ = hasEAC;
-    checkAEC();
+    // if we have a current audio processor, tell it to enable/disable its own AEC
+    if (audioProcessor) {
+        audioProcessor->enableEchoCancel(!hasEAC);
+    }
 }

+// must acquire lock beforehand
 void
-AudioLayer::checkAEC()
+AudioLayer::createAudioProcessor()
 {
-    std::lock_guard<std::mutex> lk(ecMutex_);
-    bool shouldSoftAEC = not hasNativeAEC_ and playbackStarted_ and recordStarted_;
-    if (not echoCanceller_ and shouldSoftAEC) {
    auto nb_channels = std::min(audioFormat_.nb_channels, audioInputFormat_.nb_channels);
    auto sample_rate = std::min(audioFormat_.sample_rate, audioInputFormat_.sample_rate);
+
+    // TODO: explain/rework this math??
    if (sample_rate % 16000u != 0)
        sample_rate = 16000u * ((sample_rate / 16000u) + 1u);
    sample_rate = std::clamp(sample_rate, 16000u, 96000u);
-        AudioFormat format {sample_rate, nb_channels};
+
+    AudioFormat formatForProcessor {sample_rate, nb_channels};
+
+#if HAVE_SPEEXDSP && !HAVE_WEBRTC_AP
+    // we are using speex
+    // TODO: maybe force this to be equivalent to 20ms? as expected by speex
+    auto frame_size = sample_rate / 50u;
+#else
+    // we are using either webrtc-ap or null
    auto frame_size = sample_rate / 100u;
+#endif
    JAMI_WARN("Input {%d Hz, %d channels}",
              audioInputFormat_.sample_rate,
              audioInputFormat_.nb_channels);
    JAMI_WARN("Output {%d Hz, %d channels}", audioFormat_.sample_rate, audioFormat_.nb_channels);
-        JAMI_WARN("Starting AEC {%d Hz, %d channels, %d samples/frame}",
+    JAMI_WARN("Starting audio processor with: {%d Hz, %d channels, %d samples/frame}",
              sample_rate,
              nb_channels,
              frame_size);

 #if HAVE_WEBRTC_AP
-        echoCanceller_.reset(new WebRTCEchoCanceller(format, frame_size));
+    JAMI_INFO("[audiolayer] using webrtc audio processor");
+    audioProcessor.reset(new WebRTCAudioProcessor(formatForProcessor, frame_size));
+#elif HAVE_SPEEXDSP
+    JAMI_INFO("[audiolayer] using speex audio processor");
+    audioProcessor.reset(new SpeexAudioProcessor(formatForProcessor, frame_size));
 #else
-        echoCanceller_.reset(new NullEchoCanceller(format, frame_size));
+    JAMI_INFO("[audiolayer] using null audio processor");
+    audioProcessor.reset(new NullAudioProcessor(formatForProcessor, frame_size));
 #endif
-    } else if (echoCanceller_ and not shouldSoftAEC and not playbackStarted_
-               and not recordStarted_) {
-        JAMI_WARN("Stopping AEC");
-        echoCanceller_.reset();
+
+    audioProcessor->enableNoiseSuppression(true);
+    // TODO: enable AGC?
+    audioProcessor->enableAutomaticGainControl(false);
+
+    // can also be updated after creation via setHasNativeAEC
+    audioProcessor->enableEchoCancel(!hasNativeAEC_);
 }
+
+// must acquire lock beforehand
+void
+AudioLayer::destroyAudioProcessor()
+{
+    // delete it
+    audioProcessor.reset();
 }

 void
@@ -228,19 +264,19 @@ AudioLayer::getToPlay(AudioFormat format, size_t writableSamples)
        } else if (auto buf = bufferPool.getData(RingBufferPool::DEFAULT_ID)) {
            resampled = resampler_->resample(std::move(buf), format);
        } else {
-            if (echoCanceller_) {
+            std::lock_guard<std::mutex> lock(audioProcessorMutex);
+            if (audioProcessor) {
                auto silence = std::make_shared<AudioFrame>(format, writableSamples);
                libav_utils::fillWithSilence(silence->pointer());
-                std::lock_guard<std::mutex> lk(ecMutex_);
-                echoCanceller_->putPlayback(silence);
+                audioProcessor->putPlayback(silence);
            }
            break;
        }

        if (resampled) {
-            if (echoCanceller_) {
-                std::lock_guard<std::mutex> lk(ecMutex_);
-                echoCanceller_->putPlayback(resampled);
+            std::lock_guard<std::mutex> lock(audioProcessorMutex);
+            if (audioProcessor) {
+                audioProcessor->putPlayback(resampled);
            }
            playbackQueue_->enqueue(std::move(resampled));
        } else
@@ -253,12 +289,13 @@ AudioLayer::getToPlay(AudioFormat format, size_t writableSamples)
 void
 AudioLayer::putRecorded(std::shared_ptr<AudioFrame>&& frame)
 {
-    if (echoCanceller_) {
-        std::lock_guard<std::mutex> lk(ecMutex_);
-        echoCanceller_->putRecorded(std::move(frame));
-        while (auto rec = echoCanceller_->getProcessed()) {
+    std::lock_guard<std::mutex> lock(audioProcessorMutex);
+    if (audioProcessor && playbackStarted_ && recordStarted_) {
+        audioProcessor->putRecorded(std::move(frame));
+        while (auto rec = audioProcessor->getProcessed()) {
            mainRingBuffer_->put(std::move(rec));
        }
+
    } else {
        mainRingBuffer_->put(std::move(frame));
    }