Skip to content
Snippets Groups Projects
Commit 6ff875cb authored by Tobias Hildebrandt's avatar Tobias Hildebrandt Committed by Adrien Béraud
Browse files

audio: add voice activity detection

* rename EchoCanceller to AudioProcessor
* enable echo cancellation and voice activity detection
* add minimum active duration and trigger time to voice activation

Gitlab: #741
Change-Id: I98662462c17539fca1d042482e97fdb3eff86130
parent 7e74098c
No related branches found
No related tags found
No related merge requests found
Showing
with 841 additions and 101 deletions
......@@ -22,7 +22,7 @@ source_group("Source Files\\jamidht\\eth\\libdevcore" FILES ${Source_Files__jami
source_group("Source Files\\jamidht\\eth\\libdevcrypto" FILES ${Source_Files__jamidht__eth__libdevcrypto})
source_group("Source Files\\media" FILES ${Source_Files__media})
source_group("Source Files\\media\\audio" FILES ${Source_Files__media__audio})
source_group("Source Files\\media\\audio\\echo-cancel" FILES ${Source_Files__media__audio__echo_cancel})
source_group("Source Files\\media\\audio\\audio-processing" FILES ${Source_Files__media__audio__audio_processing})
source_group("Source Files\\media\\audio\\sound" FILES ${Source_Files__media__audio__sound})
source_group("Source Files\\media\\video" FILES ${Source_Files__media__video})
source_group("Source Files\\plugin" FILES ${Source_Files__plugin})
......@@ -55,7 +55,7 @@ list (APPEND ALL_FILES
${Source_Files__media}
${Source_Files__media__audio}
${Source_Files__media__audio__sound}
${Source_Files__media__audio__echo_cancel}
${Source_Files__media__audio__audio_processing}
${Source_Files__media__video}
${Source_Files__security}
${Source_Files__sip}
......
......@@ -29,7 +29,7 @@ deplibavformat = dependency('libavformat', version: '>= 56.40.101')
deplibswscale = dependency('libswscale', version: '>= 3.1.101')
deplibswresample = dependency('libswresample', version: '>= 1.2.101')
deplibavutil = dependency('libavutil', version: '>= 55.75.100')
depspeexdsp = dependency('speexdsp')
depfmt = dependency('fmt', version: '>= 5.3')
depyamlcpp = dependency('yaml-cpp', version: '>= 0.5.1', required: false)
......@@ -113,6 +113,9 @@ conf.set10('HAVE_RINGNS', depopenssl.found())
depwebrtcap = dependency('webrtc-audio-processing', required: get_option('aec'))
conf.set10('HAVE_WEBRTC_AP', depwebrtcap.found())
depspeexdsp = dependency('speexdsp')
conf.set10('HAVE_SPEEXDSP', depspeexdsp.found())
if get_option('video')
conf.set('ENABLE_VIDEO', true)
if host_machine.system() == 'linux' and meson.get_compiler('cpp').get_define('__ANDROID__') != '1'
......
......@@ -105,6 +105,7 @@ public:
float calcRMS() const;
jami::AudioFormat getFormat() const;
size_t getFrameSize() const;
bool has_voice {false};
private:
void setFormat(const jami::AudioFormat& format);
......
......@@ -50,5 +50,5 @@ endif()
add_subdirectory(sound)
set (Source_Files__media__audio__sound ${Source_Files__media__audio__sound} PARENT_SCOPE)
add_subdirectory(echo-cancel)
set (Source_Files__media__audio__echo_cancel ${Source_Files__media__audio__echo_cancel} PARENT_SCOPE)
\ No newline at end of file
add_subdirectory(audio-processing)
set (Source_Files__media__audio__audio_processing ${Source_Files__media__audio__audio_processing} PARENT_SCOPE)
......@@ -53,7 +53,7 @@ noinst_HEADERS += $(RING_SPEEXDSP_HEAD) \
include ./media/audio/sound/Makefile.am
include ./media/audio/echo-cancel/Makefile.am
include ./media/audio/audio-processing/Makefile.am
if BUILD_OPENSL
include ./media/audio/opensl/Makefile.am
......
################################################################################
# Source groups - audio-processing
################################################################################
list (APPEND Source_Files__media__audio__audio_processing
"${CMAKE_CURRENT_SOURCE_DIR}/audio_processor.h"
"${CMAKE_CURRENT_SOURCE_DIR}/null_audio_processor.h"
"${CMAKE_CURRENT_SOURCE_DIR}/null_audio_processor.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/speex.h"
"${CMAKE_CURRENT_SOURCE_DIR}/speex.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/webrtc.h"
"${CMAKE_CURRENT_SOURCE_DIR}/webrtc.cpp"
)
set (Source_Files__media__audio__audio_processing ${Source_Files__media__audio__audio_processing} PARENT_SCOPE)
noinst_LTLIBRARIES += libaudioprocessing.la
EC_SRC = ./media/audio/audio-processing/null_audio_processor.cpp
EC_HDR = ./media/audio/audio-processing/null_audio_processor.h
if BUILD_SPEEXDSP
EC_SRC += ./media/audio/audio-processing/speex.cpp
EC_HDR += ./media/audio/audio-processing/speex.h
endif
if HAVE_WEBRTC_AP
EC_SRC += ./media/audio/audio-processing/webrtc.cpp
EC_HDR += ./media/audio/audio-processing/webrtc.h
libaudioprocessing_la_CXXFLAGS = @WEBRTC_CFLAGS@ $(AM_CXXFLAGS)
endif
libaudioprocessing_la_SOURCES = \
$(EC_SRC)
noinst_HEADERS += \
./media/audio/audio-processing/audio_processor.h \
$(EC_HDR)
libaudio_la_LIBADD += libaudioprocessing.la
/*
* Copyright (C) 2021-2022 Savoir-faire Linux Inc.
*
* Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
......@@ -31,20 +29,21 @@
namespace jami {
class EchoCanceller
class AudioProcessor
{
private:
NON_COPYABLE(EchoCanceller);
NON_COPYABLE(AudioProcessor);
public:
EchoCanceller(AudioFormat format, unsigned frameSize)
: playbackQueue_(format, frameSize)
, recordQueue_(format, frameSize)
AudioProcessor(AudioFormat format, unsigned frameSize)
: playbackQueue_(format, (int) frameSize)
, recordQueue_(format, (int) frameSize)
, resampler_(new Resampler)
, format_(format)
, frameSize_(frameSize)
, frameDurationMs_((unsigned int) (frameSize_ * (1.0 / format_.sample_rate) * 1000))
{}
virtual ~EchoCanceller() = default;
virtual ~AudioProcessor() = default;
virtual void putRecorded(std::shared_ptr<AudioFrame>&& buf)
{
......@@ -61,8 +60,27 @@ public:
auto copy = buf;
enqueue(playbackQueue_, std::move(copy));
};
/**
* @brief Process and return a single AudioFrame
*/
virtual std::shared_ptr<AudioFrame> getProcessed() = 0;
virtual void done() = 0;
/**
* @brief Set the status of echo cancellation
*/
virtual void enableEchoCancel(bool enabled) = 0;
/**
* @brief Set the status of noise suppression
* includes de-reverb, de-noise, high pass filter, etc
*/
virtual void enableNoiseSuppression(bool enabled) = 0;
/**
* @brief Set the status of automatic gain control
*/
virtual void enableAutomaticGainControl(bool enabled) = 0;
protected:
AudioFrameResizer playbackQueue_;
......@@ -71,7 +89,82 @@ protected:
std::atomic_bool playbackStarted_;
std::atomic_bool recordStarted_;
AudioFormat format_;
unsigned frameSize_;
unsigned int frameSize_;
unsigned int frameDurationMs_;
// artificially extend voice activity by this long
unsigned int forceMinimumVoiceActivityMs {1000};
// current number of frames to force the voice activity to be true
unsigned int forceVoiceActiveFramesLeft {0};
// voice activity must be active for this long _before_ it is considered legitimate
unsigned int minimumConsequtiveDurationMs {200};
// current number of frames that the voice activity has been true
unsigned int consecutiveActiveFrames {0};
/**
* @brief Helper method for audio processors, should be called at start of getProcessed()
* Pops frames from audio queues if there's overflow
* @returns True if there is underflow, false otherwise. An AudioProcessor should
* return a blank AudioFrame if there is underflow.
*/
bool tidyQueues()
{
while (recordQueue_.samples() > recordQueue_.frameSize() * 10) {
JAMI_DBG("record overflow %d / %d", recordQueue_.samples(), frameSize_);
recordQueue_.dequeue();
}
while (playbackQueue_.samples() > playbackQueue_.frameSize() * 10) {
JAMI_DBG("playback overflow %d / %d", playbackQueue_.samples(), frameSize_);
playbackQueue_.dequeue();
}
if (recordQueue_.samples() < recordQueue_.frameSize()
|| playbackQueue_.samples() < playbackQueue_.frameSize()) {
// If there are not enough samples in either queue, we can't
// process anything.
return true;
}
return false;
}
/**
* @brief Stablilizes voice activity
* @param voiceStatus the voice status that was detected by the audio processor
* for the current frame
* @returns The voice activity status that should be set on the current frame
*/
bool getStabilizedVoiceActivity(bool voiceStatus)
{
bool newVoice = false;
if (voiceStatus) {
// we detected activity
consecutiveActiveFrames += 1;
// make sure that we have been active for necessary time
if (consecutiveActiveFrames > minimumConsequtiveDurationMs / frameDurationMs_) {
newVoice = true;
// set number of frames that will be forced positive
forceVoiceActiveFramesLeft = (int) forceMinimumVoiceActivityMs / frameDurationMs_;
}
} else if (forceVoiceActiveFramesLeft > 0) {
// if we didn't detect voice, but we haven't elapsed the minimum duration,
// force voice to be true
newVoice = true;
forceVoiceActiveFramesLeft -= 1;
consecutiveActiveFrames += 1;
} else {
// else no voice and no need to force
newVoice = false;
consecutiveActiveFrames = 0;
}
return newVoice;
}
private:
void enqueue(AudioFrameResizer& frameResizer, std::shared_ptr<AudioFrame>&& buf)
......
/*
* Copyright (C) 2021-2022 Savoir-faire Linux Inc.
*
* Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
......@@ -18,56 +16,30 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "null_echo_canceller.h"
#include "null_audio_processor.h"
#include <cassert>
namespace jami {
NullEchoCanceller::NullEchoCanceller(AudioFormat format, unsigned frameSize)
: EchoCanceller(format, frameSize)
{}
void
NullEchoCanceller::putRecorded(std::shared_ptr<AudioFrame>&& buf)
{
EchoCanceller::putRecorded(std::move(buf));
};
void
NullEchoCanceller::putPlayback(const std::shared_ptr<AudioFrame>& buf)
NullAudioProcessor::NullAudioProcessor(AudioFormat format, unsigned frameSize)
: AudioProcessor(format, frameSize)
{
EchoCanceller::putPlayback(buf);
};
JAMI_DBG("[null_audio] NullAudioProcessor, frame size = %d (=%d ms), channels = %d",
frameSize,
frameDurationMs_,
format.nb_channels);
}
std::shared_ptr<AudioFrame>
NullEchoCanceller::getProcessed()
NullAudioProcessor::getProcessed()
{
while (recordQueue_.samples() > recordQueue_.frameSize() * 10) {
JAMI_DBG("record overflow %d / %d", recordQueue_.samples(), frameSize_);
recordQueue_.dequeue();
}
while (playbackQueue_.samples() > playbackQueue_.frameSize() * 10) {
JAMI_DBG("playback overflow %d / %d", playbackQueue_.samples(), frameSize_);
playbackQueue_.dequeue();
}
if (recordQueue_.samples() < recordQueue_.frameSize()
|| playbackQueue_.samples() < playbackQueue_.frameSize()) {
JAMI_DBG("underflow rec: %d, play: %d fs: %d",
recordQueue_.samples(),
playbackQueue_.samples(),
frameSize_);
if (tidyQueues()) {
return {};
}
JAMI_WARN("Processing %d samples, rec: %d, play: %d ",
frameSize_,
recordQueue_.samples(),
playbackQueue_.samples());
playbackQueue_.dequeue();
return recordQueue_.dequeue();
};
void NullEchoCanceller::done() {};
} // namespace jami
/*
* Copyright (C) 2021-2022 Savoir-faire Linux Inc.
*
* Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
......@@ -20,20 +18,23 @@
#pragma once
#include "echo_canceller.h"
#include "audio_processor.h"
namespace jami {
class NullEchoCanceller final : public EchoCanceller
class NullAudioProcessor final : public AudioProcessor
{
public:
NullEchoCanceller(AudioFormat format, unsigned frameSize);
~NullEchoCanceller() = default;
NullAudioProcessor(AudioFormat format, unsigned frameSize);
~NullAudioProcessor() = default;
void putRecorded(std::shared_ptr<AudioFrame>&& buf) override;
void putPlayback(const std::shared_ptr<AudioFrame>& buf) override;
std::shared_ptr<AudioFrame> getProcessed() override;
void done() override;
void enableEchoCancel(bool) override {};
void enableNoiseSuppression(bool) override {};
void enableAutomaticGainControl(bool) override {};
};
} // namespace jami
/*
* Copyright (C) 2021-2022 Savoir-faire Linux Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "speex.h"
#include "audio/audiolayer.h"
#include <cstdint>
#include <memory>
#include <speex/speex_config_types.h>
#include <vector>
extern "C" {
#include <speex/speex_echo.h>
#include <speex/speex_preprocess.h>
}
namespace jami {
SpeexAudioProcessor::SpeexAudioProcessor(AudioFormat format, unsigned frameSize)
: AudioProcessor(format, frameSize)
, echoState(speex_echo_state_init_mc((int) frameSize,
(int) frameSize * 16,
(int) format.nb_channels,
(int) format.nb_channels),
&speex_echo_state_destroy)
, iProcBuffer(frameSize_, format)
{
JAMI_DBG("[speex-dsp] SpeexAudioProcessor, frame size = %d (=%d ms), channels = %d",
frameSize,
frameDurationMs_,
format.nb_channels);
// set up speex echo state
speex_echo_ctl(echoState.get(), SPEEX_ECHO_SET_SAMPLING_RATE, &format_.sample_rate);
// speex specific value to turn feature on (need to pass a pointer to it)
spx_int32_t speexOn = 1;
// probability integers, i.e. 50 means 50%
// vad will be true if speex's raw probability calculation is higher than this in any case
spx_int32_t probStart = 99;
// vad will be true if voice was active last frame
// AND speex's raw probability calculation is higher than this
spx_int32_t probContinue = 90;
// maximum noise suppression in dB (negative)
spx_int32_t maxNoiseSuppress = -50;
// set up speex preprocess states, one for each channel
// note that they are not enabled here, but rather in the enable* functions
for (unsigned int i = 0; i < format.nb_channels; i++) {
auto channelPreprocessorState
= SpeexPreprocessStatePtr(speex_preprocess_state_init((int) frameSize,
(int) format.sample_rate),
&speex_preprocess_state_destroy);
// set max noise suppression level
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_NOISE_SUPPRESS,
&maxNoiseSuppress);
// set up voice activity values
speex_preprocess_ctl(channelPreprocessorState.get(), SPEEX_PREPROCESS_SET_VAD, &speexOn);
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_PROB_START,
&probStart);
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_PROB_CONTINUE,
&probContinue);
// keep track of this channel's preprocessor state
preprocessorStates.push_back(std::move(channelPreprocessorState));
}
JAMI_INFO("[speex-dsp] Done initializing");
}
void
SpeexAudioProcessor::enableEchoCancel(bool enabled)
{
JAMI_DBG("[speex-dsp] enableEchoCancel %d", enabled);
// need to set member variable so we know to do it in getProcessed
shouldAEC = enabled;
if (enabled) {
// reset the echo canceller
speex_echo_state_reset(echoState.get());
for (auto& channelPreprocessorState : preprocessorStates) {
// attach our already-created echo canceller
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_ECHO_STATE,
echoState.get());
}
} else {
for (auto& channelPreprocessorState : preprocessorStates) {
// detach echo canceller (set it to NULL)
// don't destroy it though, we will reset it when necessary
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_ECHO_STATE,
NULL);
}
}
}
void
SpeexAudioProcessor::enableNoiseSuppression(bool enabled)
{
JAMI_DBG("[speex-dsp] enableNoiseSuppression %d", enabled);
spx_int32_t speexSetValue = (spx_int32_t) enabled;
// for each preprocessor
for (auto& channelPreprocessorState : preprocessorStates) {
// set denoise status
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_DENOISE,
&speexSetValue);
// set de-reverb status
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_DEREVERB,
&speexSetValue);
}
}
void
SpeexAudioProcessor::enableAutomaticGainControl(bool enabled)
{
JAMI_DBG("[speex-dsp] enableAutomaticGainControl %d", enabled);
spx_int32_t speexSetValue = (spx_int32_t) enabled;
// for each preprocessor
for (auto& channelPreprocessorState : preprocessorStates) {
// set AGC status
speex_preprocess_ctl(channelPreprocessorState.get(),
SPEEX_PREPROCESS_SET_AGC,
&speexSetValue);
}
}
std::shared_ptr<AudioFrame>
SpeexAudioProcessor::getProcessed()
{
if (tidyQueues()) {
return {};
}
auto playback = playbackQueue_.dequeue();
auto record = recordQueue_.dequeue();
if (!playback || !record) {
return {};
}
auto processed = std::make_shared<AudioFrame>(record->getFormat(), record->getFrameSize());
if (shouldAEC) {
// we want to echo cancel
// multichannel, output into processed
speex_echo_cancellation(echoState.get(),
(int16_t*) record->pointer()->data[0],
(int16_t*) playback->pointer()->data[0],
(int16_t*) processed->pointer()->data[0]);
} else {
// don't want to echo cancel, so just use record frame instead
processed = record;
}
// deinterleave processed into channels
std::vector<int16_t*> procData {format_.nb_channels};
iProcBuffer.deinterleave((const AudioSample*) processed->pointer()->data[0],
frameSize_,
format_.nb_channels);
// point procData to correct channels
for (unsigned int channel = 0; channel < format_.nb_channels; channel++) {
procData[channel] = iProcBuffer.getChannel(channel)->data();
}
// overall voice activity
bool overallVad = false;
// current channel voice activity
int channelVad;
// run preprocess on each channel
int channel = 0;
for (auto& channelPreprocessorState : preprocessorStates) {
// preprocesses in place, returns voice activity boolean
channelVad = speex_preprocess_run(channelPreprocessorState.get(), procData[channel]);
// boolean OR
overallVad |= channelVad;
channel += 1;
}
// reinterleave into processed
iProcBuffer.interleave((AudioSample*) processed->pointer()->data[0]);
// add stabilized voice activity to the AudioFrame
processed->has_voice = getStabilizedVoiceActivity(overallVad);
return processed;
}
} // namespace jami
......@@ -20,32 +20,44 @@
#pragma once
#include "audio/echo-cancel/echo_canceller.h"
#include "audio/audio_frame_resizer.h"
#include "audio_processor.h"
// typedef speex C structs
extern "C" {
struct SpeexEchoState_;
typedef struct SpeexEchoState_ SpeexEchoState;
struct SpeexPreprocessState_;
typedef struct SpeexPreprocessState_ SpeexPreprocessState;
}
#include <memory>
namespace jami {
class SpeexEchoCanceller final : public EchoCanceller
class SpeexAudioProcessor final : public AudioProcessor
{
public:
SpeexEchoCanceller(AudioFormat format, unsigned frameSize);
~SpeexEchoCanceller() = default;
SpeexAudioProcessor(AudioFormat format, unsigned frameSize);
~SpeexAudioProcessor() = default;
// Inherited via EchoCanceller
void putRecorded(std::shared_ptr<AudioFrame>&& buf) override;
void putPlayback(const std::shared_ptr<AudioFrame>& buf) override;
std::shared_ptr<AudioFrame> getProcessed() override;
void done() override;
void enableEchoCancel(bool enabled) override;
void enableNoiseSuppression(bool enabled) override;
void enableAutomaticGainControl(bool enabled) override;
private:
struct SpeexEchoStateImpl;
std::unique_ptr<SpeexEchoStateImpl> pimpl_;
using SpeexEchoStatePtr = std::unique_ptr<SpeexEchoState, void (*)(SpeexEchoState*)>;
using SpeexPreprocessStatePtr
= std::unique_ptr<SpeexPreprocessState, void (*)(SpeexPreprocessState*)>;
// multichannel, one for the entire audio processor
SpeexEchoStatePtr echoState;
// one for each channel
std::vector<SpeexPreprocessStatePtr> preprocessorStates;
AudioBuffer iProcBuffer;
// if we should do echo cancellation
bool shouldAEC {false};
};
} // namespace jami
/*
* Copyright (C) 2021-2022 Savoir-faire Linux Inc.
*
* Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
......@@ -18,97 +16,112 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "webrtc_echo_canceller.h"
#include "webrtc.h"
#include <webrtc/modules/audio_processing/include/audio_processing.h>
namespace jami {
WebRTCEchoCanceller::WebRTCEchoCanceller(AudioFormat format, unsigned frameSize)
: EchoCanceller(format, frameSize)
, pimpl_(std::make_unique<WebRTCAPMImpl>(format, frameSize))
constexpr int webrtcNoError = webrtc::AudioProcessing::kNoError;
WebRTCAudioProcessor::WebRTCAudioProcessor(AudioFormat format, unsigned frameSize)
: AudioProcessor(format, frameSize)
, fRecordBuffer_(format.nb_channels, std::vector<float>(frameSize_, 0))
, fPlaybackBuffer_(format.nb_channels, std::vector<float>(frameSize_, 0))
, iRecordBuffer_(frameSize_, format)
, iPlaybackBuffer_(frameSize_, format)
{}
struct WebRTCEchoCanceller::WebRTCAPMImpl
{
using APMPtr = std::unique_ptr<webrtc::AudioProcessing>;
APMPtr apm;
webrtc::StreamConfig streamConfig;
WebRTCAPMImpl(AudioFormat format, unsigned)
: streamConfig(format.sample_rate, format.nb_channels)
{
webrtc::ProcessingConfig pconfig;
JAMI_DBG("[webrtc-ap] WebRTCAudioProcessor, frame size = %d (=%d ms), channels = %d",
frameSize,
frameDurationMs_,
format.nb_channels);
webrtc::Config config;
config.Set<webrtc::ExtendedFilter>(new webrtc::ExtendedFilter(true));
config.Set<webrtc::DelayAgnostic>(new webrtc::DelayAgnostic(true));
apm.reset(webrtc::AudioProcessing::Create(config));
pconfig = {
webrtc::StreamConfig streamConfig((int) format.sample_rate, (int) format.nb_channels);
webrtc::ProcessingConfig pconfig = {
streamConfig, /* input stream */
streamConfig, /* output stream */
streamConfig, /* reverse input stream */
streamConfig, /* reverse output stream */
};
if (apm->Initialize(pconfig) != webrtc::AudioProcessing::kNoError) {
if (apm->Initialize(pconfig) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error initialising audio processing module");
}
// aec
apm->echo_cancellation()->set_suppression_level(
webrtc::EchoCancellation::SuppressionLevel::kModerateSuppression);
apm->echo_cancellation()->enable_drift_compensation(true);
apm->echo_cancellation()->Enable(true);
// hpf
apm->high_pass_filter()->Enable(true);
// voice activity
if (apm->voice_detection()->Enable(true) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error enabling voice detection");
}
// TODO: change likelihood?
if (apm->voice_detection()->set_likelihood(webrtc::VoiceDetection::kVeryLowLikelihood)
!= webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error setting voice detection likelihood");
}
// asserted to be 10 in voice_detection_impl.cc
if (apm->voice_detection()->set_frame_size_ms(10) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error setting voice detection frame size");
}
// ns
apm->noise_suppression()->set_level(webrtc::NoiseSuppression::kHigh);
apm->noise_suppression()->Enable(true);
JAMI_INFO("[webrtc-ap] Done initializing");
}
// agc
apm->gain_control()->set_analog_level_limits(0, 255);
apm->gain_control()->set_mode(webrtc::GainControl::kAdaptiveAnalog);
apm->gain_control()->Enable(true);
void
WebRTCAudioProcessor::enableNoiseSuppression(bool enabled)
{
JAMI_DBG("[webrtc-ap] enableNoiseSuppression %d", enabled);
if (apm->noise_suppression()->Enable(enabled) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error enabling noise suppression");
}
if (apm->noise_suppression()->set_level(webrtc::NoiseSuppression::kVeryHigh) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error setting noise suppression level");
}
if (apm->high_pass_filter()->Enable(enabled) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error enabling high pass filter");
}
}
};
void
WebRTCEchoCanceller::putRecorded(std::shared_ptr<AudioFrame>&& buf)
WebRTCAudioProcessor::enableAutomaticGainControl(bool enabled)
{
EchoCanceller::putRecorded(std::move(buf));
JAMI_DBG("[webrtc-ap] enableAutomaticGainControl %d", enabled);
if (apm->gain_control()->Enable(enabled) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error enabling automatic gain control");
}
if (apm->gain_control()->set_analog_level_limits(0, 255) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error setting automatic gain control analog level limits");
}
if (apm->gain_control()->set_mode(webrtc::GainControl::kAdaptiveAnalog) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error setting automatic gain control mode");
}
}
void
WebRTCEchoCanceller::putPlayback(const std::shared_ptr<AudioFrame>& buf)
WebRTCAudioProcessor::enableEchoCancel(bool enabled)
{
EchoCanceller::putPlayback(buf);
JAMI_DBG("[webrtc-ap] enableEchoCancel %d", enabled);
if (apm->echo_cancellation()->Enable(enabled) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error enabling echo cancellation");
}
if (apm->echo_cancellation()->set_suppression_level(
webrtc::EchoCancellation::SuppressionLevel::kHighSuppression)
!= webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error setting echo cancellation level");
}
if (apm->echo_cancellation()->enable_drift_compensation(true) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] Error enabling echo cancellation drift compensation");
}
}
std::shared_ptr<AudioFrame>
WebRTCEchoCanceller::getProcessed()
WebRTCAudioProcessor::getProcessed()
{
while (recordQueue_.samples() > recordQueue_.frameSize() * 10) {
JAMI_DBG("record overflow %d / %d", recordQueue_.samples(), frameSize_);
recordQueue_.dequeue();
}
while (playbackQueue_.samples() > playbackQueue_.frameSize() * 10) {
JAMI_DBG("playback overflow %d / %d", playbackQueue_.samples(), frameSize_);
playbackQueue_.dequeue();
}
if (recordQueue_.samples() < recordQueue_.frameSize()
|| playbackQueue_.samples() < playbackQueue_.frameSize()) {
// If there are not enough samples in either queue, we can't
// process anything.
// JAMI_DBG("underrun p:%d / r:%d", playbackQueue_.samples(), recordQueue_.samples());
if (tidyQueues()) {
return {};
}
......@@ -116,55 +129,73 @@ WebRTCEchoCanceller::getProcessed()
auto playback = playbackQueue_.dequeue();
auto record = recordQueue_.dequeue();
if (!playback || !record)
if (!playback || !record) {
return {};
}
auto processed = std::make_shared<AudioFrame>(format_, frameSize_);
webrtc::StreamConfig& sc = pimpl_->streamConfig;
// webrtc::StreamConfig& sc = streamConfig;
webrtc::StreamConfig sc((int) format_.sample_rate, (int) format_.nb_channels);
// analyze deinterleaved float playback data
iPlaybackBuffer_.deinterleave((const AudioSample*) playback->pointer()->data[0],
frameSize_,
format_.nb_channels);
std::vector<float*> playData {format_.nb_channels};
for (unsigned c = 0; c < format_.nb_channels; ++c) {
playData[c] = fPlaybackBuffer_[c].data();
iPlaybackBuffer_.channelToFloat(playData[c], c);
for (unsigned channel = 0; channel < format_.nb_channels; ++channel) {
// point playData channel to appropriate data location
playData[channel] = fPlaybackBuffer_[channel].data();
// write playback to playData channel
iPlaybackBuffer_.channelToFloat(playData[channel], (int) channel);
}
if (pimpl_->apm->ProcessReverseStream(playData.data(), sc, sc, playData.data())
!= webrtc::AudioProcessing::kNoError)
// process reverse in place
if (apm->ProcessReverseStream(playData.data(), sc, sc, playData.data()) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] ProcessReverseStream failed");
}
// process deinterleaved float recorded data
iRecordBuffer_.deinterleave((const AudioSample*) record->pointer()->data[0],
frameSize_,
format_.nb_channels);
std::vector<float*> recData {format_.nb_channels};
for (unsigned c = 0; c < format_.nb_channels; ++c) {
recData[c] = fRecordBuffer_[c].data();
iRecordBuffer_.channelToFloat(recData[c], c);
for (unsigned int channel = 0; channel < format_.nb_channels; ++channel) {
// point recData channel to appropriate data location
recData[channel] = fRecordBuffer_[channel].data();
// write data to recData channel
iRecordBuffer_.channelToFloat(recData[channel], (int) channel);
}
// TODO: implement this correctly (it MUST be called prior to ProcessStream)
// TODO: maybe implement this to see if it's better than automatic drift compensation
// (it MUST be called prior to ProcessStream)
// delay = (t_render - t_analyze) + (t_process - t_capture)
pimpl_->apm->set_stream_delay_ms(0);
pimpl_->apm->gain_control()->set_stream_analog_level(analogLevel_);
pimpl_->apm->echo_cancellation()->set_stream_drift_samples(driftSamples);
if (pimpl_->apm->ProcessStream(recData.data(), sc, sc, recData.data())
!= webrtc::AudioProcessing::kNoError)
if (apm->set_stream_delay_ms(0) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] set_stream_delay_ms failed");
}
if (apm->gain_control()->set_stream_analog_level(analogLevel_) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] set_stream_analog_level failed");
}
apm->echo_cancellation()->set_stream_drift_samples(driftSamples);
// process in place
if (apm->ProcessStream(recData.data(), sc, sc, recData.data()) != webrtcNoError) {
JAMI_ERR("[webrtc-ap] ProcessStream failed");
analogLevel_ = pimpl_->apm->gain_control()->stream_analog_level();
}
analogLevel_ = apm->gain_control()->stream_analog_level();
// return interleaved s16 data
iRecordBuffer_.convertFloatPlanarToSigned16((uint8_t**) recData.data(),
frameSize_,
format_.nb_channels);
iRecordBuffer_.interleave((AudioSample*) processed->pointer()->data[0]);
processed->has_voice = getStabilizedVoiceActivity(apm->voice_detection()->stream_has_voice());
return processed;
}
void
WebRTCEchoCanceller::done()
{}
} // namespace jami
/*
* Copyright (C) 2021-2022 Savoir-faire Linux Inc.
*
* Author: Andreas Traczyk <andreas.traczyk@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
......@@ -20,28 +18,29 @@
#pragma once
#include "audio/echo-cancel/echo_canceller.h"
#include "audio/audio_frame_resizer.h"
#include "audio_processor.h"
#include <memory>
namespace webrtc {
class AudioProcessing;
}
namespace jami {
class WebRTCEchoCanceller final : public EchoCanceller
class WebRTCAudioProcessor final : public AudioProcessor
{
public:
WebRTCEchoCanceller(AudioFormat format, unsigned frameSize);
~WebRTCEchoCanceller() = default;
WebRTCAudioProcessor(AudioFormat format, unsigned frameSize);
~WebRTCAudioProcessor() = default;
// Inherited via EchoCanceller
void putRecorded(std::shared_ptr<AudioFrame>&& buf) override;
void putPlayback(const std::shared_ptr<AudioFrame>& buf) override;
// Inherited via AudioProcessor
std::shared_ptr<AudioFrame> getProcessed() override;
void done() override;
void enableEchoCancel(bool enabled) override;
void enableNoiseSuppression(bool enabled) override;
void enableAutomaticGainControl(bool enabled) override;
private:
struct WebRTCAPMImpl;
std::unique_ptr<WebRTCAPMImpl> pimpl_;
std::unique_ptr<webrtc::AudioProcessing> apm;
using fChannelBuffer = std::vector<std::vector<float>>;
fChannelBuffer fRecordBuffer_;
......
......@@ -109,6 +109,9 @@ AudioFrameResizer::enqueue(std::shared_ptr<AudioFrame>&& frame)
return; // return if frame was just passed through
}
// voice activity
hasVoice_ = frame->has_voice;
// queue reallocates itself if need be
if ((ret = av_audio_fifo_write(queue_, reinterpret_cast<void**>(f->data), f->nb_samples)) < 0) {
JAMI_ERR() << "Audio resizer error: " << libav_utils::getError(ret);
......@@ -139,6 +142,7 @@ AudioFrameResizer::dequeue()
return {};
}
frame->pointer()->pts = nextOutputPts_;
frame->has_voice = hasVoice_;
nextOutputPts_ += frameSize_;
return frame;
}
......
......@@ -104,6 +104,7 @@ private:
*/
AVAudioFifo* queue_;
int64_t nextOutputPts_ {0};
bool hasVoice_ {false};
};
} // namespace jami
......@@ -122,18 +122,20 @@ AudioInput::readFromDevice()
std::this_thread::sleep_until(wakeUp_);
wakeUp_ += MS_PER_PACKET;
auto& mainBuffer = Manager::instance().getRingBufferPool();
auto samples = mainBuffer.getData(id_);
if (not samples)
auto& bufferPool = Manager::instance().getRingBufferPool();
auto audioFrame = bufferPool.getData(id_);
if (not audioFrame)
return;
if (muteState_)
libav_utils::fillWithSilence(samples->pointer());
if (muteState_) {
libav_utils::fillWithSilence(audioFrame->pointer());
audioFrame->has_voice = false; // force no voice activity when muted
}
std::lock_guard<std::mutex> lk(fmtMutex_);
if (mainBuffer.getInternalAudioFormat() != format_)
samples = resampler_->resample(std::move(samples), format_);
resizer_->enqueue(std::move(samples));
if (bufferPool.getInternalAudioFormat() != format_)
audioFrame = resampler_->resample(std::move(audioFrame), format_);
resizer_->enqueue(std::move(audioFrame));
}
void
......
......@@ -67,7 +67,9 @@ public:
void setSeekTime(int64_t time);
void setSuccessfulSetupCb(const std::function<void(MediaType, bool)>& cb)
{ onSuccessfulSetup_ = cb; }
{
onSuccessfulSetup_ = cb;
}
private:
void readFromDevice();
......
......@@ -51,13 +51,7 @@ struct AudioFormat
unsigned nb_channels;
AVSampleFormat sampleFormat;
constexpr AudioFormat(unsigned sr, unsigned c)
: sample_rate(sr)
, nb_channels(c)
, sampleFormat(AV_SAMPLE_FMT_S16)
{}
constexpr AudioFormat(unsigned sr, unsigned c, AVSampleFormat f)
constexpr AudioFormat(unsigned sr, unsigned c, AVSampleFormat f = AV_SAMPLE_FMT_S16)
: sample_rate(sr)
, nb_channels(c)
, sampleFormat(f)
......
......@@ -28,11 +28,13 @@
#include "tonecontrol.h"
#include "client/ring_signal.h"
// aec
// TODO: decide which library to use/how to decide (compile time? runtime?)
#if HAVE_WEBRTC_AP
#include "echo-cancel/webrtc_echo_canceller.h"
#include "audio-processing/webrtc.h"
#elif HAVE_SPEEXDSP
#include "audio-processing/speex.h"
#else
#include "echo-cancel/null_echo_canceller.h"
#include "audio-processing/null_audio_processor.h"
#endif
#include <ctime>
......@@ -102,55 +104,89 @@ void
AudioLayer::playbackChanged(bool started)
{
playbackStarted_ = started;
checkAEC();
}
void
AudioLayer::recordChanged(bool started)
{
std::lock_guard<std::mutex> lock(audioProcessorMutex);
if (started) {
// create audio processor
createAudioProcessor();
} else {
// destroy audio processor
destroyAudioProcessor();
}
recordStarted_ = started;
checkAEC();
}
void
AudioLayer::setHasNativeAEC(bool hasEAC)
{
std::lock_guard<std::mutex> lock(audioProcessorMutex);
hasNativeAEC_ = hasEAC;
checkAEC();
// if we have a current audio processor, tell it to enable/disable its own AEC
if (audioProcessor) {
audioProcessor->enableEchoCancel(!hasEAC);
}
}
// must acquire lock beforehand
void
AudioLayer::checkAEC()
AudioLayer::createAudioProcessor()
{
std::lock_guard<std::mutex> lk(ecMutex_);
bool shouldSoftAEC = not hasNativeAEC_ and playbackStarted_ and recordStarted_;
if (not echoCanceller_ and shouldSoftAEC) {
auto nb_channels = std::min(audioFormat_.nb_channels, audioInputFormat_.nb_channels);
auto sample_rate = std::min(audioFormat_.sample_rate, audioInputFormat_.sample_rate);
// TODO: explain/rework this math??
if (sample_rate % 16000u != 0)
sample_rate = 16000u * ((sample_rate / 16000u) + 1u);
sample_rate = std::clamp(sample_rate, 16000u, 96000u);
AudioFormat format {sample_rate, nb_channels};
AudioFormat formatForProcessor {sample_rate, nb_channels};
#if HAVE_SPEEXDSP && !HAVE_WEBRTC_AP
// we are using speex
// TODO: maybe force this to be equivalent to 20ms? as expected by speex
auto frame_size = sample_rate / 50u;
#else
// we are using either webrtc-ap or null
auto frame_size = sample_rate / 100u;
#endif
JAMI_WARN("Input {%d Hz, %d channels}",
audioInputFormat_.sample_rate,
audioInputFormat_.nb_channels);
JAMI_WARN("Output {%d Hz, %d channels}", audioFormat_.sample_rate, audioFormat_.nb_channels);
JAMI_WARN("Starting AEC {%d Hz, %d channels, %d samples/frame}",
JAMI_WARN("Starting audio processor with: {%d Hz, %d channels, %d samples/frame}",
sample_rate,
nb_channels,
frame_size);
#if HAVE_WEBRTC_AP
echoCanceller_.reset(new WebRTCEchoCanceller(format, frame_size));
JAMI_INFO("[audiolayer] using webrtc audio processor");
audioProcessor.reset(new WebRTCAudioProcessor(formatForProcessor, frame_size));
#elif HAVE_SPEEXDSP
JAMI_INFO("[audiolayer] using speex audio processor");
audioProcessor.reset(new SpeexAudioProcessor(formatForProcessor, frame_size));
#else
echoCanceller_.reset(new NullEchoCanceller(format, frame_size));
JAMI_INFO("[audiolayer] using null audio processor");
audioProcessor.reset(new NullAudioProcessor(formatForProcessor, frame_size));
#endif
} else if (echoCanceller_ and not shouldSoftAEC and not playbackStarted_
and not recordStarted_) {
JAMI_WARN("Stopping AEC");
echoCanceller_.reset();
audioProcessor->enableNoiseSuppression(true);
// TODO: enable AGC?
audioProcessor->enableAutomaticGainControl(false);
// can also be updated after creation via setHasNativeAEC
audioProcessor->enableEchoCancel(!hasNativeAEC_);
}
// must acquire lock beforehand
void
AudioLayer::destroyAudioProcessor()
{
// delete it
audioProcessor.reset();
}
void
......@@ -228,19 +264,19 @@ AudioLayer::getToPlay(AudioFormat format, size_t writableSamples)
} else if (auto buf = bufferPool.getData(RingBufferPool::DEFAULT_ID)) {
resampled = resampler_->resample(std::move(buf), format);
} else {
if (echoCanceller_) {
std::lock_guard<std::mutex> lock(audioProcessorMutex);
if (audioProcessor) {
auto silence = std::make_shared<AudioFrame>(format, writableSamples);
libav_utils::fillWithSilence(silence->pointer());
std::lock_guard<std::mutex> lk(ecMutex_);
echoCanceller_->putPlayback(silence);
audioProcessor->putPlayback(silence);
}
break;
}
if (resampled) {
if (echoCanceller_) {
std::lock_guard<std::mutex> lk(ecMutex_);
echoCanceller_->putPlayback(resampled);
std::lock_guard<std::mutex> lock(audioProcessorMutex);
if (audioProcessor) {
audioProcessor->putPlayback(resampled);
}
playbackQueue_->enqueue(std::move(resampled));
} else
......@@ -253,12 +289,13 @@ AudioLayer::getToPlay(AudioFormat format, size_t writableSamples)
void
AudioLayer::putRecorded(std::shared_ptr<AudioFrame>&& frame)
{
if (echoCanceller_) {
std::lock_guard<std::mutex> lk(ecMutex_);
echoCanceller_->putRecorded(std::move(frame));
while (auto rec = echoCanceller_->getProcessed()) {
std::lock_guard<std::mutex> lock(audioProcessorMutex);
if (audioProcessor && playbackStarted_ && recordStarted_) {
audioProcessor->putRecorded(std::move(frame));
while (auto rec = audioProcessor->getProcessed()) {
mainRingBuffer_->put(std::move(rec));
}
} else {
mainRingBuffer_->put(std::move(frame));
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment