diff --git a/src/media/audio/audio-processing/speex.cpp b/src/media/audio/audio-processing/speex.cpp index cc3ce74462b9ec0fa9b9e7b561154329a4e79a42..f875eedeeaed83c81374a25f0450fdad6bc42921 100644 --- a/src/media/audio/audio-processing/speex.cpp +++ b/src/media/audio/audio-processing/speex.cpp @@ -34,19 +34,25 @@ extern "C" { namespace jami { +inline AudioFormat +audioFormatToSampleFormat(AudioFormat format) +{ + return {format.sample_rate, format.nb_channels, AV_SAMPLE_FMT_S16}; +} + SpeexAudioProcessor::SpeexAudioProcessor(AudioFormat format, unsigned frameSize) - : AudioProcessor(format, frameSize) + : AudioProcessor(format.withSampleFormat(AV_SAMPLE_FMT_S16), frameSize) , echoState(speex_echo_state_init_mc((int) frameSize, (int) frameSize * 16, - (int) format.nb_channels, - (int) format.nb_channels), + (int) format_.nb_channels, + (int) format_.nb_channels), &speex_echo_state_destroy) - , iProcBuffer(frameSize_, format) + , procBuffer(std::make_unique<AudioFrame>(format.withSampleFormat(AV_SAMPLE_FMT_S16P), frameSize_)) { JAMI_DBG("[speex-dsp] SpeexAudioProcessor, frame size = %d (=%d ms), channels = %d", frameSize, frameDurationMs_, - format.nb_channels); + format_.nb_channels); // set up speex echo state speex_echo_ctl(echoState.get(), SPEEX_ECHO_SET_SAMPLING_RATE, &format_.sample_rate); @@ -66,10 +72,10 @@ SpeexAudioProcessor::SpeexAudioProcessor(AudioFormat format, unsigned frameSize) // set up speex preprocess states, one for each channel // note that they are not enabled here, but rather in the enable* functions - for (unsigned int i = 0; i < format.nb_channels; i++) { + for (unsigned int i = 0; i < format_.nb_channels; i++) { auto channelPreprocessorState = SpeexPreprocessStatePtr(speex_preprocess_state_init((int) frameSize, - (int) format.sample_rate), + (int) format_.sample_rate), &speex_preprocess_state_destroy); // set max noise suppression level @@ -184,11 +190,11 @@ SpeexAudioProcessor::getProcessed() return {}; } - auto processed = std::make_shared<AudioFrame>(record->getFormat(), record->getFrameSize()); - + std::shared_ptr<AudioFrame> processed; if (shouldAEC) { // we want to echo cancel // multichannel, output into processed + processed = std::make_shared<AudioFrame>(record->getFormat(), record->getFrameSize()); speex_echo_cancellation(echoState.get(), (int16_t*) record->pointer()->data[0], (int16_t*) playback->pointer()->data[0], @@ -198,16 +204,7 @@ SpeexAudioProcessor::getProcessed() processed = record; } - // deinterleave processed into channels - std::vector<int16_t*> procData {format_.nb_channels}; - iProcBuffer.deinterleave((const AudioSample*) processed->pointer()->data[0], - frameSize_, - format_.nb_channels); - - // point procData to correct channels - for (unsigned int channel = 0; channel < format_.nb_channels; channel++) { - procData[channel] = iProcBuffer.getChannel(channel)->data(); - } + deinterleaveResampler.resample(processed->pointer(), procBuffer->pointer()); // overall voice activity bool overallVad = false; @@ -218,7 +215,7 @@ SpeexAudioProcessor::getProcessed() int channel = 0; for (auto& channelPreprocessorState : preprocessorStates) { // preprocesses in place, returns voice activity boolean - channelVad = speex_preprocess_run(channelPreprocessorState.get(), procData[channel]); + channelVad = speex_preprocess_run(channelPreprocessorState.get(), (int16_t*)procBuffer->pointer()->data[channel]); // boolean OR overallVad |= channelVad; @@ -226,12 +223,10 @@ SpeexAudioProcessor::getProcessed() channel += 1; } - // reinterleave into processed - iProcBuffer.interleave((AudioSample*) processed->pointer()->data[0]); + interleaveResampler.resample(procBuffer->pointer(), processed->pointer()); // add stabilized voice activity to the AudioFrame processed->has_voice = shouldDetectVoice && getStabilizedVoiceActivity(overallVad); - return processed; } diff --git a/src/media/audio/audio-processing/speex.h b/src/media/audio/audio-processing/speex.h index 018dbb9e27f7bc44132f73d3b5b82ce75397cad4..5a6323e74b02c900405b8bb1da2f4b28344cbfdd 100644 --- a/src/media/audio/audio-processing/speex.h +++ b/src/media/audio/audio-processing/speex.h @@ -21,7 +21,6 @@ #pragma once #include "audio_processor.h" -#include "media/audio/audiobuffer.h" // typedef speex C structs extern "C" { @@ -57,7 +56,9 @@ private: // one for each channel std::vector<SpeexPreprocessStatePtr> preprocessorStates; - AudioBuffer iProcBuffer; + std::unique_ptr<AudioFrame> procBuffer {}; + Resampler deinterleaveResampler; + Resampler interleaveResampler; // if we should do echo cancellation bool shouldAEC {false}; diff --git a/src/media/audio/audio-processing/webrtc.cpp b/src/media/audio/audio-processing/webrtc.cpp index ea6c48fadd5d2ff264c970e7c1f129634dd87833..cb768a9fbdac1e5b113068388ca465e177b93477 100644 --- a/src/media/audio/audio-processing/webrtc.cpp +++ b/src/media/audio/audio-processing/webrtc.cpp @@ -23,12 +23,6 @@ namespace jami { -inline AudioFormat -audioFormatToFloatPlanar(AudioFormat format) -{ - return {format.sample_rate, format.nb_channels, AV_SAMPLE_FMT_FLTP}; -} - inline size_t webrtcFrameSize(AudioFormat format) { @@ -38,19 +32,19 @@ webrtcFrameSize(AudioFormat format) constexpr int webrtcNoError = webrtc::AudioProcessing::kNoError; WebRTCAudioProcessor::WebRTCAudioProcessor(AudioFormat format, unsigned /* frameSize */) - : AudioProcessor(audioFormatToFloatPlanar(format), webrtcFrameSize(format)) + : AudioProcessor(format.withSampleFormat(AV_SAMPLE_FMT_FLTP), webrtcFrameSize(format)) { JAMI_LOG("[webrtc-ap] WebRTCAudioProcessor, frame size = {:d} (={:d} ms), channels = {:d}", frameSize_, frameDurationMs_, - format.nb_channels); + format_.nb_channels); webrtc::Config config; config.Set<webrtc::ExtendedFilter>(new webrtc::ExtendedFilter(true)); config.Set<webrtc::DelayAgnostic>(new webrtc::DelayAgnostic(true)); apm.reset(webrtc::AudioProcessing::Create(config)); - webrtc::StreamConfig streamConfig((int) format.sample_rate, (int) format.nb_channels); + webrtc::StreamConfig streamConfig((int) format_.sample_rate, (int) format_.nb_channels); webrtc::ProcessingConfig pconfig = { streamConfig, /* input stream */ streamConfig, /* output stream */ diff --git a/src/media/audio/audio_format.h b/src/media/audio/audio_format.h index 8c1b9e1c9355b1f73ff96f0691c111cfa65002f7..fb5253877ee0274337ac257d647caee1fed1a730 100644 --- a/src/media/audio/audio_format.h +++ b/src/media/audio/audio_format.h @@ -59,6 +59,12 @@ struct AudioFormat return fmt::format("{{{}, {} channels, {}Hz}}", av_get_sample_fmt_name(sampleFormat), nb_channels, sample_rate); } + inline AudioFormat withSampleFormat(AVSampleFormat format) + { + return {sample_rate, nb_channels, format}; + } + + /** * Returns bytes necessary to hold one frame of audio data. */