diff --git a/src/client/videomanager.cpp b/src/client/videomanager.cpp
index 3f723675d39c42a6c49ab1f9e28f0b78c53e5e98..3c646806ee69d10a10f2f6f4e72c8b8d91b4cc8c 100644
--- a/src/client/videomanager.cpp
+++ b/src/client/videomanager.cpp
@@ -67,6 +67,43 @@ MediaFrame::reset() noexcept
     av_frame_unref(frame_.get());
 }
 
+AudioFrame::AudioFrame(const ring::AudioFormat& format, size_t nb_samples)
+ : MediaFrame()
+{
+    setFormat(format);
+    if (nb_samples)
+        reserve(nb_samples);
+}
+
+void
+AudioFrame::setFormat(const ring::AudioFormat& format)
+{
+    auto d = pointer();
+    d->channels = format.nb_channels;
+    d->channel_layout = av_get_default_channel_layout(format.nb_channels);
+    d->sample_rate = format.sample_rate;
+    d->format = format.sampleFormat;
+}
+
+void
+AudioFrame::reserve(size_t nb_samples)
+{
+    if (nb_samples != 0) {
+        auto d = pointer();
+        d->nb_samples = nb_samples;
+        int err;
+        if ((err = av_frame_get_buffer(d, 0)) < 0) {
+            throw std::bad_alloc();
+        }
+    }
+}
+
+void
+AudioFrame::mix(const AudioFrame&)
+{
+    RING_ERR("AudioFrame::mix not implemented yet");
+}
+
 VideoFrame::~VideoFrame()
 {
     if (releaseBufferCb_)
diff --git a/src/dring/videomanager_interface.h b/src/dring/videomanager_interface.h
index d849be2c46ae6ee3faddb1a8f1f54656ec078a6e..876df5fb456f2fb5795e2018409aa9b560312211 100644
--- a/src/dring/videomanager_interface.h
+++ b/src/dring/videomanager_interface.h
@@ -35,14 +35,18 @@ struct AVPacket;
 #include <map>
 #include <string>
 #include <functional>
+#include <chrono>
 #include <cstdint>
 #include <cstdlib>
 
-
 #if __APPLE__
 #import "TargetConditionals.h"
 #endif
 
+namespace ring {
+struct AudioFormat;
+}
+
 namespace DRing {
 
 [[deprecated("Replaced by registerSignalHandlers")]] DRING_PUBLIC
@@ -88,7 +92,17 @@ protected:
     std::unique_ptr<AVFrame, void(*)(AVFrame*)> frame_;
 };
 
-class DRING_PUBLIC AudioFrame : public MediaFrame {};
+class DRING_PUBLIC AudioFrame : public MediaFrame {
+public:
+    AudioFrame() : MediaFrame() {}
+    AudioFrame(const ring::AudioFormat& format, size_t nb_samples = 0);
+    ~AudioFrame() {};
+    void mix(const AudioFrame& o);
+
+private:
+    void setFormat(const ring::AudioFormat& format);
+    void reserve(size_t nb_samples = 0);
+};
 
 class DRING_PUBLIC VideoFrame : public MediaFrame {
 public:
diff --git a/src/media/audio/audio_frame_resizer.cpp b/src/media/audio/audio_frame_resizer.cpp
index 6699ef1f26c38cb5ccbd69c98b87c483bed96919..311860de95195c1a7136b8919bf2254a56327d09 100644
--- a/src/media/audio/audio_frame_resizer.cpp
+++ b/src/media/audio/audio_frame_resizer.cpp
@@ -30,15 +30,14 @@ extern "C" {
 
 namespace ring {
 
+// NOTE 160 samples should the minimum that will be provided (20 ms @ 8kHz),
+// barring files that for some obscure reason have smaller packets
 AudioFrameResizer::AudioFrameResizer(const AudioFormat& format, int frameSize, std::function<void(std::unique_ptr<AudioFrame>&&)> cb)
     : format_(format)
     , frameSize_(frameSize)
     , cb_(cb)
-{
-    // NOTE 160 samples should the minimum that will be provided (20 ms @ 8kHz),
-    // barring files that for some obscure reason have smaller packets
-    queue_ = av_audio_fifo_alloc(format.sampleFormat, format.nb_channels, 160);
-}
+    , queue_(av_audio_fifo_alloc(format.sampleFormat, format.nb_channels, 160))
+{}
 
 AudioFrameResizer::~AudioFrameResizer()
 {
@@ -94,24 +93,12 @@ AudioFrameResizer::dequeue()
     if (samples() < frameSize_)
         return {};
 
+    auto frame = std::make_unique<AudioFrame>(format_, frameSize_);
     int ret;
-    auto frame = std::make_unique<AudioFrame>();
-    auto f = frame->pointer();
-    f->format = (int)format_.sampleFormat;
-    f->channels = format_.nb_channels;
-    f->channel_layout = av_get_default_channel_layout(format_.nb_channels);
-    f->sample_rate = format_.sample_rate;
-    f->nb_samples = frameSize_;
-    if ((ret = av_frame_get_buffer(f, 0)) < 0) {
-        RING_ERR() << "Failed to allocate audio buffers: " << libav_utils::getError(ret);
-        return {};
-    }
-
-    if ((ret = av_audio_fifo_read(queue_, reinterpret_cast<void**>(f->data), frameSize_)) < 0) {
+    if ((ret = av_audio_fifo_read(queue_, reinterpret_cast<void**>(frame->pointer()->data), frameSize_)) < 0) {
         RING_ERR() << "Could not read samples from queue: " << libav_utils::getError(ret);
         return {};
     }
-
     return frame;
 }
 
diff --git a/src/media/audio/audiobuffer.cpp b/src/media/audio/audiobuffer.cpp
index 2c9acffab17143f877dc08b81cc1a307db1d469c..16135aef655a0cb4b32971e4737480837e967424 100644
--- a/src/media/audio/audiobuffer.cpp
+++ b/src/media/audio/audiobuffer.cpp
@@ -301,22 +301,8 @@ size_t AudioBuffer::copy(AudioSample* in, size_t sample_num, size_t pos_out /* =
 std::unique_ptr<AudioFrame>
 AudioBuffer::toAVFrame() const
 {
-    auto audioFrame = std::make_unique<AudioFrame>();
-    auto frame = audioFrame->pointer();
-    frame->format = AV_SAMPLE_FMT_S16;
-    frame->nb_samples = frames();
-    frame->channel_layout = av_get_default_channel_layout(channels());
-    frame->channels = channels();
-    frame->sample_rate = getSampleRate();
-
-    if (av_frame_get_buffer(frame, 0) < 0) {
-        RING_ERR() << "Failed to allocate audio frame";
-        av_frame_free(&frame);
-        return nullptr;
-    }
-
-    interleave(reinterpret_cast<AudioSample*>(frame->data[0]));
-
+    auto audioFrame = std::make_unique<AudioFrame>(getFormat(), frames());
+    interleave(reinterpret_cast<AudioSample*>(audioFrame->pointer()->data[0]));
     return audioFrame;
 }