diff --git a/src/media/audio/audio_input.cpp b/src/media/audio/audio_input.cpp
index 7bf5d4d7b1969fc194b5742532e43982ed89f719..0ff144c40f2468bd2c7f62f71eb446576ddca99f 100644
--- a/src/media/audio/audio_input.cpp
+++ b/src/media/audio/audio_input.cpp
@@ -95,7 +95,8 @@ AudioInput::process()
         resampled = micData_;
     }
 
-    AVFrame* frame = resampled.toAVFrame();
+    auto audioFrame = resampled.toAVFrame();
+    auto frame = audioFrame->pointer();
     auto ms = MediaStream("a:local", format_);
     frame->pts = getNextTimestamp(sent_samples, ms.sampleRate, static_cast<rational<int64_t>>(ms.timeBase));
     sent_samples += frame->nb_samples;
diff --git a/src/media/audio/audio_rtp_session.cpp b/src/media/audio/audio_rtp_session.cpp
index bfb0fce5a75f4582a4e63b669c7ac6ccbd526d0d..0e18573bfa503768a5e0eb2af7b0e95fbd54f469 100644
--- a/src/media/audio/audio_rtp_session.cpp
+++ b/src/media/audio/audio_rtp_session.cpp
@@ -209,7 +209,8 @@ AudioSender::process()
     if (muteState_) // audio is muted, set samples to 0
         buffer.reset();
 
-    AVFrame* frame = buffer.toAVFrame();
+    auto audioFrame = buffer.toAVFrame();
+    auto frame = audioFrame->pointer();
     auto ms = MediaStream("a:local", buffer.getFormat());
     frame->pts = getNextTimestamp(sent_samples, ms.sampleRate, static_cast<rational<int64_t>>(ms.timeBase));
     ms.firstTimestamp = frame->pts;
@@ -221,7 +222,7 @@ AudioSender::process()
             rec->recordData(frame, ms);
     }
 
-    if (audioEncoder_->encodeAudio(frame) < 0)
+    if (audioEncoder_->encodeAudio(*audioFrame) < 0)
         RING_ERR("encoding failed");
 }
 
diff --git a/src/media/audio/audiobuffer.cpp b/src/media/audio/audiobuffer.cpp
index 0f0c443aa9b1ed8b3a8494db808ab282ecb3f034..289161c591a34abec63f553fb3ae44d2861badd2 100644
--- a/src/media/audio/audiobuffer.cpp
+++ b/src/media/audio/audiobuffer.cpp
@@ -290,15 +290,11 @@ size_t AudioBuffer::copy(AudioSample* in, size_t sample_num, size_t pos_out /* =
     return sample_num;
 }
 
-AVFrame*
+std::unique_ptr<AudioFrame>
 AudioBuffer::toAVFrame() const
 {
-    AVFrame* frame = av_frame_alloc();
-    if (!frame) {
-        RING_ERR() << "Failed to allocate audio frame";
-        return nullptr;
-    }
-
+    auto audioFrame = std::make_unique<AudioFrame>();
+    auto frame = audioFrame->pointer();
     frame->format = AV_SAMPLE_FMT_S16;
     frame->nb_samples = frames();
     frame->channel_layout = av_get_default_channel_layout(channels());
@@ -313,12 +309,13 @@ AudioBuffer::toAVFrame() const
 
     interleave(reinterpret_cast<AudioSample*>(frame->data[0]));
 
-    return frame;
+    return audioFrame;
 }
 
 int
-AudioBuffer::append(AVFrame* frame)
+AudioBuffer::append(const AudioFrame& audioFrame)
 {
+    auto frame = audioFrame.pointer();
     // FIXME we assume frame is s16 interleaved
     if (channels() != static_cast<unsigned>(frame->channels)
         || getSampleRate() != frame->sample_rate) {
diff --git a/src/media/audio/audiobuffer.h b/src/media/audio/audiobuffer.h
index b0ceb12f871b2be094642d9f729aa22b166e4957..47bf07843b5fc8c46c4525b40878192815f9a61b 100644
--- a/src/media/audio/audiobuffer.h
+++ b/src/media/audio/audiobuffer.h
@@ -35,6 +35,7 @@ extern "C" {
 }
 
 #include "ring_types.h"
+#include "media_buffer.h"
 
 #include <ciso646> // fix windows compiler bug
 
@@ -357,9 +358,9 @@ class AudioBuffer {
          */
         size_t copy(AudioSample* in, size_t sample_num, size_t pos_out = 0);
 
-        AVFrame* toAVFrame() const;
+        std::unique_ptr<AudioFrame> toAVFrame() const;
 
-        int append(AVFrame* frame);
+        int append(const AudioFrame& frame);
 
     private:
         int sampleRate_;
diff --git a/src/media/audio/resampler.cpp b/src/media/audio/resampler.cpp
index abdf639cbe5696b6de44d20f6ff1360968ef7f28..3424d5e7085fd9db266f9cc6ea1c920e0c8ff6ca 100644
--- a/src/media/audio/resampler.cpp
+++ b/src/media/audio/resampler.cpp
@@ -78,22 +78,20 @@ Resampler::resample(const AVFrame* input, AVFrame* output)
 void
 Resampler::resample(const AudioBuffer& dataIn, AudioBuffer& dataOut)
 {
-    auto input = dataIn.toAVFrame();
+    auto inputFrame = dataIn.toAVFrame();
+    auto input = inputFrame->pointer();
     AudioFrame resampled;
     auto output = resampled.pointer();
     output->sample_rate = dataOut.getSampleRate();
     output->channel_layout = av_get_default_channel_layout(dataOut.channels());
     output->format = AV_SAMPLE_FMT_S16;
 
-    if (resample(input, output) < 0) {
-        av_frame_free(&input);
+    if (resample(input, output) < 0)
         return;
-    }
 
     dataOut.resize(output->nb_samples);
     dataOut.deinterleave(reinterpret_cast<const AudioSample*>(output->extended_data[0]),
         output->nb_samples, output->channels);
-    av_frame_free(&input);
 }
 
 } // namespace ring
diff --git a/src/media/audio/sound/audiofile.cpp b/src/media/audio/sound/audiofile.cpp
index 195b83b21950156acae5cab345cd2007a2df61d8..214d665b58255f025737b865b6254ab63d50dfc1 100644
--- a/src/media/audio/sound/audiofile.cpp
+++ b/src/media/audio/sound/audiofile.cpp
@@ -83,7 +83,7 @@ AudioFile::AudioFile(const std::string &fileName, unsigned int sampleRate) :
             resampled->format = AV_SAMPLE_FMT_S16;
             if (resampler->resample(input.pointer(), resampled) < 0)
                 throw AudioFileException("Frame could not be resampled");
-            if (buf->append(resampled) < 0)
+            if (buf->append(output) < 0)
                 throw AudioFileException("Error while decoding: " + fileName);
             break;
         case MediaDecoder::Status::DecodeError:
diff --git a/src/media/media_encoder.cpp b/src/media/media_encoder.cpp
index ddcda822ef13a34c57caca8cbedb73c842986f28..99b063fe0b6bab2645e6be6dbd8da060a4cb2ded 100644
--- a/src/media/media_encoder.cpp
+++ b/src/media/media_encoder.cpp
@@ -380,13 +380,12 @@ MediaEncoder::encode(VideoFrame& input, bool is_keyframe,
 }
 #endif // RING_VIDEO
 
-int MediaEncoder::encodeAudio(AVFrame* frame)
+int MediaEncoder::encodeAudio(AudioFrame& frame)
 {
     auto enc = encoders_[currentStreamIdx_];
-    frame->pts = getNextTimestamp(sent_samples, enc->sample_rate, enc->time_base);
-    sent_samples += frame->nb_samples;
-    encode(frame, currentStreamIdx_);
-    av_frame_free(&frame);
+    frame.pointer()->pts = getNextTimestamp(sent_samples, enc->sample_rate, enc->time_base);
+    sent_samples += frame.pointer()->nb_samples;
+    encode(frame.pointer(), currentStreamIdx_);
     return 0;
 }
 
diff --git a/src/media/media_encoder.h b/src/media/media_encoder.h
index 0a432e0d3dd5b3f190350edc535e19052f8ac74e..80c28c17125faba4ac852ee14b44fb534bc4d2a8 100644
--- a/src/media/media_encoder.h
+++ b/src/media/media_encoder.h
@@ -75,7 +75,7 @@ public:
     int encode(VideoFrame &input, bool is_keyframe, int64_t frame_number);
 #endif // RING_VIDEO
 
-    int encodeAudio(AVFrame* frame);
+    int encodeAudio(AudioFrame& frame);
 
     // frame should be ready to be sent to the encoder at this point
     int encode(AVFrame* frame, int streamIdx);