diff --git a/WhisperTranscript/.gitignore b/WhisperTranscript/.gitignore
index a55df1d1b5f260945928fc6bf5b33f8911c58276..f31f07ce5fe22d444fe5b9a88e9749c3268e4a87 100644
--- a/WhisperTranscript/.gitignore
+++ b/WhisperTranscript/.gitignore
@@ -1,6 +1,5 @@
 *.mp3
 /WhisperTranscript*
-libonnxruntime.so*
 /libonnxruntime.dylib
 /processed.mp4
 *.so
diff --git a/WhisperTranscript/CMakeLists.txt b/WhisperTranscript/CMakeLists.txt
index a55f5c4857b013c8869a24dce17da555c9ca7eb6..d1e06ff42f3539f9d343fb8dd92ab592e0423798 100644
--- a/WhisperTranscript/CMakeLists.txt
+++ b/WhisperTranscript/CMakeLists.txt
@@ -58,11 +58,11 @@ set(plugin_SRC main.cpp
                PluginPreferenceHandler.cpp
                TranscriptAudioSubscriber.cpp
                TranscriptVideoSubscriber.cpp
-               Preprocess.cpp
-               ModelProcessor.cpp
+               stt_whisper.cpp
                ./../lib/accel.cpp
                ./../lib/frameUtils.cpp
                ./../lib/frameFilter.cpp
+               ./../lib/resampler.cpp
                ./../lib/common.cpp
                )
 
@@ -70,8 +70,7 @@ set(plugin_HDR TranscriptAudioSubscriber.h
                TranscriptVideoSubscriber.h
                TranscriptMediaHandler.h
                PluginPreferenceHandler.h
-               Preprocess.h
-               ModelProcessor.h
+               stt_whisper.h
                ./../lib/pluglog.h
                ./../lib/mediaStream.h
                ./../lib/audioFormat.h
@@ -98,6 +97,7 @@ target_include_directories(${ProjectName} PUBLIC ${PROJECT_BINARY_DIR}
                                                  ${ONNX_DIR}/../include/session
                                                  ${ONNX_DIR}/../include/providers/cuda
                                                  ${CONTRIB_PATH}/build/yaml-cpp/include
+                                                 ${CONTRIB_PATH}/build/whispercpp
                                                  )
 target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH}
                                               ${CONTRIB_PATH}/build/fmt/msvc/Release
@@ -110,7 +110,7 @@ target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH}
 
 target_link_libraries(${ProjectName} PUBLIC libyaml-cppmd libavfilter libswscale libswresample
                                             libavformat libavcodec libavutil libvpx libx264 libopus
-                                            libmfx fmt libzlib freetype ws2_32 Bcrypt Secur32 onnxruntime msvcrt)
+                                            libmfx fmt libzlib freetype whisper ws2_32 Bcrypt Secur32 msvcrt)
 
 add_custom_command(
     TARGET ${ProjectName}
@@ -120,6 +120,8 @@ add_custom_command(
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/yaml-cpp ${CONTRIB_PATH}/src/yaml-cpp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/freetype ${CONTRIB_PATH}/src/freetype
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/ffmpeg/ ${CONTRIB_PATH}/src/ffmpeg
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/whispercpp/ ${CONTRIB_PATH}/src/whispercpp
+    COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb whispercpp
     COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb fmt
     COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb yaml-cpp
     COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb zlib
@@ -136,34 +138,17 @@ if(TESTPROCESS)
         PRE_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/testPreferences.yml ${PROJECT_BINARY_DIR}/
         COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/jfk.wav ${PROJECT_BINARY_DIR}/
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${PROJECT_BINARY_DIR}/Debug
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${PROJECT_BINARY_DIR}/Debug
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${PROJECT_SOURCE_DIR}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${PROJECT_SOURCE_DIR}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${PROJECT_SOURCE_DIR}/data/assets
+        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/sample.mp4 ${PROJECT_BINARY_DIR}/
+        COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${PROJECT_SOURCE_DIR}/data/assets/
     )
 else()
     add_custom_command(
         TARGET ${ProjectName}
         PRE_BUILD
         COMMAND python ${PROJECT_SOURCE_DIR}/../SDK/jplManipulation.py --preassemble --plugin=${ProjectName}
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${JPL_DIRECTORY}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${JPL_DIRECTORY}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${JPL_DIRECTORY}/data/assets
+        COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${JPL_DIRECTORY}/data/assets/
         COMMENT "Assembling Plugin files"
     )
-    if(NVIDIA)
-        add_custom_command(
-            TARGET ${ProjectName}
-            PRE_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-        )
-    endif()
     add_custom_command(
         TARGET ${ProjectName}
         POST_BUILD
diff --git a/WhisperTranscript/ModelProcessor.cpp b/WhisperTranscript/ModelProcessor.cpp
deleted file mode 100644
index 6c8660103e7a89787e25a8b31196c4bc73a38750..0000000000000000000000000000000000000000
--- a/WhisperTranscript/ModelProcessor.cpp
+++ /dev/null
@@ -1,441 +0,0 @@
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
- * USA.
- */
-
-#include "ModelProcessor.h"
-
-#include <pluglog.h>
-#include <common.h>
-#include <limits.h>
-
-const char sep = separator();
-
-const std::string TAG = "Transcript";
-
-namespace jami {
-
-ModelProcessor::ModelProcessor(const std::string& path, bool acc)
-{
-    loadTokens(path + "/assets/tokenizer.bin", vocab_);
-
-#ifdef __ANDROID__
-    initModels(path + "/assets/mModelEncoder.ort", path + "/assets/mModelDecoder.ort", path + "/assets/mLogSoftMax.ort", acc);
-#else
-    initModels(path + "/assets/mModelEncoder.onnx", path + "/assets/mModelDecoder.onnx", path + "/assets/mLogSoftMax.onnx", acc);
-#endif
-}
-
-ModelProcessor::~ModelProcessor()
-{
-    endModels();
-    Plog::log(Plog::LogPriority::INFO, TAG, "~ModelProcessor");
-}
-
-void
-ModelProcessor::endModels()
-{
-    if (encoderSession_) {
-        delete encoderSession_;
-        encoderSession_ = nullptr;
-    }
-    if (decoderSession_) {
-        delete decoderSession_;
-        decoderSession_ = nullptr;
-    }
-    if (logSoftMaxSession_) {
-        delete logSoftMaxSession_;
-        logSoftMaxSession_ = nullptr;
-    }
-#ifdef NVIDIA
-    if (cudaOptions_) {
-        ortApi.ReleaseCUDAProviderOptions(cudaOptions_);
-        cudaOptions_ = nullptr;
-    }
-#endif
-    if (env_) {
-        env_.release();
-        env_ = NULL;
-    }
-}
-
-void
-ModelProcessor::initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc)
-{
-    try {
-        sessOpt_ = Ort::SessionOptions();
-
-        try {
-            if (activateAcc) {
-#ifdef NVIDIA
-                Ort::ThrowOnError(ortApi.CreateCUDAProviderOptions(&cudaOptions_));
-
-                // std::vector<const char*> keys{"device_id"};
-                // std::vector<const char*> values{"0"};
-
-                // Ort::ThrowOnError(ortApi.UpdateCUDAProviderOptions(cudaOptions_, keys.data(), values.data(), keys.size()));
-                Ort::ThrowOnError(ortApi.SessionOptionsAppendExecutionProvider_CUDA_V2(sessOpt_, cudaOptions_));
-#endif
-#ifdef __ANDROID__
-                Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(sessOpt_, 0));
-#endif
-            }
-        } catch (std::exception& accelException) {
-            Plog::log(Plog::LogPriority::ERR, TAG, accelException.what());
-            Plog::log(Plog::LogPriority::ERR, TAG, "Acceleration not available, loading models for CPU.");
-        }
-
-        sessOpt_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-#ifdef WIN32
-        encoderSession_ = new Ort::Session(env_, string_utils::to_wstring(encoderModelPath).c_str(), sessOpt_);
-        decoderSession_ = new Ort::Session(env_, string_utils::to_wstring(decoderModelPath).c_str(), sessOpt_);
-        logSoftMaxSession_ = new Ort::Session(env_, string_utils::to_wstring(logSoftMaxModelPath).c_str(), sessOpt_);
-#else
-        encoderSession_ = new Ort::Session(env_, encoderModelPath.c_str(), sessOpt_);
-        decoderSession_ = new Ort::Session(env_, decoderModelPath.c_str(), sessOpt_);
-        logSoftMaxSession_ = new Ort::Session(env_, logSoftMaxModelPath.c_str(), sessOpt_);
-#endif
-        isAllocated_ = true;
-        Plog::log(Plog::LogPriority::INFO, TAG, "Model is allocated");
-    } catch (std::exception& e) {
-        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
-    }
-}
-
-/* from whisper.cpp */
-// the most basic sampling scheme - select the top token
-whisperTokenData
-ModelProcessor::whisper_sample_best(const float * probs)
-{
-    whisperTokenData result = {
-        0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
-    };
-
-    int n_logits = vocab_.id_to_token.size();
-
-    std::vector<std::pair<double, int64_t>> probs_id;
-    probs_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        probs_id.emplace_back(std::make_pair(probs[i], i));
-    }
-
-    {
-        double sum_ts =  0.0;
-        double max_ts = -1.0;
-        double max_tx = -1.0;
-
-        for (int i = 0; i < vocab_.token_beg; i++) {
-            max_tx = std::max(max_tx, probs_id[i].first);
-        }
-
-        for (int i = vocab_.token_beg; i < n_logits; i++) {
-            sum_ts += probs_id[i].first;
-            if  (probs_id[i].first > max_ts) {
-                max_ts = probs_id[i].first;
-                result.tid = probs_id[i].second;
-            }
-        }
-
-        // if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
-        // timestamp token
-        if (sum_ts > max_tx) {
-            // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
-            for (int i = 0; i < vocab_.token_beg; i++) {
-                probs_id[i].first = -INT_MAX;
-            }
-        }
-
-        result.pt = max_ts/(sum_ts + 1e-10);
-        result.ptsum = sum_ts;
-    }
-
-    // find the top K tokens
-    const int top_k = 4;
-
-    std::partial_sort(
-            probs_id.begin(),
-            probs_id.begin() + top_k, probs_id.end(),
-            [](const std::pair<double, int64_t> & a, const std::pair<double, int64_t> & b) {
-        return a.first > b.first;
-    });
-
-    probs_id.resize(top_k);
-
-    int res = 0;
-    while ((probs_id[res].second == vocab_.token_sot ||
-            probs_id[res].second == vocab_.token_solm ||
-            probs_id[res].second == vocab_.token_beg) &&
-            res < (int) probs_id.size() - 1) {
-        res++;
-    }
-
-    result.id = probs_id[res].second;
-    result.p  = probs_id[res].first;
-
-    return result;
-}
-
-void
-ModelProcessor::filterLogits(std::vector<float>& logits, int offset)
-{
-    // Remove all no speech tokens
-    for (const auto idx : vocab_.noSpeechTokens) {
-        logits[idx] = (float)-INT_MAX;
-    }
-}
-
-void
-ModelProcessor::filterLanguageLogits(std::vector<float>& logits)
-{
-    // Leave only the language tokens
-    for (size_t i = 0; i < logits.size(); i++) {
-        if (vocab_.languageId2Tokens[i].empty())
-            logits[i] = (float)(-INT_MAX);
-    }
-}
-
-whisperTokenData
-ModelProcessor::getToken(std::vector<float>& logits)
-{
-    Ort::RunOptions runOption;
-    std::vector<Ort::Value> logSoftMaxInputs;
-    logSoftMaxInputs.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                            logits.data(),
-                                                            logits.size(),
-                                                            logitsShape_.data(),
-                                                            logitsShape_.size()));
-
-    auto softmaxOutputs = logSoftMaxSession_->Run(runOption,
-                                                logSoftMaxInputNames.data(),
-                                                logSoftMaxInputs.data(),
-                                                logSoftMaxInputNames.size(),
-                                                logSoftMaxOutputNames.data(),
-                                                logSoftMaxOutputNames.size());
-
-    float* probs = softmaxOutputs[1].GetTensorMutableData<float>();
-    return whisper_sample_best(probs);
-}
-
-std::string
-ModelProcessor::feedInput(std::vector<float>& melInput, const std::string& preferenceLanguage)
-{
-    std::lock_guard<std::mutex> l(mtx_);
-    if (!isAllocated_ || !logSoftMaxSession_ || !encoderSession_ || !decoderSession_)
-        return "";
-    Ort::RunOptions runOption;
-    try {
-        Ort::Value melInputTensor = Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                    melInput.data(),
-                                                                    melInput.size(),
-                                                                    melInputShape_.data(),
-                                                                    melInputShape_.size());
-        audioFeaturesTensor_ = Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                               audioFeatures_.data(),
-                                                               audioFeatures_.size(),
-                                                               audioFeaturesShape_.data(),
-                                                               audioFeaturesShape_.size());
-        // Run the encoder graph
-        encoderSession_->Run(runOption,
-                        encoderInputNames,
-                        &melInputTensor,
-                        1,
-                        encoderOutputNames,
-                        &audioFeaturesTensor_,
-                        1);
-    } catch(Ort::Exception e) {
-        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
-        return "";
-    } catch (...) { return ""; }
-    std::vector<float> currentTokensP {};
-
-    try {
-        auto isMultilingual = vocab_.is_multilingual();
-        std::vector<int64_t> currentTokens {};
-        currentTokens.emplace_back(vocab_.token_sot);
-        currentTokensP.emplace_back(1);
-
-        std::array<int64_t, 1> offsetShape {1};
-
-        if (isMultilingual) {
-            if (preferenceLanguage == "auto"
-                    || vocab_.languageTokens2Id.find(preferenceLanguage) == vocab_.languageTokens2Id.end()) {
-                std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f);
-                std::array<int64_t, 2> tokenShape {1, 1};
-                int64_t offset =  0;
-                std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, 1, MODELFEATURESHAPE };
-
-                std::vector<int64_t> token = { currentTokens.back() };
-
-                // Run the decoder graph
-                std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_};
-                inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                        audioFeatures_.data(),
-                                                                        audioFeatures_.size(),
-                                                                        audioFeaturesShape_.data(),
-                                                                        audioFeaturesShape_.size()));
-
-                inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                            token.data(),
-                                                                            token.size(),
-                                                                            tokenShape.data(),
-                                                                            tokenShape.size()));
-
-                inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                        currentKVCache.data(),
-                                                                        currentKVCache.size(),
-                                                                        kvCacheShape.data(),
-                                                                        kvCacheShape.size()));
-
-                inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                            &offset,
-                                                                            1,
-                                                                            offsetShape.data(),
-                                                                            0));
-
-                auto outputs = decoderSession_->Run(runOption,
-                                                    decoderInputNames.data(),
-                                                    inputsVector.data(),
-                                                    decoderInputNames.size(),
-                                                    decoderOutputNames.data(),
-                                                    decoderOutputNames.size());
-
-                auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo();
-                auto logitsData = outputs[0].GetTensorMutableData<float>();
-
-                {
-                    std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount());
-                    filterLanguageLogits(logits);
-                    auto it = std::max_element(logits.begin(), logits.end());
-                    currentTokens.emplace_back(std::distance(logits.begin(), it));
-                }
-            } else
-                currentTokens.emplace_back(vocab_.languageTokens2Id[preferenceLanguage]);
-            currentTokens.emplace_back(vocab_.token_transcribe);
-            currentTokensP.emplace_back(1);
-            currentTokensP.emplace_back(1);
-        }
-
-        std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f);
-        std::array<int64_t, 2> tokenShape {1, (long)currentTokens.size()};
-
-        for (auto i = 0; i < sampleLen; i++) {
-            int64_t offset =  isMultilingual ? ( i == 0 ? 0 : i + 2 ) : i;
-            std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, static_cast<int64_t>(currentTokens.size()), MODELFEATURESHAPE };
-
-            std::vector<int64_t> token = { currentTokens.back() };
-            if (i == 0) {
-                token = currentTokens;
-                tokenShape[1] = currentTokens.size();
-            } else {
-                tokenShape[1] = 1;
-            }
-
-            // Run the decoder graph
-            std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_};
-            inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                      audioFeatures_.data(),
-                                                                      audioFeatures_.size(),
-                                                                      audioFeaturesShape_.data(),
-                                                                      audioFeaturesShape_.size()));
-
-            inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                        token.data(),
-                                                                        token.size(),
-                                                                        tokenShape.data(),
-                                                                        tokenShape.size()));
-
-            inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                      currentKVCache.data(),
-                                                                      currentKVCache.size(),
-                                                                      kvCacheShape.data(),
-                                                                      kvCacheShape.size()));
-
-            inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                        &offset,
-                                                                        1,
-                                                                        offsetShape.data(),
-                                                                        0));
-
-            auto outputs = decoderSession_->Run(runOption,
-                                                decoderInputNames.data(),
-                                                inputsVector.data(),
-                                                decoderInputNames.size(),
-                                                decoderOutputNames.data(),
-                                                decoderOutputNames.size());
-
-            auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo();
-            auto logitsData = outputs[0].GetTensorMutableData<float>();
-
-            {
-                std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount());
-                if (isMultilingual && logits.size() > vocab_.n_vocab) {
-                    std::vector<float>lastLogits;
-                    lastLogits = std::vector<float>(logits.begin() + 2 * vocab_.n_vocab, logits.end());
-                    std::swap(lastLogits, logits);
-                }
-
-                filterLogits(logits, offset);
-
-                auto tokenData = getToken(logits);
-                currentTokens.emplace_back(tokenData.id);
-                currentTokensP.emplace_back(tokenData.p);
-            }
-
-            // Grab kvCache for next iteration
-            auto kvCacheTensorInfo = outputs[1].GetTensorTypeAndShapeInfo();
-            auto nextKVCacheData = outputs[1].GetTensorMutableData<float>();
-
-            std::vector<float> nextKVCache;
-            std::vector<float> zeros(MODELFEATURESHAPE, 0.0f);
-            int delta = (currentTokens.size() - 1) * MODELFEATURESHAPE;
-            for (int currentKVIdx = 0; currentKVIdx < MODELKVCACHESHAPE; currentKVIdx++) {
-                nextKVCache.insert(nextKVCache.end(),
-                                   nextKVCacheData + (currentKVIdx * delta),
-                                   nextKVCacheData + ((currentKVIdx + 1) * delta));
-                nextKVCache.insert(nextKVCache.end(), zeros.begin(), zeros.end());
-            }
-            std::swap(currentKVCache, nextKVCache);
-
-            if (currentTokens.back() == vocab_.token_eot)
-                break;
-        }
-
-        std::swap(currentTokens, tokensOutput_);
-    } catch(Ort::Exception e) {
-        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
-        return "";
-    } catch (...) {}
-
-    std::ostringstream oss;
-    std::ostringstream tokensStr;
-    auto idx = -1;
-    for (const auto& token : tokensOutput_) {
-        idx ++;
-        tokensStr << token << " " << currentTokensP[idx] << " ";
-        if (token >= vocab_.token_eot)
-            continue;
-        if (currentTokensP[idx] > -1.8)
-            oss << vocab_.id_to_token[token];
-    }
-
-    tokensOutput_.clear();
-    return oss.str();
-}
-} // namespace jami
diff --git a/WhisperTranscript/ModelProcessor.h b/WhisperTranscript/ModelProcessor.h
deleted file mode 100644
index e640c191ea3b011ed27057f5785d41e3bddc525b..0000000000000000000000000000000000000000
--- a/WhisperTranscript/ModelProcessor.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
- * USA.
- */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include <algorithm>
-#include <set>
-#include <mutex>
-
-#include <onnxruntime_cxx_api.h>
-// #ifdef NVIDIA
-// #include <cuda_provider_options.h>
-// #endif
-#ifdef __ANDROID__
-#include <nnapi_provider_factory.h>
-#endif
-
-#include <functional>
-
-#include "Preprocess.h"
-
-namespace jami {
-
-// Use script getonnxio.py to grab model inputs and outputs
-// names and shapes.
-// Note: None is a open shape. If in the input, it will be defined by
-// the data we want to use as input. As for open output, it is recommended
-// to not try to pre allocate the tensor and use the model.run return.
-
-static const char* encoderInputNames[4] = {"mel"};
-static const char* encoderOutputNames[4] = {"617"};
-
-#define MODELFEATURESHAPE 384
-#define MODELKVCACHESHAPE 8
-
-#define MODELLOGITSHAPE 51865 // 51864 for english models
-
-static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
-static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};
-
-static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
-static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};
-
-typedef struct whisperTokenData {
-    int64_t id;  // token id
-    int64_t tid; // forced timestamp token id
-
-    float p;     // probability of the token
-    float pt;    // probability of the timestamp token
-    float ptsum; // sum of probabilities of all timestamp tokens
-
-    // token-level timestamp data
-    // do not use if you haven't computed token-level timestamps
-    int64_t t0; // start time of the token
-    int64_t t1; //   end time of the token
-
-    float vlen; // voice length of the token
-} whisperTokenData;
-
-class ModelProcessor
-{
-public:
-    ModelProcessor(const std::string& path, bool acc);
-    ~ModelProcessor();
-
-    void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
-    void endModels();
-
-    whisperTokenData whisper_sample_best(const float * probs);
-
-    /**
-     * @brief feedInput
-     * Takes a input and feeds it to the model storage for predictions
-     * @param input
-     * @param preferenceLanguage
-     */
-    std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");
-
-    bool isAllocated() { return isAllocated_; }
-
-private:
-    // Tokens
-    whisperVocab vocab_;
-
-    whisperTokenData getToken(std::vector<float>& logits);
-    void filterLogits(std::vector<float>& logits, int offset);
-    void filterLanguageLogits(std::vector<float>& logits);
-
-    // onnx related
-    Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-    bool isAllocated_ {false};
-    Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
-    Ort::Session* encoderSession_ {nullptr};
-    Ort::Session* decoderSession_ {nullptr};
-    Ort::Session* logSoftMaxSession_ {nullptr};
-    Ort::SessionOptions sessOpt_;
-#ifdef NVIDIA
-    const OrtApi& ortApi = Ort::GetApi();
-    OrtCUDAProviderOptionsV2* cudaOptions_ = nullptr;
-#endif
-
-    // Encoder tensors. 1 input and 1 output
-    std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
-    Ort::Value audioFeaturesTensor_ {nullptr};
-    std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
-    std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};
-
-    std::vector<float> output_;
-
-    // Decoder tensors. 4 inputs and 2 outputs
-    std::vector<int64_t> tokensOutput_ { };
-
-    // LogProb check
-    std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};
-
-    int sampleLen = 100;
-
-    std::mutex mtx_;
-
-};
-} // namespace jami
diff --git a/WhisperTranscript/PluginPreferenceHandler.cpp b/WhisperTranscript/PluginPreferenceHandler.cpp
index b1a63d0722fa70b933eadd9f405938e937d65167..e4d6ec06e215730ab85ce65dbe8597ffb604f9db 100644
--- a/WhisperTranscript/PluginPreferenceHandler.cpp
+++ b/WhisperTranscript/PluginPreferenceHandler.cpp
@@ -89,9 +89,7 @@ PluginPreferenceHandler::preferenceMapHasKey(const std::string& key)
     return (key == "background"
             || key == "position"
             || key == "fontsize"
-            || key == "language"
-            || key == "chunksize"
-            || key == "stepsize");
+            || key == "language");
 }
 
 std::string
@@ -110,8 +108,7 @@ std::map<std::string, std::string>
 PluginPreferenceHandler::getPreferences(const std::string& accountId)
 {
     std::lock_guard<std::mutex> lk(mtx_);
-    auto preferences = preferences_.emplace(accountId, preferences_["default"]).first->second;
-    return preferences;
+    return preferences_.emplace(accountId, preferences_["default"]).first->second;
 }
 
 PluginPreferenceHandler::~PluginPreferenceHandler()
diff --git a/WhisperTranscript/PluginPreferenceHandler.h b/WhisperTranscript/PluginPreferenceHandler.h
index 1f2c774fc8edc53369ff1d60fb3f2b08fdf1723e..62d8b9482cbaff0afac2672d277f7a328af896f8 100644
--- a/WhisperTranscript/PluginPreferenceHandler.h
+++ b/WhisperTranscript/PluginPreferenceHandler.h
@@ -35,9 +35,7 @@ enum Parameter {
     POSITION,
     BACKGROUND,
     FONTSIZE,
-    LANGUAGE,
-    CHUNK,
-    STEP
+    LANGUAGE
 };
 
 class TranscriptMediaHandler;
diff --git a/WhisperTranscript/Preprocess.cpp b/WhisperTranscript/Preprocess.cpp
deleted file mode 100644
index 70a29aaf6904e3476c0bc62aee70e9b10ae94238..0000000000000000000000000000000000000000
--- a/WhisperTranscript/Preprocess.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
- */
-
-#include "Preprocess.h"
-
-#ifdef WIN32
-#define _USE_MATH_DEFINES
-#endif
-
-#include <thread>
-#include <math.h>
-#include <fstream>
-#include <iostream>
-
-// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
-bool logMelSpectrogram(
-    const float *samples,
-    const int n_samples,
-    const int n_threads,
-    const whisperFilters &filters,
-    whisperMel &mel) {
-
-    // const int sample_rate = WHISPER_SAMPLE_RATE;
-    const int fft_size = WHISPER_N_FFT;
-    const int fft_step = WHISPER_HOP_LENGTH;
-    const int n_mel = WHISPER_N_MEL;
-
-    // Hanning window
-    std::vector<float> hann;
-    hann.resize(fft_size);
-    for (int i = 0; i < fft_size; i++) {
-        hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
-    }
-
-    mel.n_mel = n_mel;
-    mel.n_len = (n_samples)/fft_step;
-    mel.data.resize(mel.n_mel*mel.n_len);
-
-    const int n_fft = 1 + fft_size/2;
-
-    std::vector<std::thread> workers(n_threads);
-    for (int iw = 0; iw < n_threads; ++iw) {
-        workers[iw] = std::thread([&](int ith) {
-            std::vector<float> fft_in;
-            fft_in.resize(fft_size);
-            for (int i = 0; i < fft_size; i++) {
-                fft_in[i] = 0.0;
-            }
-
-            std::vector<float> fft_out;
-            fft_out.resize(2*fft_size);
-
-            for (int i = ith; i < mel.n_len; i += n_threads) {
-                const int offset = i*fft_step;
-
-                // apply Hanning window
-                for (int j = 0; j < fft_size; j++) {
-                    if (offset + j < n_samples) {
-                        fft_in[j] = hann[j]*samples[offset + j];
-                    } else {
-                        fft_in[j] = 0.0;
-                    }
-                }
-
-                // FFT -> mag^2
-                fft(fft_in, fft_out);
-
-                for (int j = 0; j < fft_size; j++) {
-                    fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
-                }
-                for (int j = 1; j < fft_size/2; j++) {
-                    fft_out[j] += fft_out[fft_size - j];
-                }
-
-                // mel spectrogram
-                for (int j = 0; j < mel.n_mel; j++) {
-                    double sum = 0.0;
-
-                    for (int k = 0; k < n_fft; k++) {
-                        sum += fft_out[k]*filters.data[j*n_fft + k];
-                    }
-                    if (sum < 1e-10) {
-                        sum = 1e-10;
-                    }
-
-                    sum = log10(sum);
-
-                    mel.data[j*mel.n_len + i] = sum;
-                }
-            }
-        }, iw);
-    }
-
-    for (int iw = 0; iw < n_threads; ++iw) {
-        workers[iw].join();
-    }
-
-    // clamping and normalization
-    double mmax = -1e20;
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] > mmax) {
-            mmax = mel.data[i];
-        }
-    }
-
-    mmax -= 8.0;
-
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] < mmax) {
-            mel.data[i] = mmax;
-        }
-
-        mel.data[i] = (mel.data[i] + 4.0)/4.0;
-    }
-
-    return true;
-}
-
-// Cooley-Tukey FFT
-// poor man's implementation - use something better
-// input is real-valued
-// output is complex-valued
-void fft(const std::vector<float> & in, std::vector<float> & out) {
-    out.resize(in.size()*2);
-
-    int N = in.size();
-
-    if (N == 1) {
-        out[0] = in[0];
-        out[1] = 0;
-        return;
-    }
-
-    if (N%2 == 1) {
-        dft(in, out);
-        return;
-    }
-
-    std::vector<float> even;
-    std::vector<float> odd;
-
-    for (int i = 0; i < N; i++) {
-        if (i % 2 == 0) {
-            even.emplace_back(in[i]);
-        } else {
-            odd.emplace_back(in[i]);
-        }
-    }
-
-    std::vector<float> even_fft;
-    std::vector<float> odd_fft;
-
-    fft(even, even_fft);
-    fft(odd, odd_fft);
-
-    for (int k = 0; k < N/2; k++) {
-        float theta = 2*M_PI*k/N;
-
-        float re = cos(theta);
-        float im = -sin(theta);
-
-        float re_odd = odd_fft[2*k + 0];
-        float im_odd = odd_fft[2*k + 1];
-
-        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
-        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
-
-        out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
-        out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
-    }
-}
-
-// naive Discrete Fourier Transform
-// input is real-valued
-// output is complex-valued
-void dft(const std::vector<float> & in, std::vector<float> & out) {
-    int N = in.size();
-
-    out.resize(N*2);
-
-    for (int k = 0; k < N; k++) {
-        float re = 0;
-        float im = 0;
-
-        for (int n = 0; n < N; n++) {
-            float angle = 2*M_PI*k*n/N;
-            re += in[n]*cos(angle);
-            im -= in[n]*sin(angle);
-        }
-
-        out[k*2 + 0] = re;
-        out[k*2 + 1] = im;
-    }
-}
-
-
-void loadMelFilters(const std::string& fileName, whisperFilters& filters) {
-    auto fin = std::ifstream(fileName, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
-        return;
-    }
-
-    fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
-    fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));
-
-    filters.data.resize(filters.n_mel * filters.n_fft);
-    fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
-}
-
-void loadTokens(const std::string& fileName, whisperVocab& vocab) {
-    auto fin = std::ifstream(fileName, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
-        return;
-    }
-
-    int32_t modelNVocab = 0;
-    fin.read((char *) &modelNVocab, sizeof(modelNVocab));
-
-    int32_t tokensNVocab = 0;
-    fin.read((char *) &tokensNVocab, sizeof(tokensNVocab));
-
-    std::string word;
-    for (int i = 0; i < tokensNVocab; i++) {
-        uint32_t len;
-        fin.read((char *) &len, sizeof(len));
-
-        word.resize(len);
-        fin.read((char *) word.data(), len);
-
-        vocab.token_to_id[word] = i;
-        vocab.id_to_token[i] = word;
-    }
-
-    vocab.n_vocab = modelNVocab;
-    if (vocab.is_multilingual()) {
-        vocab.token_eot++;
-        vocab.token_sot++;
-        vocab.token_prev++;
-        vocab.token_solm++;
-        vocab.token_not++;
-        vocab.token_beg++;
-    }
-
-    if (tokensNVocab < modelNVocab) {
-        // Read language tokens
-        {
-            int32_t languageTokensLen = 0;
-            fin.read((char *) &languageTokensLen, sizeof(languageTokensLen));
-
-            std::string word;
-            for (int i = 0; i < languageTokensLen; i++) {
-                int32_t id = 0;
-                fin.read((char *) &id, sizeof(id));
-                uint32_t len;
-                fin.read((char *) &len, sizeof(len));
-
-                word.resize(len);
-                fin.read((char *) word.data(), len);
-
-                vocab.token_to_id[word] = id;
-                vocab.id_to_token[id] = word;
-                vocab.languageId2Tokens.insert({id, word});
-                vocab.languageTokens2Id.insert({word, id});
-            }
-        }
-
-        fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab);
-        for (int i = tokensNVocab; i < modelNVocab; i++) {
-            if (!vocab.id_to_token[i].empty())
-                continue;
-            if (i > vocab.token_beg) {
-                word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
-            } else if (i == vocab.token_eot) {
-                word = "[_EOT_]";
-            } else if (i == vocab.token_sot) {
-                word = "[_SOT_]";
-            } else if (i == vocab.token_prev) {
-                word = "[_PREV_]";
-            } else if (i == vocab.token_not) {
-                word = "[_NOT_]";
-            } else if (i == vocab.token_beg) {
-                word = "[_BEG_]";
-            } else {
-                word = "[_extra_token_" + std::to_string(i) + "]";
-            }
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // Read no speech tokens
-    {
-        int32_t noSpeechTokensLen = 0;
-        fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen));
-
-        for (int i = 0; i < noSpeechTokensLen; i++) {
-            uint32_t id;
-            fin.read((char *) &id, sizeof(id));
-
-            vocab.noSpeechTokens.insert(id);
-        }
-    }
-}
-
-void
-inputPadTrim(whisperMel &mel)
-{
-    if (mel.n_len == ENCODER_INPUT_LEN)
-        return;
-    std::vector<float> data;
-    std::vector<float> partialData;
-    int seek = 0;
-    auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN);
-    for (auto j = 0; j < mel.n_mel; j++) {
-        seek = j * mel.n_len;
-        for (auto i = seek; i < (j + 1) * dataLimit; i++) {
-            partialData.emplace_back(mel.data[i]);
-        }
-        if (mel.n_len < ENCODER_INPUT_LEN) {
-            for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) {
-                partialData.emplace_back(0.0f);
-            }
-        }
-        data.insert(data.end(), partialData.begin(), partialData.end());
-        partialData.clear();
-    }
-    std::swap(mel.data, data);
-}
diff --git a/WhisperTranscript/Preprocess.h b/WhisperTranscript/Preprocess.h
deleted file mode 100644
index 5138321a12210605d14d971e3a26d29fa44f2adb..0000000000000000000000000000000000000000
--- a/WhisperTranscript/Preprocess.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
- */
-
-#pragma once
-
-#include <vector>
-#include <cstdint>
-#include <string>
-#include <map>
-#include <set>
-
-
-// Those are model defined
-// Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf)
-#define WHISPER_SAMPLE_RATE 16000
-#define WHISPER_N_FFT       400
-#define WHISPER_N_MEL       80
-#define WHISPER_HOP_LENGTH  160
-#define WHISPER_CHUNK_SIZE  30
-#define ENCODER_INPUT_LEN   3000
-
-struct whisperMel {
-    int n_len;
-    int n_mel;
-
-    std::vector<float> data;
-};
-
-struct whisperFilters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-struct whisperVocab {
-    size_t n_vocab = 51864;
-
-    std::map<std::string, int32_t> token_to_id;
-    std::map<int32_t, std::string> id_to_token;
-
-    int32_t token_eot  = 50256;
-    int32_t token_sot  = 50257;
-    int32_t token_prev = 50360;
-    int32_t token_solm = 50361; // no speech
-    int32_t token_not  = 50362; // no timestamps
-    int32_t token_beg  = 50363; // timestamp begin
-
-    // available tasks
-    const int32_t token_translate  = 50358;
-    const int32_t token_transcribe = 50359;
-
-    bool is_multilingual() const {
-        return n_vocab == 51865;
-    }
-
-    std::map<std::string, int32_t> languageTokens2Id;
-    std::map<int32_t, std::string> languageId2Tokens;
-    std::set<int32_t> noSpeechTokens;
-};
-
-bool logMelSpectrogram(
-    const float * samples,
-    const int n_samples,
-    const int n_threads,
-    const whisperFilters & filters,
-    whisperMel &mel);
-
-void fft(const std::vector<float> & in, std::vector<float> & out);
-
-void dft(const std::vector<float> & in, std::vector<float> & out);
-
-void loadMelFilters(const std::string& fileName, whisperFilters& filters);
-
-void loadTokens(const std::string& fileName, whisperVocab& vocab);
-
-void inputPadTrim(whisperMel &mel);
diff --git a/WhisperTranscript/TranscriptAudioSubscriber.cpp b/WhisperTranscript/TranscriptAudioSubscriber.cpp
index cd866b4a69870c8366b1ce112b3c917c76a79e4d..11a5adf54b18139ca57317168a7656f05e96ad11 100644
--- a/WhisperTranscript/TranscriptAudioSubscriber.cpp
+++ b/WhisperTranscript/TranscriptAudioSubscriber.cpp
@@ -24,27 +24,27 @@
 #include <frameUtils.h>
 #include <bitset>
 #include <iostream>
+#include <fmt/core.h>
+#include <fmt/format.h>
 
-const std::string TAG = "Transcript";
+#include "stt_whisper.h"
+
+const std::string TAG = "TranscriptAudio";
 const char sep = separator();
 
 namespace jami {
 
-TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc)
+TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath,
+                                                     TranscriptVideoSubscriber* videoSubscriber)
     : path_ {dataPath}
-    , modelProcessor_ {dataPath, acc}
     , mVS_ {videoSubscriber}
 {
-    loadMelFilters(path_ + "/assets/mel_filters.bin", modelFilters_);
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this)));
 }
 
 TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
 {
-    modelProcessor_.endModels();
-    formatFilter_.clean();
-    stop();
-    processFrameThread.join();
-    Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaProcessor");
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this)));
 }
 
 /**
@@ -53,83 +53,84 @@ TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
 void
 TranscriptAudioSubscriber::processFrame()
 {
+    if (!whisper_) {
+        whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin");
+        whisper_->setLanguage(language_);
+    }
+
     while (running) {
-        auto data = modelInput_[modelIdx_];
-        if (data.size() <= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(waitingPoint_));
-            continue;
-        }
-        if (!running)
-            break;
-        melSpectrogram_.data.clear();
-        melSpectrogram_.n_len = 0;
-        melSpectrogram_.n_mel = 0;
-        logMelSpectrogram(data.data(),
-                          data.size(),
-                          8,
-                          modelFilters_,
-                          melSpectrogram_);
-        inputPadTrim(melSpectrogram_);
-
-        auto text = modelProcessor_.feedInput(melSpectrogram_.data, language_);
-        if (text.empty()) {
+        decltype(frames_) frames;
+        {
             std::unique_lock<std::mutex> l(inputLock);
-            modelInput_[0].clear();
-            modelInput_[1].clear();
-            modelIdx_ = 0;
+            cv_.wait(l, [&]{
+                return !running || !frames_.empty();
+            });
+            if (!running)
+                return;
+            frames = std::move(frames_);
+        }
+
+        for (auto& f : frames) {
+            uniqueFramePtr filteredFrame = getUniqueFrame();
+            filteredFrame->sample_rate = WHISPER_SAMPLE_RATE;
+            filteredFrame->format  = AV_SAMPLE_FMT_FLT;
+            av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO);
+            try {
+                if (resampler_.resample(f.get(), filteredFrame.get()) == 0) {
+                    whisper_->AddAudioData((float*) filteredFrame->buf[0]->data,
+                                           filteredFrame->nb_samples);
+                }
+            } catch (...) {
+            }
+        }
+
+        auto result = whisper_->GetTranscribed();
+        if (not result.empty()) {
+            std::string txt;
+            for (const auto& t : result) {
+                if (not t.is_partial)
+                    txt += t.text;
+            }
+            if (!txt.empty())
+                mVS_->setText(txt);
         }
-        mVS_->setText(text);
     }
+    whisper_.reset();
 }
 
 void
 TranscriptAudioSubscriber::stop()
 {
-    running = false;
+    Plog::log(Plog::LogPriority::INFO, TAG, "stop()");
+    {
+        std::unique_lock<std::mutex> l(inputLock);
+        running = false;
+        cv_.notify_all();
+    }
     if (processFrameThread.joinable()) {
         processFrameThread.join();
     }
-    std::string str = "";
-    mVS_->setText(str);
+    mVS_->setText("");
 }
 
 void
 TranscriptAudioSubscriber::start()
 {
+    Plog::log(Plog::LogPriority::INFO, TAG, "start()");
     running = true;
-    processFrameThread = std::thread([this] { processFrame(); });
+    processFrameThread = std::thread([this](){ processFrame(); });
+    mVS_->setText("");
 }
 
 void
-TranscriptAudioSubscriber::setParameter(std::string& parameter, Parameter type)
+TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type)
 {
     std::unique_lock<std::mutex> l(inputLock);
-    std::string str = "";
     switch (type) {
     case (Parameter::LANGUAGE):
         language_ = parameter;
-        modelInput_[0].clear();
-        modelInput_[1].clear();
-        modelIdx_ = 0;
-        mVS_->setText(str);
-        break;
-    case (Parameter::CHUNK):
-        WHISPER_STREAM_SAMPLES_CHUNK = 16000 * std::stoi(parameter);
-        modelInput_[0].resize(0);
-        modelInput_[1].resize(0);
-        modelInput_[0].reserve(WHISPER_STREAM_SAMPLES_CHUNK);
-        modelInput_[1].reserve(WHISPER_STREAM_SAMPLES_CHUNK);
-        waitingPoint_ = (std::stoi(parameter) * 1000 - (WHISPER_STREAM_SAMPLES_CHUNK_STEP / 16)) / 3;
-        modelIdx_ = 0;
-        mVS_->setText(str);
-        break;
-    case (Parameter::STEP):
-        modelInput_[0].clear();
-        modelInput_[1].clear();
-        WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * std::stoi(parameter);
-        waitingPoint_ = ((WHISPER_STREAM_SAMPLES_CHUNK / 16) - std::stoi(parameter) * 1000) / 3;
-        modelIdx_ = 0;
-        mVS_->setText(str);
+        if (whisper_)
+            whisper_->setLanguage(parameter);
         break;
     default:
         return;
@@ -140,77 +141,30 @@ void
 TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame)
 {
     std::unique_lock<std::mutex> l(inputLock);
-    if (!pluginFrame || modelFilters_.data.empty() || obs != observable_)
+    if (!pluginFrame || obs != observable_)
         return;
 
-    if (firstRun) {
-        samplesCount_ = 0;
-        currentModelInput_.clear();
-        futureModelInput_.clear();
-        formatFilter_.clean();
-        AudioFormat afmt = AudioFormat(pluginFrame->sample_rate,
-                                        pluginFrame->channels,
-                                        static_cast<AVSampleFormat>(pluginFrame->format));
-        MediaStream ms = MediaStream("input", afmt);
-        formatFilter_.initialize(filterDescription_, {ms});
-        firstRun = false;
-    }
-
-    if (!formatFilter_.initialized_)
-        return;
-
-    if (formatFilter_.feedInput(pluginFrame, "input") == 0) {
-        uniqueFramePtr filteredFrame = {formatFilter_.readOutput(), frameFree};
-        if (filteredFrame) {
-            for (size_t i = 0; i < filteredFrame->buf[0]->size; i += 2) {
-#ifdef __DEBUG__
-                std::lock_guard<std::mutex> l(inputLock);
-#endif
-                int16_t rawValue = (filteredFrame->buf[0]->data[i+1] << 8) | filteredFrame->buf[0]->data[i];
-
-                // If not a positive value, perform the 2's complement math on the value
-                if ((rawValue & 0x8000) != 0) {
-                    rawValue = (~(rawValue - 0x0001)) * -1;
-                }
-                if (futureModelInput_.size() == WHISPER_STREAM_SAMPLES_CHUNK)
-                    futureModelInput_.erase(futureModelInput_.begin());
-                futureModelInput_.emplace_back(float(rawValue)/32768.0f);
-                samplesCount_++;
-
-                auto value = float(rawValue) / 32768.0f;
-                if (modelInput_[modelIdx_].size() >= WHISPER_STREAM_SAMPLES_CHUNK) {
-                    modelInput_[modelIdx_].clear();
-                    modelIdx_ = modelIdx_ ? 0 : 1;
-                }
-                modelInput_[modelIdx_].emplace_back(value);
-                if (modelInput_[modelIdx_].size()
-                    >= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) {
-                    modelInput_[modelIdx_ ? 0 : 1].emplace_back(value);
-                }
-            }
-        }
-    }
+    frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree));
+    cv_.notify_all();
     // audio returns as is
 }
 
 void
 TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable)
 {
-    Plog::log(Plog::LogPriority::INFO, TAG, "::Attached ! ");
+    std::unique_lock<std::mutex> l(inputLock);
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
     observable_ = observable;
     start();
 }
 
 void
-TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>*)
+TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable)
 {
     firstRun = true;
     observable_ = nullptr;
     stop();
-    modelInput_[0].clear();
-    modelInput_[1].clear();
-    modelIdx_ = 0;
-    Plog::log(Plog::LogPriority::INFO, TAG, "::Detached()");
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
 }
 
 void
@@ -218,7 +172,6 @@ TranscriptAudioSubscriber::detach()
 {
     if (observable_) {
         firstRun = true;
-        std::ostringstream oss;
         Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
         observable_->detach(this);
     }
diff --git a/WhisperTranscript/TranscriptAudioSubscriber.h b/WhisperTranscript/TranscriptAudioSubscriber.h
index d32cb0059bb2ec249a69ddb9f68fe101f12a9190..3e970e646fee37b4ca5cf1bc169b4d7551303f4b 100644
--- a/WhisperTranscript/TranscriptAudioSubscriber.h
+++ b/WhisperTranscript/TranscriptAudioSubscriber.h
@@ -26,21 +26,25 @@ extern "C" {
 #include <observer.h>
 
 #include <frameFilter.h>
-
-#include "Preprocess.h"
-#include "ModelProcessor.h"
+#include <frameUtils.h>
 #include "TranscriptVideoSubscriber.h"
 #include "PluginPreferenceHandler.h"
+#include "resampler.h"
 
 #include <thread>
 #include <condition_variable>
+#include <deque>
+#include <atomic>
+
+class RealtimeSttWhisper;
 
 namespace jami {
 
 class TranscriptAudioSubscriber : public Observer<AVFrame*>
 {
 public:
-    TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc = false);
+    TranscriptAudioSubscriber(const std::string& dataPath,
+                              TranscriptVideoSubscriber* videoSubscriber);
     ~TranscriptAudioSubscriber();
 
     virtual void update(Observable<AVFrame*>*, AVFrame* const&) override;
@@ -49,22 +53,11 @@ public:
 
     void detach();
 
-    void setParameter(std::string& parameter, Parameter type);
+    void setParameter(const std::string& parameter, Parameter type);
 
 private:
-    // Mel spectrogram filters
-    whisperFilters modelFilters_;
-    whisperMel melSpectrogram_;
-
     // Observer pattern
-    Observable<AVFrame*>* observable_{};
-
-    // Filter for audio formatting
-    const std::string filterDescription_ = "[input]aresample=16000,aformat=sample_fmts=s16:channel_layouts=mono";
-    FrameFilter formatFilter_;
-    std::array<std::vector<float>, 2> modelInput_ {};
-    int modelIdx_ {0};
-    int waitingPoint_ {1000};
+    Observable<AVFrame*>* observable_ {};
     std::string language_ {"auto"};
 
     // Data
@@ -72,12 +65,15 @@ private:
 
     // Status variables of the processing
     bool firstRun {true};
-    bool running {false};
+    std::atomic_bool running {false};
 
     std::mutex inputLock;
+    std::condition_variable cv_;
 
     // Model
-    ModelProcessor modelProcessor_;
+    std::unique_ptr<RealtimeSttWhisper> whisper_;
+    Resampler resampler_;
+    std::vector<uniqueFramePtr> frames_;
 
     // Threading
     std::thread processFrameThread;
@@ -87,9 +83,5 @@ private:
 
     // Video processor
     TranscriptVideoSubscriber* mVS_ {};
-
-
-    size_t WHISPER_STREAM_SAMPLES_CHUNK      = 16000 * 15; // 16 KHz * 15 seconds
-    size_t WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * 3;  // 16 KHz * 3 seconds
 };
 } // namespace jami
diff --git a/WhisperTranscript/TranscriptMediaHandler.cpp b/WhisperTranscript/TranscriptMediaHandler.cpp
index e33134da85e813aa3894f617791d4ccc2330f710..7457bb5ea149a275861270f8b8529593d8f3d3f0 100644
--- a/WhisperTranscript/TranscriptMediaHandler.cpp
+++ b/WhisperTranscript/TranscriptMediaHandler.cpp
@@ -36,13 +36,11 @@ TranscriptMediaHandler::TranscriptMediaHandler(std::string&& datapath, PluginPre
     aph_ = prefHandler;
     setId(datapath_);
     auto preferences = aph_->getPreferences("default");
-    auto it = preferences.find("acceleration");
-    auto useAcceleration = it == preferences.end() ? false : it->second == "1";
     videoSubscriber_ = std::make_shared<TranscriptVideoSubscriber>(datapath_);
-    audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get(), useAcceleration);
+    audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get());
     setParameters("default");
 #ifdef __DEBUG__
-    it = preferences.find("subtitle");
+    auto it = preferences.find("subtitle");
     if (it != preferences.end())
         videoSubscriber_->setText(it->second);
 #endif
@@ -102,8 +100,6 @@ TranscriptMediaHandler::setParameters(const std::string& accountId)
         videoSubscriber_->setParameter(preferences["background"], Parameter::BACKGROUND);
         videoSubscriber_->setParameter(preferences["position"], Parameter::POSITION);
         audioSubscriber_->setParameter(preferences["language"], Parameter::LANGUAGE);
-        audioSubscriber_->setParameter(preferences["chunksize"], Parameter::CHUNK);
-        audioSubscriber_->setParameter(preferences["stepsize"], Parameter::STEP);
     } catch (std::exception& e) {
         Plog::log(Plog::LogPriority::ERR, TAG, e.what());
     }
@@ -129,9 +125,7 @@ TranscriptMediaHandler::detach()
 
 TranscriptMediaHandler::~TranscriptMediaHandler()
 {
-    std::ostringstream oss;
-    oss << " ~TranscriptMediaHandler from WhisperTranscript Plugin" << std::endl;
-    Plog::log(Plog::LogPriority::INFO, TAG, oss.str());
+    Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaHandler from WhisperTranscript Plugin");
     detach();
 }
 } // namespace jami
diff --git a/WhisperTranscript/TranscriptVideoSubscriber.cpp b/WhisperTranscript/TranscriptVideoSubscriber.cpp
index 353519071a2e223986dabcc0a7d90a095d0137d9..57635d802e637382b54aff0057c9bf59a471d6db 100644
--- a/WhisperTranscript/TranscriptVideoSubscriber.cpp
+++ b/WhisperTranscript/TranscriptVideoSubscriber.cpp
@@ -35,8 +35,11 @@ extern "C" {
 #include <fmt/format.h>
 
 #include <bitset>
+#include <string_view>
 
-const std::string TAG = "Transcript";
+using namespace std::literals;
+
+const std::string TAG = "TranscriptVideo";
 const char sep = separator();
 
 namespace jami {
@@ -54,9 +57,10 @@ TranscriptVideoSubscriber::~TranscriptVideoSubscriber()
 }
 
 void
-TranscriptVideoSubscriber::setText(std::string& text)
+TranscriptVideoSubscriber::setText(const std::string& t)
 {
-    text = string_utils::ffmpegScapeString(text);
+    Plog::log(Plog::LogPriority::INFO, TAG, "setText " + t);
+    auto text = string_utils::ffmpegScapeString(t);
     std::vector<std::string> textWords = string_utils::getWords(text, " ");
     subtitle_ = "";
 
@@ -101,9 +105,28 @@ TranscriptVideoSubscriber::setParameter(std::string& parameter, Parameter type)
     firstRun = true;
 }
 
+std::string_view getTransposeDescr(int rotation)
+{
+    switch (rotation) {
+    case 90:
+    case -270:
+        return "transpose=2,"sv;
+    case 180:
+    case -180:
+        return "transpose=1, transpose=1,"sv;
+    case 270:
+    case -90:
+        return "transpose=1,"sv;
+    default:
+        return {};
+    }
+    return {};
+}
+
 void
 TranscriptVideoSubscriber::setFilterDescription()
 {
+    Plog::log(Plog::LogPriority::INFO, TAG, "setFilterDescription() " + subtitle_);
     if (pluginFrameSize_.first == 0 || pluginFrameSize_.second == 0)
         return;
 
@@ -119,35 +142,26 @@ TranscriptVideoSubscriber::setFilterDescription()
         point_ = {pluginFrameSize_.first - margin, pluginFrameSize_.second - margin};
     }
 
-    std::string rotateSides = "";
-
-    if (std::abs(angle_) == 90)
-        rotateSides = ":out_w=ih:out_h=iw";
-
     auto baseInfosDescription
-        = fmt::format("[input]rotate={}{}"
-                      ",drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}"
+        = fmt::format("[input]{}"
+                      "drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}"
                       "':line_spacing=5:box=1:boxcolor={}:boxborderw=5:x=",
-                      rotation[angle_], rotateSides,
+                      getTransposeDescr(angle_),
                       fontColor_, fontSize_, fontFile_, subtitle_, fontBackground_);
 
-    auto position = "{}-text_w:y={}";
+    auto position = "{}-text_w:y={}"sv;
     if (position_ == "2")
-        position = "{}:y={}";
+        position = "{}:y={}"sv;
     else if (position_ == "3")
-        position = "{}:y={}-text_h";
+        position = "{}:y={}-text_h"sv;
     else if (position_ == "4")
-        position = "{}-text_w:y={}-text_h";
-    baseInfosDescription = baseInfosDescription + position + ",rotate={}{},format=yuv420p";
-    filterDescription_ = fmt::format(baseInfosDescription,
-                                     std::to_string(point_.first),
-                                     std::to_string(point_.second),
-                                     rotation[-angle_],
-                                     rotateSides);
+        position = "{}-text_w:y={}-text_h"sv;
+    filterDescription_ = baseInfosDescription + fmt::format(std::string(position) + ",{}format=yuv420p"s,
+                                     point_.first,
+                                     point_.second,
+                                     getTransposeDescr(-angle_));
 
-#ifdef __DEBUG__
     Plog::log(Plog::LogPriority::INFO, TAG, filterDescription_);
-#endif
 }
 
 void
@@ -156,9 +170,8 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
     if (!observable_ || !pluginFrame || subtitle_.empty())
         return;
 
-    AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX);
     int newAngle {0};
-    if (side_data) {
+    if (AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX)) {
         auto matrix_rotation = reinterpret_cast<int32_t*>(side_data->data);
         newAngle = static_cast<int>(av_display_rotation_get(matrix_rotation));
     }
@@ -170,12 +183,17 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
     //======================================================================================
     // GET RAW FRAME
     uniqueFramePtr rgbFrame = {transferToMainMemory(pluginFrame, AV_PIX_FMT_NV12), frameFree};
-    rgbFrame.reset(FrameScaler::convertFormat(rgbFrame.get(), AV_PIX_FMT_YUV420P));
+    if (!rgbFrame.get())
+        return;
+    if ((AVPixelFormat)rgbFrame->format != AV_PIX_FMT_YUV420P)
+        rgbFrame.reset(FrameScaler::convertFormat(rgbFrame.get(), AV_PIX_FMT_YUV420P));
     if (!rgbFrame.get())
         return;
 
     if (sourceTimeBase_.num != pluginFrame->time_base.num || sourceTimeBase_.den != pluginFrame->time_base.den)
         firstRun = true;
+    if (rgbFrame->width != pluginFrameSize_.first || rgbFrame->height != pluginFrameSize_.second)
+        firstRun = true;
 
     rgbFrame->pts = pluginFrame->pts;
     rgbFrame->time_base = pluginFrame->time_base;
@@ -184,8 +202,6 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
     if (firstRun) {
         filter_.clean();
         pluginFrameSize_ = {rgbFrame->width, rgbFrame->height};
-        if (std::abs(angle_) == 90)
-            pluginFrameSize_ = {rgbFrame->height, rgbFrame->width};
         setFilterDescription();
 
         rational<int> fr(sourceTimeBase_.den, sourceTimeBase_.num);
diff --git a/WhisperTranscript/TranscriptVideoSubscriber.h b/WhisperTranscript/TranscriptVideoSubscriber.h
index b809f99748f20f165af6237ab6d2919e995d0375..499568e8291fe3787b40a85a221e648f0bcad384 100644
--- a/WhisperTranscript/TranscriptVideoSubscriber.h
+++ b/WhisperTranscript/TranscriptVideoSubscriber.h
@@ -43,7 +43,7 @@ public:
 
     void detach();
 
-    void setText(std::string& text);
+    void setText(const std::string& text);
     void setFilterDescription();
 
     void setParameter(std::string& parameter, Parameter type);
diff --git a/WhisperTranscript/build.sh b/WhisperTranscript/build.sh
index 32f1155b12f49668e4efe78a077238c34ad1cf77..2ca33b2380337ab565736aceaa8090f75015fa5d 100755
--- a/WhisperTranscript/build.sh
+++ b/WhisperTranscript/build.sh
@@ -12,18 +12,21 @@ EXTRAPATH=''
 # -d: debug program.
 
 if [ -z "${DAEMON}" ]; then
-    DAEMON="./../../daemon"
-    echo "DAEMON not provided, building with ${DAEMON}"
+    echo "DAEMON not provided, building with ./../../daemon"
 fi
 
+DAEMON=${DAEMON:="./../../daemon"}
+CONTRIB_PATH=${CONTRIB_PATH:="${DAEMON}/contrib"}
+CONTRIB_BUILD_DIR=${CONTRIB_BUILD_DIR:="native"}
+
 PLUGIN_NAME="WhisperTranscript"
 JPL_FILE_NAME="${PLUGIN_NAME}.jpl"
 SO_FILE_NAME="lib${PLUGIN_NAME}.so"
 DAEMON_SRC="${DAEMON}/src"
-CONTRIB_PATH="${DAEMON}/contrib"
 PLUGINS_LIB="../lib"
 LIBS_DIR="./../contrib/Libs"
 PLATFORM=$(uname)
+CONTRIB_BUILD_PATH="${CONTRIB_PATH}/${CONTRIB_BUILD_DIR}"
 
 if [ "${PLATFORM}" = "Linux" ]; then
     PLATFORM="linux-gnu"
@@ -41,19 +44,19 @@ fi
 while getopts t:c:p:d OPT; do
   case "$OPT" in
     d)
-      DEBUG=true
-      export __DEBUG__=true
+        DEBUG=true
+        export __DEBUG__=true
     ;;
     t)
-      PLATFORM="${OPTARG}"
+        PLATFORM="${OPTARG}"
     ;;
     c)
-      PROCESSOR="${OPTARG}"
+        PROCESSOR="${OPTARG}"
     ;;
     p)
     ;;
     \?)
-      exit 1
+        exit 1
     ;;
   esac
 done
@@ -74,39 +77,57 @@ fi
 echo $PROCESSOR
 
 cp -r ffmpeg ${CONTRIB_PATH}/src/
+cp -r whispercpp ${CONTRIB_PATH}/src/
 cp -r ../contrib/rav1e ${CONTRIB_PATH}/src/
 
+if [ ! -f "./data/assets/ggml-base.bin" ]; then
+    if [ -x "$(command -v wget)" ]; then
+        wget --quiet --show-progress -O ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin
+    elif [ -x "$(command -v curl)" ]; then
+        curl --output ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin
+    else
+        printf "Either wget or curl is required to download models.\n"
+        exit 1
+    fi
+fi
+
+if [ ! -f "./data/assets/ggml-base.bin" ]; then
+    printf "Model is required to build the plugin. Aborting.\n"
+    exit 1
+fi
+
 if [ "${PLATFORM}" = "linux-gnu" ] || [ "${PLATFORM}" = "redhat-linux" ]
 then
-    if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then
-        rm "${CONTRIB_PATH}/native/.ffmpeg"
+    if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.ffmpeg"
+        rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg"
+    fi
+    if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.whispercpp"
+        rm -rf "${CONTRIB_BUILD_PATH}/whispercpp"
     fi
     WORKPATH=$(pwd)
-    cd "${CONTRIB_PATH}/native/"
-    make .ffmpeg -j$(nproc)
+    cd "${CONTRIB_BUILD_PATH}/"
+    make .ffmpeg -j$(nproc) install
+    make .whispercpp -j$(nproc) install
+    rm .whispercpp
     rm .ffmpeg
     cd ${WORKPATH}
 
     CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM}
-    ONNX_PATH=${EXTRALIBS_PATH}
-    if [ -z "${EXTRALIBS_PATH}" ]
-    then
-      ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}"
-    fi
-
     if [ ${DEBUG} ]; then
-      OUTPUT="${PLUGIN_NAME}"
-      CLANG_OPTS="-g -fsanitize=address"
-      EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lvdpau -lX11 -lva-drm -lva-x11 -lrav1e"
-      EXTRA_DEFINES="-D__DEBUG__"
+        OUTPUT="${PLUGIN_NAME}"
+        CLANG_OPTS="-O0 -g -fsanitize=address"
+        EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lvdpau -lX11 -lva-drm -lva-x11 -lrav1e"
+        EXTRA_DEFINES="-D__DEBUG__"
     else
-      python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
-      CLANG_OPTS="-O3 -shared"
-      OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
+        python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
+        CLANG_OPTS="-O3 -g -shared"
+        OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
     fi
 
     # Compile
-    clang++ -std=c++17 -g -O0 -fPIC ${CLANG_OPTS} \
+    clang++ -std=c++17 -fPIC ${CLANG_OPTS} \
     -Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \
     -Wall -Wextra \
     -Wno-unused-parameter \
@@ -115,30 +136,26 @@ then
     -I"." \
     -I"${DAEMON_SRC}" \
     -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \
-    -I"${ONNX_PATH}/include/onnxruntime/session" \
-    -I"${ONNX_PATH}/include/onnxruntime/providers/cuda" \
-    -I"${CONTRIB_PATH}/native/onnx/onnxruntime" \
     -I"${PLUGINS_LIB}" \
     ./../lib/common.cpp \
     ./../lib/accel.cpp \
     ./../lib/frameFilter.cpp \
     ./../lib/frameUtils.cpp \
+    ./../lib/resampler.cpp \
     main.cpp \
     TranscriptMediaHandler.cpp \
     TranscriptAudioSubscriber.cpp \
     TranscriptVideoSubscriber.cpp \
     PluginPreferenceHandler.cpp \
-    Preprocess.cpp \
-    ModelProcessor.cpp \
-    -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \
-    -L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \
-    -L"${CUDA_HOME}/lib64/" \
+    stt_whisper.cpp \
+    -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib" \
     -l:libavfilter.a \
     -l:libswscale.a \
     -l:libswresample.a \
     -l:libavformat.a \
     -l:libavcodec.a \
     -l:libavutil.a \
+    -l:libwhisper.a \
     -lfreetype \
     -lvpx \
     -lx264 \
@@ -147,57 +164,36 @@ then
     -lz \
     -lva \
     -lfmt \
-    -lonnxruntime \
     ${EXTRA_DEBUG_LIBRARIES} \
     -o "${OUTPUT}"
 
-    if [ ${DEBUG} ]; then
-      cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx"
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "libonnxruntime.so.1.12.0"
-    else
-      cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx"
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime.so.1.12.0"
-    fi
-    if [ "${PROCESSOR}" = "NVIDIA" ]; then
-      if [ ${DEBUG} ]; then
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "libonnxruntime_providers_shared.so"
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "libonnxruntime_providers_cuda.so"
-      else
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_shared.so"
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_cuda.so"
-      fi
-    fi
-
 elif [ "${PLATFORM}" = "darwin" ]
 then
-    if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then
-        rm "${CONTRIB_PATH}/native/.ffmpeg"
+    if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.ffmpeg"
+        rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg"
+    fi
+    if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.whispercpp"
+        rm -rf "${CONTRIB_BUILD_PATH}/whispercpp"
     fi
     WORKPATH=$(pwd)
-    cd "${CONTRIB_PATH}/native/"
-    make .ffmpeg -j$(nproc)
+    cd "${CONTRIB_BUILD_PATH}/"
+    make .whispercpp
+    make .ffmpeg
+    rm .whispercpp
     rm .ffmpeg
     cd ${WORKPATH}
 
     CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM}
-    ONNX_PATH=${EXTRALIBS_PATH}
-    if [ -z "${EXTRALIBS_PATH}" ]
-    then
-      ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}"
-    fi
-
     if [ ${DEBUG} ]; then
       OUTPUT="${PLUGIN_NAME}"
-      CLANG_OPTS="-g -fsanitize=address"
+      CLANG_OPTS="-O0 -g -fsanitize=address"
       EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lrav1e"
       EXTRA_DEFINES="-D__DEBUG__"
     else
       python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
-      CLANG_OPTS="-O3 -shared"
+      CLANG_OPTS="-O3 -g -shared"
       OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
     fi
 
@@ -215,21 +211,19 @@ then
     -I"." \
     -I"${DAEMON_SRC}" \
     -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/include" \
-    -I"${ONNX_PATH}/include/onnxruntime/session" \
     -I"${PLUGINS_LIB}" \
     ./../lib/common.cpp \
     ./../lib/accel.cpp \
     ./../lib/frameFilter.cpp \
     ./../lib/frameUtils.cpp \
+    ./../lib/resampler.cpp \
     main.cpp \
     TranscriptMediaHandler.cpp \
     TranscriptAudioSubscriber.cpp \
     TranscriptVideoSubscriber.cpp \
     PluginPreferenceHandler.cpp \
-    Preprocess.cpp \
-    ModelProcessor.cpp \
+    stt_whisper.cpp \
     -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/" \
-    -L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \
     -lavfilter \
     -lswscale \
     -lswresample \
@@ -237,44 +231,22 @@ then
     -lavcodec \
     -lavutil \
     -lvpx -lx264 -lbz2 -liconv -lz \
-    -lonnxruntime \
-    -lspeex \
-    -lopus \
+    "${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libspeex.a" \
+    "${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libopus.a" \
     -lfmt \
+    -lwhisper \
     "/usr/local/opt/libpng/lib/libpng.a" \
     "/usr/local/opt/freetype/lib/libfreetype.a" \
     ${EXTRA_DEBUG_LIBRARIES} \
     -o "${OUTPUT}"
 
-    if [ ${DEBUG} ]; then
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "libonnxruntime.dylib"
-      cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx"
-      install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "libonnxruntime.dylib"
-      install_name_tool -id "@loader_path/${PLUGIN_NAME}" "${OUTPUT}"
-    else
-      cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx"
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
-      install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
-      install_name_tool -id "@loader_path/${SO_FILE_NAME}" "${OUTPUT}"
-    fi
-    install_name_tool -change "@rpath/libonnxruntime.1.12.0.dylib" "@loader_path/libonnxruntime.dylib" "${OUTPUT}"
-
     if [ -n "${APPLE_SIGN_CERTIFICATE}" ]; then
-      codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}"  "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
       codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}"  "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
-      ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" "build-local/libonnxruntime.dylib.zip"
-      LIBRARYNAME=libonnxruntime.dylib sh ./../notarize.sh
-      ditto -x -k "build-local/libonnxruntime.dylib.zip" "build-local/notarized0"
-      cp "build-local/notarized0/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
 
       ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" "build-local/${SO_FILE_NAME}.zip"
       LIBRARYNAME=${SO_FILE_NAME} sh ./../notarize.sh
-      ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized1"
-      cp "build-local/notarized1/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
+      ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized"
+      cp "build-local/notarized/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
     fi
 
 elif [ "${PLATFORM}" = "android" ]
@@ -346,13 +318,15 @@ then
         CONTRIB_PLATFORM=x86_64-linux-android
         fi
 
-        if [ -f "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg" ]; then
-            rm "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg"
+        if [ -f "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg" ]; then
+            rm "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg"
         fi
 
         WORKPATH=$(pwd)
-        cd "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/"
+        cd "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/"
         make .ffmpeg -j$(nproc)
+        make .whispercpp -j$(nproc)
+        rm .whispercpp
         rm .ffmpeg
         cd ${WORKPATH}
 
@@ -360,12 +334,6 @@ then
         #    Compile the plugin
         #=========================================================
 
-        ONNX_PATH="${EXTRALIBS_PATH}/${CURRENT_ABI}"
-        if [ -z ${EXTRALIBS_PATH} ]
-        then
-          ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}"
-        fi
-
         # Create so destination folder
         $CXX --std=c++17 -O3 -fPIC \
         -Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \
@@ -376,10 +344,6 @@ then
         -I"." \
         -I"${DAEMON_SRC}" \
         -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \
-        -I"${ONNX_PATH}/include/onnxruntime/session" \
-        -I"${ONNX_PATH}/include/onnxruntime/providers/nnapi" \
-        -I"${ONNX_PATH}/../include/onnxruntime/session" \
-        -I"${ONNX_PATH}/../include/onnxruntime/providers/nnapi" \
         -I"${PLUGINS_LIB}" \
         ./../lib/common.cpp \
         ./../lib/accel.cpp \
@@ -390,10 +354,9 @@ then
         TranscriptAudioSubscriber.cpp \
         TranscriptVideoSubscriber.cpp \
         PluginPreferenceHandler.cpp \
-        Preprocess.cpp \
-        ModelProcessor.cpp \
+        stt_whisper.cpp \
+        ./../lib/resampler.cpp \
         -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \
-        -L"${ONNX_PATH}/lib/" \
         -lavfilter \
         -lswscale \
         -lswresample \
@@ -405,13 +368,11 @@ then
         -lspeex \
         -lopus \
         -lfmt \
+        -lwhisper \
         -l:libfreetype.a \
         -llog -lz \
-        -lonnxruntime \
         --sysroot=$ANDROID_SYSROOT \
         -o "build-local/jpl/lib/$CURRENT_ABI/${SO_FILE_NAME}"
-
-        cp "${ONNX_PATH}/lib/libonnxruntime.so" "build-local/jpl/lib/${CURRENT_ABI}/libonnxruntime.so"
     }
 
     # Build the so
@@ -419,14 +380,10 @@ then
         CURRENT_ABI=$i
         buildlib
     done
-
-    cp "./modelSRC/mModelEncoder.ort" "./build-local/jpl/data/assets/mModelEncoder.ort"
-    cp "./modelSRC/mModelDecoder.ort" "./build-local/jpl/data/assets/mModelDecoder.ort"
-    cp "./modelSRC/mLogSoftMax.ort" "./build-local/jpl/data/assets/mLogSoftMax.ort"
 fi
 
 if [ ! ${DEBUG} ]; then
-  python3 ./../SDK/jplManipulation.py --assemble --plugin=${PLUGIN_NAME} --distribution=${PLATFORM} --extraPath=${EXTRAPATH}
+python3 ./../SDK/jplManipulation.py --assemble --plugin=${PLUGIN_NAME} --distribution=${PLATFORM} --extraPath=${EXTRAPATH}
 fi
 
 cd ${CONTRIB_PATH}/src/ffmpeg/
diff --git a/WhisperTranscript/data/accountpreferences.json b/WhisperTranscript/data/accountpreferences.json
index 04b18e459a33742a90a72db05830f5d9535213a8..a7bbcded57ed9c0e293bdb8c7433199aeac26643 100644
--- a/WhisperTranscript/data/accountpreferences.json
+++ b/WhisperTranscript/data/accountpreferences.json
@@ -3,7 +3,7 @@
         "type": "List",
         "key": "language",
         "title": "{{language_title}}",
-        "defaultValue": "en",
+        "defaultValue": "auto",
         "scope": "plugin,Transcript",
         "entryValues": [
             "auto",
@@ -210,6 +210,84 @@
             "{{language_yo}}"
         ]
     },
+    {
+        "type": "List",
+        "key": "background",
+        "title": "{{background_title}}",
+        "summary": "{{background_summary}}",
+        "defaultValue": "black",
+        "scope": "plugin,Transcript",
+        "entryValues": [
+            "black",
+            "white"
+        ],
+        "entries": [
+            "{{background_entries_1}}",
+            "{{background_entries_2}}"
+        ]
+    },
+    {
+        "type": "List",
+        "key": "position",
+        "title": "{{position_title}}",
+        "defaultValue": "2",
+        "scope": "plugin,Transcript",
+        "entryValues": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ],
+        "entries": [
+            "{{position_entries_1}}",
+            "{{position_entries_2}}",
+            "{{position_entries_3}}",
+            "{{position_entries_4}}"
+        ]
+    },
+    {
+        "type": "List",
+        "key": "fontsize",
+        "title": "{{fontsize_title}}",
+        "defaultValue": "14",
+        "scope": "plugin,Transcript",
+        "entryValues": [
+            "10",
+            "12",
+            "14",
+            "16",
+            "18",
+            "24",
+            "36",
+            "72"
+        ],
+        "entries": [
+            "10",
+            "12",
+            "14",
+            "16",
+            "18",
+            "24",
+            "36",
+            "72"
+        ]
+    },
+    {
+        "type": "List",
+        "key": "avstream",
+        "title": "{{avstream_title}}",
+        "summary": "{{avstream_summary}}",
+        "defaultValue": "in",
+        "scope": "plugin",
+        "entryValues": [
+            "out",
+            "in"
+        ],
+        "entries": [
+            "{{avstream_entries_1}}",
+            "{{avstream_entries_2}}"
+        ]
+    },
     {
         "type": "Switch",
         "key": "TranscriptAlways",
diff --git a/WhisperTranscript/data/assets/.gitignore b/WhisperTranscript/data/assets/.gitignore
index e1a699ac37f449b6c2b99d720554c9122433b2de..42cbf4c9c0bb524a484e6fa5d936cb14d5582535 100644
--- a/WhisperTranscript/data/assets/.gitignore
+++ b/WhisperTranscript/data/assets/.gitignore
@@ -1 +1,2 @@
 *.onnx
+*.bin
diff --git a/WhisperTranscript/data/assets/mel_filters.bin b/WhisperTranscript/data/assets/mel_filters.bin
deleted file mode 100644
index 9e3c32b7b856f37c60392d2023be21d7e4d76022..0000000000000000000000000000000000000000
Binary files a/WhisperTranscript/data/assets/mel_filters.bin and /dev/null differ
diff --git a/WhisperTranscript/data/assets/tokenizer.bin b/WhisperTranscript/data/assets/tokenizer.bin
deleted file mode 100644
index aa1c457305d25d603161d77b0ad348255f5664e1..0000000000000000000000000000000000000000
Binary files a/WhisperTranscript/data/assets/tokenizer.bin and /dev/null differ
diff --git a/WhisperTranscript/data/locale/WhisperTranscript_en.json b/WhisperTranscript/data/locale/WhisperTranscript_en.json
index 69bc1cfda143e0f22ed55638253e72cd5e4f1e67..4577ef665470bc0036ff57a4ec7843c7d3136044 100644
--- a/WhisperTranscript/data/locale/WhisperTranscript_en.json
+++ b/WhisperTranscript/data/locale/WhisperTranscript_en.json
@@ -5,11 +5,10 @@
     "avstream_entries_2": "Received",
     "TranscriptAlways_title": "Automatically activate transcription",
     "TranscriptAlways_summary": "Activate transcription when a call starts.",
-    "background_title": "Add background color",
-    "background_summary": "Add a partial transparency to the subtitle background if it isn't visible enough",
-    "background_entries_1": "None",
-    "background_entries_2": "Black",
-    "background_entries_3": "White",
+    "background_title": "Background color",
+    "background_summary": "Defines the subtitle background color",
+    "background_entries_1": "Black",
+    "background_entries_2": "White",
     "position_title": "Transcription position",
     "position_entries_1": "Top right",
     "position_entries_2": "Top left",
@@ -116,9 +115,5 @@
     "language_vi": "Vietnamese",
     "language_cy": "Welsh",
     "language_yi": "Yiddish",
-    "language_yo": "Yoruba",
-    "acceleration_title": "Use hardware acceleration",
-    "acceleration_summary": "Use CUDA or NNAPI where applicable",
-    "chunk_title": "Chunk size in seconds",
-    "step_title": "Step size in seconds"
+    "language_yo": "Yoruba"
 }
\ No newline at end of file
diff --git a/WhisperTranscript/data/preferences.json b/WhisperTranscript/data/preferences.json
index e803e211495a979010569a72d653f0016c890301..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc 100644
--- a/WhisperTranscript/data/preferences.json
+++ b/WhisperTranscript/data/preferences.json
@@ -1,224 +1 @@
-[
-    {
-        "type": "List",
-        "key": "background",
-        "title": "{{background_title}}",
-        "summary": "{{background_summary}}",
-        "defaultValue": "black@0.0",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "black@0.0",
-            "black@0.5",
-            "white@0.5"
-        ],
-        "entries": [
-            "{{background_entries_1}}",
-            "{{background_entries_2}}",
-            "{{background_entries_3}}"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "position",
-        "title": "{{position_title}}",
-        "defaultValue": "2",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "1",
-            "2",
-            "3",
-            "4"
-        ],
-        "entries": [
-            "{{position_entries_1}}",
-            "{{position_entries_2}}",
-            "{{position_entries_3}}",
-            "{{position_entries_4}}"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "fontsize",
-        "title": "{{fontsize_title}}",
-        "defaultValue": "14",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "10",
-            "12",
-            "14",
-            "16",
-            "18",
-            "24",
-            "36",
-            "72"
-        ],
-        "entries": [
-            "10",
-            "12",
-            "14",
-            "16",
-            "18",
-            "24",
-            "36",
-            "72"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "avstream",
-        "title": "{{avstream_title}}",
-        "summary": "{{avstream_summary}}",
-        "defaultValue": "in",
-        "scope": "plugin",
-        "entryValues": [
-            "out",
-            "in"
-        ],
-        "entries": [
-            "{{avstream_entries_1}}",
-            "{{avstream_entries_2}}"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "chunksize",
-        "title": "{{chunk_title}}",
-        "defaultValue": "15",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ],
-        "entries": [
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "stepsize",
-        "title": "{{step_title}}",
-        "defaultValue": "3",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "1",
-            "2",
-            "3",
-            "4",
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ],
-        "entries": [
-            "1",
-            "2",
-            "3",
-            "4",
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ]
-    },
-    {
-        "type": "Switch",
-        "key": "acceleration",
-        "title": "{{acceleration_title}}",
-        "summary": "{{acceleration_summary}}",
-        "defaultValue": "1",
-        "scope": "plugin"
-    }
-]
\ No newline at end of file
+[]
\ No newline at end of file
diff --git a/WhisperTranscript/ffmpeg/package.json b/WhisperTranscript/ffmpeg/package.json
index fa3d2779f7527d102cf56d9846b3b8c8993200b9..428d0e7489517aa934badd98a60dce69a161fdc3 100644
--- a/WhisperTranscript/ffmpeg/package.json
+++ b/WhisperTranscript/ffmpeg/package.json
@@ -1,6 +1,6 @@
 {
     "name": "ffmpeg",
-    "version": "n5.0",
+    "version": "n6.0",
     "url": "https://git.ffmpeg.org/gitweb/ffmpeg.git/snapshot/__VERSION__.tar.gz",
     "deps": [
         "freetype",
@@ -16,13 +16,12 @@
         "libopusenc-reload-packet-loss-at-encode.patch",
         "libopusdec-enable-FEC.patch",
         "windows-configure.patch",
-        "windows-configure-ffnvcodec.patch",
-        "windows-configure-libmfx.patch"
+        "windows-configure-ffnvcodec.patch"
     ],
     "win_patches": [
     ],
     "project_paths": [],
-    "with_env" : "10.0.16299.0",
+    "with_env" : "",
     "custom_scripts": {
         "pre_build": [],
         "build": [
diff --git a/WhisperTranscript/ffmpeg/rules.mak b/WhisperTranscript/ffmpeg/rules.mak
index 7b5c55447f9757018014a913e3d25328b8577f46..24296dfdf3722d5a32885f0889cdbb152accef13 100644
--- a/WhisperTranscript/ffmpeg/rules.mak
+++ b/WhisperTranscript/ffmpeg/rules.mak
@@ -1,4 +1,4 @@
-FFMPEG_HASH := n5.0
+FFMPEG_HASH := n6.0
 FFMPEG_URL := https://git.ffmpeg.org/gitweb/ffmpeg.git/snapshot/$(FFMPEG_HASH).tar.gz
 
 PKGS+=ffmpeg
@@ -86,16 +86,10 @@ FFMPEGCONF += \
 	--enable-parser=mpeg4video \
 	--enable-parser=vp8 \
 	--enable-parser=vp9 \
-	--enable-parser=opus \
-	--enable-parser=w64 \
-	--enable-parser=wav
+	--enable-parser=opus
 
 #encoders/decoders
 FFMPEGCONF += \
-	--enable-encoder=w64 \
-	--enable-encoder=wav \
-	--enable-decoder=w64 \
-	--enable-decoder=wav \
 	--enable-encoder=adpcm_g722 \
 	--enable-decoder=adpcm_g722 \
 	--enable-encoder=rawvideo \
@@ -344,7 +338,7 @@ $(TARBALLS)/ffmpeg-$(FFMPEG_HASH).tar.gz:
 ffmpeg: ffmpeg-$(FFMPEG_HASH).tar.gz
 	rm -Rf $@ $@-$(FFMPEG_HASH)
 	mkdir -p $@-$(FFMPEG_HASH)
-	(cd $@-$(FFMPEG_HASH) && tar x $(if ${BATCH_MODE},,-v) --strip-components=1 -f ../$<)
+	(cd $@-$(FFMPEG_HASH) && tar x $(if ${BATCH_MODE},,-v) --strip-components=1 -f $<)
 	$(APPLY) $(SRC)/ffmpeg/remove-mjpeg-log.patch
 	$(APPLY) $(SRC)/ffmpeg/change-RTCP-ratio.patch
 	$(APPLY) $(SRC)/ffmpeg/rtp_ext_abs_send_time.patch
@@ -352,6 +346,7 @@ ffmpeg: ffmpeg-$(FFMPEG_HASH).tar.gz
 	$(APPLY) $(SRC)/ffmpeg/libopusenc-reload-packet-loss-at-encode.patch
 	$(APPLY) $(SRC)/ffmpeg/ios-disable-b-frames.patch
 	$(APPLY) $(SRC)/ffmpeg/screen-sharing-x11-fix.patch
+	$(APPLY) $(SRC)/ffmpeg/nvenc-fix-reorderqueueflush-crash.patch
 	$(UPDATE_AUTOCONFIG)
 	$(MOVE)
 
diff --git a/WhisperTranscript/package.json b/WhisperTranscript/package.json
index e0e95adbe6eeeb700fb94b3b1d772ad4299f1a81..71ac7733340637d5af44a7289becbe39270b48a1 100644
--- a/WhisperTranscript/package.json
+++ b/WhisperTranscript/package.json
@@ -4,7 +4,6 @@
     "extractLibs": false,
     "deps": [],
     "defines": [
-        "NVIDIA=False",
         "TESTPROCESS=False"
     ],
     "custom_scripts": {
diff --git a/WhisperTranscript/stt_whisper.cpp b/WhisperTranscript/stt_whisper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3df602afce51ae8edbe1a2ae2a578353d7428ef7
--- /dev/null
+++ b/WhisperTranscript/stt_whisper.cpp
@@ -0,0 +1,266 @@
+#include "stt_whisper.h"
+#include "whisper.h"
+
+#ifdef WIN32
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+void print_array(const std::vector<float>& data)
+{
+  fprintf(stdout, "print array: [");
+  for (int i = 0; i < std::min((int)data.size(), 10); i++) {
+    fprintf(stdout, " %.8f,", data[i]);
+  }
+  fprintf(stdout, " ]\n");
+}
+
+void high_pass_filter(std::vector<float>& data, float cutoff, float sample_rate)
+{
+  const float rc = 1.0f / (2.0f * M_PI * cutoff);
+  const float dt = 1.0f / sample_rate;
+  const float alpha = dt / (rc + dt);
+
+  float y = data[0];
+
+  for (size_t i = 1; i < data.size(); i++) {
+    y = alpha * (y + data[i] - data[i - 1]);
+    data[i] = y;
+  }
+}
+
+/** Check if speech is ending. */
+bool vad_simple(std::vector<float>& pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose)
+{
+  const int n_samples = pcmf32.size();
+  const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+  if (n_samples_last >= n_samples) {
+    // not enough samples - assume no speech
+    return false;
+  }
+
+  if (freq_thold > 0.0f) {
+    high_pass_filter(pcmf32, freq_thold, sample_rate);
+  }
+
+  float energy_all = 0.0f;
+  float energy_last = 0.0f;
+
+  for (int i = 0; i < n_samples; i++) {
+    energy_all += fabsf(pcmf32[i]);
+
+    if (i >= n_samples - n_samples_last) {
+      energy_last += fabsf(pcmf32[i]);
+    }
+  }
+
+  energy_all /= n_samples;
+  energy_last /= n_samples_last;
+
+  if (verbose) {
+    fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+  }
+
+  if ((energy_all < 0.0001f && energy_last < 0.0001f) || energy_last > vad_thold * energy_all) {
+    return false;
+  }
+
+  return true;
+}
+
+RealtimeSttWhisper::RealtimeSttWhisper(const std::string& path_model)
+{
+  ctx = whisper_init_from_file(path_model.c_str());
+  is_running = true;
+  worker = std::thread(&RealtimeSttWhisper::Run, this);
+  t_last_iter = std::chrono::high_resolution_clock::now();
+}
+
+RealtimeSttWhisper::~RealtimeSttWhisper()
+{
+  is_running = false;
+  if (worker.joinable())
+    worker.join();
+  whisper_free(ctx);
+}
+
+/** Add audio data in PCM f32 format. */
+void RealtimeSttWhisper::AddAudioData(const float* data, size_t n_samples)
+{
+  std::lock_guard<std::mutex> lock(s_mutex);
+  // printf("AddAudioData: remaining: %d, new: %d\n", (int)s_queued_pcmf32.size(), (int)data.size());
+  s_queued_pcmf32.insert(s_queued_pcmf32.end(), data, data + n_samples);
+}
+
+/** Get newly transcribed text. */
+std::vector<transcribed_msg> RealtimeSttWhisper::GetTranscribed()
+{
+  std::vector<transcribed_msg> transcribed;
+  std::lock_guard<std::mutex> lock(s_mutex);
+  transcribed = std::move(s_transcribed_msgs);
+  s_transcribed_msgs.clear();
+  return transcribed;
+}
+
+/** Run Whisper in its own thread to not block the main thread. */
+void RealtimeSttWhisper::Run()
+{
+  struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+  // See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302
+  wparams.n_threads = 4;
+  wparams.no_context = true;
+  wparams.single_segment = true;
+  wparams.print_progress = false;
+  wparams.print_realtime = false;
+  wparams.print_special = false;
+  wparams.print_timestamps = false;
+  wparams.max_tokens = 64;
+  wparams.translate = false;
+
+  /**
+   * Experimental optimization: Reduce audio_ctx to 15s (half of the chunk
+   * size whisper is designed for) to speed up 2x.
+   * https://github.com/ggerganov/whisper.cpp/issues/137#issuecomment-1318412267
+   */
+  wparams.audio_ctx = 768;
+
+  /* When more than this amount of audio received, run an iteration. */
+  const int trigger_ms = 400;
+  const int n_samples_trigger = (trigger_ms / 1000.0) * WHISPER_SAMPLE_RATE;
+  /**
+   * When more than this amount of audio accumulates in the audio buffer,
+   * force finalize current audio context and clear the buffer. Note that
+   * VAD may finalize an iteration earlier.
+   */
+  // This is recommended to be smaller than the time wparams.audio_ctx
+  // represents so an iteration can fit in one chunk.
+  const int iter_threshold_ms = trigger_ms * 35;
+  const int n_samples_iter_threshold = (iter_threshold_ms / 1000.0) * WHISPER_SAMPLE_RATE;
+
+  /**
+   * ### Reminders
+   *
+   * - Note that whisper designed to process audio in 30-second chunks, and
+   *   the execution time of processing smaller chunks may not be shorter.
+   * - The design of trigger and threshold allows inputing audio data at
+   *   arbitrary rates with zero config. Inspired by Assembly.ai's
+   *   real-time transcription API
+   *   (https://github.com/misraturp/Real-time-transcription-from-microphone/blob/main/speech_recognition.py)
+   */
+
+  /* VAD parameters */
+  // The most recent 3s.
+  const int vad_window_s = 3;
+  const int n_samples_vad_window = WHISPER_SAMPLE_RATE * vad_window_s;
+  // In VAD, compare the energy of the last 500ms to that of the total 3s.
+  const int vad_last_ms = 500;
+  // Keep the last 0.5s of an iteration to the next one for better
+  // transcription at begin/end.
+  const int n_samples_keep_iter = WHISPER_SAMPLE_RATE * 0.5;
+  const float vad_thold = 0.3f;
+  const float freq_thold = 200.0f;
+
+  /* Audio buffer */
+  std::vector<float> pcmf32;
+
+  /* Processing loop */
+  while (is_running) {
+    {
+      std::unique_lock<std::mutex> lock(s_mutex);
+
+      if (s_queued_pcmf32.size() < n_samples_trigger) {
+        lock.unlock();
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        continue;
+      }
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(s_mutex);
+
+      if (s_queued_pcmf32.size() > 2 * n_samples_iter_threshold) {
+        fprintf(stderr, "\n\n%s: WARNING: too much audio is going to be processed, result may not come out in real time\n\n", __func__);
+      }
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(s_mutex);
+
+      pcmf32.insert(pcmf32.end(), s_queued_pcmf32.begin(), s_queued_pcmf32.end());
+
+      // printf("existing: %d, new: %d, will process: %d, threshold: %d\n",
+      //        n_samples_old, n_samples_new, (int)pcmf32.size(), n_samples_iter_threshold);
+
+      // print_array(pcmf32);
+
+      s_queued_pcmf32.clear();
+      wparams.language = lang_.c_str();
+    }
+
+    {
+      int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
+      if (ret != 0) {
+        fprintf(stderr, "Failed to process audio, returned %d\n", ret);
+        continue;
+      }
+    }
+
+    {
+      transcribed_msg msg;
+
+      const int n_segments = whisper_full_n_segments(ctx);
+      for (int i = 0; i < n_segments; ++i) {
+        const char* text = whisper_full_get_segment_text(ctx, i);
+        msg.text += text;
+      }
+
+      /**
+       * Simple VAD from the "stream" example in whisper.cpp
+       * https://github.com/ggerganov/whisper.cpp/blob/231bebca7deaf32d268a8b207d15aa859e52dbbe/examples/stream/stream.cpp#L378
+       */
+      bool speech_has_end = false;
+
+      /* Need enough accumulated audio to do VAD. */
+      if ((int)pcmf32.size() >= n_samples_vad_window) {
+        std::vector<float> pcmf32_window(pcmf32.end() - n_samples_vad_window, pcmf32.end());
+        speech_has_end = vad_simple(pcmf32_window, WHISPER_SAMPLE_RATE, vad_last_ms,
+                                    vad_thold, freq_thold, false);
+        if (speech_has_end)
+          printf("speech end detected\n");
+      }
+
+      /**
+       * Clear audio buffer when the size exceeds iteration threshold or
+       * speech end is detected.
+       */
+      if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) {
+        const auto t_now = std::chrono::high_resolution_clock::now();
+        const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last_iter).count();
+        printf("iter took: %lldms\n", t_diff);
+        t_last_iter = t_now;
+
+        msg.is_partial = false;
+        /**
+         * Keep the last few samples in the audio buffer, so the next
+         * iteration has a smoother start.
+         */
+        std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
+        pcmf32 = std::move(last);
+      } else {
+        msg.is_partial = true;
+      }
+
+      std::lock_guard<std::mutex> lock(s_mutex);
+      s_transcribed_msgs.insert(s_transcribed_msgs.end(), std::move(msg));
+    }
+  }
+}
\ No newline at end of file
diff --git a/WhisperTranscript/stt_whisper.h b/WhisperTranscript/stt_whisper.h
new file mode 100644
index 0000000000000000000000000000000000000000..4766faffc2b2448c3088678421c3d6cbb88913ac
--- /dev/null
+++ b/WhisperTranscript/stt_whisper.h
@@ -0,0 +1,43 @@
+#ifndef STT_WHISPER_H_
+#define STT_WHISPER_H_
+
+#include <atomic>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#define WHISPER_SAMPLE_RATE 16000
+
+struct transcribed_msg {
+  std::string text;
+  bool is_partial;
+};
+
+class WavWriter;
+
+class RealtimeSttWhisper
+{
+ public:
+  RealtimeSttWhisper(const std::string& path_model);
+  ~RealtimeSttWhisper();
+  void AddAudioData(const float* data, size_t n_samples);
+  std::vector<transcribed_msg> GetTranscribed();
+  void setLanguage(const std::string& lang) {
+    std::lock_guard<std::mutex> lock(s_mutex);
+    lang_ = lang;
+  }
+
+ private:
+  struct whisper_context* ctx;
+  std::string lang_;
+  std::atomic<bool> is_running;
+  std::vector<float> s_queued_pcmf32;
+  std::vector<transcribed_msg> s_transcribed_msgs;
+  std::mutex s_mutex;  // for accessing shared variables from both main thread and worker thread
+  std::thread worker;
+  void Run();
+  std::chrono::time_point<std::chrono::high_resolution_clock> t_last_iter;
+};
+
+#endif  // STT_WHISPER_H_
diff --git a/WhisperTranscript/whispercpp/package.json b/WhisperTranscript/whispercpp/package.json
new file mode 100644
index 0000000000000000000000000000000000000000..20eb8b4727c481f22c14e1699b76a6cc85be2af6
--- /dev/null
+++ b/WhisperTranscript/whispercpp/package.json
@@ -0,0 +1,17 @@
+{
+    "name": "whispercpp",
+    "version": "v1.2.1",
+    "url": "https://github.com/ggerganov/whisper.cpp/archive/refs/tags/__VERSION__.tar.gz",
+    "deps": [],
+    "patches": ["project.patch"],
+    "win_patches": [],
+    "project_paths": ["whisper.vcxproj"],
+    "with_env" : "",
+    "custom_scripts": {
+        "pre_build": [
+            "wget --no-check-certificate --quiet --show-progress -O ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin"
+        ],
+        "build": [],
+        "post_build": []
+    }
+}
diff --git a/WhisperTranscript/whispercpp/project.patch b/WhisperTranscript/whispercpp/project.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f3130e9ac52a6421ae6abf5584f518c3e23bcfe2
--- /dev/null
+++ b/WhisperTranscript/whispercpp/project.patch
@@ -0,0 +1,97 @@
+---
+ whisper.vcxproj | 83 +++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 83 insertions(+)
+ create mode 100644 whisper.vcxproj
+
+diff --git a/whisper.vcxproj b/whisper.vcxproj
+new file mode 100644
+index 0000000..9cbfdb7
+--- /dev/null
++++ b/whisper.vcxproj
+@@ -0,0 +1,83 @@
++<?xml version="1.0" encoding="utf-8"?>
++<Project DefaultTargets="Build" ToolsVersion="17.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
++  <PropertyGroup>
++    <PreferredToolArchitecture>x64</PreferredToolArchitecture>
++  </PropertyGroup>
++  <ItemGroup Label="ProjectConfigurations">
++    <ProjectConfiguration Include="Release|x64">
++      <Configuration>Release</Configuration>
++      <Platform>x64</Platform>
++    </ProjectConfiguration>
++  </ItemGroup>
++  <PropertyGroup Label="Globals">
++    <ProjectGuid>{47B512DE-EE88-3A32-A01F-DF4317B53175}</ProjectGuid>
++    <Keyword>Win32Proj</Keyword>
++    <WindowsTargetPlatformVersion>10.0.18362.0</WindowsTargetPlatformVersion>
++    <Platform>x64</Platform>
++    <ProjectName>whisper</ProjectName>
++    <VCProjectUpgraderObjectName>NoUpgrade</VCProjectUpgraderObjectName>
++  </PropertyGroup>
++  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
++  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
++    <ConfigurationType>StaticLibrary</ConfigurationType>
++    <CharacterSet>MultiByte</CharacterSet>
++    <PlatformToolset>v143</PlatformToolset>
++  </PropertyGroup>
++  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
++  <ImportGroup Label="ExtensionSettings">
++  </ImportGroup>
++  <ImportGroup Label="PropertySheets">
++    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++  </ImportGroup>
++  <PropertyGroup Label="UserMacros" />
++  <PropertyGroup>
++    <_ProjectFileVersion>10.0.18362.0</_ProjectFileVersion>
++    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)..\..\msvc\lib\$(Platform)\</OutDir>
++    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">whisper.dir\Release\</IntDir>
++    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">whisper</TargetName>
++    <TargetExt Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.lib</TargetExt>
++  </PropertyGroup>
++  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++    <ClCompile>
++      <AdditionalIncludeDirectories>$(ProjectDir).;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
++      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
++      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
++      <ExceptionHandling>Sync</ExceptionHandling>
++      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
++      <Optimization>MaxSpeed</Optimization>
++      <PrecompiledHeader>NotUsing</PrecompiledHeader>
++      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++      <RuntimeTypeInfo>true</RuntimeTypeInfo>
++      <UseFullPaths>false</UseFullPaths>
++      <WarningLevel>Level3</WarningLevel>
++      <PreprocessorDefinitions>%(PreprocessorDefinitions);WIN32;_WINDOWS;NDEBUG;_CRT_SECURE_NO_WARNINGS;CMAKE_INTDIR="Release"</PreprocessorDefinitions>
++      <ObjectFileName>$(IntDir)</ObjectFileName>
++      <DebugInformationFormat>
++      </DebugInformationFormat>
++    </ClCompile>
++    <ResourceCompile>
++      <PreprocessorDefinitions>%(PreprocessorDefinitions);WIN32;_WINDOWS;NDEBUG;_CRT_SECURE_NO_WARNINGS;CMAKE_INTDIR=\"Release\"</PreprocessorDefinitions>
++      <AdditionalIncludeDirectories>$(ProjectDir).;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
++    </ResourceCompile>
++    <Midl>
++      <AdditionalIncludeDirectories>$(ProjectDir).;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
++      <OutputDirectory>$(ProjectDir)/$(IntDir)</OutputDirectory>
++      <HeaderFileName>%(Filename).h</HeaderFileName>
++      <TypeLibraryName>%(Filename).tlb</TypeLibraryName>
++      <InterfaceIdentifierFileName>%(Filename)_i.c</InterfaceIdentifierFileName>
++      <ProxyFileName>%(Filename)_p.c</ProxyFileName>
++    </Midl>
++    <Lib>
++      <AdditionalOptions>%(AdditionalOptions) /machine:x64</AdditionalOptions>
++    </Lib>
++  </ItemDefinitionGroup>
++  <ItemGroup>
++    <ClInclude Include="ggml.h" />
++    <ClCompile Include="ggml.c" />
++    <ClInclude Include="whisper.h" />
++    <ClCompile Include="whisper.cpp" />
++  </ItemGroup>
++  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
++  <ImportGroup Label="ExtensionTargets">
++  </ImportGroup>
++</Project>
+-- 
+2.37.1.windows.1
+
diff --git a/WhisperTranscript/whispercpp/rules.mak b/WhisperTranscript/whispercpp/rules.mak
new file mode 100644
index 0000000000000000000000000000000000000000..22791824945b80fb32054176c4c5843fd550d5e6
--- /dev/null
+++ b/WhisperTranscript/whispercpp/rules.mak
@@ -0,0 +1,27 @@
+# whispercpp
+WHISPERCPP_HASH := v1.2.1
+WHISPERCPP_GITURL := https://github.com/ggerganov/whisper.cpp.git
+
+WCONFIG := -DBUILD_SHARED_LIBS=OFF \
+		   -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+$(TARBALLS)/whispercpp-$(WHISPERCPP_HASH).tar.xz:
+	$(call download_git,$(WHISPERCPP_GITURL),master,$(WHISPERCPP_HASH))
+
+.sum-whispercpp: whispercpp-$(WHISPERCPP_HASH).tar.xz
+	$(warning $@ not implemented)
+	touch $@
+
+whispercpp: whispercpp-$(WHISPERCPP_HASH).tar.xz .sum-whispercpp
+	rm -Rf $@-$(WHISPERCPP_HASH)
+	mkdir -p $@-$(WHISPERCPP_HASH)
+	(cd $@-$(WHISPERCPP_HASH) && tar x $(if ${BATCH_MODE},,-v) --strip-components=1 -f $<)
+	$(UPDATE_AUTOCONFIG)
+	$(MOVE)
+
+.whispercpp: whispercpp
+	cd $< && cmake . $(WCONFIG)
+	cd $< && $(MAKE)
+	cd $< && cp libwhisper.a $(PREFIX)/lib
+	cd $< && cp whisper.h $(PREFIX)/include
+	touch $@
diff --git a/lib/accel.cpp b/lib/accel.cpp
index 785a092c9591f815569c8b696977f935b31f0083..36e764d0e0a2bd307283ed0d581be06d86dee733 100644
--- a/lib/accel.cpp
+++ b/lib/accel.cpp
@@ -38,15 +38,12 @@ av_frame_new_side_data_from_buf(AVFrame* frame, enum AVFrameSideDataType type, A
 AVFrame*
 transferToMainMemory(const AVFrame* framePtr, AVPixelFormat desiredFormat)
 {
-    AVFrame* out = av_frame_alloc();
     auto desc = av_pix_fmt_desc_get(static_cast<AVPixelFormat>(framePtr->format));
-
     if (desc && !(desc->flags & AV_PIX_FMT_FLAG_HWACCEL)) {
-        av_frame_unref(out);
-        av_frame_free(&out);
         return av_frame_clone(framePtr);
     }
 
+    AVFrame* out = av_frame_alloc();
     out->format = desiredFormat;
     if (av_hwframe_transfer_data(out, framePtr, 0) < 0) {
         av_frame_unref(out);
diff --git a/lib/common.cpp b/lib/common.cpp
index c9534b13107a49f455640958aa159cd94b8a2087..0eeff59b68ff6d239a68052c3396fd28def86e88 100644
--- a/lib/common.cpp
+++ b/lib/common.cpp
@@ -59,6 +59,7 @@ void ffmpegFormatStringInline(std::string& str)
 void ffmpegScapeStringInline(std::string& str)
 {
     std::string newStr;
+    newStr.reserve(str.size());
     for (size_t i = 0; i < str.size(); i ++) {
         switch (str[i]) {
             case '\'':
diff --git a/lib/frameUtils.cpp b/lib/frameUtils.cpp
index 72890269db8f2eb957ec82727fa4fbf2c7089f95..37a0666a1da266c42ba5b96c4f12741da97c15a3 100644
--- a/lib/frameUtils.cpp
+++ b/lib/frameUtils.cpp
@@ -28,7 +28,6 @@ void
 moveFrom(AVFrame* dst, AVFrame* src)
 {
     if (dst && src) {
-        av_frame_copy_props(src, dst);
         av_frame_unref(dst);
         av_frame_move_ref(dst, src);
     }
@@ -37,7 +36,8 @@ moveFrom(AVFrame* dst, AVFrame* src)
 void
 frameFree(AVFrame* frame)
 {
-    av_frame_unref(frame);
+    if (frame)
+        av_frame_unref(frame);
     av_frame_free(&frame);
 }
 
diff --git a/lib/resampler.cpp b/lib/resampler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d2a06d44e8784dfb6fba341d932734c05aa91b0
--- /dev/null
+++ b/lib/resampler.cpp
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (C) 2004-2023 Savoir-faire Linux Inc.
+ *
+ *  Author: Emmanuel Milou <emmanuel.milou@savoirfairelinux.com>
+ *  Author: Alexandre Savard <alexandre.savard@savoirfairelinux.com>
+ *  Author: Philippe Gorley <philippe.gorley@savoirfairelinux.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
+ */
+
+#include "resampler.h"
+
+extern "C" {
+#include <libswresample/swresample.h>
+#include <libavutil/opt.h>
+}
+
+#include <new>
+#include <stdexcept>
+#include <iostream>
+
+namespace jami {
+
+Resampler::Resampler()
+    : swrCtx_(swr_alloc())
+    , initCount_(0)
+{}
+
+Resampler::~Resampler()
+{
+    swr_free(&swrCtx_);
+}
+
+void
+Resampler::reinit(const AVFrame* in, const AVFrame* out)
+{
+    // NOTE swr_set_matrix should be called on an uninitialized context
+    auto swrCtx = swr_alloc();
+    if (!swrCtx) {
+        throw std::bad_alloc();
+    }
+
+    av_opt_set_chlayout(swrCtx, "ichl", &in->ch_layout, 0);
+    av_opt_set_int(swrCtx, "isr", in->sample_rate, 0);
+    av_opt_set_sample_fmt(swrCtx, "isf", static_cast<AVSampleFormat>(in->format), 0);
+
+    av_opt_set_chlayout(swrCtx, "ochl", &out->ch_layout, 0);
+    av_opt_set_int(swrCtx, "osr", out->sample_rate, 0);
+    av_opt_set_sample_fmt(swrCtx, "osf", static_cast<AVSampleFormat>(out->format), 0);
+
+    /**
+     * Downmixing from 5.1 requires extra setup, since libswresample can't do it automatically
+     * (not yet implemented).
+     *
+     * Source: https://www.atsc.org/wp-content/uploads/2015/03/A52-201212-17.pdf
+     * Section 7.8.2 for the algorithm
+     * Tables 5.9 and 5.10 for the coefficients clev and slev
+     *
+     * LFE downmixing is optional, so any coefficient can be used, we use +6dB for mono and
+     * +0dB in each channel for stereo.
+     */
+    if (in->ch_layout.u.mask == AV_CH_LAYOUT_5POINT1
+        || in->ch_layout.u.mask == AV_CH_LAYOUT_5POINT1_BACK) {
+        // NOTE MSVC can't allocate dynamic size arrays on the stack
+        if (out->ch_layout.nb_channels == 2) {
+            double matrix[2][6];
+            // L = 1.0*FL + 0.707*FC + 0.707*BL + 1.0*LFE
+            matrix[0][0] = 1;
+            matrix[0][1] = 0;
+            matrix[0][2] = 0.707;
+            matrix[0][3] = 1;
+            matrix[0][4] = 0.707;
+            matrix[0][5] = 0;
+            // R = 1.0*FR + 0.707*FC + 0.707*BR + 1.0*LFE
+            matrix[1][0] = 0;
+            matrix[1][1] = 1;
+            matrix[1][2] = 0.707;
+            matrix[1][3] = 1;
+            matrix[1][4] = 0;
+            matrix[1][5] = 0.707;
+            swr_set_matrix(swrCtx, matrix[0], 6);
+        } else {
+            double matrix[1][6];
+            // M = 1.0*FL + 1.414*FC + 1.0*FR + 0.707*BL + 0.707*BR + 2.0*LFE
+            matrix[0][0] = 1;
+            matrix[0][1] = 1;
+            matrix[0][2] = 1.414;
+            matrix[0][3] = 2;
+            matrix[0][4] = 0.707;
+            matrix[0][5] = 0.707;
+            swr_set_matrix(swrCtx, matrix[0], 6);
+        }
+    }
+
+    if (swr_init(swrCtx) >= 0) {
+        std::swap(swrCtx_, swrCtx);
+        swr_free(&swrCtx);
+        ++initCount_;
+    } else {
+        throw std::runtime_error("Failed to initialize resampler context");
+    }
+}
+
+int
+Resampler::resample(const AVFrame* input, AVFrame* output)
+{
+    if (!initCount_)
+        reinit(input, output);
+
+    int ret = swr_convert_frame(swrCtx_, output, input);
+    if (ret & AVERROR_INPUT_CHANGED || ret & AVERROR_OUTPUT_CHANGED) {
+        // Under certain conditions, the resampler reinits itself in an infinite loop. This is
+        // indicative of an underlying problem in the code. This check is so the backtrace
+        // doesn't get mangled with a bunch of calls to Resampler::resample
+        if (initCount_ > 1) {
+            throw std::runtime_error("Infinite loop detected in audio resampler");
+        }
+        reinit(input, output);
+        return resample(input, output);
+    } else if (ret < 0) {
+        return -1;
+    }
+
+    // Resampling worked, reset count to 1 so reinit isn't called again
+    initCount_ = 1;
+    return 0;
+}
+} // namespace jami
diff --git a/lib/resampler.h b/lib/resampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbf61799ea955b016b04bd20d124ed06929f4bcc
--- /dev/null
+++ b/lib/resampler.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (C) 2004-2023 Savoir-faire Linux Inc.
+ *
+ *  Author: Emmanuel Milou <emmanuel.milou@savoirfairelinux.com>
+ *  Author: Alexandre Savard <alexandre.savard@savoirfairelinux.com>
+ *  Author: Philippe Gorley <philippe.gorley@savoirfairelinux.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
+ */
+
+#pragma once
+
+extern "C" {
+struct AVFrame;
+struct SwrContext;
+}
+
+namespace jami {
+
+/**
+ * @brief Wrapper class for libswresample
+ */
+class Resampler
+{
+public:
+    Resampler();
+    ~Resampler();
+
+    /**
+     * @brief Resample a frame.
+     *
+     * Resample from @input format to @output format.
+     *
+     * NOTE: sample_rate, ch_layout, and format should be set on @output
+     */
+    int resample(const AVFrame* input, AVFrame* output);
+private:
+    /**
+     * @brief Reinitializes filter according to new format.
+     *
+     * Reinitializes the resampler when new settings are detected. As long as both input and
+     * output formats don't change, this will only be called once.
+     */
+    void reinit(const AVFrame* in, const AVFrame* out);
+
+    /**
+     * @brief Libswresample resampler context.
+     *
+     * NOTE SwrContext is an imcomplete type and cannot be stored in a smart pointer.
+     */
+    SwrContext* swrCtx_;
+
+    /**
+     * @brief Number of times @swrCtx_ has been initialized with no successful audio resampling.
+     *
+     * 0: Uninitialized
+     * 1: Initialized
+     * >1: Invalid frames or formats, reinit is going to be called in an infinite loop
+     */
+    unsigned initCount_;
+};
+} // namespace jami