whisper: use whispercpp to avoid glitches

Change-Id: I3def6db3eed39b1e9a5feb2d8f4b664de60bd1d2

whisper: use whispercpp to avoid glitches
087d5597 · Adrien Béraud · Sébastien Blin · 440e8127 · 087d5597 · 087d5597
Commit 087d5597 authored Mar 6, 2023 by Adrien Béraud Committed by Sébastien Blin Apr 13, 2023
--- a/WhisperTranscript/.gitignore
+++ b/WhisperTranscript/.gitignore
 *.mp3
 /WhisperTranscript*
-libonnxruntime.so*
 /libonnxruntime.dylib
 /processed.mp4
 *.so
--- a/WhisperTranscript/CMakeLists.txt
+++ b/WhisperTranscript/CMakeLists.txt
@@ -58,11 +58,11 @@ set(plugin_SRC main.cpp
               PluginPreferenceHandler.cpp
               TranscriptAudioSubscriber.cpp
               TranscriptVideoSubscriber.cpp
-               Preprocess.cpp
-               ModelProcessor.cpp
+               stt_whisper.cpp
               ./../lib/accel.cpp
               ./../lib/frameUtils.cpp
               ./../lib/frameFilter.cpp
+               ./../lib/resampler.cpp
               ./../lib/common.cpp
               )

@@ -70,8 +70,7 @@ set(plugin_HDR TranscriptAudioSubscriber.h
               TranscriptVideoSubscriber.h
               TranscriptMediaHandler.h
               PluginPreferenceHandler.h
-               Preprocess.h
-               ModelProcessor.h
+               stt_whisper.h
               ./../lib/pluglog.h
               ./../lib/mediaStream.h
               ./../lib/audioFormat.h
@@ -98,6 +97,7 @@ target_include_directories(${ProjectName} PUBLIC ${PROJECT_BINARY_DIR}
                                                 ${ONNX_DIR}/../include/session
                                                 ${ONNX_DIR}/../include/providers/cuda
                                                 ${CONTRIB_PATH}/build/yaml-cpp/include
+                                                 ${CONTRIB_PATH}/build/whispercpp
                                                 )
 target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH}
                                              ${CONTRIB_PATH}/build/fmt/msvc/Release
@@ -110,7 +110,7 @@ target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH}

 target_link_libraries(${ProjectName} PUBLIC libyaml-cppmd libavfilter libswscale libswresample
                                            libavformat libavcodec libavutil libvpx libx264 libopus
-                                            libmfx fmt libzlib freetype ws2_32 Bcrypt Secur32 onnxruntime msvcrt)
+                                            libmfx fmt libzlib freetype whisper ws2_32 Bcrypt Secur32 msvcrt)

 add_custom_command(
    TARGET ${ProjectName}
@@ -120,6 +120,8 @@ add_custom_command(
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/yaml-cpp ${CONTRIB_PATH}/src/yaml-cpp
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/freetype ${CONTRIB_PATH}/src/freetype
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/ffmpeg/ ${CONTRIB_PATH}/src/ffmpeg
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/whispercpp/ ${CONTRIB_PATH}/src/whispercpp
+    COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb whispercpp
    COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb fmt
    COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb yaml-cpp
    COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb zlib
@@ -136,34 +138,17 @@ if(TESTPROCESS)
        PRE_BUILD
        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/testPreferences.yml ${PROJECT_BINARY_DIR}/
        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/jfk.wav ${PROJECT_BINARY_DIR}/
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${PROJECT_BINARY_DIR}/Debug
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${PROJECT_BINARY_DIR}/Debug
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${PROJECT_SOURCE_DIR}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${PROJECT_SOURCE_DIR}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${PROJECT_SOURCE_DIR}/data/assets
+        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/sample.mp4 ${PROJECT_BINARY_DIR}/
+        COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${PROJECT_SOURCE_DIR}/data/assets/
    )
 else()
    add_custom_command(
        TARGET ${ProjectName}
        PRE_BUILD
        COMMAND python ${PROJECT_SOURCE_DIR}/../SDK/jplManipulation.py --preassemble --plugin=${ProjectName}
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${JPL_DIRECTORY}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${JPL_DIRECTORY}/data/assets
-        COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${JPL_DIRECTORY}/data/assets
+        COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${JPL_DIRECTORY}/data/assets/
        COMMENT "Assembling Plugin files"
    )
-    if(NVIDIA)
-        add_custom_command(
-            TARGET ${ProjectName}
-            PRE_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-            COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
-        )
-    endif()
    add_custom_command(
        TARGET ${ProjectName}
        POST_BUILD

--- a/WhisperTranscript/ModelProcessor.cpp
+++ b/WhisperTranscript/ModelProcessor.cpp
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
- * USA.
- */
-
-#include "ModelProcessor.h"
-
-#include <pluglog.h>
-#include <common.h>
-#include <limits.h>
-
-const char sep = separator();
-
-const std::string TAG = "Transcript";
-
-namespace jami {
-
-ModelProcessor::ModelProcessor(const std::string& path, bool acc)
-{
-    loadTokens(path + "/assets/tokenizer.bin", vocab_);
-
-#ifdef __ANDROID__
-    initModels(path + "/assets/mModelEncoder.ort", path + "/assets/mModelDecoder.ort", path + "/assets/mLogSoftMax.ort", acc);
-#else
-    initModels(path + "/assets/mModelEncoder.onnx", path + "/assets/mModelDecoder.onnx", path + "/assets/mLogSoftMax.onnx", acc);
-#endif
-}
-
-ModelProcessor::~ModelProcessor()
-{
-    endModels();
-    Plog::log(Plog::LogPriority::INFO, TAG, "~ModelProcessor");
-}
-
-void
-ModelProcessor::endModels()
-{
-    if (encoderSession_) {
-        delete encoderSession_;
-        encoderSession_ = nullptr;
-    }
-    if (decoderSession_) {
-        delete decoderSession_;
-        decoderSession_ = nullptr;
-    }
-    if (logSoftMaxSession_) {
-        delete logSoftMaxSession_;
-        logSoftMaxSession_ = nullptr;
-    }
-#ifdef NVIDIA
-    if (cudaOptions_) {
-        ortApi.ReleaseCUDAProviderOptions(cudaOptions_);
-        cudaOptions_ = nullptr;
-    }
-#endif
-    if (env_) {
-        env_.release();
-        env_ = NULL;
-    }
-}
-
-void
-ModelProcessor::initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc)
-{
-    try {
-        sessOpt_ = Ort::SessionOptions();
-
-        try {
-            if (activateAcc) {
-#ifdef NVIDIA
-                Ort::ThrowOnError(ortApi.CreateCUDAProviderOptions(&cudaOptions_));
-
-                // std::vector<const char*> keys{"device_id"};
-                // std::vector<const char*> values{"0"};
-
-                // Ort::ThrowOnError(ortApi.UpdateCUDAProviderOptions(cudaOptions_, keys.data(), values.data(), keys.size()));
-                Ort::ThrowOnError(ortApi.SessionOptionsAppendExecutionProvider_CUDA_V2(sessOpt_, cudaOptions_));
-#endif
-#ifdef __ANDROID__
-                Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(sessOpt_, 0));
-#endif
-            }
-        } catch (std::exception& accelException) {
-            Plog::log(Plog::LogPriority::ERR, TAG, accelException.what());
-            Plog::log(Plog::LogPriority::ERR, TAG, "Acceleration not available, loading models for CPU.");
-        }
-
-        sessOpt_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-#ifdef WIN32
-        encoderSession_ = new Ort::Session(env_, string_utils::to_wstring(encoderModelPath).c_str(), sessOpt_);
-        decoderSession_ = new Ort::Session(env_, string_utils::to_wstring(decoderModelPath).c_str(), sessOpt_);
-        logSoftMaxSession_ = new Ort::Session(env_, string_utils::to_wstring(logSoftMaxModelPath).c_str(), sessOpt_);
-#else
-        encoderSession_ = new Ort::Session(env_, encoderModelPath.c_str(), sessOpt_);
-        decoderSession_ = new Ort::Session(env_, decoderModelPath.c_str(), sessOpt_);
-        logSoftMaxSession_ = new Ort::Session(env_, logSoftMaxModelPath.c_str(), sessOpt_);
-#endif
-        isAllocated_ = true;
-        Plog::log(Plog::LogPriority::INFO, TAG, "Model is allocated");
-    } catch (std::exception& e) {
-        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
-    }
-}
-
-/* from whisper.cpp */
-// the most basic sampling scheme - select the top token
-whisperTokenData
-ModelProcessor::whisper_sample_best(const float * probs)
-{
-    whisperTokenData result = {
-        0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
-    };
-
-    int n_logits = vocab_.id_to_token.size();
-
-    std::vector<std::pair<double, int64_t>> probs_id;
-    probs_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        probs_id.emplace_back(std::make_pair(probs[i], i));
-    }
-
-    {
-        double sum_ts =  0.0;
-        double max_ts = -1.0;
-        double max_tx = -1.0;
-
-        for (int i = 0; i < vocab_.token_beg; i++) {
-            max_tx = std::max(max_tx, probs_id[i].first);
-        }
-
-        for (int i = vocab_.token_beg; i < n_logits; i++) {
-            sum_ts += probs_id[i].first;
-            if  (probs_id[i].first > max_ts) {
-                max_ts = probs_id[i].first;
-                result.tid = probs_id[i].second;
-            }
-        }
-
-        // if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
-        // timestamp token
-        if (sum_ts > max_tx) {
-            // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
-            for (int i = 0; i < vocab_.token_beg; i++) {
-                probs_id[i].first = -INT_MAX;
-            }
-        }
-
-        result.pt = max_ts/(sum_ts + 1e-10);
-        result.ptsum = sum_ts;
-    }
-
-    // find the top K tokens
-    const int top_k = 4;
-
-    std::partial_sort(
-            probs_id.begin(),
-            probs_id.begin() + top_k, probs_id.end(),
-            [](const std::pair<double, int64_t> & a, const std::pair<double, int64_t> & b) {
-        return a.first > b.first;
-    });
-
-    probs_id.resize(top_k);
-
-    int res = 0;
-    while ((probs_id[res].second == vocab_.token_sot ||
-            probs_id[res].second == vocab_.token_solm ||
-            probs_id[res].second == vocab_.token_beg) &&
-            res < (int) probs_id.size() - 1) {
-        res++;
-    }
-
-    result.id = probs_id[res].second;
-    result.p  = probs_id[res].first;
-
-    return result;
-}
-
-void
-ModelProcessor::filterLogits(std::vector<float>& logits, int offset)
-{
-    // Remove all no speech tokens
-    for (const auto idx : vocab_.noSpeechTokens) {
-        logits[idx] = (float)-INT_MAX;
-    }
-}
-
-void
-ModelProcessor::filterLanguageLogits(std::vector<float>& logits)
-{
-    // Leave only the language tokens
-    for (size_t i = 0; i < logits.size(); i++) {
-        if (vocab_.languageId2Tokens[i].empty())
-            logits[i] = (float)(-INT_MAX);
-    }
-}
-
-whisperTokenData
-ModelProcessor::getToken(std::vector<float>& logits)
-{
-    Ort::RunOptions runOption;
-    std::vector<Ort::Value> logSoftMaxInputs;
-    logSoftMaxInputs.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                            logits.data(),
-                                                            logits.size(),
-                                                            logitsShape_.data(),
-                                                            logitsShape_.size()));
-
-    auto softmaxOutputs = logSoftMaxSession_->Run(runOption,
-                                                logSoftMaxInputNames.data(),
-                                                logSoftMaxInputs.data(),
-                                                logSoftMaxInputNames.size(),
-                                                logSoftMaxOutputNames.data(),
-                                                logSoftMaxOutputNames.size());
-
-    float* probs = softmaxOutputs[1].GetTensorMutableData<float>();
-    return whisper_sample_best(probs);
-}
-
-std::string
-ModelProcessor::feedInput(std::vector<float>& melInput, const std::string& preferenceLanguage)
-{
-    std::lock_guard<std::mutex> l(mtx_);
-    if (!isAllocated_ || !logSoftMaxSession_ || !encoderSession_ || !decoderSession_)
-        return "";
-    Ort::RunOptions runOption;
-    try {
-        Ort::Value melInputTensor = Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                    melInput.data(),
-                                                                    melInput.size(),
-                                                                    melInputShape_.data(),
-                                                                    melInputShape_.size());
-        audioFeaturesTensor_ = Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                               audioFeatures_.data(),
-                                                               audioFeatures_.size(),
-                                                               audioFeaturesShape_.data(),
-                                                               audioFeaturesShape_.size());
-        // Run the encoder graph
-        encoderSession_->Run(runOption,
-                        encoderInputNames,
-                        &melInputTensor,
-                        1,
-                        encoderOutputNames,
-                        &audioFeaturesTensor_,
-                        1);
-    } catch(Ort::Exception e) {
-        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
-        return "";
-    } catch (...) { return ""; }
-    std::vector<float> currentTokensP {};
-
-    try {
-        auto isMultilingual = vocab_.is_multilingual();
-        std::vector<int64_t> currentTokens {};
-        currentTokens.emplace_back(vocab_.token_sot);
-        currentTokensP.emplace_back(1);
-
-        std::array<int64_t, 1> offsetShape {1};
-
-        if (isMultilingual) {
-            if (preferenceLanguage == "auto"
-                    || vocab_.languageTokens2Id.find(preferenceLanguage) == vocab_.languageTokens2Id.end()) {
-                std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f);
-                std::array<int64_t, 2> tokenShape {1, 1};
-                int64_t offset =  0;
-                std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, 1, MODELFEATURESHAPE };
-
-                std::vector<int64_t> token = { currentTokens.back() };
-
-                // Run the decoder graph
-                std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_};
-                inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                        audioFeatures_.data(),
-                                                                        audioFeatures_.size(),
-                                                                        audioFeaturesShape_.data(),
-                                                                        audioFeaturesShape_.size()));
-
-                inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                            token.data(),
-                                                                            token.size(),
-                                                                            tokenShape.data(),
-                                                                            tokenShape.size()));
-
-                inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                        currentKVCache.data(),
-                                                                        currentKVCache.size(),
-                                                                        kvCacheShape.data(),
-                                                                        kvCacheShape.size()));
-
-                inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                            &offset,
-                                                                            1,
-                                                                            offsetShape.data(),
-                                                                            0));
-
-                auto outputs = decoderSession_->Run(runOption,
-                                                    decoderInputNames.data(),
-                                                    inputsVector.data(),
-                                                    decoderInputNames.size(),
-                                                    decoderOutputNames.data(),
-                                                    decoderOutputNames.size());
-
-                auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo();
-                auto logitsData = outputs[0].GetTensorMutableData<float>();
-
-                {
-                    std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount());
-                    filterLanguageLogits(logits);
-                    auto it = std::max_element(logits.begin(), logits.end());
-                    currentTokens.emplace_back(std::distance(logits.begin(), it));
-                }
-            } else
-                currentTokens.emplace_back(vocab_.languageTokens2Id[preferenceLanguage]);
-            currentTokens.emplace_back(vocab_.token_transcribe);
-            currentTokensP.emplace_back(1);
-            currentTokensP.emplace_back(1);
-        }
-
-        std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f);
-        std::array<int64_t, 2> tokenShape {1, (long)currentTokens.size()};
-
-        for (auto i = 0; i < sampleLen; i++) {
-            int64_t offset =  isMultilingual ? ( i == 0 ? 0 : i + 2 ) : i;
-            std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, static_cast<int64_t>(currentTokens.size()), MODELFEATURESHAPE };
-
-            std::vector<int64_t> token = { currentTokens.back() };
-            if (i == 0) {
-                token = currentTokens;
-                tokenShape[1] = currentTokens.size();
-            } else {
-                tokenShape[1] = 1;
-            }
-
-            // Run the decoder graph
-            std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_};
-            inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                      audioFeatures_.data(),
-                                                                      audioFeatures_.size(),
-                                                                      audioFeaturesShape_.data(),
-                                                                      audioFeaturesShape_.size()));
-
-            inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                        token.data(),
-                                                                        token.size(),
-                                                                        tokenShape.data(),
-                                                                        tokenShape.size()));
-
-            inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
-                                                                      currentKVCache.data(),
-                                                                      currentKVCache.size(),
-                                                                      kvCacheShape.data(),
-                                                                      kvCacheShape.size()));
-
-            inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
-                                                                        &offset,
-                                                                        1,
-                                                                        offsetShape.data(),
-                                                                        0));
-
-            auto outputs = decoderSession_->Run(runOption,
-                                                decoderInputNames.data(),
-                                                inputsVector.data(),
-                                                decoderInputNames.size(),
-                                                decoderOutputNames.data(),
-                                                decoderOutputNames.size());
-
-            auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo();
-            auto logitsData = outputs[0].GetTensorMutableData<float>();
-
-            {
-                std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount());
-                if (isMultilingual && logits.size() > vocab_.n_vocab) {
-                    std::vector<float>lastLogits;
-                    lastLogits = std::vector<float>(logits.begin() + 2 * vocab_.n_vocab, logits.end());
-                    std::swap(lastLogits, logits);
-                }
-
-                filterLogits(logits, offset);
-
-                auto tokenData = getToken(logits);
-                currentTokens.emplace_back(tokenData.id);
-                currentTokensP.emplace_back(tokenData.p);
-            }
-
-            // Grab kvCache for next iteration
-            auto kvCacheTensorInfo = outputs[1].GetTensorTypeAndShapeInfo();
-            auto nextKVCacheData = outputs[1].GetTensorMutableData<float>();
-
-            std::vector<float> nextKVCache;
-            std::vector<float> zeros(MODELFEATURESHAPE, 0.0f);
-            int delta = (currentTokens.size() - 1) * MODELFEATURESHAPE;
-            for (int currentKVIdx = 0; currentKVIdx < MODELKVCACHESHAPE; currentKVIdx++) {
-                nextKVCache.insert(nextKVCache.end(),
-                                   nextKVCacheData + (currentKVIdx * delta),
-                                   nextKVCacheData + ((currentKVIdx + 1) * delta));
-                nextKVCache.insert(nextKVCache.end(), zeros.begin(), zeros.end());
-            }
-            std::swap(currentKVCache, nextKVCache);
-
-            if (currentTokens.back() == vocab_.token_eot)
-                break;
-        }
-
-        std::swap(currentTokens, tokensOutput_);
-    } catch(Ort::Exception e) {
-        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
-        return "";
-    } catch (...) {}
-
-    std::ostringstream oss;
-    std::ostringstream tokensStr;
-    auto idx = -1;
-    for (const auto& token : tokensOutput_) {
-        idx ++;
-        tokensStr << token << " " << currentTokensP[idx] << " ";
-        if (token >= vocab_.token_eot)
-            continue;
-        if (currentTokensP[idx] > -1.8)
-            oss << vocab_.id_to_token[token];
-    }
-
-    tokensOutput_.clear();
-    return oss.str();
-}
-} // namespace jami
--- a/WhisperTranscript/ModelProcessor.h
+++ b/WhisperTranscript/ModelProcessor.h
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
- * USA.
- */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include <algorithm>
-#include <set>
-#include <mutex>
-
-#include <onnxruntime_cxx_api.h>
-// #ifdef NVIDIA
-// #include <cuda_provider_options.h>
-// #endif
-#ifdef __ANDROID__
-#include <nnapi_provider_factory.h>
-#endif
-
-#include <functional>
-
-#include "Preprocess.h"
-
-namespace jami {
-
-// Use script getonnxio.py to grab model inputs and outputs
-// names and shapes.
-// Note: None is a open shape. If in the input, it will be defined by
-// the data we want to use as input. As for open output, it is recommended
-// to not try to pre allocate the tensor and use the model.run return.
-
-static const char* encoderInputNames[4] = {"mel"};
-static const char* encoderOutputNames[4] = {"617"};
-
-#define MODELFEATURESHAPE 384
-#define MODELKVCACHESHAPE 8
-
-#define MODELLOGITSHAPE 51865 // 51864 for english models
-
-static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
-static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};
-
-static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
-static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};
-
-typedef struct whisperTokenData {
-    int64_t id;  // token id
-    int64_t tid; // forced timestamp token id
-
-    float p;     // probability of the token
-    float pt;    // probability of the timestamp token
-    float ptsum; // sum of probabilities of all timestamp tokens
-
-    // token-level timestamp data
-    // do not use if you haven't computed token-level timestamps
-    int64_t t0; // start time of the token
-    int64_t t1; //   end time of the token
-
-    float vlen; // voice length of the token
-} whisperTokenData;
-
-class ModelProcessor
-{
-public:
-    ModelProcessor(const std::string& path, bool acc);
-    ~ModelProcessor();
-
-    void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
-    void endModels();
-
-    whisperTokenData whisper_sample_best(const float * probs);
-
-    /**
-     * @brief feedInput
-     * Takes a input and feeds it to the model storage for predictions
-     * @param input
-     * @param preferenceLanguage
-     */
-    std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");
-
-    bool isAllocated() { return isAllocated_; }
-
-private:
-    // Tokens
-    whisperVocab vocab_;
-
-    whisperTokenData getToken(std::vector<float>& logits);
-    void filterLogits(std::vector<float>& logits, int offset);
-    void filterLanguageLogits(std::vector<float>& logits);
-
-    // onnx related
-    Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-    bool isAllocated_ {false};
-    Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
-    Ort::Session* encoderSession_ {nullptr};
-    Ort::Session* decoderSession_ {nullptr};
-    Ort::Session* logSoftMaxSession_ {nullptr};
-    Ort::SessionOptions sessOpt_;
-#ifdef NVIDIA
-    const OrtApi& ortApi = Ort::GetApi();
-    OrtCUDAProviderOptionsV2* cudaOptions_ = nullptr;
-#endif
-
-    // Encoder tensors. 1 input and 1 output
-    std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
-    Ort::Value audioFeaturesTensor_ {nullptr};
-    std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
-    std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};
-
-    std::vector<float> output_;
-
-    // Decoder tensors. 4 inputs and 2 outputs
-    std::vector<int64_t> tokensOutput_ { };
-
-    // LogProb check
-    std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};
-
-    int sampleLen = 100;
-
-    std::mutex mtx_;
-
-};
-} // namespace jami
--- a/WhisperTranscript/PluginPreferenceHandler.cpp
+++ b/WhisperTranscript/PluginPreferenceHandler.cpp
@@ -89,9 +89,7 @@ PluginPreferenceHandler::preferenceMapHasKey(const std::string& key)
    return (key == "background"
            || key == "position"
            || key == "fontsize"
-            || key == "language"
-            || key == "chunksize"
-            || key == "stepsize");
+            || key == "language");
 }

 std::string
@@ -110,8 +108,7 @@ std::map<std::string, std::string>
 PluginPreferenceHandler::getPreferences(const std::string& accountId)
 {
    std::lock_guard<std::mutex> lk(mtx_);
-    auto preferences = preferences_.emplace(accountId, preferences_["default"]).first->second;
-    return preferences;
+    return preferences_.emplace(accountId, preferences_["default"]).first->second;
 }

 PluginPreferenceHandler::~PluginPreferenceHandler()

--- a/WhisperTranscript/PluginPreferenceHandler.h
+++ b/WhisperTranscript/PluginPreferenceHandler.h
@@ -35,9 +35,7 @@ enum Parameter {
    POSITION,
    BACKGROUND,
    FONTSIZE,
-    LANGUAGE,
-    CHUNK,
-    STEP
+    LANGUAGE
 };

 class TranscriptMediaHandler;

--- a/WhisperTranscript/Preprocess.cpp
+++ b/WhisperTranscript/Preprocess.cpp
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
- */
-
-#include "Preprocess.h"
-
-#ifdef WIN32
-#define _USE_MATH_DEFINES
-#endif
-
-#include <thread>
-#include <math.h>
-#include <fstream>
-#include <iostream>
-
-// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
-bool logMelSpectrogram(
-    const float *samples,
-    const int n_samples,
-    const int n_threads,
-    const whisperFilters &filters,
-    whisperMel &mel) {
-
-    // const int sample_rate = WHISPER_SAMPLE_RATE;
-    const int fft_size = WHISPER_N_FFT;
-    const int fft_step = WHISPER_HOP_LENGTH;
-    const int n_mel = WHISPER_N_MEL;
-
-    // Hanning window
-    std::vector<float> hann;
-    hann.resize(fft_size);
-    for (int i = 0; i < fft_size; i++) {
-        hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
-    }
-
-    mel.n_mel = n_mel;
-    mel.n_len = (n_samples)/fft_step;
-    mel.data.resize(mel.n_mel*mel.n_len);
-
-    const int n_fft = 1 + fft_size/2;
-
-    std::vector<std::thread> workers(n_threads);
-    for (int iw = 0; iw < n_threads; ++iw) {
-        workers[iw] = std::thread([&](int ith) {
-            std::vector<float> fft_in;
-            fft_in.resize(fft_size);
-            for (int i = 0; i < fft_size; i++) {
-                fft_in[i] = 0.0;
-            }
-
-            std::vector<float> fft_out;
-            fft_out.resize(2*fft_size);
-
-            for (int i = ith; i < mel.n_len; i += n_threads) {
-                const int offset = i*fft_step;
-
-                // apply Hanning window
-                for (int j = 0; j < fft_size; j++) {
-                    if (offset + j < n_samples) {
-                        fft_in[j] = hann[j]*samples[offset + j];
-                    } else {
-                        fft_in[j] = 0.0;
-                    }
-                }
-
-                // FFT -> mag^2
-                fft(fft_in, fft_out);
-
-                for (int j = 0; j < fft_size; j++) {
-                    fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
-                }
-                for (int j = 1; j < fft_size/2; j++) {
-                    fft_out[j] += fft_out[fft_size - j];
-                }
-
-                // mel spectrogram
-                for (int j = 0; j < mel.n_mel; j++) {
-                    double sum = 0.0;
-
-                    for (int k = 0; k < n_fft; k++) {
-                        sum += fft_out[k]*filters.data[j*n_fft + k];
-                    }
-                    if (sum < 1e-10) {
-                        sum = 1e-10;
-                    }
-
-                    sum = log10(sum);
-
-                    mel.data[j*mel.n_len + i] = sum;
-                }
-            }
-        }, iw);
-    }
-
-    for (int iw = 0; iw < n_threads; ++iw) {
-        workers[iw].join();
-    }
-
-    // clamping and normalization
-    double mmax = -1e20;
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] > mmax) {
-            mmax = mel.data[i];
-        }
-    }
-
-    mmax -= 8.0;
-
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] < mmax) {
-            mel.data[i] = mmax;
-        }
-
-        mel.data[i] = (mel.data[i] + 4.0)/4.0;
-    }
-
-    return true;
-}
-
-// Cooley-Tukey FFT
-// poor man's implementation - use something better
-// input is real-valued
-// output is complex-valued
-void fft(const std::vector<float> & in, std::vector<float> & out) {
-    out.resize(in.size()*2);
-
-    int N = in.size();
-
-    if (N == 1) {
-        out[0] = in[0];
-        out[1] = 0;
-        return;
-    }
-
-    if (N%2 == 1) {
-        dft(in, out);
-        return;
-    }
-
-    std::vector<float> even;
-    std::vector<float> odd;
-
-    for (int i = 0; i < N; i++) {
-        if (i % 2 == 0) {
-            even.emplace_back(in[i]);
-        } else {
-            odd.emplace_back(in[i]);
-        }
-    }
-
-    std::vector<float> even_fft;
-    std::vector<float> odd_fft;
-
-    fft(even, even_fft);
-    fft(odd, odd_fft);
-
-    for (int k = 0; k < N/2; k++) {
-        float theta = 2*M_PI*k/N;
-
-        float re = cos(theta);
-        float im = -sin(theta);
-
-        float re_odd = odd_fft[2*k + 0];
-        float im_odd = odd_fft[2*k + 1];
-
-        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
-        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
-
-        out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
-        out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
-    }
-}
-
-// naive Discrete Fourier Transform
-// input is real-valued
-// output is complex-valued
-void dft(const std::vector<float> & in, std::vector<float> & out) {
-    int N = in.size();
-
-    out.resize(N*2);
-
-    for (int k = 0; k < N; k++) {
-        float re = 0;
-        float im = 0;
-
-        for (int n = 0; n < N; n++) {
-            float angle = 2*M_PI*k*n/N;
-            re += in[n]*cos(angle);
-            im -= in[n]*sin(angle);
-        }
-
-        out[k*2 + 0] = re;
-        out[k*2 + 1] = im;
-    }
-}
-
-
-void loadMelFilters(const std::string& fileName, whisperFilters& filters) {
-    auto fin = std::ifstream(fileName, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
-        return;
-    }
-
-    fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
-    fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));
-
-    filters.data.resize(filters.n_mel * filters.n_fft);
-    fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
-}
-
-void loadTokens(const std::string& fileName, whisperVocab& vocab) {
-    auto fin = std::ifstream(fileName, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
-        return;
-    }
-
-    int32_t modelNVocab = 0;
-    fin.read((char *) &modelNVocab, sizeof(modelNVocab));
-
-    int32_t tokensNVocab = 0;
-    fin.read((char *) &tokensNVocab, sizeof(tokensNVocab));
-
-    std::string word;
-    for (int i = 0; i < tokensNVocab; i++) {
-        uint32_t len;
-        fin.read((char *) &len, sizeof(len));
-
-        word.resize(len);
-        fin.read((char *) word.data(), len);
-
-        vocab.token_to_id[word] = i;
-        vocab.id_to_token[i] = word;
-    }
-
-    vocab.n_vocab = modelNVocab;
-    if (vocab.is_multilingual()) {
-        vocab.token_eot++;
-        vocab.token_sot++;
-        vocab.token_prev++;
-        vocab.token_solm++;
-        vocab.token_not++;
-        vocab.token_beg++;
-    }
-
-    if (tokensNVocab < modelNVocab) {
-        // Read language tokens
-        {
-            int32_t languageTokensLen = 0;
-            fin.read((char *) &languageTokensLen, sizeof(languageTokensLen));
-
-            std::string word;
-            for (int i = 0; i < languageTokensLen; i++) {
-                int32_t id = 0;
-                fin.read((char *) &id, sizeof(id));
-                uint32_t len;
-                fin.read((char *) &len, sizeof(len));
-
-                word.resize(len);
-                fin.read((char *) word.data(), len);
-
-                vocab.token_to_id[word] = id;
-                vocab.id_to_token[id] = word;
-                vocab.languageId2Tokens.insert({id, word});
-                vocab.languageTokens2Id.insert({word, id});
-            }
-        }
-
-        fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab);
-        for (int i = tokensNVocab; i < modelNVocab; i++) {
-            if (!vocab.id_to_token[i].empty())
-                continue;
-            if (i > vocab.token_beg) {
-                word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
-            } else if (i == vocab.token_eot) {
-                word = "[_EOT_]";
-            } else if (i == vocab.token_sot) {
-                word = "[_SOT_]";
-            } else if (i == vocab.token_prev) {
-                word = "[_PREV_]";
-            } else if (i == vocab.token_not) {
-                word = "[_NOT_]";
-            } else if (i == vocab.token_beg) {
-                word = "[_BEG_]";
-            } else {
-                word = "[_extra_token_" + std::to_string(i) + "]";
-            }
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // Read no speech tokens
-    {
-        int32_t noSpeechTokensLen = 0;
-        fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen));
-
-        for (int i = 0; i < noSpeechTokensLen; i++) {
-            uint32_t id;
-            fin.read((char *) &id, sizeof(id));
-
-            vocab.noSpeechTokens.insert(id);
-        }
-    }
-}
-
-void
-inputPadTrim(whisperMel &mel)
-{
-    if (mel.n_len == ENCODER_INPUT_LEN)
-        return;
-    std::vector<float> data;
-    std::vector<float> partialData;
-    int seek = 0;
-    auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN);
-    for (auto j = 0; j < mel.n_mel; j++) {
-        seek = j * mel.n_len;
-        for (auto i = seek; i < (j + 1) * dataLimit; i++) {
-            partialData.emplace_back(mel.data[i]);
-        }
-        if (mel.n_len < ENCODER_INPUT_LEN) {
-            for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) {
-                partialData.emplace_back(0.0f);
-            }
-        }
-        data.insert(data.end(), partialData.begin(), partialData.end());
-        partialData.clear();
-    }
-    std::swap(mel.data, data);
-}
--- a/WhisperTranscript/Preprocess.h
+++ b/WhisperTranscript/Preprocess.h
-/**
- *  Copyright (C) 2022 Savoir-faire Linux Inc.
- *
- *  Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
- */
-
-#pragma once
-
-#include <vector>
-#include <cstdint>
-#include <string>
-#include <map>
-#include <set>
-
-
-// Those are model defined
-// Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf)
-#define WHISPER_SAMPLE_RATE 16000
-#define WHISPER_N_FFT       400
-#define WHISPER_N_MEL       80
-#define WHISPER_HOP_LENGTH  160
-#define WHISPER_CHUNK_SIZE  30
-#define ENCODER_INPUT_LEN   3000
-
-struct whisperMel {
-    int n_len;
-    int n_mel;
-
-    std::vector<float> data;
-};
-
-struct whisperFilters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-struct whisperVocab {
-    size_t n_vocab = 51864;
-
-    std::map<std::string, int32_t> token_to_id;
-    std::map<int32_t, std::string> id_to_token;
-
-    int32_t token_eot  = 50256;
-    int32_t token_sot  = 50257;
-    int32_t token_prev = 50360;
-    int32_t token_solm = 50361; // no speech
-    int32_t token_not  = 50362; // no timestamps
-    int32_t token_beg  = 50363; // timestamp begin
-
-    // available tasks
-    const int32_t token_translate  = 50358;
-    const int32_t token_transcribe = 50359;
-
-    bool is_multilingual() const {
-        return n_vocab == 51865;
-    }
-
-    std::map<std::string, int32_t> languageTokens2Id;
-    std::map<int32_t, std::string> languageId2Tokens;
-    std::set<int32_t> noSpeechTokens;
-};
-
-bool logMelSpectrogram(
-    const float * samples,
-    const int n_samples,
-    const int n_threads,
-    const whisperFilters & filters,
-    whisperMel &mel);
-
-void fft(const std::vector<float> & in, std::vector<float> & out);
-
-void dft(const std::vector<float> & in, std::vector<float> & out);
-
-void loadMelFilters(const std::string& fileName, whisperFilters& filters);
-
-void loadTokens(const std::string& fileName, whisperVocab& vocab);
-
-void inputPadTrim(whisperMel &mel);
--- a/WhisperTranscript/TranscriptAudioSubscriber.cpp
+++ b/WhisperTranscript/TranscriptAudioSubscriber.cpp
@@ -24,27 +24,27 @@
 #include <frameUtils.h>
 #include <bitset>
 #include <iostream>
+#include <fmt/core.h>
+#include <fmt/format.h>

-const std::string TAG = "Transcript";
+#include "stt_whisper.h"
+
+const std::string TAG = "TranscriptAudio";
 const char sep = separator();

 namespace jami {

-TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc)
+TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath,
+                                                     TranscriptVideoSubscriber* videoSubscriber)
    : path_ {dataPath}
-    , modelProcessor_ {dataPath, acc}
    , mVS_ {videoSubscriber}
 {
-    loadMelFilters(path_ + "/assets/mel_filters.bin", modelFilters_);
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this)));
 }

 TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
 {
-    modelProcessor_.endModels();
-    formatFilter_.clean();
-    stop();
-    processFrameThread.join();
-    Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaProcessor");
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this)));
 }

 /**
@@ -53,83 +53,84 @@ TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
 void
 TranscriptAudioSubscriber::processFrame()
 {
-    while (running) {
-        auto data = modelInput_[modelIdx_];
-        if (data.size() <= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(waitingPoint_));
-            continue;
+    if (!whisper_) {
+        whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin");
+        whisper_->setLanguage(language_);
    }
-        if (!running)
-            break;
-        melSpectrogram_.data.clear();
-        melSpectrogram_.n_len = 0;
-        melSpectrogram_.n_mel = 0;
-        logMelSpectrogram(data.data(),
-                          data.size(),
-                          8,
-                          modelFilters_,
-                          melSpectrogram_);
-        inputPadTrim(melSpectrogram_);
-
-        auto text = modelProcessor_.feedInput(melSpectrogram_.data, language_);
-        if (text.empty()) {
+
+    while (running) {
+        decltype(frames_) frames;
+        {
            std::unique_lock<std::mutex> l(inputLock);
-            modelInput_[0].clear();
-            modelInput_[1].clear();
-            modelIdx_ = 0;
+            cv_.wait(l, [&]{
+                return !running || !frames_.empty();
+            });
+            if (!running)
+                return;
+            frames = std::move(frames_);
+        }
+
+        for (auto& f : frames) {
+            uniqueFramePtr filteredFrame = getUniqueFrame();
+            filteredFrame->sample_rate = WHISPER_SAMPLE_RATE;
+            filteredFrame->format  = AV_SAMPLE_FMT_FLT;
+            av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO);
+            try {
+                if (resampler_.resample(f.get(), filteredFrame.get()) == 0) {
+                    whisper_->AddAudioData((float*) filteredFrame->buf[0]->data,
+                                           filteredFrame->nb_samples);
+                }
+            } catch (...) {
+            }
+        }
+
+        auto result = whisper_->GetTranscribed();
+        if (not result.empty()) {
+            std::string txt;
+            for (const auto& t : result) {
+                if (not t.is_partial)
+                    txt += t.text;
+            }
+            if (!txt.empty())
+                mVS_->setText(txt);
        }
-        mVS_->setText(text);
    }
+    whisper_.reset();
 }

 void
 TranscriptAudioSubscriber::stop()
 {
+    Plog::log(Plog::LogPriority::INFO, TAG, "stop()");
+    {
+        std::unique_lock<std::mutex> l(inputLock);
        running = false;
+        cv_.notify_all();
+    }
    if (processFrameThread.joinable()) {
        processFrameThread.join();
    }
-    std::string str = "";
-    mVS_->setText(str);
+    mVS_->setText("");
 }

 void
 TranscriptAudioSubscriber::start()
 {
+    Plog::log(Plog::LogPriority::INFO, TAG, "start()");
    running = true;
-    processFrameThread = std::thread([this] { processFrame(); });
+    processFrameThread = std::thread([this](){ processFrame(); });
+    mVS_->setText("");
 }

 void
-TranscriptAudioSubscriber::setParameter(std::string& parameter, Parameter type)
+TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type)
 {
    std::unique_lock<std::mutex> l(inputLock);
-    std::string str = "";
    switch (type) {
    case (Parameter::LANGUAGE):
        language_ = parameter;
-        modelInput_[0].clear();
-        modelInput_[1].clear();
-        modelIdx_ = 0;
-        mVS_->setText(str);
-        break;
-    case (Parameter::CHUNK):
-        WHISPER_STREAM_SAMPLES_CHUNK = 16000 * std::stoi(parameter);
-        modelInput_[0].resize(0);
-        modelInput_[1].resize(0);
-        modelInput_[0].reserve(WHISPER_STREAM_SAMPLES_CHUNK);
-        modelInput_[1].reserve(WHISPER_STREAM_SAMPLES_CHUNK);
-        waitingPoint_ = (std::stoi(parameter) * 1000 - (WHISPER_STREAM_SAMPLES_CHUNK_STEP / 16)) / 3;
-        modelIdx_ = 0;
-        mVS_->setText(str);
-        break;
-    case (Parameter::STEP):
-        modelInput_[0].clear();
-        modelInput_[1].clear();
-        WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * std::stoi(parameter);
-        waitingPoint_ = ((WHISPER_STREAM_SAMPLES_CHUNK / 16) - std::stoi(parameter) * 1000) / 3;
-        modelIdx_ = 0;
-        mVS_->setText(str);
+        if (whisper_)
+            whisper_->setLanguage(parameter);
        break;
    default:
        return;
@@ -140,77 +141,30 @@ void
 TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame)
 {
    std::unique_lock<std::mutex> l(inputLock);
-    if (!pluginFrame || modelFilters_.data.empty() || obs != observable_)
-        return;
-
-    if (firstRun) {
-        samplesCount_ = 0;
-        currentModelInput_.clear();
-        futureModelInput_.clear();
-        formatFilter_.clean();
-        AudioFormat afmt = AudioFormat(pluginFrame->sample_rate,
-                                        pluginFrame->channels,
-                                        static_cast<AVSampleFormat>(pluginFrame->format));
-        MediaStream ms = MediaStream("input", afmt);
-        formatFilter_.initialize(filterDescription_, {ms});
-        firstRun = false;
-    }
-
-    if (!formatFilter_.initialized_)
+    if (!pluginFrame || obs != observable_)
        return;

-    if (formatFilter_.feedInput(pluginFrame, "input") == 0) {
-        uniqueFramePtr filteredFrame = {formatFilter_.readOutput(), frameFree};
-        if (filteredFrame) {
-            for (size_t i = 0; i < filteredFrame->buf[0]->size; i += 2) {
-#ifdef __DEBUG__
-                std::lock_guard<std::mutex> l(inputLock);
-#endif
-                int16_t rawValue = (filteredFrame->buf[0]->data[i+1] << 8) | filteredFrame->buf[0]->data[i];
-
-                // If not a positive value, perform the 2's complement math on the value
-                if ((rawValue & 0x8000) != 0) {
-                    rawValue = (~(rawValue - 0x0001)) * -1;
-                }
-                if (futureModelInput_.size() == WHISPER_STREAM_SAMPLES_CHUNK)
-                    futureModelInput_.erase(futureModelInput_.begin());
-                futureModelInput_.emplace_back(float(rawValue)/32768.0f);
-                samplesCount_++;
-
-                auto value = float(rawValue) / 32768.0f;
-                if (modelInput_[modelIdx_].size() >= WHISPER_STREAM_SAMPLES_CHUNK) {
-                    modelInput_[modelIdx_].clear();
-                    modelIdx_ = modelIdx_ ? 0 : 1;
-                }
-                modelInput_[modelIdx_].emplace_back(value);
-                if (modelInput_[modelIdx_].size()
-                    >= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) {
-                    modelInput_[modelIdx_ ? 0 : 1].emplace_back(value);
-                }
-            }
-        }
-    }
+    frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree));
+    cv_.notify_all();
    // audio returns as is
 }

 void
 TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable)
 {
-    Plog::log(Plog::LogPriority::INFO, TAG, "::Attached ! ");
+    std::unique_lock<std::mutex> l(inputLock);
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
    observable_ = observable;
    start();
 }

 void
-TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>*)
+TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable)
 {
    firstRun = true;
    observable_ = nullptr;
    stop();
-    modelInput_[0].clear();
-    modelInput_[1].clear();
-    modelIdx_ = 0;
-    Plog::log(Plog::LogPriority::INFO, TAG, "::Detached()");
+    Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
 }

 void
@@ -218,7 +172,6 @@ TranscriptAudioSubscriber::detach()
 {
    if (observable_) {
        firstRun = true;
-        std::ostringstream oss;
        Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
        observable_->detach(this);
    }

--- a/WhisperTranscript/TranscriptAudioSubscriber.h
+++ b/WhisperTranscript/TranscriptAudioSubscriber.h
@@ -26,21 +26,25 @@ extern "C" {
 #include <observer.h>

 #include <frameFilter.h>
-
-#include "Preprocess.h"
-#include "ModelProcessor.h"
+#include <frameUtils.h>
 #include "TranscriptVideoSubscriber.h"
 #include "PluginPreferenceHandler.h"
+#include "resampler.h"

 #include <thread>
 #include <condition_variable>
+#include <deque>
+#include <atomic>
+
+class RealtimeSttWhisper;

 namespace jami {

 class TranscriptAudioSubscriber : public Observer<AVFrame*>
 {
 public:
-    TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc = false);
+    TranscriptAudioSubscriber(const std::string& dataPath,
+                              TranscriptVideoSubscriber* videoSubscriber);
    ~TranscriptAudioSubscriber();

    virtual void update(Observable<AVFrame*>*, AVFrame* const&) override;
@@ -49,22 +53,11 @@ public:

    void detach();

-    void setParameter(std::string& parameter, Parameter type);
+    void setParameter(const std::string& parameter, Parameter type);

 private:
-    // Mel spectrogram filters
-    whisperFilters modelFilters_;
-    whisperMel melSpectrogram_;
-
    // Observer pattern
    Observable<AVFrame*>* observable_ {};
-
-    // Filter for audio formatting
-    const std::string filterDescription_ = "[input]aresample=16000,aformat=sample_fmts=s16:channel_layouts=mono";
-    FrameFilter formatFilter_;
-    std::array<std::vector<float>, 2> modelInput_ {};
-    int modelIdx_ {0};
-    int waitingPoint_ {1000};
    std::string language_ {"auto"};

    // Data
@@ -72,12 +65,15 @@ private:

    // Status variables of the processing
    bool firstRun {true};
-    bool running {false};
+    std::atomic_bool running {false};

    std::mutex inputLock;
+    std::condition_variable cv_;

    // Model
-    ModelProcessor modelProcessor_;
+    std::unique_ptr<RealtimeSttWhisper> whisper_;
+    Resampler resampler_;
+    std::vector<uniqueFramePtr> frames_;

    // Threading
    std::thread processFrameThread;
@@ -87,9 +83,5 @@ private:

    // Video processor
    TranscriptVideoSubscriber* mVS_ {};
-
-
-    size_t WHISPER_STREAM_SAMPLES_CHUNK      = 16000 * 15; // 16 KHz * 15 seconds
-    size_t WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * 3;  // 16 KHz * 3 seconds
 };
 } // namespace jami
--- a/WhisperTranscript/TranscriptMediaHandler.cpp
+++ b/WhisperTranscript/TranscriptMediaHandler.cpp
@@ -36,13 +36,11 @@ TranscriptMediaHandler::TranscriptMediaHandler(std::string&& datapath, PluginPre
    aph_ = prefHandler;
    setId(datapath_);
    auto preferences = aph_->getPreferences("default");
-    auto it = preferences.find("acceleration");
-    auto useAcceleration = it == preferences.end() ? false : it->second == "1";
    videoSubscriber_ = std::make_shared<TranscriptVideoSubscriber>(datapath_);
-    audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get(), useAcceleration);
+    audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get());
    setParameters("default");
 #ifdef __DEBUG__
-    it = preferences.find("subtitle");
+    auto it = preferences.find("subtitle");
    if (it != preferences.end())
        videoSubscriber_->setText(it->second);
 #endif
@@ -102,8 +100,6 @@ TranscriptMediaHandler::setParameters(const std::string& accountId)
        videoSubscriber_->setParameter(preferences["background"], Parameter::BACKGROUND);
        videoSubscriber_->setParameter(preferences["position"], Parameter::POSITION);
        audioSubscriber_->setParameter(preferences["language"], Parameter::LANGUAGE);
-        audioSubscriber_->setParameter(preferences["chunksize"], Parameter::CHUNK);
-        audioSubscriber_->setParameter(preferences["stepsize"], Parameter::STEP);
    } catch (std::exception& e) {
        Plog::log(Plog::LogPriority::ERR, TAG, e.what());
    }
@@ -129,9 +125,7 @@ TranscriptMediaHandler::detach()

 TranscriptMediaHandler::~TranscriptMediaHandler()
 {
-    std::ostringstream oss;
-    oss << " ~TranscriptMediaHandler from WhisperTranscript Plugin" << std::endl;
-    Plog::log(Plog::LogPriority::INFO, TAG, oss.str());
+    Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaHandler from WhisperTranscript Plugin");
    detach();
 }
 } // namespace jami
--- a/WhisperTranscript/TranscriptVideoSubscriber.cpp
+++ b/WhisperTranscript/TranscriptVideoSubscriber.cpp
@@ -35,8 +35,11 @@ extern "C" {
 #include <fmt/format.h>

 #include <bitset>
+#include <string_view>

-const std::string TAG = "Transcript";
+using namespace std::literals;
+
+const std::string TAG = "TranscriptVideo";
 const char sep = separator();

 namespace jami {
@@ -54,9 +57,10 @@ TranscriptVideoSubscriber::~TranscriptVideoSubscriber()
 }

 void
-TranscriptVideoSubscriber::setText(std::string& text)
+TranscriptVideoSubscriber::setText(const std::string& t)
 {
-    text = string_utils::ffmpegScapeString(text);
+    Plog::log(Plog::LogPriority::INFO, TAG, "setText " + t);
+    auto text = string_utils::ffmpegScapeString(t);
    std::vector<std::string> textWords = string_utils::getWords(text, " ");
    subtitle_ = "";

@@ -101,9 +105,28 @@ TranscriptVideoSubscriber::setParameter(std::string& parameter, Parameter type)
    firstRun = true;
 }

+std::string_view getTransposeDescr(int rotation)
+{
+    switch (rotation) {
+    case 90:
+    case -270:
+        return "transpose=2,"sv;
+    case 180:
+    case -180:
+        return "transpose=1, transpose=1,"sv;
+    case 270:
+    case -90:
+        return "transpose=1,"sv;
+    default:
+        return {};
+    }
+    return {};
+}
+
 void
 TranscriptVideoSubscriber::setFilterDescription()
 {
+    Plog::log(Plog::LogPriority::INFO, TAG, "setFilterDescription() " + subtitle_);
    if (pluginFrameSize_.first == 0 || pluginFrameSize_.second == 0)
        return;

@@ -119,35 +142,26 @@ TranscriptVideoSubscriber::setFilterDescription()
        point_ = {pluginFrameSize_.first - margin, pluginFrameSize_.second - margin};
    }

-    std::string rotateSides = "";
-
-    if (std::abs(angle_) == 90)
-        rotateSides = ":out_w=ih:out_h=iw";
-
    auto baseInfosDescription
-        = fmt::format("[input]rotate={}{}"
-                      ",drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}"
+        = fmt::format("[input]{}"
+                      "drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}"
                      "':line_spacing=5:box=1:boxcolor={}:boxborderw=5:x=",
-                      rotation[angle_], rotateSides,
+                      getTransposeDescr(angle_),
                      fontColor_, fontSize_, fontFile_, subtitle_, fontBackground_);

-    auto position = "{}-text_w:y={}";
+    auto position = "{}-text_w:y={}"sv;
    if (position_ == "2")
-        position = "{}:y={}";
+        position = "{}:y={}"sv;
    else if (position_ == "3")
-        position = "{}:y={}-text_h";
+        position = "{}:y={}-text_h"sv;
    else if (position_ == "4")
-        position = "{}-text_w:y={}-text_h";
-    baseInfosDescription = baseInfosDescription + position + ",rotate={}{},format=yuv420p";
-    filterDescription_ = fmt::format(baseInfosDescription,
-                                     std::to_string(point_.first),
-                                     std::to_string(point_.second),
-                                     rotation[-angle_],
-                                     rotateSides);
+        position = "{}-text_w:y={}-text_h"sv;
+    filterDescription_ = baseInfosDescription + fmt::format(std::string(position) + ",{}format=yuv420p"s,
+                                     point_.first,
+                                     point_.second,
+                                     getTransposeDescr(-angle_));

-#ifdef __DEBUG__
    Plog::log(Plog::LogPriority::INFO, TAG, filterDescription_);
-#endif
 }

 void
@@ -156,9 +170,8 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
    if (!observable_ || !pluginFrame || subtitle_.empty())
        return;

-    AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX);
    int newAngle {0};
-    if (side_data) {
+    if (AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX)) {
        auto matrix_rotation = reinterpret_cast<int32_t*>(side_data->data);
        newAngle = static_cast<int>(av_display_rotation_get(matrix_rotation));
    }
@@ -170,12 +183,17 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
    //======================================================================================
    // GET RAW FRAME
    uniqueFramePtr rgbFrame = {transferToMainMemory(pluginFrame, AV_PIX_FMT_NV12), frameFree};
+    if (!rgbFrame.get())
+        return;
+    if ((AVPixelFormat)rgbFrame->format != AV_PIX_FMT_YUV420P)
        rgbFrame.reset(FrameScaler::convertFormat(rgbFrame.get(), AV_PIX_FMT_YUV420P));
    if (!rgbFrame.get())
        return;

    if (sourceTimeBase_.num != pluginFrame->time_base.num || sourceTimeBase_.den != pluginFrame->time_base.den)
        firstRun = true;
+    if (rgbFrame->width != pluginFrameSize_.first || rgbFrame->height != pluginFrameSize_.second)
+        firstRun = true;

    rgbFrame->pts = pluginFrame->pts;
    rgbFrame->time_base = pluginFrame->time_base;
@@ -184,8 +202,6 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
    if (firstRun) {
        filter_.clean();
        pluginFrameSize_ = {rgbFrame->width, rgbFrame->height};
-        if (std::abs(angle_) == 90)
-            pluginFrameSize_ = {rgbFrame->height, rgbFrame->width};
        setFilterDescription();

        rational<int> fr(sourceTimeBase_.den, sourceTimeBase_.num);

--- a/WhisperTranscript/TranscriptVideoSubscriber.h
+++ b/WhisperTranscript/TranscriptVideoSubscriber.h
@@ -43,7 +43,7 @@ public:

    void detach();

-    void setText(std::string& text);
+    void setText(const std::string& text);
    void setFilterDescription();

    void setParameter(std::string& parameter, Parameter type);

--- a/WhisperTranscript/build.sh
+++ b/WhisperTranscript/build.sh
@@ -12,18 +12,21 @@ EXTRAPATH=''
 # -d: debug program.

 if [ -z "${DAEMON}" ]; then
-    DAEMON="./../../daemon"
-    echo "DAEMON not provided, building with ${DAEMON}"
+    echo "DAEMON not provided, building with ./../../daemon"
 fi

+DAEMON=${DAEMON:="./../../daemon"}
+CONTRIB_PATH=${CONTRIB_PATH:="${DAEMON}/contrib"}
+CONTRIB_BUILD_DIR=${CONTRIB_BUILD_DIR:="native"}
+
 PLUGIN_NAME="WhisperTranscript"
 JPL_FILE_NAME="${PLUGIN_NAME}.jpl"
 SO_FILE_NAME="lib${PLUGIN_NAME}.so"
 DAEMON_SRC="${DAEMON}/src"
-CONTRIB_PATH="${DAEMON}/contrib"
 PLUGINS_LIB="../lib"
 LIBS_DIR="./../contrib/Libs"
 PLATFORM=$(uname)
+CONTRIB_BUILD_PATH="${CONTRIB_PATH}/${CONTRIB_BUILD_DIR}"

 if [ "${PLATFORM}" = "Linux" ]; then
    PLATFORM="linux-gnu"
@@ -74,39 +77,57 @@ fi
 echo $PROCESSOR

 cp -r ffmpeg ${CONTRIB_PATH}/src/
+cp -r whispercpp ${CONTRIB_PATH}/src/
 cp -r ../contrib/rav1e ${CONTRIB_PATH}/src/

+if [ ! -f "./data/assets/ggml-base.bin" ]; then
+    if [ -x "$(command -v wget)" ]; then
+        wget --quiet --show-progress -O ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin
+    elif [ -x "$(command -v curl)" ]; then
+        curl --output ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin
+    else
+        printf "Either wget or curl is required to download models.\n"
+        exit 1
+    fi
+fi
+
+if [ ! -f "./data/assets/ggml-base.bin" ]; then
+    printf "Model is required to build the plugin. Aborting.\n"
+    exit 1
+fi
+
 if [ "${PLATFORM}" = "linux-gnu" ] || [ "${PLATFORM}" = "redhat-linux" ]
 then
-    if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then
-        rm "${CONTRIB_PATH}/native/.ffmpeg"
+    if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.ffmpeg"
+        rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg"
+    fi
+    if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.whispercpp"
+        rm -rf "${CONTRIB_BUILD_PATH}/whispercpp"
    fi
    WORKPATH=$(pwd)
-    cd "${CONTRIB_PATH}/native/"
-    make .ffmpeg -j$(nproc)
+    cd "${CONTRIB_BUILD_PATH}/"
+    make .ffmpeg -j$(nproc) install
+    make .whispercpp -j$(nproc) install
+    rm .whispercpp
    rm .ffmpeg
    cd ${WORKPATH}

    CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM}
-    ONNX_PATH=${EXTRALIBS_PATH}
-    if [ -z "${EXTRALIBS_PATH}" ]
-    then
-      ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}"
-    fi
-
    if [ ${DEBUG} ]; then
        OUTPUT="${PLUGIN_NAME}"
-      CLANG_OPTS="-g -fsanitize=address"
+        CLANG_OPTS="-O0 -g -fsanitize=address"
        EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lvdpau -lX11 -lva-drm -lva-x11 -lrav1e"
        EXTRA_DEFINES="-D__DEBUG__"
    else
        python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
-      CLANG_OPTS="-O3 -shared"
+        CLANG_OPTS="-O3 -g -shared"
        OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
    fi

    # Compile
-    clang++ -std=c++17 -g -O0 -fPIC ${CLANG_OPTS} \
+    clang++ -std=c++17 -fPIC ${CLANG_OPTS} \
    -Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \
    -Wall -Wextra \
    -Wno-unused-parameter \
@@ -115,30 +136,26 @@ then
    -I"." \
    -I"${DAEMON_SRC}" \
    -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \
-    -I"${ONNX_PATH}/include/onnxruntime/session" \
-    -I"${ONNX_PATH}/include/onnxruntime/providers/cuda" \
-    -I"${CONTRIB_PATH}/native/onnx/onnxruntime" \
    -I"${PLUGINS_LIB}" \
    ./../lib/common.cpp \
    ./../lib/accel.cpp \
    ./../lib/frameFilter.cpp \
    ./../lib/frameUtils.cpp \
+    ./../lib/resampler.cpp \
    main.cpp \
    TranscriptMediaHandler.cpp \
    TranscriptAudioSubscriber.cpp \
    TranscriptVideoSubscriber.cpp \
    PluginPreferenceHandler.cpp \
-    Preprocess.cpp \
-    ModelProcessor.cpp \
-    -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \
-    -L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \
-    -L"${CUDA_HOME}/lib64/" \
+    stt_whisper.cpp \
+    -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib" \
    -l:libavfilter.a \
    -l:libswscale.a \
    -l:libswresample.a \
    -l:libavformat.a \
    -l:libavcodec.a \
    -l:libavutil.a \
+    -l:libwhisper.a \
    -lfreetype \
    -lvpx \
    -lx264 \
@@ -147,57 +164,36 @@ then
    -lz \
    -lva \
    -lfmt \
-    -lonnxruntime \
    ${EXTRA_DEBUG_LIBRARIES} \
    -o "${OUTPUT}"

-    if [ ${DEBUG} ]; then
-      cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx"
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "libonnxruntime.so.1.12.0"
-    else
-      cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx"
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime.so.1.12.0"
-    fi
-    if [ "${PROCESSOR}" = "NVIDIA" ]; then
-      if [ ${DEBUG} ]; then
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "libonnxruntime_providers_shared.so"
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "libonnxruntime_providers_cuda.so"
-      else
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_shared.so"
-        cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_cuda.so"
-      fi
-    fi
-
 elif [ "${PLATFORM}" = "darwin" ]
 then
-    if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then
-        rm "${CONTRIB_PATH}/native/.ffmpeg"
+    if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.ffmpeg"
+        rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg"
+    fi
+    if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then
+        rm "${CONTRIB_BUILD_PATH}/.whispercpp"
+        rm -rf "${CONTRIB_BUILD_PATH}/whispercpp"
    fi
    WORKPATH=$(pwd)
-    cd "${CONTRIB_PATH}/native/"
-    make .ffmpeg -j$(nproc)
+    cd "${CONTRIB_BUILD_PATH}/"
+    make .whispercpp
+    make .ffmpeg
+    rm .whispercpp
    rm .ffmpeg
    cd ${WORKPATH}

    CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM}
-    ONNX_PATH=${EXTRALIBS_PATH}
-    if [ -z "${EXTRALIBS_PATH}" ]
-    then
-      ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}"
-    fi
-
    if [ ${DEBUG} ]; then
      OUTPUT="${PLUGIN_NAME}"
-      CLANG_OPTS="-g -fsanitize=address"
+      CLANG_OPTS="-O0 -g -fsanitize=address"
      EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lrav1e"
      EXTRA_DEFINES="-D__DEBUG__"
    else
      python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
-      CLANG_OPTS="-O3 -shared"
+      CLANG_OPTS="-O3 -g -shared"
      OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
    fi

@@ -215,21 +211,19 @@ then
    -I"." \
    -I"${DAEMON_SRC}" \
    -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/include" \
-    -I"${ONNX_PATH}/include/onnxruntime/session" \
    -I"${PLUGINS_LIB}" \
    ./../lib/common.cpp \
    ./../lib/accel.cpp \
    ./../lib/frameFilter.cpp \
    ./../lib/frameUtils.cpp \
+    ./../lib/resampler.cpp \
    main.cpp \
    TranscriptMediaHandler.cpp \
    TranscriptAudioSubscriber.cpp \
    TranscriptVideoSubscriber.cpp \
    PluginPreferenceHandler.cpp \
-    Preprocess.cpp \
-    ModelProcessor.cpp \
+    stt_whisper.cpp \
    -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/" \
-    -L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \
    -lavfilter \
    -lswscale \
    -lswresample \
@@ -237,44 +231,22 @@ then
    -lavcodec \
    -lavutil \
    -lvpx -lx264 -lbz2 -liconv -lz \
-    -lonnxruntime \
-    -lspeex \
-    -lopus \
+    "${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libspeex.a" \
+    "${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libopus.a" \
    -lfmt \
+    -lwhisper \
    "/usr/local/opt/libpng/lib/libpng.a" \
    "/usr/local/opt/freetype/lib/libfreetype.a" \
    ${EXTRA_DEBUG_LIBRARIES} \
    -o "${OUTPUT}"

-    if [ ${DEBUG} ]; then
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "libonnxruntime.dylib"
-      cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx"
-      install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "libonnxruntime.dylib"
-      install_name_tool -id "@loader_path/${PLUGIN_NAME}" "${OUTPUT}"
-    else
-      cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx"
-      cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx"
-      cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx"
-      cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
-      install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
-      install_name_tool -id "@loader_path/${SO_FILE_NAME}" "${OUTPUT}"
-    fi
-    install_name_tool -change "@rpath/libonnxruntime.1.12.0.dylib" "@loader_path/libonnxruntime.dylib" "${OUTPUT}"
-
    if [ -n "${APPLE_SIGN_CERTIFICATE}" ]; then
-      codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}"  "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
      codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}"  "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
-      ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" "build-local/libonnxruntime.dylib.zip"
-      LIBRARYNAME=libonnxruntime.dylib sh ./../notarize.sh
-      ditto -x -k "build-local/libonnxruntime.dylib.zip" "build-local/notarized0"
-      cp "build-local/notarized0/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"

      ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" "build-local/${SO_FILE_NAME}.zip"
      LIBRARYNAME=${SO_FILE_NAME} sh ./../notarize.sh
-      ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized1"
-      cp "build-local/notarized1/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
+      ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized"
+      cp "build-local/notarized/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
    fi

 elif [ "${PLATFORM}" = "android" ]
@@ -346,13 +318,15 @@ then
        CONTRIB_PLATFORM=x86_64-linux-android
        fi

-        if [ -f "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg" ]; then
-            rm "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg"
+        if [ -f "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg" ]; then
+            rm "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg"
        fi

        WORKPATH=$(pwd)
-        cd "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/"
+        cd "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/"
        make .ffmpeg -j$(nproc)
+        make .whispercpp -j$(nproc)
+        rm .whispercpp
        rm .ffmpeg
        cd ${WORKPATH}

@@ -360,12 +334,6 @@ then
        #    Compile the plugin
        #=========================================================

-        ONNX_PATH="${EXTRALIBS_PATH}/${CURRENT_ABI}"
-        if [ -z ${EXTRALIBS_PATH} ]
-        then
-          ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}"
-        fi
-
        # Create so destination folder
        $CXX --std=c++17 -O3 -fPIC \
        -Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \
@@ -376,10 +344,6 @@ then
        -I"." \
        -I"${DAEMON_SRC}" \
        -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \
-        -I"${ONNX_PATH}/include/onnxruntime/session" \
-        -I"${ONNX_PATH}/include/onnxruntime/providers/nnapi" \
-        -I"${ONNX_PATH}/../include/onnxruntime/session" \
-        -I"${ONNX_PATH}/../include/onnxruntime/providers/nnapi" \
        -I"${PLUGINS_LIB}" \
        ./../lib/common.cpp \
        ./../lib/accel.cpp \
@@ -390,10 +354,9 @@ then
        TranscriptAudioSubscriber.cpp \
        TranscriptVideoSubscriber.cpp \
        PluginPreferenceHandler.cpp \
-        Preprocess.cpp \
-        ModelProcessor.cpp \
+        stt_whisper.cpp \
+        ./../lib/resampler.cpp \
        -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \
-        -L"${ONNX_PATH}/lib/" \
        -lavfilter \
        -lswscale \
        -lswresample \
@@ -405,13 +368,11 @@ then
        -lspeex \
        -lopus \
        -lfmt \
+        -lwhisper \
        -l:libfreetype.a \
        -llog -lz \
-        -lonnxruntime \
        --sysroot=$ANDROID_SYSROOT \
        -o "build-local/jpl/lib/$CURRENT_ABI/${SO_FILE_NAME}"
-
-        cp "${ONNX_PATH}/lib/libonnxruntime.so" "build-local/jpl/lib/${CURRENT_ABI}/libonnxruntime.so"
    }

    # Build the so
@@ -419,10 +380,6 @@ then
        CURRENT_ABI=$i
        buildlib
    done
-
-    cp "./modelSRC/mModelEncoder.ort" "./build-local/jpl/data/assets/mModelEncoder.ort"
-    cp "./modelSRC/mModelDecoder.ort" "./build-local/jpl/data/assets/mModelDecoder.ort"
-    cp "./modelSRC/mLogSoftMax.ort" "./build-local/jpl/data/assets/mLogSoftMax.ort"
 fi

 if [ ! ${DEBUG} ]; then

--- a/WhisperTranscript/data/accountpreferences.json
+++ b/WhisperTranscript/data/accountpreferences.json
@@ -3,7 +3,7 @@
        "type": "List",
        "key": "language",
        "title": "{{language_title}}",
-        "defaultValue": "en",
+        "defaultValue": "auto",
        "scope": "plugin,Transcript",
        "entryValues": [
            "auto",
@@ -210,6 +210,84 @@
            "{{language_yo}}"
        ]
    },
+    {
+        "type": "List",
+        "key": "background",
+        "title": "{{background_title}}",
+        "summary": "{{background_summary}}",
+        "defaultValue": "black",
+        "scope": "plugin,Transcript",
+        "entryValues": [
+            "black",
+            "white"
+        ],
+        "entries": [
+            "{{background_entries_1}}",
+            "{{background_entries_2}}"
+        ]
+    },
+    {
+        "type": "List",
+        "key": "position",
+        "title": "{{position_title}}",
+        "defaultValue": "2",
+        "scope": "plugin,Transcript",
+        "entryValues": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ],
+        "entries": [
+            "{{position_entries_1}}",
+            "{{position_entries_2}}",
+            "{{position_entries_3}}",
+            "{{position_entries_4}}"
+        ]
+    },
+    {
+        "type": "List",
+        "key": "fontsize",
+        "title": "{{fontsize_title}}",
+        "defaultValue": "14",
+        "scope": "plugin,Transcript",
+        "entryValues": [
+            "10",
+            "12",
+            "14",
+            "16",
+            "18",
+            "24",
+            "36",
+            "72"
+        ],
+        "entries": [
+            "10",
+            "12",
+            "14",
+            "16",
+            "18",
+            "24",
+            "36",
+            "72"
+        ]
+    },
+    {
+        "type": "List",
+        "key": "avstream",
+        "title": "{{avstream_title}}",
+        "summary": "{{avstream_summary}}",
+        "defaultValue": "in",
+        "scope": "plugin",
+        "entryValues": [
+            "out",
+            "in"
+        ],
+        "entries": [
+            "{{avstream_entries_1}}",
+            "{{avstream_entries_2}}"
+        ]
+    },
    {
        "type": "Switch",
        "key": "TranscriptAlways",

--- a/WhisperTranscript/data/assets/.gitignore
+++ b/WhisperTranscript/data/assets/.gitignore
 *.onnx
+*.bin
--- a/WhisperTranscript/data/assets/mel_filters.bin
+++ b/WhisperTranscript/data/assets/mel_filters.bin
--- a/WhisperTranscript/data/assets/tokenizer.bin
+++ b/WhisperTranscript/data/assets/tokenizer.bin
--- a/WhisperTranscript/data/locale/WhisperTranscript_en.json
+++ b/WhisperTranscript/data/locale/WhisperTranscript_en.json
@@ -5,11 +5,10 @@
    "avstream_entries_2": "Received",
    "TranscriptAlways_title": "Automatically activate transcription",
    "TranscriptAlways_summary": "Activate transcription when a call starts.",
-    "background_title": "Add background color",
-    "background_summary": "Add a partial transparency to the subtitle background if it isn't visible enough",
-    "background_entries_1": "None",
-    "background_entries_2": "Black",
-    "background_entries_3": "White",
+    "background_title": "Background color",
+    "background_summary": "Defines the subtitle background color",
+    "background_entries_1": "Black",
+    "background_entries_2": "White",
    "position_title": "Transcription position",
    "position_entries_1": "Top right",
    "position_entries_2": "Top left",
@@ -116,9 +115,5 @@
    "language_vi": "Vietnamese",
    "language_cy": "Welsh",
    "language_yi": "Yiddish",
-    "language_yo": "Yoruba",
-    "acceleration_title": "Use hardware acceleration",
-    "acceleration_summary": "Use CUDA or NNAPI where applicable",
-    "chunk_title": "Chunk size in seconds",
-    "step_title": "Step size in seconds"
+    "language_yo": "Yoruba"
 }
\ No newline at end of file
--- a/WhisperTranscript/data/preferences.json
+++ b/WhisperTranscript/data/preferences.json
-[
-    {
-        "type": "List",
-        "key": "background",
-        "title": "{{background_title}}",
-        "summary": "{{background_summary}}",
-        "defaultValue": "black@0.0",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "black@0.0",
-            "black@0.5",
-            "white@0.5"
-        ],
-        "entries": [
-            "{{background_entries_1}}",
-            "{{background_entries_2}}",
-            "{{background_entries_3}}"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "position",
-        "title": "{{position_title}}",
-        "defaultValue": "2",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "1",
-            "2",
-            "3",
-            "4"
-        ],
-        "entries": [
-            "{{position_entries_1}}",
-            "{{position_entries_2}}",
-            "{{position_entries_3}}",
-            "{{position_entries_4}}"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "fontsize",
-        "title": "{{fontsize_title}}",
-        "defaultValue": "14",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "10",
-            "12",
-            "14",
-            "16",
-            "18",
-            "24",
-            "36",
-            "72"
-        ],
-        "entries": [
-            "10",
-            "12",
-            "14",
-            "16",
-            "18",
-            "24",
-            "36",
-            "72"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "avstream",
-        "title": "{{avstream_title}}",
-        "summary": "{{avstream_summary}}",
-        "defaultValue": "in",
-        "scope": "plugin",
-        "entryValues": [
-            "out",
-            "in"
-        ],
-        "entries": [
-            "{{avstream_entries_1}}",
-            "{{avstream_entries_2}}"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "chunksize",
-        "title": "{{chunk_title}}",
-        "defaultValue": "15",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ],
-        "entries": [
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ]
-    },
-    {
-        "type": "List",
-        "key": "stepsize",
-        "title": "{{step_title}}",
-        "defaultValue": "3",
-        "scope": "plugin,Transcript",
-        "entryValues": [
-            "1",
-            "2",
-            "3",
-            "4",
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ],
-        "entries": [
-            "1",
-            "2",
-            "3",
-            "4",
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            "10",
-            "11",
-            "12",
-            "13",
-            "14",
-            "15",
-            "16",
-            "17",
-            "18",
-            "19",
-            "20",
-            "21",
-            "22",
-            "23",
-            "24",
-            "25",
-            "26",
-            "27",
-            "28",
-            "29",
-            "30"
-        ]
-    },
-    {
-        "type": "Switch",
-        "key": "acceleration",
-        "title": "{{acceleration_title}}",
-        "summary": "{{acceleration_summary}}",
-        "defaultValue": "1",
-        "scope": "plugin"
-    }
-]
\ No newline at end of file
+[]
\ No newline at end of file