diff --git a/WhisperTranscript/.gitignore b/WhisperTranscript/.gitignore index a55df1d1b5f260945928fc6bf5b33f8911c58276..f31f07ce5fe22d444fe5b9a88e9749c3268e4a87 100644 --- a/WhisperTranscript/.gitignore +++ b/WhisperTranscript/.gitignore @@ -1,6 +1,5 @@ *.mp3 /WhisperTranscript* -libonnxruntime.so* /libonnxruntime.dylib /processed.mp4 *.so diff --git a/WhisperTranscript/CMakeLists.txt b/WhisperTranscript/CMakeLists.txt index a55f5c4857b013c8869a24dce17da555c9ca7eb6..d1e06ff42f3539f9d343fb8dd92ab592e0423798 100644 --- a/WhisperTranscript/CMakeLists.txt +++ b/WhisperTranscript/CMakeLists.txt @@ -58,11 +58,11 @@ set(plugin_SRC main.cpp PluginPreferenceHandler.cpp TranscriptAudioSubscriber.cpp TranscriptVideoSubscriber.cpp - Preprocess.cpp - ModelProcessor.cpp + stt_whisper.cpp ./../lib/accel.cpp ./../lib/frameUtils.cpp ./../lib/frameFilter.cpp + ./../lib/resampler.cpp ./../lib/common.cpp ) @@ -70,8 +70,7 @@ set(plugin_HDR TranscriptAudioSubscriber.h TranscriptVideoSubscriber.h TranscriptMediaHandler.h PluginPreferenceHandler.h - Preprocess.h - ModelProcessor.h + stt_whisper.h ./../lib/pluglog.h ./../lib/mediaStream.h ./../lib/audioFormat.h @@ -98,6 +97,7 @@ target_include_directories(${ProjectName} PUBLIC ${PROJECT_BINARY_DIR} ${ONNX_DIR}/../include/session ${ONNX_DIR}/../include/providers/cuda ${CONTRIB_PATH}/build/yaml-cpp/include + ${CONTRIB_PATH}/build/whispercpp ) target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH} ${CONTRIB_PATH}/build/fmt/msvc/Release @@ -110,7 +110,7 @@ target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH} target_link_libraries(${ProjectName} PUBLIC libyaml-cppmd libavfilter libswscale libswresample libavformat libavcodec libavutil libvpx libx264 libopus - libmfx fmt libzlib freetype ws2_32 Bcrypt Secur32 onnxruntime msvcrt) + libmfx fmt libzlib freetype whisper ws2_32 Bcrypt Secur32 msvcrt) add_custom_command( TARGET ${ProjectName} @@ -120,6 +120,8 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/yaml-cpp ${CONTRIB_PATH}/src/yaml-cpp COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/freetype ${CONTRIB_PATH}/src/freetype COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/ffmpeg/ ${CONTRIB_PATH}/src/ffmpeg + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/whispercpp/ ${CONTRIB_PATH}/src/whispercpp + COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb whispercpp COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb fmt COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb yaml-cpp COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb zlib @@ -136,34 +138,17 @@ if(TESTPROCESS) PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/testPreferences.yml ${PROJECT_BINARY_DIR}/ COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/jfk.wav ${PROJECT_BINARY_DIR}/ - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${PROJECT_BINARY_DIR}/Debug - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${PROJECT_BINARY_DIR}/Debug - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${PROJECT_SOURCE_DIR}/data/assets - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${PROJECT_SOURCE_DIR}/data/assets - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${PROJECT_SOURCE_DIR}/data/assets + COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/sample.mp4 ${PROJECT_BINARY_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${PROJECT_SOURCE_DIR}/data/assets/ ) else() add_custom_command( TARGET ${ProjectName} PRE_BUILD COMMAND python ${PROJECT_SOURCE_DIR}/../SDK/jplManipulation.py --preassemble --plugin=${ProjectName} - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM} - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM} - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${JPL_DIRECTORY}/data/assets - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${JPL_DIRECTORY}/data/assets - COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${JPL_DIRECTORY}/data/assets + COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${JPL_DIRECTORY}/data/assets/ COMMENT "Assembling Plugin files" ) - if(NVIDIA) - add_custom_command( - TARGET ${ProjectName} - PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM} - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM} - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM} - COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM} - ) - endif() add_custom_command( TARGET ${ProjectName} POST_BUILD diff --git a/WhisperTranscript/ModelProcessor.cpp b/WhisperTranscript/ModelProcessor.cpp deleted file mode 100644 index 6c8660103e7a89787e25a8b31196c4bc73a38750..0000000000000000000000000000000000000000 --- a/WhisperTranscript/ModelProcessor.cpp +++ /dev/null @@ -1,441 +0,0 @@ -/** - * Copyright (C) 2022 Savoir-faire Linux Inc. - * - * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 - * USA. - */ - -#include "ModelProcessor.h" - -#include <pluglog.h> -#include <common.h> -#include <limits.h> - -const char sep = separator(); - -const std::string TAG = "Transcript"; - -namespace jami { - -ModelProcessor::ModelProcessor(const std::string& path, bool acc) -{ - loadTokens(path + "/assets/tokenizer.bin", vocab_); - -#ifdef __ANDROID__ - initModels(path + "/assets/mModelEncoder.ort", path + "/assets/mModelDecoder.ort", path + "/assets/mLogSoftMax.ort", acc); -#else - initModels(path + "/assets/mModelEncoder.onnx", path + "/assets/mModelDecoder.onnx", path + "/assets/mLogSoftMax.onnx", acc); -#endif -} - -ModelProcessor::~ModelProcessor() -{ - endModels(); - Plog::log(Plog::LogPriority::INFO, TAG, "~ModelProcessor"); -} - -void -ModelProcessor::endModels() -{ - if (encoderSession_) { - delete encoderSession_; - encoderSession_ = nullptr; - } - if (decoderSession_) { - delete decoderSession_; - decoderSession_ = nullptr; - } - if (logSoftMaxSession_) { - delete logSoftMaxSession_; - logSoftMaxSession_ = nullptr; - } -#ifdef NVIDIA - if (cudaOptions_) { - ortApi.ReleaseCUDAProviderOptions(cudaOptions_); - cudaOptions_ = nullptr; - } -#endif - if (env_) { - env_.release(); - env_ = NULL; - } -} - -void -ModelProcessor::initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc) -{ - try { - sessOpt_ = Ort::SessionOptions(); - - try { - if (activateAcc) { -#ifdef NVIDIA - Ort::ThrowOnError(ortApi.CreateCUDAProviderOptions(&cudaOptions_)); - - // std::vector<const char*> keys{"device_id"}; - // std::vector<const char*> values{"0"}; - - // Ort::ThrowOnError(ortApi.UpdateCUDAProviderOptions(cudaOptions_, keys.data(), values.data(), keys.size())); - Ort::ThrowOnError(ortApi.SessionOptionsAppendExecutionProvider_CUDA_V2(sessOpt_, cudaOptions_)); -#endif -#ifdef __ANDROID__ - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(sessOpt_, 0)); -#endif - } - } catch (std::exception& accelException) { - Plog::log(Plog::LogPriority::ERR, TAG, accelException.what()); - Plog::log(Plog::LogPriority::ERR, TAG, "Acceleration not available, loading models for CPU."); - } - - sessOpt_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); -#ifdef WIN32 - encoderSession_ = new Ort::Session(env_, string_utils::to_wstring(encoderModelPath).c_str(), sessOpt_); - decoderSession_ = new Ort::Session(env_, string_utils::to_wstring(decoderModelPath).c_str(), sessOpt_); - logSoftMaxSession_ = new Ort::Session(env_, string_utils::to_wstring(logSoftMaxModelPath).c_str(), sessOpt_); -#else - encoderSession_ = new Ort::Session(env_, encoderModelPath.c_str(), sessOpt_); - decoderSession_ = new Ort::Session(env_, decoderModelPath.c_str(), sessOpt_); - logSoftMaxSession_ = new Ort::Session(env_, logSoftMaxModelPath.c_str(), sessOpt_); -#endif - isAllocated_ = true; - Plog::log(Plog::LogPriority::INFO, TAG, "Model is allocated"); - } catch (std::exception& e) { - Plog::log(Plog::LogPriority::ERR, TAG, e.what()); - } -} - -/* from whisper.cpp */ -// the most basic sampling scheme - select the top token -whisperTokenData -ModelProcessor::whisper_sample_best(const float * probs) -{ - whisperTokenData result = { - 0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f, - }; - - int n_logits = vocab_.id_to_token.size(); - - std::vector<std::pair<double, int64_t>> probs_id; - probs_id.reserve(n_logits); - - for (int i = 0; i < n_logits; i++) { - probs_id.emplace_back(std::make_pair(probs[i], i)); - } - - { - double sum_ts = 0.0; - double max_ts = -1.0; - double max_tx = -1.0; - - for (int i = 0; i < vocab_.token_beg; i++) { - max_tx = std::max(max_tx, probs_id[i].first); - } - - for (int i = vocab_.token_beg; i < n_logits; i++) { - sum_ts += probs_id[i].first; - if (probs_id[i].first > max_ts) { - max_ts = probs_id[i].first; - result.tid = probs_id[i].second; - } - } - - // if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a - // timestamp token - if (sum_ts > max_tx) { - // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438 - for (int i = 0; i < vocab_.token_beg; i++) { - probs_id[i].first = -INT_MAX; - } - } - - result.pt = max_ts/(sum_ts + 1e-10); - result.ptsum = sum_ts; - } - - // find the top K tokens - const int top_k = 4; - - std::partial_sort( - probs_id.begin(), - probs_id.begin() + top_k, probs_id.end(), - [](const std::pair<double, int64_t> & a, const std::pair<double, int64_t> & b) { - return a.first > b.first; - }); - - probs_id.resize(top_k); - - int res = 0; - while ((probs_id[res].second == vocab_.token_sot || - probs_id[res].second == vocab_.token_solm || - probs_id[res].second == vocab_.token_beg) && - res < (int) probs_id.size() - 1) { - res++; - } - - result.id = probs_id[res].second; - result.p = probs_id[res].first; - - return result; -} - -void -ModelProcessor::filterLogits(std::vector<float>& logits, int offset) -{ - // Remove all no speech tokens - for (const auto idx : vocab_.noSpeechTokens) { - logits[idx] = (float)-INT_MAX; - } -} - -void -ModelProcessor::filterLanguageLogits(std::vector<float>& logits) -{ - // Leave only the language tokens - for (size_t i = 0; i < logits.size(); i++) { - if (vocab_.languageId2Tokens[i].empty()) - logits[i] = (float)(-INT_MAX); - } -} - -whisperTokenData -ModelProcessor::getToken(std::vector<float>& logits) -{ - Ort::RunOptions runOption; - std::vector<Ort::Value> logSoftMaxInputs; - logSoftMaxInputs.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_, - logits.data(), - logits.size(), - logitsShape_.data(), - logitsShape_.size())); - - auto softmaxOutputs = logSoftMaxSession_->Run(runOption, - logSoftMaxInputNames.data(), - logSoftMaxInputs.data(), - logSoftMaxInputNames.size(), - logSoftMaxOutputNames.data(), - logSoftMaxOutputNames.size()); - - float* probs = softmaxOutputs[1].GetTensorMutableData<float>(); - return whisper_sample_best(probs); -} - -std::string -ModelProcessor::feedInput(std::vector<float>& melInput, const std::string& preferenceLanguage) -{ - std::lock_guard<std::mutex> l(mtx_); - if (!isAllocated_ || !logSoftMaxSession_ || !encoderSession_ || !decoderSession_) - return ""; - Ort::RunOptions runOption; - try { - Ort::Value melInputTensor = Ort::Value::CreateTensor<float>(allocatorInfo_, - melInput.data(), - melInput.size(), - melInputShape_.data(), - melInputShape_.size()); - audioFeaturesTensor_ = Ort::Value::CreateTensor<float>(allocatorInfo_, - audioFeatures_.data(), - audioFeatures_.size(), - audioFeaturesShape_.data(), - audioFeaturesShape_.size()); - // Run the encoder graph - encoderSession_->Run(runOption, - encoderInputNames, - &melInputTensor, - 1, - encoderOutputNames, - &audioFeaturesTensor_, - 1); - } catch(Ort::Exception e) { - Plog::log(Plog::LogPriority::ERR, TAG, e.what()); - return ""; - } catch (...) { return ""; } - std::vector<float> currentTokensP {}; - - try { - auto isMultilingual = vocab_.is_multilingual(); - std::vector<int64_t> currentTokens {}; - currentTokens.emplace_back(vocab_.token_sot); - currentTokensP.emplace_back(1); - - std::array<int64_t, 1> offsetShape {1}; - - if (isMultilingual) { - if (preferenceLanguage == "auto" - || vocab_.languageTokens2Id.find(preferenceLanguage) == vocab_.languageTokens2Id.end()) { - std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f); - std::array<int64_t, 2> tokenShape {1, 1}; - int64_t offset = 0; - std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, 1, MODELFEATURESHAPE }; - - std::vector<int64_t> token = { currentTokens.back() }; - - // Run the decoder graph - std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_}; - inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_, - audioFeatures_.data(), - audioFeatures_.size(), - audioFeaturesShape_.data(), - audioFeaturesShape_.size())); - - inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_, - token.data(), - token.size(), - tokenShape.data(), - tokenShape.size())); - - inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_, - currentKVCache.data(), - currentKVCache.size(), - kvCacheShape.data(), - kvCacheShape.size())); - - inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_, - &offset, - 1, - offsetShape.data(), - 0)); - - auto outputs = decoderSession_->Run(runOption, - decoderInputNames.data(), - inputsVector.data(), - decoderInputNames.size(), - decoderOutputNames.data(), - decoderOutputNames.size()); - - auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo(); - auto logitsData = outputs[0].GetTensorMutableData<float>(); - - { - std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount()); - filterLanguageLogits(logits); - auto it = std::max_element(logits.begin(), logits.end()); - currentTokens.emplace_back(std::distance(logits.begin(), it)); - } - } else - currentTokens.emplace_back(vocab_.languageTokens2Id[preferenceLanguage]); - currentTokens.emplace_back(vocab_.token_transcribe); - currentTokensP.emplace_back(1); - currentTokensP.emplace_back(1); - } - - std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f); - std::array<int64_t, 2> tokenShape {1, (long)currentTokens.size()}; - - for (auto i = 0; i < sampleLen; i++) { - int64_t offset = isMultilingual ? ( i == 0 ? 0 : i + 2 ) : i; - std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, static_cast<int64_t>(currentTokens.size()), MODELFEATURESHAPE }; - - std::vector<int64_t> token = { currentTokens.back() }; - if (i == 0) { - token = currentTokens; - tokenShape[1] = currentTokens.size(); - } else { - tokenShape[1] = 1; - } - - // Run the decoder graph - std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_}; - inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_, - audioFeatures_.data(), - audioFeatures_.size(), - audioFeaturesShape_.data(), - audioFeaturesShape_.size())); - - inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_, - token.data(), - token.size(), - tokenShape.data(), - tokenShape.size())); - - inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_, - currentKVCache.data(), - currentKVCache.size(), - kvCacheShape.data(), - kvCacheShape.size())); - - inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_, - &offset, - 1, - offsetShape.data(), - 0)); - - auto outputs = decoderSession_->Run(runOption, - decoderInputNames.data(), - inputsVector.data(), - decoderInputNames.size(), - decoderOutputNames.data(), - decoderOutputNames.size()); - - auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo(); - auto logitsData = outputs[0].GetTensorMutableData<float>(); - - { - std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount()); - if (isMultilingual && logits.size() > vocab_.n_vocab) { - std::vector<float>lastLogits; - lastLogits = std::vector<float>(logits.begin() + 2 * vocab_.n_vocab, logits.end()); - std::swap(lastLogits, logits); - } - - filterLogits(logits, offset); - - auto tokenData = getToken(logits); - currentTokens.emplace_back(tokenData.id); - currentTokensP.emplace_back(tokenData.p); - } - - // Grab kvCache for next iteration - auto kvCacheTensorInfo = outputs[1].GetTensorTypeAndShapeInfo(); - auto nextKVCacheData = outputs[1].GetTensorMutableData<float>(); - - std::vector<float> nextKVCache; - std::vector<float> zeros(MODELFEATURESHAPE, 0.0f); - int delta = (currentTokens.size() - 1) * MODELFEATURESHAPE; - for (int currentKVIdx = 0; currentKVIdx < MODELKVCACHESHAPE; currentKVIdx++) { - nextKVCache.insert(nextKVCache.end(), - nextKVCacheData + (currentKVIdx * delta), - nextKVCacheData + ((currentKVIdx + 1) * delta)); - nextKVCache.insert(nextKVCache.end(), zeros.begin(), zeros.end()); - } - std::swap(currentKVCache, nextKVCache); - - if (currentTokens.back() == vocab_.token_eot) - break; - } - - std::swap(currentTokens, tokensOutput_); - } catch(Ort::Exception e) { - Plog::log(Plog::LogPriority::ERR, TAG, e.what()); - return ""; - } catch (...) {} - - std::ostringstream oss; - std::ostringstream tokensStr; - auto idx = -1; - for (const auto& token : tokensOutput_) { - idx ++; - tokensStr << token << " " << currentTokensP[idx] << " "; - if (token >= vocab_.token_eot) - continue; - if (currentTokensP[idx] > -1.8) - oss << vocab_.id_to_token[token]; - } - - tokensOutput_.clear(); - return oss.str(); -} -} // namespace jami diff --git a/WhisperTranscript/ModelProcessor.h b/WhisperTranscript/ModelProcessor.h deleted file mode 100644 index e640c191ea3b011ed27057f5785d41e3bddc525b..0000000000000000000000000000000000000000 --- a/WhisperTranscript/ModelProcessor.h +++ /dev/null @@ -1,141 +0,0 @@ -/** - * Copyright (C) 2022 Savoir-faire Linux Inc. - * - * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 - * USA. - */ - -#pragma once - -#include <map> -#include <vector> -#include <algorithm> -#include <set> -#include <mutex> - -#include <onnxruntime_cxx_api.h> -// #ifdef NVIDIA -// #include <cuda_provider_options.h> -// #endif -#ifdef __ANDROID__ -#include <nnapi_provider_factory.h> -#endif - -#include <functional> - -#include "Preprocess.h" - -namespace jami { - -// Use script getonnxio.py to grab model inputs and outputs -// names and shapes. -// Note: None is a open shape. If in the input, it will be defined by -// the data we want to use as input. As for open output, it is recommended -// to not try to pre allocate the tensor and use the model.run return. - -static const char* encoderInputNames[4] = {"mel"}; -static const char* encoderOutputNames[4] = {"617"}; - -#define MODELFEATURESHAPE 384 -#define MODELKVCACHESHAPE 8 - -#define MODELLOGITSHAPE 51865 // 51864 for english models - -static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"}; -static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"}; - -static const std::vector<const char *> logSoftMaxInputNames = {"logits"}; -static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"}; - -typedef struct whisperTokenData { - int64_t id; // token id - int64_t tid; // forced timestamp token id - - float p; // probability of the token - float pt; // probability of the timestamp token - float ptsum; // sum of probabilities of all timestamp tokens - - // token-level timestamp data - // do not use if you haven't computed token-level timestamps - int64_t t0; // start time of the token - int64_t t1; // end time of the token - - float vlen; // voice length of the token -} whisperTokenData; - -class ModelProcessor -{ -public: - ModelProcessor(const std::string& path, bool acc); - ~ModelProcessor(); - - void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc); - void endModels(); - - whisperTokenData whisper_sample_best(const float * probs); - - /** - * @brief feedInput - * Takes a input and feeds it to the model storage for predictions - * @param input - * @param preferenceLanguage - */ - std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto"); - - bool isAllocated() { return isAllocated_; } - -private: - // Tokens - whisperVocab vocab_; - - whisperTokenData getToken(std::vector<float>& logits); - void filterLogits(std::vector<float>& logits, int offset); - void filterLanguageLogits(std::vector<float>& logits); - - // onnx related - Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - bool isAllocated_ {false}; - Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"}; - Ort::Session* encoderSession_ {nullptr}; - Ort::Session* decoderSession_ {nullptr}; - Ort::Session* logSoftMaxSession_ {nullptr}; - Ort::SessionOptions sessOpt_; -#ifdef NVIDIA - const OrtApi& ortApi = Ort::GetApi(); - OrtCUDAProviderOptionsV2* cudaOptions_ = nullptr; -#endif - - // Encoder tensors. 1 input and 1 output - std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000] - Ort::Value audioFeaturesTensor_ {nullptr}; - std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE] - std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {}; - - std::vector<float> output_; - - // Decoder tensors. 4 inputs and 2 outputs - std::vector<int64_t> tokensOutput_ { }; - - // LogProb check - std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE}; - - int sampleLen = 100; - - std::mutex mtx_; - -}; -} // namespace jami diff --git a/WhisperTranscript/PluginPreferenceHandler.cpp b/WhisperTranscript/PluginPreferenceHandler.cpp index b1a63d0722fa70b933eadd9f405938e937d65167..e4d6ec06e215730ab85ce65dbe8597ffb604f9db 100644 --- a/WhisperTranscript/PluginPreferenceHandler.cpp +++ b/WhisperTranscript/PluginPreferenceHandler.cpp @@ -89,9 +89,7 @@ PluginPreferenceHandler::preferenceMapHasKey(const std::string& key) return (key == "background" || key == "position" || key == "fontsize" - || key == "language" - || key == "chunksize" - || key == "stepsize"); + || key == "language"); } std::string @@ -110,8 +108,7 @@ std::map<std::string, std::string> PluginPreferenceHandler::getPreferences(const std::string& accountId) { std::lock_guard<std::mutex> lk(mtx_); - auto preferences = preferences_.emplace(accountId, preferences_["default"]).first->second; - return preferences; + return preferences_.emplace(accountId, preferences_["default"]).first->second; } PluginPreferenceHandler::~PluginPreferenceHandler() diff --git a/WhisperTranscript/PluginPreferenceHandler.h b/WhisperTranscript/PluginPreferenceHandler.h index 1f2c774fc8edc53369ff1d60fb3f2b08fdf1723e..62d8b9482cbaff0afac2672d277f7a328af896f8 100644 --- a/WhisperTranscript/PluginPreferenceHandler.h +++ b/WhisperTranscript/PluginPreferenceHandler.h @@ -35,9 +35,7 @@ enum Parameter { POSITION, BACKGROUND, FONTSIZE, - LANGUAGE, - CHUNK, - STEP + LANGUAGE }; class TranscriptMediaHandler; diff --git a/WhisperTranscript/Preprocess.cpp b/WhisperTranscript/Preprocess.cpp deleted file mode 100644 index 70a29aaf6904e3476c0bc62aee70e9b10ae94238..0000000000000000000000000000000000000000 --- a/WhisperTranscript/Preprocess.cpp +++ /dev/null @@ -1,347 +0,0 @@ -/** - * Copyright (C) 2022 Savoir-faire Linux Inc. - * - * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include "Preprocess.h" - -#ifdef WIN32 -#define _USE_MATH_DEFINES -#endif - -#include <thread> -#include <math.h> -#include <fstream> -#include <iostream> - -// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124 -bool logMelSpectrogram( - const float *samples, - const int n_samples, - const int n_threads, - const whisperFilters &filters, - whisperMel &mel) { - - // const int sample_rate = WHISPER_SAMPLE_RATE; - const int fft_size = WHISPER_N_FFT; - const int fft_step = WHISPER_HOP_LENGTH; - const int n_mel = WHISPER_N_MEL; - - // Hanning window - std::vector<float> hann; - hann.resize(fft_size); - for (int i = 0; i < fft_size; i++) { - hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size))); - } - - mel.n_mel = n_mel; - mel.n_len = (n_samples)/fft_step; - mel.data.resize(mel.n_mel*mel.n_len); - - const int n_fft = 1 + fft_size/2; - - std::vector<std::thread> workers(n_threads); - for (int iw = 0; iw < n_threads; ++iw) { - workers[iw] = std::thread([&](int ith) { - std::vector<float> fft_in; - fft_in.resize(fft_size); - for (int i = 0; i < fft_size; i++) { - fft_in[i] = 0.0; - } - - std::vector<float> fft_out; - fft_out.resize(2*fft_size); - - for (int i = ith; i < mel.n_len; i += n_threads) { - const int offset = i*fft_step; - - // apply Hanning window - for (int j = 0; j < fft_size; j++) { - if (offset + j < n_samples) { - fft_in[j] = hann[j]*samples[offset + j]; - } else { - fft_in[j] = 0.0; - } - } - - // FFT -> mag^2 - fft(fft_in, fft_out); - - for (int j = 0; j < fft_size; j++) { - fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]); - } - for (int j = 1; j < fft_size/2; j++) { - fft_out[j] += fft_out[fft_size - j]; - } - - // mel spectrogram - for (int j = 0; j < mel.n_mel; j++) { - double sum = 0.0; - - for (int k = 0; k < n_fft; k++) { - sum += fft_out[k]*filters.data[j*n_fft + k]; - } - if (sum < 1e-10) { - sum = 1e-10; - } - - sum = log10(sum); - - mel.data[j*mel.n_len + i] = sum; - } - } - }, iw); - } - - for (int iw = 0; iw < n_threads; ++iw) { - workers[iw].join(); - } - - // clamping and normalization - double mmax = -1e20; - for (int i = 0; i < mel.n_mel*mel.n_len; i++) { - if (mel.data[i] > mmax) { - mmax = mel.data[i]; - } - } - - mmax -= 8.0; - - for (int i = 0; i < mel.n_mel*mel.n_len; i++) { - if (mel.data[i] < mmax) { - mel.data[i] = mmax; - } - - mel.data[i] = (mel.data[i] + 4.0)/4.0; - } - - return true; -} - -// Cooley-Tukey FFT -// poor man's implementation - use something better -// input is real-valued -// output is complex-valued -void fft(const std::vector<float> & in, std::vector<float> & out) { - out.resize(in.size()*2); - - int N = in.size(); - - if (N == 1) { - out[0] = in[0]; - out[1] = 0; - return; - } - - if (N%2 == 1) { - dft(in, out); - return; - } - - std::vector<float> even; - std::vector<float> odd; - - for (int i = 0; i < N; i++) { - if (i % 2 == 0) { - even.emplace_back(in[i]); - } else { - odd.emplace_back(in[i]); - } - } - - std::vector<float> even_fft; - std::vector<float> odd_fft; - - fft(even, even_fft); - fft(odd, odd_fft); - - for (int k = 0; k < N/2; k++) { - float theta = 2*M_PI*k/N; - - float re = cos(theta); - float im = -sin(theta); - - float re_odd = odd_fft[2*k + 0]; - float im_odd = odd_fft[2*k + 1]; - - out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd; - out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd; - - out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; - out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; - } -} - -// naive Discrete Fourier Transform -// input is real-valued -// output is complex-valued -void dft(const std::vector<float> & in, std::vector<float> & out) { - int N = in.size(); - - out.resize(N*2); - - for (int k = 0; k < N; k++) { - float re = 0; - float im = 0; - - for (int n = 0; n < N; n++) { - float angle = 2*M_PI*k*n/N; - re += in[n]*cos(angle); - im -= in[n]*sin(angle); - } - - out[k*2 + 0] = re; - out[k*2 + 1] = im; - } -} - - -void loadMelFilters(const std::string& fileName, whisperFilters& filters) { - auto fin = std::ifstream(fileName, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str()); - return; - } - - fin.read((char *) &filters.n_mel, sizeof(filters.n_mel)); - fin.read((char *) &filters.n_fft, sizeof(filters.n_fft)); - - filters.data.resize(filters.n_mel * filters.n_fft); - fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float)); -} - -void loadTokens(const std::string& fileName, whisperVocab& vocab) { - auto fin = std::ifstream(fileName, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str()); - return; - } - - int32_t modelNVocab = 0; - fin.read((char *) &modelNVocab, sizeof(modelNVocab)); - - int32_t tokensNVocab = 0; - fin.read((char *) &tokensNVocab, sizeof(tokensNVocab)); - - std::string word; - for (int i = 0; i < tokensNVocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - word.resize(len); - fin.read((char *) word.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - - vocab.n_vocab = modelNVocab; - if (vocab.is_multilingual()) { - vocab.token_eot++; - vocab.token_sot++; - vocab.token_prev++; - vocab.token_solm++; - vocab.token_not++; - vocab.token_beg++; - } - - if (tokensNVocab < modelNVocab) { - // Read language tokens - { - int32_t languageTokensLen = 0; - fin.read((char *) &languageTokensLen, sizeof(languageTokensLen)); - - std::string word; - for (int i = 0; i < languageTokensLen; i++) { - int32_t id = 0; - fin.read((char *) &id, sizeof(id)); - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - word.resize(len); - fin.read((char *) word.data(), len); - - vocab.token_to_id[word] = id; - vocab.id_to_token[id] = word; - vocab.languageId2Tokens.insert({id, word}); - vocab.languageTokens2Id.insert({word, id}); - } - } - - fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab); - for (int i = tokensNVocab; i < modelNVocab; i++) { - if (!vocab.id_to_token[i].empty()) - continue; - if (i > vocab.token_beg) { - word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]"; - } else if (i == vocab.token_eot) { - word = "[_EOT_]"; - } else if (i == vocab.token_sot) { - word = "[_SOT_]"; - } else if (i == vocab.token_prev) { - word = "[_PREV_]"; - } else if (i == vocab.token_not) { - word = "[_NOT_]"; - } else if (i == vocab.token_beg) { - word = "[_BEG_]"; - } else { - word = "[_extra_token_" + std::to_string(i) + "]"; - } - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // Read no speech tokens - { - int32_t noSpeechTokensLen = 0; - fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen)); - - for (int i = 0; i < noSpeechTokensLen; i++) { - uint32_t id; - fin.read((char *) &id, sizeof(id)); - - vocab.noSpeechTokens.insert(id); - } - } -} - -void -inputPadTrim(whisperMel &mel) -{ - if (mel.n_len == ENCODER_INPUT_LEN) - return; - std::vector<float> data; - std::vector<float> partialData; - int seek = 0; - auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN); - for (auto j = 0; j < mel.n_mel; j++) { - seek = j * mel.n_len; - for (auto i = seek; i < (j + 1) * dataLimit; i++) { - partialData.emplace_back(mel.data[i]); - } - if (mel.n_len < ENCODER_INPUT_LEN) { - for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) { - partialData.emplace_back(0.0f); - } - } - data.insert(data.end(), partialData.begin(), partialData.end()); - partialData.clear(); - } - std::swap(mel.data, data); -} diff --git a/WhisperTranscript/Preprocess.h b/WhisperTranscript/Preprocess.h deleted file mode 100644 index 5138321a12210605d14d971e3a26d29fa44f2adb..0000000000000000000000000000000000000000 --- a/WhisperTranscript/Preprocess.h +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Copyright (C) 2022 Savoir-faire Linux Inc. - * - * Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#pragma once - -#include <vector> -#include <cstdint> -#include <string> -#include <map> -#include <set> - - -// Those are model defined -// Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf) -#define WHISPER_SAMPLE_RATE 16000 -#define WHISPER_N_FFT 400 -#define WHISPER_N_MEL 80 -#define WHISPER_HOP_LENGTH 160 -#define WHISPER_CHUNK_SIZE 30 -#define ENCODER_INPUT_LEN 3000 - -struct whisperMel { - int n_len; - int n_mel; - - std::vector<float> data; -}; - -struct whisperFilters { - int32_t n_mel; - int32_t n_fft; - - std::vector<float> data; -}; - -struct whisperVocab { - size_t n_vocab = 51864; - - std::map<std::string, int32_t> token_to_id; - std::map<int32_t, std::string> id_to_token; - - int32_t token_eot = 50256; - int32_t token_sot = 50257; - int32_t token_prev = 50360; - int32_t token_solm = 50361; // no speech - int32_t token_not = 50362; // no timestamps - int32_t token_beg = 50363; // timestamp begin - - // available tasks - const int32_t token_translate = 50358; - const int32_t token_transcribe = 50359; - - bool is_multilingual() const { - return n_vocab == 51865; - } - - std::map<std::string, int32_t> languageTokens2Id; - std::map<int32_t, std::string> languageId2Tokens; - std::set<int32_t> noSpeechTokens; -}; - -bool logMelSpectrogram( - const float * samples, - const int n_samples, - const int n_threads, - const whisperFilters & filters, - whisperMel &mel); - -void fft(const std::vector<float> & in, std::vector<float> & out); - -void dft(const std::vector<float> & in, std::vector<float> & out); - -void loadMelFilters(const std::string& fileName, whisperFilters& filters); - -void loadTokens(const std::string& fileName, whisperVocab& vocab); - -void inputPadTrim(whisperMel &mel); diff --git a/WhisperTranscript/TranscriptAudioSubscriber.cpp b/WhisperTranscript/TranscriptAudioSubscriber.cpp index cd866b4a69870c8366b1ce112b3c917c76a79e4d..11a5adf54b18139ca57317168a7656f05e96ad11 100644 --- a/WhisperTranscript/TranscriptAudioSubscriber.cpp +++ b/WhisperTranscript/TranscriptAudioSubscriber.cpp @@ -24,27 +24,27 @@ #include <frameUtils.h> #include <bitset> #include <iostream> +#include <fmt/core.h> +#include <fmt/format.h> -const std::string TAG = "Transcript"; +#include "stt_whisper.h" + +const std::string TAG = "TranscriptAudio"; const char sep = separator(); namespace jami { -TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc) +TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, + TranscriptVideoSubscriber* videoSubscriber) : path_ {dataPath} - , modelProcessor_ {dataPath, acc} , mVS_ {videoSubscriber} { - loadMelFilters(path_ + "/assets/mel_filters.bin", modelFilters_); + Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this))); } TranscriptAudioSubscriber::~TranscriptAudioSubscriber() { - modelProcessor_.endModels(); - formatFilter_.clean(); - stop(); - processFrameThread.join(); - Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaProcessor"); + Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this))); } /** @@ -53,83 +53,84 @@ TranscriptAudioSubscriber::~TranscriptAudioSubscriber() void TranscriptAudioSubscriber::processFrame() { + if (!whisper_) { + whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin"); + whisper_->setLanguage(language_); + } + while (running) { - auto data = modelInput_[modelIdx_]; - if (data.size() <= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) { - std::this_thread::sleep_for(std::chrono::milliseconds(waitingPoint_)); - continue; - } - if (!running) - break; - melSpectrogram_.data.clear(); - melSpectrogram_.n_len = 0; - melSpectrogram_.n_mel = 0; - logMelSpectrogram(data.data(), - data.size(), - 8, - modelFilters_, - melSpectrogram_); - inputPadTrim(melSpectrogram_); - - auto text = modelProcessor_.feedInput(melSpectrogram_.data, language_); - if (text.empty()) { + decltype(frames_) frames; + { std::unique_lock<std::mutex> l(inputLock); - modelInput_[0].clear(); - modelInput_[1].clear(); - modelIdx_ = 0; + cv_.wait(l, [&]{ + return !running || !frames_.empty(); + }); + if (!running) + return; + frames = std::move(frames_); + } + + for (auto& f : frames) { + uniqueFramePtr filteredFrame = getUniqueFrame(); + filteredFrame->sample_rate = WHISPER_SAMPLE_RATE; + filteredFrame->format = AV_SAMPLE_FMT_FLT; + av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO); + try { + if (resampler_.resample(f.get(), filteredFrame.get()) == 0) { + whisper_->AddAudioData((float*) filteredFrame->buf[0]->data, + filteredFrame->nb_samples); + } + } catch (...) { + } + } + + auto result = whisper_->GetTranscribed(); + if (not result.empty()) { + std::string txt; + for (const auto& t : result) { + if (not t.is_partial) + txt += t.text; + } + if (!txt.empty()) + mVS_->setText(txt); } - mVS_->setText(text); } + whisper_.reset(); } void TranscriptAudioSubscriber::stop() { - running = false; + Plog::log(Plog::LogPriority::INFO, TAG, "stop()"); + { + std::unique_lock<std::mutex> l(inputLock); + running = false; + cv_.notify_all(); + } if (processFrameThread.joinable()) { processFrameThread.join(); } - std::string str = ""; - mVS_->setText(str); + mVS_->setText(""); } void TranscriptAudioSubscriber::start() { + Plog::log(Plog::LogPriority::INFO, TAG, "start()"); running = true; - processFrameThread = std::thread([this] { processFrame(); }); + processFrameThread = std::thread([this](){ processFrame(); }); + mVS_->setText(""); } void -TranscriptAudioSubscriber::setParameter(std::string& parameter, Parameter type) +TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type) { std::unique_lock<std::mutex> l(inputLock); - std::string str = ""; switch (type) { case (Parameter::LANGUAGE): language_ = parameter; - modelInput_[0].clear(); - modelInput_[1].clear(); - modelIdx_ = 0; - mVS_->setText(str); - break; - case (Parameter::CHUNK): - WHISPER_STREAM_SAMPLES_CHUNK = 16000 * std::stoi(parameter); - modelInput_[0].resize(0); - modelInput_[1].resize(0); - modelInput_[0].reserve(WHISPER_STREAM_SAMPLES_CHUNK); - modelInput_[1].reserve(WHISPER_STREAM_SAMPLES_CHUNK); - waitingPoint_ = (std::stoi(parameter) * 1000 - (WHISPER_STREAM_SAMPLES_CHUNK_STEP / 16)) / 3; - modelIdx_ = 0; - mVS_->setText(str); - break; - case (Parameter::STEP): - modelInput_[0].clear(); - modelInput_[1].clear(); - WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * std::stoi(parameter); - waitingPoint_ = ((WHISPER_STREAM_SAMPLES_CHUNK / 16) - std::stoi(parameter) * 1000) / 3; - modelIdx_ = 0; - mVS_->setText(str); + if (whisper_) + whisper_->setLanguage(parameter); break; default: return; @@ -140,77 +141,30 @@ void TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame) { std::unique_lock<std::mutex> l(inputLock); - if (!pluginFrame || modelFilters_.data.empty() || obs != observable_) + if (!pluginFrame || obs != observable_) return; - if (firstRun) { - samplesCount_ = 0; - currentModelInput_.clear(); - futureModelInput_.clear(); - formatFilter_.clean(); - AudioFormat afmt = AudioFormat(pluginFrame->sample_rate, - pluginFrame->channels, - static_cast<AVSampleFormat>(pluginFrame->format)); - MediaStream ms = MediaStream("input", afmt); - formatFilter_.initialize(filterDescription_, {ms}); - firstRun = false; - } - - if (!formatFilter_.initialized_) - return; - - if (formatFilter_.feedInput(pluginFrame, "input") == 0) { - uniqueFramePtr filteredFrame = {formatFilter_.readOutput(), frameFree}; - if (filteredFrame) { - for (size_t i = 0; i < filteredFrame->buf[0]->size; i += 2) { -#ifdef __DEBUG__ - std::lock_guard<std::mutex> l(inputLock); -#endif - int16_t rawValue = (filteredFrame->buf[0]->data[i+1] << 8) | filteredFrame->buf[0]->data[i]; - - // If not a positive value, perform the 2's complement math on the value - if ((rawValue & 0x8000) != 0) { - rawValue = (~(rawValue - 0x0001)) * -1; - } - if (futureModelInput_.size() == WHISPER_STREAM_SAMPLES_CHUNK) - futureModelInput_.erase(futureModelInput_.begin()); - futureModelInput_.emplace_back(float(rawValue)/32768.0f); - samplesCount_++; - - auto value = float(rawValue) / 32768.0f; - if (modelInput_[modelIdx_].size() >= WHISPER_STREAM_SAMPLES_CHUNK) { - modelInput_[modelIdx_].clear(); - modelIdx_ = modelIdx_ ? 0 : 1; - } - modelInput_[modelIdx_].emplace_back(value); - if (modelInput_[modelIdx_].size() - >= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) { - modelInput_[modelIdx_ ? 0 : 1].emplace_back(value); - } - } - } - } + frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree)); + cv_.notify_all(); // audio returns as is } void TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable) { - Plog::log(Plog::LogPriority::INFO, TAG, "::Attached ! "); + std::unique_lock<std::mutex> l(inputLock); + Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable))); observable_ = observable; start(); } void -TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>*) +TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable) { firstRun = true; observable_ = nullptr; stop(); - modelInput_[0].clear(); - modelInput_[1].clear(); - modelIdx_ = 0; - Plog::log(Plog::LogPriority::INFO, TAG, "::Detached()"); + Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable))); } void @@ -218,7 +172,6 @@ TranscriptAudioSubscriber::detach() { if (observable_) { firstRun = true; - std::ostringstream oss; Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()"); observable_->detach(this); } diff --git a/WhisperTranscript/TranscriptAudioSubscriber.h b/WhisperTranscript/TranscriptAudioSubscriber.h index d32cb0059bb2ec249a69ddb9f68fe101f12a9190..3e970e646fee37b4ca5cf1bc169b4d7551303f4b 100644 --- a/WhisperTranscript/TranscriptAudioSubscriber.h +++ b/WhisperTranscript/TranscriptAudioSubscriber.h @@ -26,21 +26,25 @@ extern "C" { #include <observer.h> #include <frameFilter.h> - -#include "Preprocess.h" -#include "ModelProcessor.h" +#include <frameUtils.h> #include "TranscriptVideoSubscriber.h" #include "PluginPreferenceHandler.h" +#include "resampler.h" #include <thread> #include <condition_variable> +#include <deque> +#include <atomic> + +class RealtimeSttWhisper; namespace jami { class TranscriptAudioSubscriber : public Observer<AVFrame*> { public: - TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc = false); + TranscriptAudioSubscriber(const std::string& dataPath, + TranscriptVideoSubscriber* videoSubscriber); ~TranscriptAudioSubscriber(); virtual void update(Observable<AVFrame*>*, AVFrame* const&) override; @@ -49,22 +53,11 @@ public: void detach(); - void setParameter(std::string& parameter, Parameter type); + void setParameter(const std::string& parameter, Parameter type); private: - // Mel spectrogram filters - whisperFilters modelFilters_; - whisperMel melSpectrogram_; - // Observer pattern - Observable<AVFrame*>* observable_{}; - - // Filter for audio formatting - const std::string filterDescription_ = "[input]aresample=16000,aformat=sample_fmts=s16:channel_layouts=mono"; - FrameFilter formatFilter_; - std::array<std::vector<float>, 2> modelInput_ {}; - int modelIdx_ {0}; - int waitingPoint_ {1000}; + Observable<AVFrame*>* observable_ {}; std::string language_ {"auto"}; // Data @@ -72,12 +65,15 @@ private: // Status variables of the processing bool firstRun {true}; - bool running {false}; + std::atomic_bool running {false}; std::mutex inputLock; + std::condition_variable cv_; // Model - ModelProcessor modelProcessor_; + std::unique_ptr<RealtimeSttWhisper> whisper_; + Resampler resampler_; + std::vector<uniqueFramePtr> frames_; // Threading std::thread processFrameThread; @@ -87,9 +83,5 @@ private: // Video processor TranscriptVideoSubscriber* mVS_ {}; - - - size_t WHISPER_STREAM_SAMPLES_CHUNK = 16000 * 15; // 16 KHz * 15 seconds - size_t WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * 3; // 16 KHz * 3 seconds }; } // namespace jami diff --git a/WhisperTranscript/TranscriptMediaHandler.cpp b/WhisperTranscript/TranscriptMediaHandler.cpp index e33134da85e813aa3894f617791d4ccc2330f710..7457bb5ea149a275861270f8b8529593d8f3d3f0 100644 --- a/WhisperTranscript/TranscriptMediaHandler.cpp +++ b/WhisperTranscript/TranscriptMediaHandler.cpp @@ -36,13 +36,11 @@ TranscriptMediaHandler::TranscriptMediaHandler(std::string&& datapath, PluginPre aph_ = prefHandler; setId(datapath_); auto preferences = aph_->getPreferences("default"); - auto it = preferences.find("acceleration"); - auto useAcceleration = it == preferences.end() ? false : it->second == "1"; videoSubscriber_ = std::make_shared<TranscriptVideoSubscriber>(datapath_); - audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get(), useAcceleration); + audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get()); setParameters("default"); #ifdef __DEBUG__ - it = preferences.find("subtitle"); + auto it = preferences.find("subtitle"); if (it != preferences.end()) videoSubscriber_->setText(it->second); #endif @@ -102,8 +100,6 @@ TranscriptMediaHandler::setParameters(const std::string& accountId) videoSubscriber_->setParameter(preferences["background"], Parameter::BACKGROUND); videoSubscriber_->setParameter(preferences["position"], Parameter::POSITION); audioSubscriber_->setParameter(preferences["language"], Parameter::LANGUAGE); - audioSubscriber_->setParameter(preferences["chunksize"], Parameter::CHUNK); - audioSubscriber_->setParameter(preferences["stepsize"], Parameter::STEP); } catch (std::exception& e) { Plog::log(Plog::LogPriority::ERR, TAG, e.what()); } @@ -129,9 +125,7 @@ TranscriptMediaHandler::detach() TranscriptMediaHandler::~TranscriptMediaHandler() { - std::ostringstream oss; - oss << " ~TranscriptMediaHandler from WhisperTranscript Plugin" << std::endl; - Plog::log(Plog::LogPriority::INFO, TAG, oss.str()); + Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaHandler from WhisperTranscript Plugin"); detach(); } } // namespace jami diff --git a/WhisperTranscript/TranscriptVideoSubscriber.cpp b/WhisperTranscript/TranscriptVideoSubscriber.cpp index 353519071a2e223986dabcc0a7d90a095d0137d9..57635d802e637382b54aff0057c9bf59a471d6db 100644 --- a/WhisperTranscript/TranscriptVideoSubscriber.cpp +++ b/WhisperTranscript/TranscriptVideoSubscriber.cpp @@ -35,8 +35,11 @@ extern "C" { #include <fmt/format.h> #include <bitset> +#include <string_view> -const std::string TAG = "Transcript"; +using namespace std::literals; + +const std::string TAG = "TranscriptVideo"; const char sep = separator(); namespace jami { @@ -54,9 +57,10 @@ TranscriptVideoSubscriber::~TranscriptVideoSubscriber() } void -TranscriptVideoSubscriber::setText(std::string& text) +TranscriptVideoSubscriber::setText(const std::string& t) { - text = string_utils::ffmpegScapeString(text); + Plog::log(Plog::LogPriority::INFO, TAG, "setText " + t); + auto text = string_utils::ffmpegScapeString(t); std::vector<std::string> textWords = string_utils::getWords(text, " "); subtitle_ = ""; @@ -101,9 +105,28 @@ TranscriptVideoSubscriber::setParameter(std::string& parameter, Parameter type) firstRun = true; } +std::string_view getTransposeDescr(int rotation) +{ + switch (rotation) { + case 90: + case -270: + return "transpose=2,"sv; + case 180: + case -180: + return "transpose=1, transpose=1,"sv; + case 270: + case -90: + return "transpose=1,"sv; + default: + return {}; + } + return {}; +} + void TranscriptVideoSubscriber::setFilterDescription() { + Plog::log(Plog::LogPriority::INFO, TAG, "setFilterDescription() " + subtitle_); if (pluginFrameSize_.first == 0 || pluginFrameSize_.second == 0) return; @@ -119,35 +142,26 @@ TranscriptVideoSubscriber::setFilterDescription() point_ = {pluginFrameSize_.first - margin, pluginFrameSize_.second - margin}; } - std::string rotateSides = ""; - - if (std::abs(angle_) == 90) - rotateSides = ":out_w=ih:out_h=iw"; - auto baseInfosDescription - = fmt::format("[input]rotate={}{}" - ",drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}" + = fmt::format("[input]{}" + "drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}" "':line_spacing=5:box=1:boxcolor={}:boxborderw=5:x=", - rotation[angle_], rotateSides, + getTransposeDescr(angle_), fontColor_, fontSize_, fontFile_, subtitle_, fontBackground_); - auto position = "{}-text_w:y={}"; + auto position = "{}-text_w:y={}"sv; if (position_ == "2") - position = "{}:y={}"; + position = "{}:y={}"sv; else if (position_ == "3") - position = "{}:y={}-text_h"; + position = "{}:y={}-text_h"sv; else if (position_ == "4") - position = "{}-text_w:y={}-text_h"; - baseInfosDescription = baseInfosDescription + position + ",rotate={}{},format=yuv420p"; - filterDescription_ = fmt::format(baseInfosDescription, - std::to_string(point_.first), - std::to_string(point_.second), - rotation[-angle_], - rotateSides); + position = "{}-text_w:y={}-text_h"sv; + filterDescription_ = baseInfosDescription + fmt::format(std::string(position) + ",{}format=yuv420p"s, + point_.first, + point_.second, + getTransposeDescr(-angle_)); -#ifdef __DEBUG__ Plog::log(Plog::LogPriority::INFO, TAG, filterDescription_); -#endif } void @@ -156,9 +170,8 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p if (!observable_ || !pluginFrame || subtitle_.empty()) return; - AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX); int newAngle {0}; - if (side_data) { + if (AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX)) { auto matrix_rotation = reinterpret_cast<int32_t*>(side_data->data); newAngle = static_cast<int>(av_display_rotation_get(matrix_rotation)); } @@ -170,12 +183,17 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p //====================================================================================== // GET RAW FRAME uniqueFramePtr rgbFrame = {transferToMainMemory(pluginFrame, AV_PIX_FMT_NV12), frameFree}; - rgbFrame.reset(FrameScaler::convertFormat(rgbFrame.get(), AV_PIX_FMT_YUV420P)); + if (!rgbFrame.get()) + return; + if ((AVPixelFormat)rgbFrame->format != AV_PIX_FMT_YUV420P) + rgbFrame.reset(FrameScaler::convertFormat(rgbFrame.get(), AV_PIX_FMT_YUV420P)); if (!rgbFrame.get()) return; if (sourceTimeBase_.num != pluginFrame->time_base.num || sourceTimeBase_.den != pluginFrame->time_base.den) firstRun = true; + if (rgbFrame->width != pluginFrameSize_.first || rgbFrame->height != pluginFrameSize_.second) + firstRun = true; rgbFrame->pts = pluginFrame->pts; rgbFrame->time_base = pluginFrame->time_base; @@ -184,8 +202,6 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p if (firstRun) { filter_.clean(); pluginFrameSize_ = {rgbFrame->width, rgbFrame->height}; - if (std::abs(angle_) == 90) - pluginFrameSize_ = {rgbFrame->height, rgbFrame->width}; setFilterDescription(); rational<int> fr(sourceTimeBase_.den, sourceTimeBase_.num); diff --git a/WhisperTranscript/TranscriptVideoSubscriber.h b/WhisperTranscript/TranscriptVideoSubscriber.h index b809f99748f20f165af6237ab6d2919e995d0375..499568e8291fe3787b40a85a221e648f0bcad384 100644 --- a/WhisperTranscript/TranscriptVideoSubscriber.h +++ b/WhisperTranscript/TranscriptVideoSubscriber.h @@ -43,7 +43,7 @@ public: void detach(); - void setText(std::string& text); + void setText(const std::string& text); void setFilterDescription(); void setParameter(std::string& parameter, Parameter type); diff --git a/WhisperTranscript/build.sh b/WhisperTranscript/build.sh index 32f1155b12f49668e4efe78a077238c34ad1cf77..2ca33b2380337ab565736aceaa8090f75015fa5d 100755 --- a/WhisperTranscript/build.sh +++ b/WhisperTranscript/build.sh @@ -12,18 +12,21 @@ EXTRAPATH='' # -d: debug program. if [ -z "${DAEMON}" ]; then - DAEMON="./../../daemon" - echo "DAEMON not provided, building with ${DAEMON}" + echo "DAEMON not provided, building with ./../../daemon" fi +DAEMON=${DAEMON:="./../../daemon"} +CONTRIB_PATH=${CONTRIB_PATH:="${DAEMON}/contrib"} +CONTRIB_BUILD_DIR=${CONTRIB_BUILD_DIR:="native"} + PLUGIN_NAME="WhisperTranscript" JPL_FILE_NAME="${PLUGIN_NAME}.jpl" SO_FILE_NAME="lib${PLUGIN_NAME}.so" DAEMON_SRC="${DAEMON}/src" -CONTRIB_PATH="${DAEMON}/contrib" PLUGINS_LIB="../lib" LIBS_DIR="./../contrib/Libs" PLATFORM=$(uname) +CONTRIB_BUILD_PATH="${CONTRIB_PATH}/${CONTRIB_BUILD_DIR}" if [ "${PLATFORM}" = "Linux" ]; then PLATFORM="linux-gnu" @@ -41,19 +44,19 @@ fi while getopts t:c:p:d OPT; do case "$OPT" in d) - DEBUG=true - export __DEBUG__=true + DEBUG=true + export __DEBUG__=true ;; t) - PLATFORM="${OPTARG}" + PLATFORM="${OPTARG}" ;; c) - PROCESSOR="${OPTARG}" + PROCESSOR="${OPTARG}" ;; p) ;; \?) - exit 1 + exit 1 ;; esac done @@ -74,39 +77,57 @@ fi echo $PROCESSOR cp -r ffmpeg ${CONTRIB_PATH}/src/ +cp -r whispercpp ${CONTRIB_PATH}/src/ cp -r ../contrib/rav1e ${CONTRIB_PATH}/src/ +if [ ! -f "./data/assets/ggml-base.bin" ]; then + if [ -x "$(command -v wget)" ]; then + wget --quiet --show-progress -O ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin + elif [ -x "$(command -v curl)" ]; then + curl --output ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin + else + printf "Either wget or curl is required to download models.\n" + exit 1 + fi +fi + +if [ ! -f "./data/assets/ggml-base.bin" ]; then + printf "Model is required to build the plugin. Aborting.\n" + exit 1 +fi + if [ "${PLATFORM}" = "linux-gnu" ] || [ "${PLATFORM}" = "redhat-linux" ] then - if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then - rm "${CONTRIB_PATH}/native/.ffmpeg" + if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then + rm "${CONTRIB_BUILD_PATH}/.ffmpeg" + rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg" + fi + if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then + rm "${CONTRIB_BUILD_PATH}/.whispercpp" + rm -rf "${CONTRIB_BUILD_PATH}/whispercpp" fi WORKPATH=$(pwd) - cd "${CONTRIB_PATH}/native/" - make .ffmpeg -j$(nproc) + cd "${CONTRIB_BUILD_PATH}/" + make .ffmpeg -j$(nproc) install + make .whispercpp -j$(nproc) install + rm .whispercpp rm .ffmpeg cd ${WORKPATH} CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM} - ONNX_PATH=${EXTRALIBS_PATH} - if [ -z "${EXTRALIBS_PATH}" ] - then - ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}" - fi - if [ ${DEBUG} ]; then - OUTPUT="${PLUGIN_NAME}" - CLANG_OPTS="-g -fsanitize=address" - EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lvdpau -lX11 -lva-drm -lva-x11 -lrav1e" - EXTRA_DEFINES="-D__DEBUG__" + OUTPUT="${PLUGIN_NAME}" + CLANG_OPTS="-O0 -g -fsanitize=address" + EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lvdpau -lX11 -lva-drm -lva-x11 -lrav1e" + EXTRA_DEFINES="-D__DEBUG__" else - python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME} - CLANG_OPTS="-O3 -shared" - OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" + python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME} + CLANG_OPTS="-O3 -g -shared" + OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" fi # Compile - clang++ -std=c++17 -g -O0 -fPIC ${CLANG_OPTS} \ + clang++ -std=c++17 -fPIC ${CLANG_OPTS} \ -Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \ -Wall -Wextra \ -Wno-unused-parameter \ @@ -115,30 +136,26 @@ then -I"." \ -I"${DAEMON_SRC}" \ -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \ - -I"${ONNX_PATH}/include/onnxruntime/session" \ - -I"${ONNX_PATH}/include/onnxruntime/providers/cuda" \ - -I"${CONTRIB_PATH}/native/onnx/onnxruntime" \ -I"${PLUGINS_LIB}" \ ./../lib/common.cpp \ ./../lib/accel.cpp \ ./../lib/frameFilter.cpp \ ./../lib/frameUtils.cpp \ + ./../lib/resampler.cpp \ main.cpp \ TranscriptMediaHandler.cpp \ TranscriptAudioSubscriber.cpp \ TranscriptVideoSubscriber.cpp \ PluginPreferenceHandler.cpp \ - Preprocess.cpp \ - ModelProcessor.cpp \ - -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \ - -L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \ - -L"${CUDA_HOME}/lib64/" \ + stt_whisper.cpp \ + -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib" \ -l:libavfilter.a \ -l:libswscale.a \ -l:libswresample.a \ -l:libavformat.a \ -l:libavcodec.a \ -l:libavutil.a \ + -l:libwhisper.a \ -lfreetype \ -lvpx \ -lx264 \ @@ -147,57 +164,36 @@ then -lz \ -lva \ -lfmt \ - -lonnxruntime \ ${EXTRA_DEBUG_LIBRARIES} \ -o "${OUTPUT}" - if [ ${DEBUG} ]; then - cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx" - cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx" - cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx" - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "libonnxruntime.so.1.12.0" - else - cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx" - cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx" - cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx" - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime.so.1.12.0" - fi - if [ "${PROCESSOR}" = "NVIDIA" ]; then - if [ ${DEBUG} ]; then - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "libonnxruntime_providers_shared.so" - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "libonnxruntime_providers_cuda.so" - else - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_shared.so" - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_cuda.so" - fi - fi - elif [ "${PLATFORM}" = "darwin" ] then - if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then - rm "${CONTRIB_PATH}/native/.ffmpeg" + if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then + rm "${CONTRIB_BUILD_PATH}/.ffmpeg" + rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg" + fi + if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then + rm "${CONTRIB_BUILD_PATH}/.whispercpp" + rm -rf "${CONTRIB_BUILD_PATH}/whispercpp" fi WORKPATH=$(pwd) - cd "${CONTRIB_PATH}/native/" - make .ffmpeg -j$(nproc) + cd "${CONTRIB_BUILD_PATH}/" + make .whispercpp + make .ffmpeg + rm .whispercpp rm .ffmpeg cd ${WORKPATH} CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM} - ONNX_PATH=${EXTRALIBS_PATH} - if [ -z "${EXTRALIBS_PATH}" ] - then - ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}" - fi - if [ ${DEBUG} ]; then OUTPUT="${PLUGIN_NAME}" - CLANG_OPTS="-g -fsanitize=address" + CLANG_OPTS="-O0 -g -fsanitize=address" EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lrav1e" EXTRA_DEFINES="-D__DEBUG__" else python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME} - CLANG_OPTS="-O3 -shared" + CLANG_OPTS="-O3 -g -shared" OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" fi @@ -215,21 +211,19 @@ then -I"." \ -I"${DAEMON_SRC}" \ -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/include" \ - -I"${ONNX_PATH}/include/onnxruntime/session" \ -I"${PLUGINS_LIB}" \ ./../lib/common.cpp \ ./../lib/accel.cpp \ ./../lib/frameFilter.cpp \ ./../lib/frameUtils.cpp \ + ./../lib/resampler.cpp \ main.cpp \ TranscriptMediaHandler.cpp \ TranscriptAudioSubscriber.cpp \ TranscriptVideoSubscriber.cpp \ PluginPreferenceHandler.cpp \ - Preprocess.cpp \ - ModelProcessor.cpp \ + stt_whisper.cpp \ -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/" \ - -L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \ -lavfilter \ -lswscale \ -lswresample \ @@ -237,44 +231,22 @@ then -lavcodec \ -lavutil \ -lvpx -lx264 -lbz2 -liconv -lz \ - -lonnxruntime \ - -lspeex \ - -lopus \ + "${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libspeex.a" \ + "${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libopus.a" \ -lfmt \ + -lwhisper \ "/usr/local/opt/libpng/lib/libpng.a" \ "/usr/local/opt/freetype/lib/libfreetype.a" \ ${EXTRA_DEBUG_LIBRARIES} \ -o "${OUTPUT}" - if [ ${DEBUG} ]; then - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "libonnxruntime.dylib" - cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx" - cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx" - cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx" - install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "libonnxruntime.dylib" - install_name_tool -id "@loader_path/${PLUGIN_NAME}" "${OUTPUT}" - else - cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx" - cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx" - cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx" - cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" - install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" - install_name_tool -id "@loader_path/${SO_FILE_NAME}" "${OUTPUT}" - fi - install_name_tool -change "@rpath/libonnxruntime.1.12.0.dylib" "@loader_path/libonnxruntime.dylib" "${OUTPUT}" - if [ -n "${APPLE_SIGN_CERTIFICATE}" ]; then - codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" - ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" "build-local/libonnxruntime.dylib.zip" - LIBRARYNAME=libonnxruntime.dylib sh ./../notarize.sh - ditto -x -k "build-local/libonnxruntime.dylib.zip" "build-local/notarized0" - cp "build-local/notarized0/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" "build-local/${SO_FILE_NAME}.zip" LIBRARYNAME=${SO_FILE_NAME} sh ./../notarize.sh - ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized1" - cp "build-local/notarized1/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" + ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized" + cp "build-local/notarized/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" fi elif [ "${PLATFORM}" = "android" ] @@ -346,13 +318,15 @@ then CONTRIB_PLATFORM=x86_64-linux-android fi - if [ -f "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg" ]; then - rm "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg" + if [ -f "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg" ]; then + rm "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg" fi WORKPATH=$(pwd) - cd "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/" + cd "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/" make .ffmpeg -j$(nproc) + make .whispercpp -j$(nproc) + rm .whispercpp rm .ffmpeg cd ${WORKPATH} @@ -360,12 +334,6 @@ then # Compile the plugin #========================================================= - ONNX_PATH="${EXTRALIBS_PATH}/${CURRENT_ABI}" - if [ -z ${EXTRALIBS_PATH} ] - then - ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}" - fi - # Create so destination folder $CXX --std=c++17 -O3 -fPIC \ -Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \ @@ -376,10 +344,6 @@ then -I"." \ -I"${DAEMON_SRC}" \ -I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \ - -I"${ONNX_PATH}/include/onnxruntime/session" \ - -I"${ONNX_PATH}/include/onnxruntime/providers/nnapi" \ - -I"${ONNX_PATH}/../include/onnxruntime/session" \ - -I"${ONNX_PATH}/../include/onnxruntime/providers/nnapi" \ -I"${PLUGINS_LIB}" \ ./../lib/common.cpp \ ./../lib/accel.cpp \ @@ -390,10 +354,9 @@ then TranscriptAudioSubscriber.cpp \ TranscriptVideoSubscriber.cpp \ PluginPreferenceHandler.cpp \ - Preprocess.cpp \ - ModelProcessor.cpp \ + stt_whisper.cpp \ + ./../lib/resampler.cpp \ -L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \ - -L"${ONNX_PATH}/lib/" \ -lavfilter \ -lswscale \ -lswresample \ @@ -405,13 +368,11 @@ then -lspeex \ -lopus \ -lfmt \ + -lwhisper \ -l:libfreetype.a \ -llog -lz \ - -lonnxruntime \ --sysroot=$ANDROID_SYSROOT \ -o "build-local/jpl/lib/$CURRENT_ABI/${SO_FILE_NAME}" - - cp "${ONNX_PATH}/lib/libonnxruntime.so" "build-local/jpl/lib/${CURRENT_ABI}/libonnxruntime.so" } # Build the so @@ -419,14 +380,10 @@ then CURRENT_ABI=$i buildlib done - - cp "./modelSRC/mModelEncoder.ort" "./build-local/jpl/data/assets/mModelEncoder.ort" - cp "./modelSRC/mModelDecoder.ort" "./build-local/jpl/data/assets/mModelDecoder.ort" - cp "./modelSRC/mLogSoftMax.ort" "./build-local/jpl/data/assets/mLogSoftMax.ort" fi if [ ! ${DEBUG} ]; then - python3 ./../SDK/jplManipulation.py --assemble --plugin=${PLUGIN_NAME} --distribution=${PLATFORM} --extraPath=${EXTRAPATH} +python3 ./../SDK/jplManipulation.py --assemble --plugin=${PLUGIN_NAME} --distribution=${PLATFORM} --extraPath=${EXTRAPATH} fi cd ${CONTRIB_PATH}/src/ffmpeg/ diff --git a/WhisperTranscript/data/accountpreferences.json b/WhisperTranscript/data/accountpreferences.json index 04b18e459a33742a90a72db05830f5d9535213a8..a7bbcded57ed9c0e293bdb8c7433199aeac26643 100644 --- a/WhisperTranscript/data/accountpreferences.json +++ b/WhisperTranscript/data/accountpreferences.json @@ -3,7 +3,7 @@ "type": "List", "key": "language", "title": "{{language_title}}", - "defaultValue": "en", + "defaultValue": "auto", "scope": "plugin,Transcript", "entryValues": [ "auto", @@ -210,6 +210,84 @@ "{{language_yo}}" ] }, + { + "type": "List", + "key": "background", + "title": "{{background_title}}", + "summary": "{{background_summary}}", + "defaultValue": "black", + "scope": "plugin,Transcript", + "entryValues": [ + "black", + "white" + ], + "entries": [ + "{{background_entries_1}}", + "{{background_entries_2}}" + ] + }, + { + "type": "List", + "key": "position", + "title": "{{position_title}}", + "defaultValue": "2", + "scope": "plugin,Transcript", + "entryValues": [ + "1", + "2", + "3", + "4" + ], + "entries": [ + "{{position_entries_1}}", + "{{position_entries_2}}", + "{{position_entries_3}}", + "{{position_entries_4}}" + ] + }, + { + "type": "List", + "key": "fontsize", + "title": "{{fontsize_title}}", + "defaultValue": "14", + "scope": "plugin,Transcript", + "entryValues": [ + "10", + "12", + "14", + "16", + "18", + "24", + "36", + "72" + ], + "entries": [ + "10", + "12", + "14", + "16", + "18", + "24", + "36", + "72" + ] + }, + { + "type": "List", + "key": "avstream", + "title": "{{avstream_title}}", + "summary": "{{avstream_summary}}", + "defaultValue": "in", + "scope": "plugin", + "entryValues": [ + "out", + "in" + ], + "entries": [ + "{{avstream_entries_1}}", + "{{avstream_entries_2}}" + ] + }, { "type": "Switch", "key": "TranscriptAlways", diff --git a/WhisperTranscript/data/assets/.gitignore b/WhisperTranscript/data/assets/.gitignore index e1a699ac37f449b6c2b99d720554c9122433b2de..42cbf4c9c0bb524a484e6fa5d936cb14d5582535 100644 --- a/WhisperTranscript/data/assets/.gitignore +++ b/WhisperTranscript/data/assets/.gitignore @@ -1 +1,2 @@ *.onnx +*.bin diff --git a/WhisperTranscript/data/assets/mel_filters.bin b/WhisperTranscript/data/assets/mel_filters.bin deleted file mode 100644 index 9e3c32b7b856f37c60392d2023be21d7e4d76022..0000000000000000000000000000000000000000 Binary files a/WhisperTranscript/data/assets/mel_filters.bin and /dev/null differ diff --git a/WhisperTranscript/data/assets/tokenizer.bin b/WhisperTranscript/data/assets/tokenizer.bin deleted file mode 100644 index aa1c457305d25d603161d77b0ad348255f5664e1..0000000000000000000000000000000000000000 Binary files a/WhisperTranscript/data/assets/tokenizer.bin and /dev/null differ diff --git a/WhisperTranscript/data/locale/WhisperTranscript_en.json b/WhisperTranscript/data/locale/WhisperTranscript_en.json index 69bc1cfda143e0f22ed55638253e72cd5e4f1e67..4577ef665470bc0036ff57a4ec7843c7d3136044 100644 --- a/WhisperTranscript/data/locale/WhisperTranscript_en.json +++ b/WhisperTranscript/data/locale/WhisperTranscript_en.json @@ -5,11 +5,10 @@ "avstream_entries_2": "Received", "TranscriptAlways_title": "Automatically activate transcription", "TranscriptAlways_summary": "Activate transcription when a call starts.", - "background_title": "Add background color", - "background_summary": "Add a partial transparency to the subtitle background if it isn't visible enough", - "background_entries_1": "None", - "background_entries_2": "Black", - "background_entries_3": "White", + "background_title": "Background color", + "background_summary": "Defines the subtitle background color", + "background_entries_1": "Black", + "background_entries_2": "White", "position_title": "Transcription position", "position_entries_1": "Top right", "position_entries_2": "Top left", @@ -116,9 +115,5 @@ "language_vi": "Vietnamese", "language_cy": "Welsh", "language_yi": "Yiddish", - "language_yo": "Yoruba", - "acceleration_title": "Use hardware acceleration", - "acceleration_summary": "Use CUDA or NNAPI where applicable", - "chunk_title": "Chunk size in seconds", - "step_title": "Step size in seconds" + "language_yo": "Yoruba" } \ No newline at end of file diff --git a/WhisperTranscript/data/preferences.json b/WhisperTranscript/data/preferences.json index e803e211495a979010569a72d653f0016c890301..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc 100644 --- a/WhisperTranscript/data/preferences.json +++ b/WhisperTranscript/data/preferences.json @@ -1,224 +1 @@ -[ - { - "type": "List", - "key": "background", - "title": "{{background_title}}", - "summary": "{{background_summary}}", - "defaultValue": "black@0.0", - "scope": "plugin,Transcript", - "entryValues": [ - "black@0.0", - "black@0.5", - "white@0.5" - ], - "entries": [ - "{{background_entries_1}}", - "{{background_entries_2}}", - "{{background_entries_3}}" - ] - }, - { - "type": "List", - "key": "position", - "title": "{{position_title}}", - "defaultValue": "2", - "scope": "plugin,Transcript", - "entryValues": [ - "1", - "2", - "3", - "4" - ], - "entries": [ - "{{position_entries_1}}", - "{{position_entries_2}}", - "{{position_entries_3}}", - "{{position_entries_4}}" - ] - }, - { - "type": "List", - "key": "fontsize", - "title": "{{fontsize_title}}", - "defaultValue": "14", - "scope": "plugin,Transcript", - "entryValues": [ - "10", - "12", - "14", - "16", - "18", - "24", - "36", - "72" - ], - "entries": [ - "10", - "12", - "14", - "16", - "18", - "24", - "36", - "72" - ] - }, - { - "type": "List", - "key": "avstream", - "title": "{{avstream_title}}", - "summary": "{{avstream_summary}}", - "defaultValue": "in", - "scope": "plugin", - "entryValues": [ - "out", - "in" - ], - "entries": [ - "{{avstream_entries_1}}", - "{{avstream_entries_2}}" - ] - }, - { - "type": "List", - "key": "chunksize", - "title": "{{chunk_title}}", - "defaultValue": "15", - "scope": "plugin,Transcript", - "entryValues": [ - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30" - ], - "entries": [ - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30" - ] - }, - { - "type": "List", - "key": "stepsize", - "title": "{{step_title}}", - "defaultValue": "3", - "scope": "plugin,Transcript", - "entryValues": [ - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30" - ], - "entries": [ - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30" - ] - }, - { - "type": "Switch", - "key": "acceleration", - "title": "{{acceleration_title}}", - "summary": "{{acceleration_summary}}", - "defaultValue": "1", - "scope": "plugin" - } -] \ No newline at end of file +[] \ No newline at end of file diff --git a/WhisperTranscript/ffmpeg/package.json b/WhisperTranscript/ffmpeg/package.json index fa3d2779f7527d102cf56d9846b3b8c8993200b9..428d0e7489517aa934badd98a60dce69a161fdc3 100644 --- a/WhisperTranscript/ffmpeg/package.json +++ b/WhisperTranscript/ffmpeg/package.json @@ -1,6 +1,6 @@ { "name": "ffmpeg", - "version": "n5.0", + "version": "n6.0", "url": "https://git.ffmpeg.org/gitweb/ffmpeg.git/snapshot/__VERSION__.tar.gz", "deps": [ "freetype", @@ -16,13 +16,12 @@ "libopusenc-reload-packet-loss-at-encode.patch", "libopusdec-enable-FEC.patch", "windows-configure.patch", - "windows-configure-ffnvcodec.patch", - "windows-configure-libmfx.patch" + "windows-configure-ffnvcodec.patch" ], "win_patches": [ ], "project_paths": [], - "with_env" : "10.0.16299.0", + "with_env" : "", "custom_scripts": { "pre_build": [], "build": [ diff --git a/WhisperTranscript/ffmpeg/rules.mak b/WhisperTranscript/ffmpeg/rules.mak index 7b5c55447f9757018014a913e3d25328b8577f46..24296dfdf3722d5a32885f0889cdbb152accef13 100644 --- a/WhisperTranscript/ffmpeg/rules.mak +++ b/WhisperTranscript/ffmpeg/rules.mak @@ -1,4 +1,4 @@ -FFMPEG_HASH := n5.0 +FFMPEG_HASH := n6.0 FFMPEG_URL := https://git.ffmpeg.org/gitweb/ffmpeg.git/snapshot/$(FFMPEG_HASH).tar.gz PKGS+=ffmpeg @@ -86,16 +86,10 @@ FFMPEGCONF += \ --enable-parser=mpeg4video \ --enable-parser=vp8 \ --enable-parser=vp9 \ - --enable-parser=opus \ - --enable-parser=w64 \ - --enable-parser=wav + --enable-parser=opus #encoders/decoders FFMPEGCONF += \ - --enable-encoder=w64 \ - --enable-encoder=wav \ - --enable-decoder=w64 \ - --enable-decoder=wav \ --enable-encoder=adpcm_g722 \ --enable-decoder=adpcm_g722 \ --enable-encoder=rawvideo \ @@ -344,7 +338,7 @@ $(TARBALLS)/ffmpeg-$(FFMPEG_HASH).tar.gz: ffmpeg: ffmpeg-$(FFMPEG_HASH).tar.gz rm -Rf $@ $@-$(FFMPEG_HASH) mkdir -p $@-$(FFMPEG_HASH) - (cd $@-$(FFMPEG_HASH) && tar x $(if ${BATCH_MODE},,-v) --strip-components=1 -f ../$<) + (cd $@-$(FFMPEG_HASH) && tar x $(if ${BATCH_MODE},,-v) --strip-components=1 -f $<) $(APPLY) $(SRC)/ffmpeg/remove-mjpeg-log.patch $(APPLY) $(SRC)/ffmpeg/change-RTCP-ratio.patch $(APPLY) $(SRC)/ffmpeg/rtp_ext_abs_send_time.patch @@ -352,6 +346,7 @@ ffmpeg: ffmpeg-$(FFMPEG_HASH).tar.gz $(APPLY) $(SRC)/ffmpeg/libopusenc-reload-packet-loss-at-encode.patch $(APPLY) $(SRC)/ffmpeg/ios-disable-b-frames.patch $(APPLY) $(SRC)/ffmpeg/screen-sharing-x11-fix.patch + $(APPLY) $(SRC)/ffmpeg/nvenc-fix-reorderqueueflush-crash.patch $(UPDATE_AUTOCONFIG) $(MOVE) diff --git a/WhisperTranscript/package.json b/WhisperTranscript/package.json index e0e95adbe6eeeb700fb94b3b1d772ad4299f1a81..71ac7733340637d5af44a7289becbe39270b48a1 100644 --- a/WhisperTranscript/package.json +++ b/WhisperTranscript/package.json @@ -4,7 +4,6 @@ "extractLibs": false, "deps": [], "defines": [ - "NVIDIA=False", "TESTPROCESS=False" ], "custom_scripts": { diff --git a/WhisperTranscript/stt_whisper.cpp b/WhisperTranscript/stt_whisper.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3df602afce51ae8edbe1a2ae2a578353d7428ef7 --- /dev/null +++ b/WhisperTranscript/stt_whisper.cpp @@ -0,0 +1,266 @@ +#include "stt_whisper.h" +#include "whisper.h" + +#ifdef WIN32 +#define _USE_MATH_DEFINES +#include <math.h> +#endif + +#include <atomic> +#include <cmath> +#include <mutex> +#include <string> +#include <thread> +#include <vector> + +void print_array(const std::vector<float>& data) +{ + fprintf(stdout, "print array: ["); + for (int i = 0; i < std::min((int)data.size(), 10); i++) { + fprintf(stdout, " %.8f,", data[i]); + } + fprintf(stdout, " ]\n"); +} + +void high_pass_filter(std::vector<float>& data, float cutoff, float sample_rate) +{ + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +/** Check if speech is ending. */ +bool vad_simple(std::vector<float>& pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) +{ + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (int i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if ((energy_all < 0.0001f && energy_last < 0.0001f) || energy_last > vad_thold * energy_all) { + return false; + } + + return true; +} + +RealtimeSttWhisper::RealtimeSttWhisper(const std::string& path_model) +{ + ctx = whisper_init_from_file(path_model.c_str()); + is_running = true; + worker = std::thread(&RealtimeSttWhisper::Run, this); + t_last_iter = std::chrono::high_resolution_clock::now(); +} + +RealtimeSttWhisper::~RealtimeSttWhisper() +{ + is_running = false; + if (worker.joinable()) + worker.join(); + whisper_free(ctx); +} + +/** Add audio data in PCM f32 format. */ +void RealtimeSttWhisper::AddAudioData(const float* data, size_t n_samples) +{ + std::lock_guard<std::mutex> lock(s_mutex); + // printf("AddAudioData: remaining: %d, new: %d\n", (int)s_queued_pcmf32.size(), (int)data.size()); + s_queued_pcmf32.insert(s_queued_pcmf32.end(), data, data + n_samples); +} + +/** Get newly transcribed text. */ +std::vector<transcribed_msg> RealtimeSttWhisper::GetTranscribed() +{ + std::vector<transcribed_msg> transcribed; + std::lock_guard<std::mutex> lock(s_mutex); + transcribed = std::move(s_transcribed_msgs); + s_transcribed_msgs.clear(); + return transcribed; +} + +/** Run Whisper in its own thread to not block the main thread. */ +void RealtimeSttWhisper::Run() +{ + struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); + + // See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302 + wparams.n_threads = 4; + wparams.no_context = true; + wparams.single_segment = true; + wparams.print_progress = false; + wparams.print_realtime = false; + wparams.print_special = false; + wparams.print_timestamps = false; + wparams.max_tokens = 64; + wparams.translate = false; + + /** + * Experimental optimization: Reduce audio_ctx to 15s (half of the chunk + * size whisper is designed for) to speed up 2x. + * https://github.com/ggerganov/whisper.cpp/issues/137#issuecomment-1318412267 + */ + wparams.audio_ctx = 768; + + /* When more than this amount of audio received, run an iteration. */ + const int trigger_ms = 400; + const int n_samples_trigger = (trigger_ms / 1000.0) * WHISPER_SAMPLE_RATE; + /** + * When more than this amount of audio accumulates in the audio buffer, + * force finalize current audio context and clear the buffer. Note that + * VAD may finalize an iteration earlier. + */ + // This is recommended to be smaller than the time wparams.audio_ctx + // represents so an iteration can fit in one chunk. + const int iter_threshold_ms = trigger_ms * 35; + const int n_samples_iter_threshold = (iter_threshold_ms / 1000.0) * WHISPER_SAMPLE_RATE; + + /** + * ### Reminders + * + * - Note that whisper designed to process audio in 30-second chunks, and + * the execution time of processing smaller chunks may not be shorter. + * - The design of trigger and threshold allows inputing audio data at + * arbitrary rates with zero config. Inspired by Assembly.ai's + * real-time transcription API + * (https://github.com/misraturp/Real-time-transcription-from-microphone/blob/main/speech_recognition.py) + */ + + /* VAD parameters */ + // The most recent 3s. + const int vad_window_s = 3; + const int n_samples_vad_window = WHISPER_SAMPLE_RATE * vad_window_s; + // In VAD, compare the energy of the last 500ms to that of the total 3s. + const int vad_last_ms = 500; + // Keep the last 0.5s of an iteration to the next one for better + // transcription at begin/end. + const int n_samples_keep_iter = WHISPER_SAMPLE_RATE * 0.5; + const float vad_thold = 0.3f; + const float freq_thold = 200.0f; + + /* Audio buffer */ + std::vector<float> pcmf32; + + /* Processing loop */ + while (is_running) { + { + std::unique_lock<std::mutex> lock(s_mutex); + + if (s_queued_pcmf32.size() < n_samples_trigger) { + lock.unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + } + + { + std::lock_guard<std::mutex> lock(s_mutex); + + if (s_queued_pcmf32.size() > 2 * n_samples_iter_threshold) { + fprintf(stderr, "\n\n%s: WARNING: too much audio is going to be processed, result may not come out in real time\n\n", __func__); + } + } + + { + std::lock_guard<std::mutex> lock(s_mutex); + + pcmf32.insert(pcmf32.end(), s_queued_pcmf32.begin(), s_queued_pcmf32.end()); + + // printf("existing: %d, new: %d, will process: %d, threshold: %d\n", + // n_samples_old, n_samples_new, (int)pcmf32.size(), n_samples_iter_threshold); + + // print_array(pcmf32); + + s_queued_pcmf32.clear(); + wparams.language = lang_.c_str(); + } + + { + int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()); + if (ret != 0) { + fprintf(stderr, "Failed to process audio, returned %d\n", ret); + continue; + } + } + + { + transcribed_msg msg; + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + msg.text += text; + } + + /** + * Simple VAD from the "stream" example in whisper.cpp + * https://github.com/ggerganov/whisper.cpp/blob/231bebca7deaf32d268a8b207d15aa859e52dbbe/examples/stream/stream.cpp#L378 + */ + bool speech_has_end = false; + + /* Need enough accumulated audio to do VAD. */ + if ((int)pcmf32.size() >= n_samples_vad_window) { + std::vector<float> pcmf32_window(pcmf32.end() - n_samples_vad_window, pcmf32.end()); + speech_has_end = vad_simple(pcmf32_window, WHISPER_SAMPLE_RATE, vad_last_ms, + vad_thold, freq_thold, false); + if (speech_has_end) + printf("speech end detected\n"); + } + + /** + * Clear audio buffer when the size exceeds iteration threshold or + * speech end is detected. + */ + if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) { + const auto t_now = std::chrono::high_resolution_clock::now(); + const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last_iter).count(); + printf("iter took: %lldms\n", t_diff); + t_last_iter = t_now; + + msg.is_partial = false; + /** + * Keep the last few samples in the audio buffer, so the next + * iteration has a smoother start. + */ + std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); + pcmf32 = std::move(last); + } else { + msg.is_partial = true; + } + + std::lock_guard<std::mutex> lock(s_mutex); + s_transcribed_msgs.insert(s_transcribed_msgs.end(), std::move(msg)); + } + } +} \ No newline at end of file diff --git a/WhisperTranscript/stt_whisper.h b/WhisperTranscript/stt_whisper.h new file mode 100644 index 0000000000000000000000000000000000000000..4766faffc2b2448c3088678421c3d6cbb88913ac --- /dev/null +++ b/WhisperTranscript/stt_whisper.h @@ -0,0 +1,43 @@ +#ifndef STT_WHISPER_H_ +#define STT_WHISPER_H_ + +#include <atomic> +#include <mutex> +#include <string> +#include <thread> +#include <vector> + +#define WHISPER_SAMPLE_RATE 16000 + +struct transcribed_msg { + std::string text; + bool is_partial; +}; + +class WavWriter; + +class RealtimeSttWhisper +{ + public: + RealtimeSttWhisper(const std::string& path_model); + ~RealtimeSttWhisper(); + void AddAudioData(const float* data, size_t n_samples); + std::vector<transcribed_msg> GetTranscribed(); + void setLanguage(const std::string& lang) { + std::lock_guard<std::mutex> lock(s_mutex); + lang_ = lang; + } + + private: + struct whisper_context* ctx; + std::string lang_; + std::atomic<bool> is_running; + std::vector<float> s_queued_pcmf32; + std::vector<transcribed_msg> s_transcribed_msgs; + std::mutex s_mutex; // for accessing shared variables from both main thread and worker thread + std::thread worker; + void Run(); + std::chrono::time_point<std::chrono::high_resolution_clock> t_last_iter; +}; + +#endif // STT_WHISPER_H_ diff --git a/WhisperTranscript/whispercpp/package.json b/WhisperTranscript/whispercpp/package.json new file mode 100644 index 0000000000000000000000000000000000000000..20eb8b4727c481f22c14e1699b76a6cc85be2af6 --- /dev/null +++ b/WhisperTranscript/whispercpp/package.json @@ -0,0 +1,17 @@ +{ + "name": "whispercpp", + "version": "v1.2.1", + "url": "https://github.com/ggerganov/whisper.cpp/archive/refs/tags/__VERSION__.tar.gz", + "deps": [], + "patches": ["project.patch"], + "win_patches": [], + "project_paths": ["whisper.vcxproj"], + "with_env" : "", + "custom_scripts": { + "pre_build": [ + "wget --no-check-certificate --quiet --show-progress -O ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin" + ], + "build": [], + "post_build": [] + } +} diff --git a/WhisperTranscript/whispercpp/project.patch b/WhisperTranscript/whispercpp/project.patch new file mode 100644 index 0000000000000000000000000000000000000000..f3130e9ac52a6421ae6abf5584f518c3e23bcfe2 --- /dev/null +++ b/WhisperTranscript/whispercpp/project.patch @@ -0,0 +1,97 @@ +--- + whisper.vcxproj | 83 +++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 83 insertions(+) + create mode 100644 whisper.vcxproj + +diff --git a/whisper.vcxproj b/whisper.vcxproj +new file mode 100644 +index 0000000..9cbfdb7 +--- /dev/null ++++ b/whisper.vcxproj +@@ -0,0 +1,83 @@ ++<?xml version="1.0" encoding="utf-8"?> ++<Project DefaultTargets="Build" ToolsVersion="17.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> ++ <PropertyGroup> ++ <PreferredToolArchitecture>x64</PreferredToolArchitecture> ++ </PropertyGroup> ++ <ItemGroup Label="ProjectConfigurations"> ++ <ProjectConfiguration Include="Release|x64"> ++ <Configuration>Release</Configuration> ++ <Platform>x64</Platform> ++ </ProjectConfiguration> ++ </ItemGroup> ++ <PropertyGroup Label="Globals"> ++ <ProjectGuid>{47B512DE-EE88-3A32-A01F-DF4317B53175}</ProjectGuid> ++ <Keyword>Win32Proj</Keyword> ++ <WindowsTargetPlatformVersion>10.0.18362.0</WindowsTargetPlatformVersion> ++ <Platform>x64</Platform> ++ <ProjectName>whisper</ProjectName> ++ <VCProjectUpgraderObjectName>NoUpgrade</VCProjectUpgraderObjectName> ++ </PropertyGroup> ++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> ++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> ++ <ConfigurationType>StaticLibrary</ConfigurationType> ++ <CharacterSet>MultiByte</CharacterSet> ++ <PlatformToolset>v143</PlatformToolset> ++ </PropertyGroup> ++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> ++ <ImportGroup Label="ExtensionSettings"> ++ </ImportGroup> ++ <ImportGroup Label="PropertySheets"> ++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> ++ </ImportGroup> ++ <PropertyGroup Label="UserMacros" /> ++ <PropertyGroup> ++ <_ProjectFileVersion>10.0.18362.0</_ProjectFileVersion> ++ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)..\..\msvc\lib\$(Platform)\</OutDir> ++ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">whisper.dir\Release\</IntDir> ++ <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">whisper</TargetName> ++ <TargetExt Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.lib</TargetExt> ++ </PropertyGroup> ++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> ++ <ClCompile> ++ <AdditionalIncludeDirectories>$(ProjectDir).;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> ++ <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation> ++ <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet> ++ <ExceptionHandling>Sync</ExceptionHandling> ++ <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion> ++ <Optimization>MaxSpeed</Optimization> ++ <PrecompiledHeader>NotUsing</PrecompiledHeader> ++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary> ++ <RuntimeTypeInfo>true</RuntimeTypeInfo> ++ <UseFullPaths>false</UseFullPaths> ++ <WarningLevel>Level3</WarningLevel> ++ <PreprocessorDefinitions>%(PreprocessorDefinitions);WIN32;_WINDOWS;NDEBUG;_CRT_SECURE_NO_WARNINGS;CMAKE_INTDIR="Release"</PreprocessorDefinitions> ++ <ObjectFileName>$(IntDir)</ObjectFileName> ++ <DebugInformationFormat> ++ </DebugInformationFormat> ++ </ClCompile> ++ <ResourceCompile> ++ <PreprocessorDefinitions>%(PreprocessorDefinitions);WIN32;_WINDOWS;NDEBUG;_CRT_SECURE_NO_WARNINGS;CMAKE_INTDIR=\"Release\"</PreprocessorDefinitions> ++ <AdditionalIncludeDirectories>$(ProjectDir).;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> ++ </ResourceCompile> ++ <Midl> ++ <AdditionalIncludeDirectories>$(ProjectDir).;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> ++ <OutputDirectory>$(ProjectDir)/$(IntDir)</OutputDirectory> ++ <HeaderFileName>%(Filename).h</HeaderFileName> ++ <TypeLibraryName>%(Filename).tlb</TypeLibraryName> ++ <InterfaceIdentifierFileName>%(Filename)_i.c</InterfaceIdentifierFileName> ++ <ProxyFileName>%(Filename)_p.c</ProxyFileName> ++ </Midl> ++ <Lib> ++ <AdditionalOptions>%(AdditionalOptions) /machine:x64</AdditionalOptions> ++ </Lib> ++ </ItemDefinitionGroup> ++ <ItemGroup> ++ <ClInclude Include="ggml.h" /> ++ <ClCompile Include="ggml.c" /> ++ <ClInclude Include="whisper.h" /> ++ <ClCompile Include="whisper.cpp" /> ++ </ItemGroup> ++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> ++ <ImportGroup Label="ExtensionTargets"> ++ </ImportGroup> ++</Project> +-- +2.37.1.windows.1 + diff --git a/WhisperTranscript/whispercpp/rules.mak b/WhisperTranscript/whispercpp/rules.mak new file mode 100644 index 0000000000000000000000000000000000000000..22791824945b80fb32054176c4c5843fd550d5e6 --- /dev/null +++ b/WhisperTranscript/whispercpp/rules.mak @@ -0,0 +1,27 @@ +# whispercpp +WHISPERCPP_HASH := v1.2.1 +WHISPERCPP_GITURL := https://github.com/ggerganov/whisper.cpp.git + +WCONFIG := -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + +$(TARBALLS)/whispercpp-$(WHISPERCPP_HASH).tar.xz: + $(call download_git,$(WHISPERCPP_GITURL),master,$(WHISPERCPP_HASH)) + +.sum-whispercpp: whispercpp-$(WHISPERCPP_HASH).tar.xz + $(warning $@ not implemented) + touch $@ + +whispercpp: whispercpp-$(WHISPERCPP_HASH).tar.xz .sum-whispercpp + rm -Rf $@-$(WHISPERCPP_HASH) + mkdir -p $@-$(WHISPERCPP_HASH) + (cd $@-$(WHISPERCPP_HASH) && tar x $(if ${BATCH_MODE},,-v) --strip-components=1 -f $<) + $(UPDATE_AUTOCONFIG) + $(MOVE) + +.whispercpp: whispercpp + cd $< && cmake . $(WCONFIG) + cd $< && $(MAKE) + cd $< && cp libwhisper.a $(PREFIX)/lib + cd $< && cp whisper.h $(PREFIX)/include + touch $@ diff --git a/lib/accel.cpp b/lib/accel.cpp index 785a092c9591f815569c8b696977f935b31f0083..36e764d0e0a2bd307283ed0d581be06d86dee733 100644 --- a/lib/accel.cpp +++ b/lib/accel.cpp @@ -38,15 +38,12 @@ av_frame_new_side_data_from_buf(AVFrame* frame, enum AVFrameSideDataType type, A AVFrame* transferToMainMemory(const AVFrame* framePtr, AVPixelFormat desiredFormat) { - AVFrame* out = av_frame_alloc(); auto desc = av_pix_fmt_desc_get(static_cast<AVPixelFormat>(framePtr->format)); - if (desc && !(desc->flags & AV_PIX_FMT_FLAG_HWACCEL)) { - av_frame_unref(out); - av_frame_free(&out); return av_frame_clone(framePtr); } + AVFrame* out = av_frame_alloc(); out->format = desiredFormat; if (av_hwframe_transfer_data(out, framePtr, 0) < 0) { av_frame_unref(out); diff --git a/lib/common.cpp b/lib/common.cpp index c9534b13107a49f455640958aa159cd94b8a2087..0eeff59b68ff6d239a68052c3396fd28def86e88 100644 --- a/lib/common.cpp +++ b/lib/common.cpp @@ -59,6 +59,7 @@ void ffmpegFormatStringInline(std::string& str) void ffmpegScapeStringInline(std::string& str) { std::string newStr; + newStr.reserve(str.size()); for (size_t i = 0; i < str.size(); i ++) { switch (str[i]) { case '\'': diff --git a/lib/frameUtils.cpp b/lib/frameUtils.cpp index 72890269db8f2eb957ec82727fa4fbf2c7089f95..37a0666a1da266c42ba5b96c4f12741da97c15a3 100644 --- a/lib/frameUtils.cpp +++ b/lib/frameUtils.cpp @@ -28,7 +28,6 @@ void moveFrom(AVFrame* dst, AVFrame* src) { if (dst && src) { - av_frame_copy_props(src, dst); av_frame_unref(dst); av_frame_move_ref(dst, src); } @@ -37,7 +36,8 @@ moveFrom(AVFrame* dst, AVFrame* src) void frameFree(AVFrame* frame) { - av_frame_unref(frame); + if (frame) + av_frame_unref(frame); av_frame_free(&frame); } diff --git a/lib/resampler.cpp b/lib/resampler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9d2a06d44e8784dfb6fba341d932734c05aa91b0 --- /dev/null +++ b/lib/resampler.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2004-2023 Savoir-faire Linux Inc. + * + * Author: Emmanuel Milou <emmanuel.milou@savoirfairelinux.com> + * Author: Alexandre Savard <alexandre.savard@savoirfairelinux.com> + * Author: Philippe Gorley <philippe.gorley@savoirfairelinux.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "resampler.h" + +extern "C" { +#include <libswresample/swresample.h> +#include <libavutil/opt.h> +} + +#include <new> +#include <stdexcept> +#include <iostream> + +namespace jami { + +Resampler::Resampler() + : swrCtx_(swr_alloc()) + , initCount_(0) +{} + +Resampler::~Resampler() +{ + swr_free(&swrCtx_); +} + +void +Resampler::reinit(const AVFrame* in, const AVFrame* out) +{ + // NOTE swr_set_matrix should be called on an uninitialized context + auto swrCtx = swr_alloc(); + if (!swrCtx) { + throw std::bad_alloc(); + } + + av_opt_set_chlayout(swrCtx, "ichl", &in->ch_layout, 0); + av_opt_set_int(swrCtx, "isr", in->sample_rate, 0); + av_opt_set_sample_fmt(swrCtx, "isf", static_cast<AVSampleFormat>(in->format), 0); + + av_opt_set_chlayout(swrCtx, "ochl", &out->ch_layout, 0); + av_opt_set_int(swrCtx, "osr", out->sample_rate, 0); + av_opt_set_sample_fmt(swrCtx, "osf", static_cast<AVSampleFormat>(out->format), 0); + + /** + * Downmixing from 5.1 requires extra setup, since libswresample can't do it automatically + * (not yet implemented). + * + * Source: https://www.atsc.org/wp-content/uploads/2015/03/A52-201212-17.pdf + * Section 7.8.2 for the algorithm + * Tables 5.9 and 5.10 for the coefficients clev and slev + * + * LFE downmixing is optional, so any coefficient can be used, we use +6dB for mono and + * +0dB in each channel for stereo. + */ + if (in->ch_layout.u.mask == AV_CH_LAYOUT_5POINT1 + || in->ch_layout.u.mask == AV_CH_LAYOUT_5POINT1_BACK) { + // NOTE MSVC can't allocate dynamic size arrays on the stack + if (out->ch_layout.nb_channels == 2) { + double matrix[2][6]; + // L = 1.0*FL + 0.707*FC + 0.707*BL + 1.0*LFE + matrix[0][0] = 1; + matrix[0][1] = 0; + matrix[0][2] = 0.707; + matrix[0][3] = 1; + matrix[0][4] = 0.707; + matrix[0][5] = 0; + // R = 1.0*FR + 0.707*FC + 0.707*BR + 1.0*LFE + matrix[1][0] = 0; + matrix[1][1] = 1; + matrix[1][2] = 0.707; + matrix[1][3] = 1; + matrix[1][4] = 0; + matrix[1][5] = 0.707; + swr_set_matrix(swrCtx, matrix[0], 6); + } else { + double matrix[1][6]; + // M = 1.0*FL + 1.414*FC + 1.0*FR + 0.707*BL + 0.707*BR + 2.0*LFE + matrix[0][0] = 1; + matrix[0][1] = 1; + matrix[0][2] = 1.414; + matrix[0][3] = 2; + matrix[0][4] = 0.707; + matrix[0][5] = 0.707; + swr_set_matrix(swrCtx, matrix[0], 6); + } + } + + if (swr_init(swrCtx) >= 0) { + std::swap(swrCtx_, swrCtx); + swr_free(&swrCtx); + ++initCount_; + } else { + throw std::runtime_error("Failed to initialize resampler context"); + } +} + +int +Resampler::resample(const AVFrame* input, AVFrame* output) +{ + if (!initCount_) + reinit(input, output); + + int ret = swr_convert_frame(swrCtx_, output, input); + if (ret & AVERROR_INPUT_CHANGED || ret & AVERROR_OUTPUT_CHANGED) { + // Under certain conditions, the resampler reinits itself in an infinite loop. This is + // indicative of an underlying problem in the code. This check is so the backtrace + // doesn't get mangled with a bunch of calls to Resampler::resample + if (initCount_ > 1) { + throw std::runtime_error("Infinite loop detected in audio resampler"); + } + reinit(input, output); + return resample(input, output); + } else if (ret < 0) { + return -1; + } + + // Resampling worked, reset count to 1 so reinit isn't called again + initCount_ = 1; + return 0; +} +} // namespace jami diff --git a/lib/resampler.h b/lib/resampler.h new file mode 100644 index 0000000000000000000000000000000000000000..cbf61799ea955b016b04bd20d124ed06929f4bcc --- /dev/null +++ b/lib/resampler.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2004-2023 Savoir-faire Linux Inc. + * + * Author: Emmanuel Milou <emmanuel.milou@savoirfairelinux.com> + * Author: Alexandre Savard <alexandre.savard@savoirfairelinux.com> + * Author: Philippe Gorley <philippe.gorley@savoirfairelinux.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#pragma once + +extern "C" { +struct AVFrame; +struct SwrContext; +} + +namespace jami { + +/** + * @brief Wrapper class for libswresample + */ +class Resampler +{ +public: + Resampler(); + ~Resampler(); + + /** + * @brief Resample a frame. + * + * Resample from @input format to @output format. + * + * NOTE: sample_rate, ch_layout, and format should be set on @output + */ + int resample(const AVFrame* input, AVFrame* output); +private: + /** + * @brief Reinitializes filter according to new format. + * + * Reinitializes the resampler when new settings are detected. As long as both input and + * output formats don't change, this will only be called once. + */ + void reinit(const AVFrame* in, const AVFrame* out); + + /** + * @brief Libswresample resampler context. + * + * NOTE SwrContext is an imcomplete type and cannot be stored in a smart pointer. + */ + SwrContext* swrCtx_; + + /** + * @brief Number of times @swrCtx_ has been initialized with no successful audio resampling. + * + * 0: Uninitialized + * 1: Initialized + * >1: Invalid frames or formats, reinit is going to be called in an infinite loop + */ + unsigned initCount_; +}; +} // namespace jami