Skip to content
Snippets Groups Projects
Commit 087d5597 authored by Adrien Béraud's avatar Adrien Béraud Committed by Sébastien Blin
Browse files

whisper: use whispercpp to avoid glitches

Change-Id: I3def6db3eed39b1e9a5feb2d8f4b664de60bd1d2
parent 440e8127
Branches
No related tags found
No related merge requests found
Showing
with 310 additions and 1591 deletions
*.mp3
/WhisperTranscript*
libonnxruntime.so*
/libonnxruntime.dylib
/processed.mp4
*.so
......@@ -58,11 +58,11 @@ set(plugin_SRC main.cpp
PluginPreferenceHandler.cpp
TranscriptAudioSubscriber.cpp
TranscriptVideoSubscriber.cpp
Preprocess.cpp
ModelProcessor.cpp
stt_whisper.cpp
./../lib/accel.cpp
./../lib/frameUtils.cpp
./../lib/frameFilter.cpp
./../lib/resampler.cpp
./../lib/common.cpp
)
......@@ -70,8 +70,7 @@ set(plugin_HDR TranscriptAudioSubscriber.h
TranscriptVideoSubscriber.h
TranscriptMediaHandler.h
PluginPreferenceHandler.h
Preprocess.h
ModelProcessor.h
stt_whisper.h
./../lib/pluglog.h
./../lib/mediaStream.h
./../lib/audioFormat.h
......@@ -98,6 +97,7 @@ target_include_directories(${ProjectName} PUBLIC ${PROJECT_BINARY_DIR}
${ONNX_DIR}/../include/session
${ONNX_DIR}/../include/providers/cuda
${CONTRIB_PATH}/build/yaml-cpp/include
${CONTRIB_PATH}/build/whispercpp
)
target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH}
${CONTRIB_PATH}/build/fmt/msvc/Release
......@@ -110,7 +110,7 @@ target_link_directories(${ProjectName} PUBLIC ${CONTRIB_PATH}
target_link_libraries(${ProjectName} PUBLIC libyaml-cppmd libavfilter libswscale libswresample
libavformat libavcodec libavutil libvpx libx264 libopus
libmfx fmt libzlib freetype ws2_32 Bcrypt Secur32 onnxruntime msvcrt)
libmfx fmt libzlib freetype whisper ws2_32 Bcrypt Secur32 msvcrt)
add_custom_command(
TARGET ${ProjectName}
......@@ -120,6 +120,8 @@ add_custom_command(
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/yaml-cpp ${CONTRIB_PATH}/src/yaml-cpp
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/../contrib/freetype ${CONTRIB_PATH}/src/freetype
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/ffmpeg/ ${CONTRIB_PATH}/src/ffmpeg
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/whispercpp/ ${CONTRIB_PATH}/src/whispercpp
COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb whispercpp
COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb fmt
COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb yaml-cpp
COMMAND python ${DAEMON}/compat/msvc/winmake.py -fb zlib
......@@ -136,34 +138,17 @@ if(TESTPROCESS)
PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/testPreferences.yml ${PROJECT_BINARY_DIR}/
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/jfk.wav ${PROJECT_BINARY_DIR}/
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${PROJECT_BINARY_DIR}/Debug
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${PROJECT_BINARY_DIR}/Debug
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${PROJECT_SOURCE_DIR}/data/assets
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${PROJECT_SOURCE_DIR}/data/assets
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${PROJECT_SOURCE_DIR}/data/assets
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/sample.mp4 ${PROJECT_BINARY_DIR}/
COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${PROJECT_SOURCE_DIR}/data/assets/
)
else()
add_custom_command(
TARGET ${ProjectName}
PRE_BUILD
COMMAND python ${PROJECT_SOURCE_DIR}/../SDK/jplManipulation.py --preassemble --plugin=${ProjectName}
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelEncoder.onnx ${JPL_DIRECTORY}/data/assets
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mModelDecoder.onnx ${JPL_DIRECTORY}/data/assets
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/modelSRC/mLogSoftMax.onnx ${JPL_DIRECTORY}/data/assets
COMMAND ${CMAKE_COMMAND} -E copy ${CONTRIB_PATH}/build/whispercpp/ggml-base.bin ${JPL_DIRECTORY}/data/assets/
COMMENT "Assembling Plugin files"
)
if(NVIDIA)
add_custom_command(
TARGET ${ProjectName}
PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_shared.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.lib ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
COMMAND ${CMAKE_COMMAND} -E copy ${ONNX_DIR}/onnxruntime_providers_cuda.dll ${JPL_DIRECTORY}/lib/${CONTRIB_PLATFORM}
)
endif()
add_custom_command(
TARGET ${ProjectName}
POST_BUILD
......
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
* USA.
*/
#include "ModelProcessor.h"
#include <pluglog.h>
#include <common.h>
#include <limits.h>
const char sep = separator();
const std::string TAG = "Transcript";
namespace jami {
ModelProcessor::ModelProcessor(const std::string& path, bool acc)
{
loadTokens(path + "/assets/tokenizer.bin", vocab_);
#ifdef __ANDROID__
initModels(path + "/assets/mModelEncoder.ort", path + "/assets/mModelDecoder.ort", path + "/assets/mLogSoftMax.ort", acc);
#else
initModels(path + "/assets/mModelEncoder.onnx", path + "/assets/mModelDecoder.onnx", path + "/assets/mLogSoftMax.onnx", acc);
#endif
}
ModelProcessor::~ModelProcessor()
{
endModels();
Plog::log(Plog::LogPriority::INFO, TAG, "~ModelProcessor");
}
void
ModelProcessor::endModels()
{
if (encoderSession_) {
delete encoderSession_;
encoderSession_ = nullptr;
}
if (decoderSession_) {
delete decoderSession_;
decoderSession_ = nullptr;
}
if (logSoftMaxSession_) {
delete logSoftMaxSession_;
logSoftMaxSession_ = nullptr;
}
#ifdef NVIDIA
if (cudaOptions_) {
ortApi.ReleaseCUDAProviderOptions(cudaOptions_);
cudaOptions_ = nullptr;
}
#endif
if (env_) {
env_.release();
env_ = NULL;
}
}
void
ModelProcessor::initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc)
{
try {
sessOpt_ = Ort::SessionOptions();
try {
if (activateAcc) {
#ifdef NVIDIA
Ort::ThrowOnError(ortApi.CreateCUDAProviderOptions(&cudaOptions_));
// std::vector<const char*> keys{"device_id"};
// std::vector<const char*> values{"0"};
// Ort::ThrowOnError(ortApi.UpdateCUDAProviderOptions(cudaOptions_, keys.data(), values.data(), keys.size()));
Ort::ThrowOnError(ortApi.SessionOptionsAppendExecutionProvider_CUDA_V2(sessOpt_, cudaOptions_));
#endif
#ifdef __ANDROID__
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(sessOpt_, 0));
#endif
}
} catch (std::exception& accelException) {
Plog::log(Plog::LogPriority::ERR, TAG, accelException.what());
Plog::log(Plog::LogPriority::ERR, TAG, "Acceleration not available, loading models for CPU.");
}
sessOpt_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
#ifdef WIN32
encoderSession_ = new Ort::Session(env_, string_utils::to_wstring(encoderModelPath).c_str(), sessOpt_);
decoderSession_ = new Ort::Session(env_, string_utils::to_wstring(decoderModelPath).c_str(), sessOpt_);
logSoftMaxSession_ = new Ort::Session(env_, string_utils::to_wstring(logSoftMaxModelPath).c_str(), sessOpt_);
#else
encoderSession_ = new Ort::Session(env_, encoderModelPath.c_str(), sessOpt_);
decoderSession_ = new Ort::Session(env_, decoderModelPath.c_str(), sessOpt_);
logSoftMaxSession_ = new Ort::Session(env_, logSoftMaxModelPath.c_str(), sessOpt_);
#endif
isAllocated_ = true;
Plog::log(Plog::LogPriority::INFO, TAG, "Model is allocated");
} catch (std::exception& e) {
Plog::log(Plog::LogPriority::ERR, TAG, e.what());
}
}
/* from whisper.cpp */
// the most basic sampling scheme - select the top token
whisperTokenData
ModelProcessor::whisper_sample_best(const float * probs)
{
whisperTokenData result = {
0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
};
int n_logits = vocab_.id_to_token.size();
std::vector<std::pair<double, int64_t>> probs_id;
probs_id.reserve(n_logits);
for (int i = 0; i < n_logits; i++) {
probs_id.emplace_back(std::make_pair(probs[i], i));
}
{
double sum_ts = 0.0;
double max_ts = -1.0;
double max_tx = -1.0;
for (int i = 0; i < vocab_.token_beg; i++) {
max_tx = std::max(max_tx, probs_id[i].first);
}
for (int i = vocab_.token_beg; i < n_logits; i++) {
sum_ts += probs_id[i].first;
if (probs_id[i].first > max_ts) {
max_ts = probs_id[i].first;
result.tid = probs_id[i].second;
}
}
// if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
// timestamp token
if (sum_ts > max_tx) {
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
for (int i = 0; i < vocab_.token_beg; i++) {
probs_id[i].first = -INT_MAX;
}
}
result.pt = max_ts/(sum_ts + 1e-10);
result.ptsum = sum_ts;
}
// find the top K tokens
const int top_k = 4;
std::partial_sort(
probs_id.begin(),
probs_id.begin() + top_k, probs_id.end(),
[](const std::pair<double, int64_t> & a, const std::pair<double, int64_t> & b) {
return a.first > b.first;
});
probs_id.resize(top_k);
int res = 0;
while ((probs_id[res].second == vocab_.token_sot ||
probs_id[res].second == vocab_.token_solm ||
probs_id[res].second == vocab_.token_beg) &&
res < (int) probs_id.size() - 1) {
res++;
}
result.id = probs_id[res].second;
result.p = probs_id[res].first;
return result;
}
void
ModelProcessor::filterLogits(std::vector<float>& logits, int offset)
{
// Remove all no speech tokens
for (const auto idx : vocab_.noSpeechTokens) {
logits[idx] = (float)-INT_MAX;
}
}
void
ModelProcessor::filterLanguageLogits(std::vector<float>& logits)
{
// Leave only the language tokens
for (size_t i = 0; i < logits.size(); i++) {
if (vocab_.languageId2Tokens[i].empty())
logits[i] = (float)(-INT_MAX);
}
}
whisperTokenData
ModelProcessor::getToken(std::vector<float>& logits)
{
Ort::RunOptions runOption;
std::vector<Ort::Value> logSoftMaxInputs;
logSoftMaxInputs.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
logits.data(),
logits.size(),
logitsShape_.data(),
logitsShape_.size()));
auto softmaxOutputs = logSoftMaxSession_->Run(runOption,
logSoftMaxInputNames.data(),
logSoftMaxInputs.data(),
logSoftMaxInputNames.size(),
logSoftMaxOutputNames.data(),
logSoftMaxOutputNames.size());
float* probs = softmaxOutputs[1].GetTensorMutableData<float>();
return whisper_sample_best(probs);
}
std::string
ModelProcessor::feedInput(std::vector<float>& melInput, const std::string& preferenceLanguage)
{
std::lock_guard<std::mutex> l(mtx_);
if (!isAllocated_ || !logSoftMaxSession_ || !encoderSession_ || !decoderSession_)
return "";
Ort::RunOptions runOption;
try {
Ort::Value melInputTensor = Ort::Value::CreateTensor<float>(allocatorInfo_,
melInput.data(),
melInput.size(),
melInputShape_.data(),
melInputShape_.size());
audioFeaturesTensor_ = Ort::Value::CreateTensor<float>(allocatorInfo_,
audioFeatures_.data(),
audioFeatures_.size(),
audioFeaturesShape_.data(),
audioFeaturesShape_.size());
// Run the encoder graph
encoderSession_->Run(runOption,
encoderInputNames,
&melInputTensor,
1,
encoderOutputNames,
&audioFeaturesTensor_,
1);
} catch(Ort::Exception e) {
Plog::log(Plog::LogPriority::ERR, TAG, e.what());
return "";
} catch (...) { return ""; }
std::vector<float> currentTokensP {};
try {
auto isMultilingual = vocab_.is_multilingual();
std::vector<int64_t> currentTokens {};
currentTokens.emplace_back(vocab_.token_sot);
currentTokensP.emplace_back(1);
std::array<int64_t, 1> offsetShape {1};
if (isMultilingual) {
if (preferenceLanguage == "auto"
|| vocab_.languageTokens2Id.find(preferenceLanguage) == vocab_.languageTokens2Id.end()) {
std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f);
std::array<int64_t, 2> tokenShape {1, 1};
int64_t offset = 0;
std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, 1, MODELFEATURESHAPE };
std::vector<int64_t> token = { currentTokens.back() };
// Run the decoder graph
std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_};
inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
audioFeatures_.data(),
audioFeatures_.size(),
audioFeaturesShape_.data(),
audioFeaturesShape_.size()));
inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
token.data(),
token.size(),
tokenShape.data(),
tokenShape.size()));
inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
currentKVCache.data(),
currentKVCache.size(),
kvCacheShape.data(),
kvCacheShape.size()));
inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
&offset,
1,
offsetShape.data(),
0));
auto outputs = decoderSession_->Run(runOption,
decoderInputNames.data(),
inputsVector.data(),
decoderInputNames.size(),
decoderOutputNames.data(),
decoderOutputNames.size());
auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo();
auto logitsData = outputs[0].GetTensorMutableData<float>();
{
std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount());
filterLanguageLogits(logits);
auto it = std::max_element(logits.begin(), logits.end());
currentTokens.emplace_back(std::distance(logits.begin(), it));
}
} else
currentTokens.emplace_back(vocab_.languageTokens2Id[preferenceLanguage]);
currentTokens.emplace_back(vocab_.token_transcribe);
currentTokensP.emplace_back(1);
currentTokensP.emplace_back(1);
}
std::vector<float> currentKVCache(MODELKVCACHESHAPE * 1 * currentTokens.size() * MODELFEATURESHAPE, 0.0f);
std::array<int64_t, 2> tokenShape {1, (long)currentTokens.size()};
for (auto i = 0; i < sampleLen; i++) {
int64_t offset = isMultilingual ? ( i == 0 ? 0 : i + 2 ) : i;
std::array<int64_t, 4> kvCacheShape { MODELKVCACHESHAPE, 1, static_cast<int64_t>(currentTokens.size()), MODELFEATURESHAPE };
std::vector<int64_t> token = { currentTokens.back() };
if (i == 0) {
token = currentTokens;
tokenShape[1] = currentTokens.size();
} else {
tokenShape[1] = 1;
}
// Run the decoder graph
std::vector<Ort::Value> inputsVector; // {audioFeaturesTensor_, tokensTensor_, kvCacheTensor_, offsetTensor_};
inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
audioFeatures_.data(),
audioFeatures_.size(),
audioFeaturesShape_.data(),
audioFeaturesShape_.size()));
inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
token.data(),
token.size(),
tokenShape.data(),
tokenShape.size()));
inputsVector.emplace_back(Ort::Value::CreateTensor<float>(allocatorInfo_,
currentKVCache.data(),
currentKVCache.size(),
kvCacheShape.data(),
kvCacheShape.size()));
inputsVector.emplace_back(Ort::Value::CreateTensor<int64_t>(allocatorInfo_,
&offset,
1,
offsetShape.data(),
0));
auto outputs = decoderSession_->Run(runOption,
decoderInputNames.data(),
inputsVector.data(),
decoderInputNames.size(),
decoderOutputNames.data(),
decoderOutputNames.size());
auto logitsTensorInfo = outputs[0].GetTensorTypeAndShapeInfo();
auto logitsData = outputs[0].GetTensorMutableData<float>();
{
std::vector<float>logits(logitsData, logitsData + logitsTensorInfo.GetElementCount());
if (isMultilingual && logits.size() > vocab_.n_vocab) {
std::vector<float>lastLogits;
lastLogits = std::vector<float>(logits.begin() + 2 * vocab_.n_vocab, logits.end());
std::swap(lastLogits, logits);
}
filterLogits(logits, offset);
auto tokenData = getToken(logits);
currentTokens.emplace_back(tokenData.id);
currentTokensP.emplace_back(tokenData.p);
}
// Grab kvCache for next iteration
auto kvCacheTensorInfo = outputs[1].GetTensorTypeAndShapeInfo();
auto nextKVCacheData = outputs[1].GetTensorMutableData<float>();
std::vector<float> nextKVCache;
std::vector<float> zeros(MODELFEATURESHAPE, 0.0f);
int delta = (currentTokens.size() - 1) * MODELFEATURESHAPE;
for (int currentKVIdx = 0; currentKVIdx < MODELKVCACHESHAPE; currentKVIdx++) {
nextKVCache.insert(nextKVCache.end(),
nextKVCacheData + (currentKVIdx * delta),
nextKVCacheData + ((currentKVIdx + 1) * delta));
nextKVCache.insert(nextKVCache.end(), zeros.begin(), zeros.end());
}
std::swap(currentKVCache, nextKVCache);
if (currentTokens.back() == vocab_.token_eot)
break;
}
std::swap(currentTokens, tokensOutput_);
} catch(Ort::Exception e) {
Plog::log(Plog::LogPriority::ERR, TAG, e.what());
return "";
} catch (...) {}
std::ostringstream oss;
std::ostringstream tokensStr;
auto idx = -1;
for (const auto& token : tokensOutput_) {
idx ++;
tokensStr << token << " " << currentTokensP[idx] << " ";
if (token >= vocab_.token_eot)
continue;
if (currentTokensP[idx] > -1.8)
oss << vocab_.id_to_token[token];
}
tokensOutput_.clear();
return oss.str();
}
} // namespace jami
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
* USA.
*/
#pragma once
#include <map>
#include <vector>
#include <algorithm>
#include <set>
#include <mutex>
#include <onnxruntime_cxx_api.h>
// #ifdef NVIDIA
// #include <cuda_provider_options.h>
// #endif
#ifdef __ANDROID__
#include <nnapi_provider_factory.h>
#endif
#include <functional>
#include "Preprocess.h"
namespace jami {
// Use script getonnxio.py to grab model inputs and outputs
// names and shapes.
// Note: None is a open shape. If in the input, it will be defined by
// the data we want to use as input. As for open output, it is recommended
// to not try to pre allocate the tensor and use the model.run return.
static const char* encoderInputNames[4] = {"mel"};
static const char* encoderOutputNames[4] = {"617"};
#define MODELFEATURESHAPE 384
#define MODELKVCACHESHAPE 8
#define MODELLOGITSHAPE 51865 // 51864 for english models
static const std::vector<const char*> decoderInputNames = {"audio_features", "tokens", "kv_cache", "offset"};
static const std::vector<const char*> decoderOutputNames = {"logits", "output_kv_cache"};
static const std::vector<const char *> logSoftMaxInputNames = {"logits"};
static const std::vector<const char *> logSoftMaxOutputNames = {"token_ids", "probs"};
typedef struct whisperTokenData {
int64_t id; // token id
int64_t tid; // forced timestamp token id
float p; // probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
int64_t t0; // start time of the token
int64_t t1; // end time of the token
float vlen; // voice length of the token
} whisperTokenData;
class ModelProcessor
{
public:
ModelProcessor(const std::string& path, bool acc);
~ModelProcessor();
void initModels(const std::string& encoderModelPath, const std::string& decoderModelPath, const std::string& logSoftMaxModelPath, bool activateAcc);
void endModels();
whisperTokenData whisper_sample_best(const float * probs);
/**
* @brief feedInput
* Takes a input and feeds it to the model storage for predictions
* @param input
* @param preferenceLanguage
*/
std::string feedInput(std::vector<float>& input, const std::string& preferenceLanguage = "auto");
bool isAllocated() { return isAllocated_; }
private:
// Tokens
whisperVocab vocab_;
whisperTokenData getToken(std::vector<float>& logits);
void filterLogits(std::vector<float>& logits, int offset);
void filterLanguageLogits(std::vector<float>& logits);
// onnx related
Ort::MemoryInfo allocatorInfo_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
bool isAllocated_ {false};
Ort::Env env_ {ORT_LOGGING_LEVEL_WARNING, "whisperTest"};
Ort::Session* encoderSession_ {nullptr};
Ort::Session* decoderSession_ {nullptr};
Ort::Session* logSoftMaxSession_ {nullptr};
Ort::SessionOptions sessOpt_;
#ifdef NVIDIA
const OrtApi& ortApi = Ort::GetApi();
OrtCUDAProviderOptionsV2* cudaOptions_ = nullptr;
#endif
// Encoder tensors. 1 input and 1 output
std::vector<int64_t> melInputShape_ {1, 80, 3000}; // Input Data Type: 1 (float), Input Shape: [1, 80, 3000]
Ort::Value audioFeaturesTensor_ {nullptr};
std::vector<int64_t> audioFeaturesShape_ {1, 1500, MODELFEATURESHAPE}; // Output Data Type: 1 (float), Output Shape: [1, 1500, MODELFEATURESHAPE]
std::array<float, 1500 * MODELFEATURESHAPE> audioFeatures_ {};
std::vector<float> output_;
// Decoder tensors. 4 inputs and 2 outputs
std::vector<int64_t> tokensOutput_ { };
// LogProb check
std::array<int64_t, 3> logitsShape_ {1, 1, MODELLOGITSHAPE};
int sampleLen = 100;
std::mutex mtx_;
};
} // namespace jami
......@@ -89,9 +89,7 @@ PluginPreferenceHandler::preferenceMapHasKey(const std::string& key)
return (key == "background"
|| key == "position"
|| key == "fontsize"
|| key == "language"
|| key == "chunksize"
|| key == "stepsize");
|| key == "language");
}
std::string
......@@ -110,8 +108,7 @@ std::map<std::string, std::string>
PluginPreferenceHandler::getPreferences(const std::string& accountId)
{
std::lock_guard<std::mutex> lk(mtx_);
auto preferences = preferences_.emplace(accountId, preferences_["default"]).first->second;
return preferences;
return preferences_.emplace(accountId, preferences_["default"]).first->second;
}
PluginPreferenceHandler::~PluginPreferenceHandler()
......
......@@ -35,9 +35,7 @@ enum Parameter {
POSITION,
BACKGROUND,
FONTSIZE,
LANGUAGE,
CHUNK,
STEP
LANGUAGE
};
class TranscriptMediaHandler;
......
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "Preprocess.h"
#ifdef WIN32
#define _USE_MATH_DEFINES
#endif
#include <thread>
#include <math.h>
#include <fstream>
#include <iostream>
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
bool logMelSpectrogram(
const float *samples,
const int n_samples,
const int n_threads,
const whisperFilters &filters,
whisperMel &mel) {
// const int sample_rate = WHISPER_SAMPLE_RATE;
const int fft_size = WHISPER_N_FFT;
const int fft_step = WHISPER_HOP_LENGTH;
const int n_mel = WHISPER_N_MEL;
// Hanning window
std::vector<float> hann;
hann.resize(fft_size);
for (int i = 0; i < fft_size; i++) {
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
}
mel.n_mel = n_mel;
mel.n_len = (n_samples)/fft_step;
mel.data.resize(mel.n_mel*mel.n_len);
const int n_fft = 1 + fft_size/2;
std::vector<std::thread> workers(n_threads);
for (int iw = 0; iw < n_threads; ++iw) {
workers[iw] = std::thread([&](int ith) {
std::vector<float> fft_in;
fft_in.resize(fft_size);
for (int i = 0; i < fft_size; i++) {
fft_in[i] = 0.0;
}
std::vector<float> fft_out;
fft_out.resize(2*fft_size);
for (int i = ith; i < mel.n_len; i += n_threads) {
const int offset = i*fft_step;
// apply Hanning window
for (int j = 0; j < fft_size; j++) {
if (offset + j < n_samples) {
fft_in[j] = hann[j]*samples[offset + j];
} else {
fft_in[j] = 0.0;
}
}
// FFT -> mag^2
fft(fft_in, fft_out);
for (int j = 0; j < fft_size; j++) {
fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
}
for (int j = 1; j < fft_size/2; j++) {
fft_out[j] += fft_out[fft_size - j];
}
// mel spectrogram
for (int j = 0; j < mel.n_mel; j++) {
double sum = 0.0;
for (int k = 0; k < n_fft; k++) {
sum += fft_out[k]*filters.data[j*n_fft + k];
}
if (sum < 1e-10) {
sum = 1e-10;
}
sum = log10(sum);
mel.data[j*mel.n_len + i] = sum;
}
}
}, iw);
}
for (int iw = 0; iw < n_threads; ++iw) {
workers[iw].join();
}
// clamping and normalization
double mmax = -1e20;
for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
if (mel.data[i] > mmax) {
mmax = mel.data[i];
}
}
mmax -= 8.0;
for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
if (mel.data[i] < mmax) {
mel.data[i] = mmax;
}
mel.data[i] = (mel.data[i] + 4.0)/4.0;
}
return true;
}
// Cooley-Tukey FFT
// poor man's implementation - use something better
// input is real-valued
// output is complex-valued
void fft(const std::vector<float> & in, std::vector<float> & out) {
out.resize(in.size()*2);
int N = in.size();
if (N == 1) {
out[0] = in[0];
out[1] = 0;
return;
}
if (N%2 == 1) {
dft(in, out);
return;
}
std::vector<float> even;
std::vector<float> odd;
for (int i = 0; i < N; i++) {
if (i % 2 == 0) {
even.emplace_back(in[i]);
} else {
odd.emplace_back(in[i]);
}
}
std::vector<float> even_fft;
std::vector<float> odd_fft;
fft(even, even_fft);
fft(odd, odd_fft);
for (int k = 0; k < N/2; k++) {
float theta = 2*M_PI*k/N;
float re = cos(theta);
float im = -sin(theta);
float re_odd = odd_fft[2*k + 0];
float im_odd = odd_fft[2*k + 1];
out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
}
}
// naive Discrete Fourier Transform
// input is real-valued
// output is complex-valued
void dft(const std::vector<float> & in, std::vector<float> & out) {
int N = in.size();
out.resize(N*2);
for (int k = 0; k < N; k++) {
float re = 0;
float im = 0;
for (int n = 0; n < N; n++) {
float angle = 2*M_PI*k*n/N;
re += in[n]*cos(angle);
im -= in[n]*sin(angle);
}
out[k*2 + 0] = re;
out[k*2 + 1] = im;
}
}
void loadMelFilters(const std::string& fileName, whisperFilters& filters) {
auto fin = std::ifstream(fileName, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
return;
}
fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));
filters.data.resize(filters.n_mel * filters.n_fft);
fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
}
void loadTokens(const std::string& fileName, whisperVocab& vocab) {
auto fin = std::ifstream(fileName, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fileName.c_str());
return;
}
int32_t modelNVocab = 0;
fin.read((char *) &modelNVocab, sizeof(modelNVocab));
int32_t tokensNVocab = 0;
fin.read((char *) &tokensNVocab, sizeof(tokensNVocab));
std::string word;
for (int i = 0; i < tokensNVocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
vocab.n_vocab = modelNVocab;
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;
vocab.token_prev++;
vocab.token_solm++;
vocab.token_not++;
vocab.token_beg++;
}
if (tokensNVocab < modelNVocab) {
// Read language tokens
{
int32_t languageTokensLen = 0;
fin.read((char *) &languageTokensLen, sizeof(languageTokensLen));
std::string word;
for (int i = 0; i < languageTokensLen; i++) {
int32_t id = 0;
fin.read((char *) &id, sizeof(id));
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = id;
vocab.id_to_token[id] = word;
vocab.languageId2Tokens.insert({id, word});
vocab.languageTokens2Id.insert({word, id});
}
}
fprintf(stderr, "%s: adding %d extra tokens\n", __func__, modelNVocab - tokensNVocab);
for (int i = tokensNVocab; i < modelNVocab; i++) {
if (!vocab.id_to_token[i].empty())
continue;
if (i > vocab.token_beg) {
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
} else if (i == vocab.token_eot) {
word = "[_EOT_]";
} else if (i == vocab.token_sot) {
word = "[_SOT_]";
} else if (i == vocab.token_prev) {
word = "[_PREV_]";
} else if (i == vocab.token_not) {
word = "[_NOT_]";
} else if (i == vocab.token_beg) {
word = "[_BEG_]";
} else {
word = "[_extra_token_" + std::to_string(i) + "]";
}
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// Read no speech tokens
{
int32_t noSpeechTokensLen = 0;
fin.read((char *) &noSpeechTokensLen, sizeof(noSpeechTokensLen));
for (int i = 0; i < noSpeechTokensLen; i++) {
uint32_t id;
fin.read((char *) &id, sizeof(id));
vocab.noSpeechTokens.insert(id);
}
}
}
void
inputPadTrim(whisperMel &mel)
{
if (mel.n_len == ENCODER_INPUT_LEN)
return;
std::vector<float> data;
std::vector<float> partialData;
int seek = 0;
auto dataLimit = std::min(mel.n_len, ENCODER_INPUT_LEN);
for (auto j = 0; j < mel.n_mel; j++) {
seek = j * mel.n_len;
for (auto i = seek; i < (j + 1) * dataLimit; i++) {
partialData.emplace_back(mel.data[i]);
}
if (mel.n_len < ENCODER_INPUT_LEN) {
for (auto i = mel.n_len; i < ENCODER_INPUT_LEN; i++) {
partialData.emplace_back(0.0f);
}
}
data.insert(data.end(), partialData.begin(), partialData.end());
partialData.clear();
}
std::swap(mel.data, data);
}
/**
* Copyright (C) 2022 Savoir-faire Linux Inc.
*
* Author: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#pragma once
#include <vector>
#include <cstdint>
#include <string>
#include <map>
#include <set>
// Those are model defined
// Check paper page 3 (https://cdn.openai.com/papers/whisper.pdf)
#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_N_MEL 80
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30
#define ENCODER_INPUT_LEN 3000
struct whisperMel {
int n_len;
int n_mel;
std::vector<float> data;
};
struct whisperFilters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
struct whisperVocab {
size_t n_vocab = 51864;
std::map<std::string, int32_t> token_to_id;
std::map<int32_t, std::string> id_to_token;
int32_t token_eot = 50256;
int32_t token_sot = 50257;
int32_t token_prev = 50360;
int32_t token_solm = 50361; // no speech
int32_t token_not = 50362; // no timestamps
int32_t token_beg = 50363; // timestamp begin
// available tasks
const int32_t token_translate = 50358;
const int32_t token_transcribe = 50359;
bool is_multilingual() const {
return n_vocab == 51865;
}
std::map<std::string, int32_t> languageTokens2Id;
std::map<int32_t, std::string> languageId2Tokens;
std::set<int32_t> noSpeechTokens;
};
bool logMelSpectrogram(
const float * samples,
const int n_samples,
const int n_threads,
const whisperFilters & filters,
whisperMel &mel);
void fft(const std::vector<float> & in, std::vector<float> & out);
void dft(const std::vector<float> & in, std::vector<float> & out);
void loadMelFilters(const std::string& fileName, whisperFilters& filters);
void loadTokens(const std::string& fileName, whisperVocab& vocab);
void inputPadTrim(whisperMel &mel);
......@@ -24,27 +24,27 @@
#include <frameUtils.h>
#include <bitset>
#include <iostream>
#include <fmt/core.h>
#include <fmt/format.h>
const std::string TAG = "Transcript";
#include "stt_whisper.h"
const std::string TAG = "TranscriptAudio";
const char sep = separator();
namespace jami {
TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc)
TranscriptAudioSubscriber::TranscriptAudioSubscriber(const std::string& dataPath,
TranscriptVideoSubscriber* videoSubscriber)
: path_ {dataPath}
, modelProcessor_ {dataPath, acc}
, mVS_ {videoSubscriber}
{
loadMelFilters(path_ + "/assets/mel_filters.bin", modelFilters_);
Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("TranscriptAudioSubscriber {}", fmt::ptr(this)));
}
TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
{
modelProcessor_.endModels();
formatFilter_.clean();
stop();
processFrameThread.join();
Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaProcessor");
Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("~TranscriptAudioSubscriber {}", fmt::ptr(this)));
}
/**
......@@ -53,83 +53,84 @@ TranscriptAudioSubscriber::~TranscriptAudioSubscriber()
void
TranscriptAudioSubscriber::processFrame()
{
while (running) {
auto data = modelInput_[modelIdx_];
if (data.size() <= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) {
std::this_thread::sleep_for(std::chrono::milliseconds(waitingPoint_));
continue;
if (!whisper_) {
whisper_ = std::make_unique<RealtimeSttWhisper>(path_ + "/assets/ggml-base.bin");
whisper_->setLanguage(language_);
}
if (!running)
break;
melSpectrogram_.data.clear();
melSpectrogram_.n_len = 0;
melSpectrogram_.n_mel = 0;
logMelSpectrogram(data.data(),
data.size(),
8,
modelFilters_,
melSpectrogram_);
inputPadTrim(melSpectrogram_);
auto text = modelProcessor_.feedInput(melSpectrogram_.data, language_);
if (text.empty()) {
while (running) {
decltype(frames_) frames;
{
std::unique_lock<std::mutex> l(inputLock);
modelInput_[0].clear();
modelInput_[1].clear();
modelIdx_ = 0;
cv_.wait(l, [&]{
return !running || !frames_.empty();
});
if (!running)
return;
frames = std::move(frames_);
}
for (auto& f : frames) {
uniqueFramePtr filteredFrame = getUniqueFrame();
filteredFrame->sample_rate = WHISPER_SAMPLE_RATE;
filteredFrame->format = AV_SAMPLE_FMT_FLT;
av_channel_layout_from_mask(&filteredFrame->ch_layout , AV_CH_LAYOUT_MONO);
try {
if (resampler_.resample(f.get(), filteredFrame.get()) == 0) {
whisper_->AddAudioData((float*) filteredFrame->buf[0]->data,
filteredFrame->nb_samples);
}
} catch (...) {
}
}
auto result = whisper_->GetTranscribed();
if (not result.empty()) {
std::string txt;
for (const auto& t : result) {
if (not t.is_partial)
txt += t.text;
}
if (!txt.empty())
mVS_->setText(txt);
}
mVS_->setText(text);
}
whisper_.reset();
}
void
TranscriptAudioSubscriber::stop()
{
Plog::log(Plog::LogPriority::INFO, TAG, "stop()");
{
std::unique_lock<std::mutex> l(inputLock);
running = false;
cv_.notify_all();
}
if (processFrameThread.joinable()) {
processFrameThread.join();
}
std::string str = "";
mVS_->setText(str);
mVS_->setText("");
}
void
TranscriptAudioSubscriber::start()
{
Plog::log(Plog::LogPriority::INFO, TAG, "start()");
running = true;
processFrameThread = std::thread([this] { processFrame(); });
processFrameThread = std::thread([this](){ processFrame(); });
mVS_->setText("");
}
void
TranscriptAudioSubscriber::setParameter(std::string& parameter, Parameter type)
TranscriptAudioSubscriber::setParameter(const std::string& parameter, Parameter type)
{
std::unique_lock<std::mutex> l(inputLock);
std::string str = "";
switch (type) {
case (Parameter::LANGUAGE):
language_ = parameter;
modelInput_[0].clear();
modelInput_[1].clear();
modelIdx_ = 0;
mVS_->setText(str);
break;
case (Parameter::CHUNK):
WHISPER_STREAM_SAMPLES_CHUNK = 16000 * std::stoi(parameter);
modelInput_[0].resize(0);
modelInput_[1].resize(0);
modelInput_[0].reserve(WHISPER_STREAM_SAMPLES_CHUNK);
modelInput_[1].reserve(WHISPER_STREAM_SAMPLES_CHUNK);
waitingPoint_ = (std::stoi(parameter) * 1000 - (WHISPER_STREAM_SAMPLES_CHUNK_STEP / 16)) / 3;
modelIdx_ = 0;
mVS_->setText(str);
break;
case (Parameter::STEP):
modelInput_[0].clear();
modelInput_[1].clear();
WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * std::stoi(parameter);
waitingPoint_ = ((WHISPER_STREAM_SAMPLES_CHUNK / 16) - std::stoi(parameter) * 1000) / 3;
modelIdx_ = 0;
mVS_->setText(str);
if (whisper_)
whisper_->setLanguage(parameter);
break;
default:
return;
......@@ -140,77 +141,30 @@ void
TranscriptAudioSubscriber::update(jami::Observable<AVFrame*>* obs, AVFrame* const& pluginFrame)
{
std::unique_lock<std::mutex> l(inputLock);
if (!pluginFrame || modelFilters_.data.empty() || obs != observable_)
return;
if (firstRun) {
samplesCount_ = 0;
currentModelInput_.clear();
futureModelInput_.clear();
formatFilter_.clean();
AudioFormat afmt = AudioFormat(pluginFrame->sample_rate,
pluginFrame->channels,
static_cast<AVSampleFormat>(pluginFrame->format));
MediaStream ms = MediaStream("input", afmt);
formatFilter_.initialize(filterDescription_, {ms});
firstRun = false;
}
if (!formatFilter_.initialized_)
if (!pluginFrame || obs != observable_)
return;
if (formatFilter_.feedInput(pluginFrame, "input") == 0) {
uniqueFramePtr filteredFrame = {formatFilter_.readOutput(), frameFree};
if (filteredFrame) {
for (size_t i = 0; i < filteredFrame->buf[0]->size; i += 2) {
#ifdef __DEBUG__
std::lock_guard<std::mutex> l(inputLock);
#endif
int16_t rawValue = (filteredFrame->buf[0]->data[i+1] << 8) | filteredFrame->buf[0]->data[i];
// If not a positive value, perform the 2's complement math on the value
if ((rawValue & 0x8000) != 0) {
rawValue = (~(rawValue - 0x0001)) * -1;
}
if (futureModelInput_.size() == WHISPER_STREAM_SAMPLES_CHUNK)
futureModelInput_.erase(futureModelInput_.begin());
futureModelInput_.emplace_back(float(rawValue)/32768.0f);
samplesCount_++;
auto value = float(rawValue) / 32768.0f;
if (modelInput_[modelIdx_].size() >= WHISPER_STREAM_SAMPLES_CHUNK) {
modelInput_[modelIdx_].clear();
modelIdx_ = modelIdx_ ? 0 : 1;
}
modelInput_[modelIdx_].emplace_back(value);
if (modelInput_[modelIdx_].size()
>= WHISPER_STREAM_SAMPLES_CHUNK - WHISPER_STREAM_SAMPLES_CHUNK_STEP) {
modelInput_[modelIdx_ ? 0 : 1].emplace_back(value);
}
}
}
}
frames_.emplace_back(uniqueFramePtr(av_frame_clone(pluginFrame), frameFree));
cv_.notify_all();
// audio returns as is
}
void
TranscriptAudioSubscriber::attached(jami::Observable<AVFrame*>* observable)
{
Plog::log(Plog::LogPriority::INFO, TAG, "::Attached ! ");
std::unique_lock<std::mutex> l(inputLock);
Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Attached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
observable_ = observable;
start();
}
void
TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>*)
TranscriptAudioSubscriber::detached(jami::Observable<AVFrame*>* observable)
{
firstRun = true;
observable_ = nullptr;
stop();
modelInput_[0].clear();
modelInput_[1].clear();
modelIdx_ = 0;
Plog::log(Plog::LogPriority::INFO, TAG, "::Detached()");
Plog::log(Plog::LogPriority::INFO, TAG, fmt::format("::Detached ! {} for {}", fmt::ptr(this), fmt::ptr(observable)));
}
void
......@@ -218,7 +172,6 @@ TranscriptAudioSubscriber::detach()
{
if (observable_) {
firstRun = true;
std::ostringstream oss;
Plog::log(Plog::LogPriority::INFO, TAG, "::Calling detach()");
observable_->detach(this);
}
......
......@@ -26,21 +26,25 @@ extern "C" {
#include <observer.h>
#include <frameFilter.h>
#include "Preprocess.h"
#include "ModelProcessor.h"
#include <frameUtils.h>
#include "TranscriptVideoSubscriber.h"
#include "PluginPreferenceHandler.h"
#include "resampler.h"
#include <thread>
#include <condition_variable>
#include <deque>
#include <atomic>
class RealtimeSttWhisper;
namespace jami {
class TranscriptAudioSubscriber : public Observer<AVFrame*>
{
public:
TranscriptAudioSubscriber(const std::string& dataPath, TranscriptVideoSubscriber* videoSubscriber, bool acc = false);
TranscriptAudioSubscriber(const std::string& dataPath,
TranscriptVideoSubscriber* videoSubscriber);
~TranscriptAudioSubscriber();
virtual void update(Observable<AVFrame*>*, AVFrame* const&) override;
......@@ -49,22 +53,11 @@ public:
void detach();
void setParameter(std::string& parameter, Parameter type);
void setParameter(const std::string& parameter, Parameter type);
private:
// Mel spectrogram filters
whisperFilters modelFilters_;
whisperMel melSpectrogram_;
// Observer pattern
Observable<AVFrame*>* observable_ {};
// Filter for audio formatting
const std::string filterDescription_ = "[input]aresample=16000,aformat=sample_fmts=s16:channel_layouts=mono";
FrameFilter formatFilter_;
std::array<std::vector<float>, 2> modelInput_ {};
int modelIdx_ {0};
int waitingPoint_ {1000};
std::string language_ {"auto"};
// Data
......@@ -72,12 +65,15 @@ private:
// Status variables of the processing
bool firstRun {true};
bool running {false};
std::atomic_bool running {false};
std::mutex inputLock;
std::condition_variable cv_;
// Model
ModelProcessor modelProcessor_;
std::unique_ptr<RealtimeSttWhisper> whisper_;
Resampler resampler_;
std::vector<uniqueFramePtr> frames_;
// Threading
std::thread processFrameThread;
......@@ -87,9 +83,5 @@ private:
// Video processor
TranscriptVideoSubscriber* mVS_ {};
size_t WHISPER_STREAM_SAMPLES_CHUNK = 16000 * 15; // 16 KHz * 15 seconds
size_t WHISPER_STREAM_SAMPLES_CHUNK_STEP = 16000 * 3; // 16 KHz * 3 seconds
};
} // namespace jami
......@@ -36,13 +36,11 @@ TranscriptMediaHandler::TranscriptMediaHandler(std::string&& datapath, PluginPre
aph_ = prefHandler;
setId(datapath_);
auto preferences = aph_->getPreferences("default");
auto it = preferences.find("acceleration");
auto useAcceleration = it == preferences.end() ? false : it->second == "1";
videoSubscriber_ = std::make_shared<TranscriptVideoSubscriber>(datapath_);
audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get(), useAcceleration);
audioSubscriber_ = std::make_shared<TranscriptAudioSubscriber>(datapath_, videoSubscriber_.get());
setParameters("default");
#ifdef __DEBUG__
it = preferences.find("subtitle");
auto it = preferences.find("subtitle");
if (it != preferences.end())
videoSubscriber_->setText(it->second);
#endif
......@@ -102,8 +100,6 @@ TranscriptMediaHandler::setParameters(const std::string& accountId)
videoSubscriber_->setParameter(preferences["background"], Parameter::BACKGROUND);
videoSubscriber_->setParameter(preferences["position"], Parameter::POSITION);
audioSubscriber_->setParameter(preferences["language"], Parameter::LANGUAGE);
audioSubscriber_->setParameter(preferences["chunksize"], Parameter::CHUNK);
audioSubscriber_->setParameter(preferences["stepsize"], Parameter::STEP);
} catch (std::exception& e) {
Plog::log(Plog::LogPriority::ERR, TAG, e.what());
}
......@@ -129,9 +125,7 @@ TranscriptMediaHandler::detach()
TranscriptMediaHandler::~TranscriptMediaHandler()
{
std::ostringstream oss;
oss << " ~TranscriptMediaHandler from WhisperTranscript Plugin" << std::endl;
Plog::log(Plog::LogPriority::INFO, TAG, oss.str());
Plog::log(Plog::LogPriority::INFO, TAG, "~TranscriptMediaHandler from WhisperTranscript Plugin");
detach();
}
} // namespace jami
......@@ -35,8 +35,11 @@ extern "C" {
#include <fmt/format.h>
#include <bitset>
#include <string_view>
const std::string TAG = "Transcript";
using namespace std::literals;
const std::string TAG = "TranscriptVideo";
const char sep = separator();
namespace jami {
......@@ -54,9 +57,10 @@ TranscriptVideoSubscriber::~TranscriptVideoSubscriber()
}
void
TranscriptVideoSubscriber::setText(std::string& text)
TranscriptVideoSubscriber::setText(const std::string& t)
{
text = string_utils::ffmpegScapeString(text);
Plog::log(Plog::LogPriority::INFO, TAG, "setText " + t);
auto text = string_utils::ffmpegScapeString(t);
std::vector<std::string> textWords = string_utils::getWords(text, " ");
subtitle_ = "";
......@@ -101,9 +105,28 @@ TranscriptVideoSubscriber::setParameter(std::string& parameter, Parameter type)
firstRun = true;
}
std::string_view getTransposeDescr(int rotation)
{
switch (rotation) {
case 90:
case -270:
return "transpose=2,"sv;
case 180:
case -180:
return "transpose=1, transpose=1,"sv;
case 270:
case -90:
return "transpose=1,"sv;
default:
return {};
}
return {};
}
void
TranscriptVideoSubscriber::setFilterDescription()
{
Plog::log(Plog::LogPriority::INFO, TAG, "setFilterDescription() " + subtitle_);
if (pluginFrameSize_.first == 0 || pluginFrameSize_.second == 0)
return;
......@@ -119,35 +142,26 @@ TranscriptVideoSubscriber::setFilterDescription()
point_ = {pluginFrameSize_.first - margin, pluginFrameSize_.second - margin};
}
std::string rotateSides = "";
if (std::abs(angle_) == 90)
rotateSides = ":out_w=ih:out_h=iw";
auto baseInfosDescription
= fmt::format("[input]rotate={}{}"
",drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}"
= fmt::format("[input]{}"
"drawtext=fontcolor={}:fontsize={}:fontfile=\\'{}\\':expansion=none:text='{}"
"':line_spacing=5:box=1:boxcolor={}:boxborderw=5:x=",
rotation[angle_], rotateSides,
getTransposeDescr(angle_),
fontColor_, fontSize_, fontFile_, subtitle_, fontBackground_);
auto position = "{}-text_w:y={}";
auto position = "{}-text_w:y={}"sv;
if (position_ == "2")
position = "{}:y={}";
position = "{}:y={}"sv;
else if (position_ == "3")
position = "{}:y={}-text_h";
position = "{}:y={}-text_h"sv;
else if (position_ == "4")
position = "{}-text_w:y={}-text_h";
baseInfosDescription = baseInfosDescription + position + ",rotate={}{},format=yuv420p";
filterDescription_ = fmt::format(baseInfosDescription,
std::to_string(point_.first),
std::to_string(point_.second),
rotation[-angle_],
rotateSides);
position = "{}-text_w:y={}-text_h"sv;
filterDescription_ = baseInfosDescription + fmt::format(std::string(position) + ",{}format=yuv420p"s,
point_.first,
point_.second,
getTransposeDescr(-angle_));
#ifdef __DEBUG__
Plog::log(Plog::LogPriority::INFO, TAG, filterDescription_);
#endif
}
void
......@@ -156,9 +170,8 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
if (!observable_ || !pluginFrame || subtitle_.empty())
return;
AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX);
int newAngle {0};
if (side_data) {
if (AVFrameSideData* side_data = av_frame_get_side_data(pluginFrame, AV_FRAME_DATA_DISPLAYMATRIX)) {
auto matrix_rotation = reinterpret_cast<int32_t*>(side_data->data);
newAngle = static_cast<int>(av_display_rotation_get(matrix_rotation));
}
......@@ -170,12 +183,17 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
//======================================================================================
// GET RAW FRAME
uniqueFramePtr rgbFrame = {transferToMainMemory(pluginFrame, AV_PIX_FMT_NV12), frameFree};
if (!rgbFrame.get())
return;
if ((AVPixelFormat)rgbFrame->format != AV_PIX_FMT_YUV420P)
rgbFrame.reset(FrameScaler::convertFormat(rgbFrame.get(), AV_PIX_FMT_YUV420P));
if (!rgbFrame.get())
return;
if (sourceTimeBase_.num != pluginFrame->time_base.num || sourceTimeBase_.den != pluginFrame->time_base.den)
firstRun = true;
if (rgbFrame->width != pluginFrameSize_.first || rgbFrame->height != pluginFrameSize_.second)
firstRun = true;
rgbFrame->pts = pluginFrame->pts;
rgbFrame->time_base = pluginFrame->time_base;
......@@ -184,8 +202,6 @@ TranscriptVideoSubscriber::update(jami::Observable<AVFrame*>*, AVFrame* const& p
if (firstRun) {
filter_.clean();
pluginFrameSize_ = {rgbFrame->width, rgbFrame->height};
if (std::abs(angle_) == 90)
pluginFrameSize_ = {rgbFrame->height, rgbFrame->width};
setFilterDescription();
rational<int> fr(sourceTimeBase_.den, sourceTimeBase_.num);
......
......@@ -43,7 +43,7 @@ public:
void detach();
void setText(std::string& text);
void setText(const std::string& text);
void setFilterDescription();
void setParameter(std::string& parameter, Parameter type);
......
......@@ -12,18 +12,21 @@ EXTRAPATH=''
# -d: debug program.
if [ -z "${DAEMON}" ]; then
DAEMON="./../../daemon"
echo "DAEMON not provided, building with ${DAEMON}"
echo "DAEMON not provided, building with ./../../daemon"
fi
DAEMON=${DAEMON:="./../../daemon"}
CONTRIB_PATH=${CONTRIB_PATH:="${DAEMON}/contrib"}
CONTRIB_BUILD_DIR=${CONTRIB_BUILD_DIR:="native"}
PLUGIN_NAME="WhisperTranscript"
JPL_FILE_NAME="${PLUGIN_NAME}.jpl"
SO_FILE_NAME="lib${PLUGIN_NAME}.so"
DAEMON_SRC="${DAEMON}/src"
CONTRIB_PATH="${DAEMON}/contrib"
PLUGINS_LIB="../lib"
LIBS_DIR="./../contrib/Libs"
PLATFORM=$(uname)
CONTRIB_BUILD_PATH="${CONTRIB_PATH}/${CONTRIB_BUILD_DIR}"
if [ "${PLATFORM}" = "Linux" ]; then
PLATFORM="linux-gnu"
......@@ -74,39 +77,57 @@ fi
echo $PROCESSOR
cp -r ffmpeg ${CONTRIB_PATH}/src/
cp -r whispercpp ${CONTRIB_PATH}/src/
cp -r ../contrib/rav1e ${CONTRIB_PATH}/src/
if [ ! -f "./data/assets/ggml-base.bin" ]; then
if [ -x "$(command -v wget)" ]; then
wget --quiet --show-progress -O ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin
elif [ -x "$(command -v curl)" ]; then
curl --output ./data/assets/ggml-base.bin https://ggml.ggerganov.com/ggml-model-whisper-base.bin
else
printf "Either wget or curl is required to download models.\n"
exit 1
fi
fi
if [ ! -f "./data/assets/ggml-base.bin" ]; then
printf "Model is required to build the plugin. Aborting.\n"
exit 1
fi
if [ "${PLATFORM}" = "linux-gnu" ] || [ "${PLATFORM}" = "redhat-linux" ]
then
if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then
rm "${CONTRIB_PATH}/native/.ffmpeg"
if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then
rm "${CONTRIB_BUILD_PATH}/.ffmpeg"
rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg"
fi
if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then
rm "${CONTRIB_BUILD_PATH}/.whispercpp"
rm -rf "${CONTRIB_BUILD_PATH}/whispercpp"
fi
WORKPATH=$(pwd)
cd "${CONTRIB_PATH}/native/"
make .ffmpeg -j$(nproc)
cd "${CONTRIB_BUILD_PATH}/"
make .ffmpeg -j$(nproc) install
make .whispercpp -j$(nproc) install
rm .whispercpp
rm .ffmpeg
cd ${WORKPATH}
CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM}
ONNX_PATH=${EXTRALIBS_PATH}
if [ -z "${EXTRALIBS_PATH}" ]
then
ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}"
fi
if [ ${DEBUG} ]; then
OUTPUT="${PLUGIN_NAME}"
CLANG_OPTS="-g -fsanitize=address"
CLANG_OPTS="-O0 -g -fsanitize=address"
EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lvdpau -lX11 -lva-drm -lva-x11 -lrav1e"
EXTRA_DEFINES="-D__DEBUG__"
else
python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
CLANG_OPTS="-O3 -shared"
CLANG_OPTS="-O3 -g -shared"
OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
fi
# Compile
clang++ -std=c++17 -g -O0 -fPIC ${CLANG_OPTS} \
clang++ -std=c++17 -fPIC ${CLANG_OPTS} \
-Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \
-Wall -Wextra \
-Wno-unused-parameter \
......@@ -115,30 +136,26 @@ then
-I"." \
-I"${DAEMON_SRC}" \
-I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \
-I"${ONNX_PATH}/include/onnxruntime/session" \
-I"${ONNX_PATH}/include/onnxruntime/providers/cuda" \
-I"${CONTRIB_PATH}/native/onnx/onnxruntime" \
-I"${PLUGINS_LIB}" \
./../lib/common.cpp \
./../lib/accel.cpp \
./../lib/frameFilter.cpp \
./../lib/frameUtils.cpp \
./../lib/resampler.cpp \
main.cpp \
TranscriptMediaHandler.cpp \
TranscriptAudioSubscriber.cpp \
TranscriptVideoSubscriber.cpp \
PluginPreferenceHandler.cpp \
Preprocess.cpp \
ModelProcessor.cpp \
-L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \
-L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \
-L"${CUDA_HOME}/lib64/" \
stt_whisper.cpp \
-L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib" \
-l:libavfilter.a \
-l:libswscale.a \
-l:libswresample.a \
-l:libavformat.a \
-l:libavcodec.a \
-l:libavutil.a \
-l:libwhisper.a \
-lfreetype \
-lvpx \
-lx264 \
......@@ -147,57 +164,36 @@ then
-lz \
-lva \
-lfmt \
-lonnxruntime \
${EXTRA_DEBUG_LIBRARIES} \
-o "${OUTPUT}"
if [ ${DEBUG} ]; then
cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx"
cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx"
cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx"
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "libonnxruntime.so.1.12.0"
else
cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx"
cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx"
cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx"
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime.so.1.12.0"
fi
if [ "${PROCESSOR}" = "NVIDIA" ]; then
if [ ${DEBUG} ]; then
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "libonnxruntime_providers_shared.so"
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "libonnxruntime_providers_cuda.so"
else
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_shared.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_shared.so"
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime_providers_cuda.so" "build-local/jpl/lib/$CONTRIB_PLATFORM/libonnxruntime_providers_cuda.so"
fi
fi
elif [ "${PLATFORM}" = "darwin" ]
then
if [ -f "${CONTRIB_PATH}/native/.ffmpeg" ]; then
rm "${CONTRIB_PATH}/native/.ffmpeg"
if [ -f "${CONTRIB_BUILD_PATH}/.ffmpeg" ]; then
rm "${CONTRIB_BUILD_PATH}/.ffmpeg"
rm -rf "${CONTRIB_BUILD_PATH}/ffmpeg"
fi
if [ -f "${CONTRIB_BUILD_PATH}/.whispercpp" ]; then
rm "${CONTRIB_BUILD_PATH}/.whispercpp"
rm -rf "${CONTRIB_BUILD_PATH}/whispercpp"
fi
WORKPATH=$(pwd)
cd "${CONTRIB_PATH}/native/"
make .ffmpeg -j$(nproc)
cd "${CONTRIB_BUILD_PATH}/"
make .whispercpp
make .ffmpeg
rm .whispercpp
rm .ffmpeg
cd ${WORKPATH}
CONTRIB_PLATFORM=${CONTRIB_PLATFORM_CURT}-${PLATFORM}
ONNX_PATH=${EXTRALIBS_PATH}
if [ -z "${EXTRALIBS_PATH}" ]
then
ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}"
fi
if [ ${DEBUG} ]; then
OUTPUT="${PLUGIN_NAME}"
CLANG_OPTS="-g -fsanitize=address"
CLANG_OPTS="-O0 -g -fsanitize=address"
EXTRA_DEBUG_LIBRARIES="-lyaml-cpp -lrav1e"
EXTRA_DEFINES="-D__DEBUG__"
else
python3 ./../SDK/jplManipulation.py --preassemble --plugin=${PLUGIN_NAME}
CLANG_OPTS="-O3 -shared"
CLANG_OPTS="-O3 -g -shared"
OUTPUT="build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
fi
......@@ -215,21 +211,19 @@ then
-I"." \
-I"${DAEMON_SRC}" \
-I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/include" \
-I"${ONNX_PATH}/include/onnxruntime/session" \
-I"${PLUGINS_LIB}" \
./../lib/common.cpp \
./../lib/accel.cpp \
./../lib/frameFilter.cpp \
./../lib/frameUtils.cpp \
./../lib/resampler.cpp \
main.cpp \
TranscriptMediaHandler.cpp \
TranscriptAudioSubscriber.cpp \
TranscriptVideoSubscriber.cpp \
PluginPreferenceHandler.cpp \
Preprocess.cpp \
ModelProcessor.cpp \
stt_whisper.cpp \
-L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/" \
-L"${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}" \
-lavfilter \
-lswscale \
-lswresample \
......@@ -237,44 +231,22 @@ then
-lavcodec \
-lavutil \
-lvpx -lx264 -lbz2 -liconv -lz \
-lonnxruntime \
-lspeex \
-lopus \
"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libspeex.a" \
"${CONTRIB_PATH}/${CONTRIB_PLATFORM}${CONTRIB_PLATFORM_EXTRA}/lib/libopus.a" \
-lfmt \
-lwhisper \
"/usr/local/opt/libpng/lib/libpng.a" \
"/usr/local/opt/freetype/lib/libfreetype.a" \
${EXTRA_DEBUG_LIBRARIES} \
-o "${OUTPUT}"
if [ ${DEBUG} ]; then
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "libonnxruntime.dylib"
cp "./modelSRC/mModelEncoder.onnx" "./data/assets/mModelEncoder.onnx"
cp "./modelSRC/mModelDecoder.onnx" "./data/assets/mModelDecoder.onnx"
cp "./modelSRC/mLogSoftMax.onnx" "./data/assets/mLogSoftMax.onnx"
install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "libonnxruntime.dylib"
install_name_tool -id "@loader_path/${PLUGIN_NAME}" "${OUTPUT}"
else
cp "./modelSRC/mModelEncoder.onnx" "./build-local/jpl/data/assets/mModelEncoder.onnx"
cp "./modelSRC/mModelDecoder.onnx" "./build-local/jpl/data/assets/mModelDecoder.onnx"
cp "./modelSRC/mLogSoftMax.onnx" "./build-local/jpl/data/assets/mLogSoftMax.onnx"
cp "${ONNX_PATH}/lib/onnxruntime/${ONNX_LIBS}/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
install_name_tool -id "@loader_path/libonnxruntime.1.12.0.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
install_name_tool -id "@loader_path/${SO_FILE_NAME}" "${OUTPUT}"
fi
install_name_tool -change "@rpath/libonnxruntime.1.12.0.dylib" "@loader_path/libonnxruntime.dylib" "${OUTPUT}"
if [ -n "${APPLE_SIGN_CERTIFICATE}" ]; then
codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
codesign --force --verify --timestamp -o runtime --sign "${APPLE_SIGN_CERTIFICATE}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib" "build-local/libonnxruntime.dylib.zip"
LIBRARYNAME=libonnxruntime.dylib sh ./../notarize.sh
ditto -x -k "build-local/libonnxruntime.dylib.zip" "build-local/notarized0"
cp "build-local/notarized0/libonnxruntime.dylib" "build-local/jpl/lib/${CONTRIB_PLATFORM}/libonnxruntime.dylib"
ditto -c -k --rsrc "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}" "build-local/${SO_FILE_NAME}.zip"
LIBRARYNAME=${SO_FILE_NAME} sh ./../notarize.sh
ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized1"
cp "build-local/notarized1/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
ditto -x -k "build-local/${SO_FILE_NAME}.zip" "build-local/notarized"
cp "build-local/notarized/${SO_FILE_NAME}" "build-local/jpl/lib/${CONTRIB_PLATFORM}/${SO_FILE_NAME}"
fi
elif [ "${PLATFORM}" = "android" ]
......@@ -346,13 +318,15 @@ then
CONTRIB_PLATFORM=x86_64-linux-android
fi
if [ -f "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg" ]; then
rm "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/.ffmpeg"
if [ -f "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg" ]; then
rm "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/.ffmpeg"
fi
WORKPATH=$(pwd)
cd "${CONTRIB_PATH}/native-${CONTRIB_PLATFORM}/"
cd "${CONTRIB_BUILD_PATH}-${CONTRIB_PLATFORM}/"
make .ffmpeg -j$(nproc)
make .whispercpp -j$(nproc)
rm .whispercpp
rm .ffmpeg
cd ${WORKPATH}
......@@ -360,12 +334,6 @@ then
# Compile the plugin
#=========================================================
ONNX_PATH="${EXTRALIBS_PATH}/${CURRENT_ABI}"
if [ -z ${EXTRALIBS_PATH} ]
then
ONNX_PATH="${CONTRIB_PATH}/${CONTRIB_PLATFORM}"
fi
# Create so destination folder
$CXX --std=c++17 -O3 -fPIC \
-Wl,-Bsymbolic,-rpath,"\${ORIGIN}" \
......@@ -376,10 +344,6 @@ then
-I"." \
-I"${DAEMON_SRC}" \
-I"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/include" \
-I"${ONNX_PATH}/include/onnxruntime/session" \
-I"${ONNX_PATH}/include/onnxruntime/providers/nnapi" \
-I"${ONNX_PATH}/../include/onnxruntime/session" \
-I"${ONNX_PATH}/../include/onnxruntime/providers/nnapi" \
-I"${PLUGINS_LIB}" \
./../lib/common.cpp \
./../lib/accel.cpp \
......@@ -390,10 +354,9 @@ then
TranscriptAudioSubscriber.cpp \
TranscriptVideoSubscriber.cpp \
PluginPreferenceHandler.cpp \
Preprocess.cpp \
ModelProcessor.cpp \
stt_whisper.cpp \
./../lib/resampler.cpp \
-L"${CONTRIB_PATH}/${CONTRIB_PLATFORM}/lib/" \
-L"${ONNX_PATH}/lib/" \
-lavfilter \
-lswscale \
-lswresample \
......@@ -405,13 +368,11 @@ then
-lspeex \
-lopus \
-lfmt \
-lwhisper \
-l:libfreetype.a \
-llog -lz \
-lonnxruntime \
--sysroot=$ANDROID_SYSROOT \
-o "build-local/jpl/lib/$CURRENT_ABI/${SO_FILE_NAME}"
cp "${ONNX_PATH}/lib/libonnxruntime.so" "build-local/jpl/lib/${CURRENT_ABI}/libonnxruntime.so"
}
# Build the so
......@@ -419,10 +380,6 @@ then
CURRENT_ABI=$i
buildlib
done
cp "./modelSRC/mModelEncoder.ort" "./build-local/jpl/data/assets/mModelEncoder.ort"
cp "./modelSRC/mModelDecoder.ort" "./build-local/jpl/data/assets/mModelDecoder.ort"
cp "./modelSRC/mLogSoftMax.ort" "./build-local/jpl/data/assets/mLogSoftMax.ort"
fi
if [ ! ${DEBUG} ]; then
......
......@@ -3,7 +3,7 @@
"type": "List",
"key": "language",
"title": "{{language_title}}",
"defaultValue": "en",
"defaultValue": "auto",
"scope": "plugin,Transcript",
"entryValues": [
"auto",
......@@ -210,6 +210,84 @@
"{{language_yo}}"
]
},
{
"type": "List",
"key": "background",
"title": "{{background_title}}",
"summary": "{{background_summary}}",
"defaultValue": "black",
"scope": "plugin,Transcript",
"entryValues": [
"black",
"white"
],
"entries": [
"{{background_entries_1}}",
"{{background_entries_2}}"
]
},
{
"type": "List",
"key": "position",
"title": "{{position_title}}",
"defaultValue": "2",
"scope": "plugin,Transcript",
"entryValues": [
"1",
"2",
"3",
"4"
],
"entries": [
"{{position_entries_1}}",
"{{position_entries_2}}",
"{{position_entries_3}}",
"{{position_entries_4}}"
]
},
{
"type": "List",
"key": "fontsize",
"title": "{{fontsize_title}}",
"defaultValue": "14",
"scope": "plugin,Transcript",
"entryValues": [
"10",
"12",
"14",
"16",
"18",
"24",
"36",
"72"
],
"entries": [
"10",
"12",
"14",
"16",
"18",
"24",
"36",
"72"
]
},
{
"type": "List",
"key": "avstream",
"title": "{{avstream_title}}",
"summary": "{{avstream_summary}}",
"defaultValue": "in",
"scope": "plugin",
"entryValues": [
"out",
"in"
],
"entries": [
"{{avstream_entries_1}}",
"{{avstream_entries_2}}"
]
},
{
"type": "Switch",
"key": "TranscriptAlways",
......
*.onnx
*.bin
File deleted
File deleted
......@@ -5,11 +5,10 @@
"avstream_entries_2": "Received",
"TranscriptAlways_title": "Automatically activate transcription",
"TranscriptAlways_summary": "Activate transcription when a call starts.",
"background_title": "Add background color",
"background_summary": "Add a partial transparency to the subtitle background if it isn't visible enough",
"background_entries_1": "None",
"background_entries_2": "Black",
"background_entries_3": "White",
"background_title": "Background color",
"background_summary": "Defines the subtitle background color",
"background_entries_1": "Black",
"background_entries_2": "White",
"position_title": "Transcription position",
"position_entries_1": "Top right",
"position_entries_2": "Top left",
......@@ -116,9 +115,5 @@
"language_vi": "Vietnamese",
"language_cy": "Welsh",
"language_yi": "Yiddish",
"language_yo": "Yoruba",
"acceleration_title": "Use hardware acceleration",
"acceleration_summary": "Use CUDA or NNAPI where applicable",
"chunk_title": "Chunk size in seconds",
"step_title": "Step size in seconds"
"language_yo": "Yoruba"
}
\ No newline at end of file
[
{
"type": "List",
"key": "background",
"title": "{{background_title}}",
"summary": "{{background_summary}}",
"defaultValue": "black@0.0",
"scope": "plugin,Transcript",
"entryValues": [
"black@0.0",
"black@0.5",
"white@0.5"
],
"entries": [
"{{background_entries_1}}",
"{{background_entries_2}}",
"{{background_entries_3}}"
]
},
{
"type": "List",
"key": "position",
"title": "{{position_title}}",
"defaultValue": "2",
"scope": "plugin,Transcript",
"entryValues": [
"1",
"2",
"3",
"4"
],
"entries": [
"{{position_entries_1}}",
"{{position_entries_2}}",
"{{position_entries_3}}",
"{{position_entries_4}}"
]
},
{
"type": "List",
"key": "fontsize",
"title": "{{fontsize_title}}",
"defaultValue": "14",
"scope": "plugin,Transcript",
"entryValues": [
"10",
"12",
"14",
"16",
"18",
"24",
"36",
"72"
],
"entries": [
"10",
"12",
"14",
"16",
"18",
"24",
"36",
"72"
]
},
{
"type": "List",
"key": "avstream",
"title": "{{avstream_title}}",
"summary": "{{avstream_summary}}",
"defaultValue": "in",
"scope": "plugin",
"entryValues": [
"out",
"in"
],
"entries": [
"{{avstream_entries_1}}",
"{{avstream_entries_2}}"
]
},
{
"type": "List",
"key": "chunksize",
"title": "{{chunk_title}}",
"defaultValue": "15",
"scope": "plugin,Transcript",
"entryValues": [
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30"
],
"entries": [
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30"
]
},
{
"type": "List",
"key": "stepsize",
"title": "{{step_title}}",
"defaultValue": "3",
"scope": "plugin,Transcript",
"entryValues": [
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30"
],
"entries": [
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30"
]
},
{
"type": "Switch",
"key": "acceleration",
"title": "{{acceleration_title}}",
"summary": "{{acceleration_summary}}",
"defaultValue": "1",
"scope": "plugin"
}
]
\ No newline at end of file
[]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment