-
Sébastien Blin authored
Change-Id: I8d5f968fbedbc884c91416246049a0ef4cd652eb
Sébastien Blin authoredChange-Id: I8d5f968fbedbc884c91416246049a0ef4cd652eb
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
htmlparser.h 4.52 KiB
/*
* Copyright (C) 2024 Savoir-faire Linux Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <QObject>
#include <QVariantMap>
#include "tidy.h"
#include "tidybuffio.h"
// This class is used to parse HTML strings. It uses the libtidy library to parse
// the HTML and traverse the DOM tree. It can be used to extract a list of tags
// and their values from an HTML string.
// Currently, it is used to extract the <a> and <pre> tags from a message body,
// and in the future it can be used in conjunction with QtNetwork to generate link
// previews without having to use QtWebEngine.
class HtmlParser : public QObject
{
Q_OBJECT
public:
HtmlParser(QObject* parent = nullptr)
: QObject(parent)
{
doc_ = tidyCreate();
tidyOptSetBool(doc_, TidyQuiet, yes);
tidyOptSetBool(doc_, TidyShowWarnings, no);
tidyOptSetInt(doc_, TidyUseCustomTags, TidyCustomEmpty);
}
~HtmlParser()
{
tidyRelease(doc_);
}
bool parseHtmlString(const QString& html)
{
return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0;
}
using TagNodeList = QMap<TidyTagId, QList<TidyNode>>;
// A function that traverses the DOM tree and fills a QVariantMap with a list
// of the tags and their nodes. The result is structured as follows:
// {tagId1: [tagNode1, tagNode2, ...],
// tagId2: [tagNode3, tagNode4, ...],
// ... }
TagNodeList getTagsNodes(const QList<TidyTagId>& tags, int maxDepth = -1)
{
TagNodeList result;
traverseNode(
tidyGetRoot(doc_),
tags,
[&result](TidyNode node, TidyTagId tag) { result[tag].append(node); },
maxDepth);
return result;
}
// The same as the above function, only it returns the first node for a single tag.
TidyNode getFirstTagNode(TidyTagId tag, int maxDepth = -1)
{
TidyNode result = nullptr;
traverseNode(
tidyGetRoot(doc_),
{tag},
[&result](TidyNode node, TidyTagId) { result = node; },
maxDepth);
return result;
}
// Extract the text value from a node.
QString getNodeText(TidyNode node)
{
TidyBuffer nodeValue = {};
if (!node || tidyNodeGetText(doc_, node, &nodeValue) != yes) {
return QString();
}
QString result = QString::fromUtf8((char*) nodeValue.bp, nodeValue.size);
tidyBufFree(&nodeValue);
return result;
}
// Extract the attribute value from a node.
QString getNodeAttr(TidyNode node, TidyAttrId attrId)
{
TidyAttr attr = tidyAttrGetById(node, attrId);
if (!attr) {
return QString();
}
const auto* attrValue = tidyAttrValue(attr);
if (!attrValue) {
return QString();
}
return QString::fromLocal8Bit(attrValue);
}
// Extract the inner HTML of a node.
QString getNodeInnerHtml(TidyNode node)
{
if (!node) {
return QString();
}
const auto* child = tidyGetChild(node);
return child ? getNodeText(child) : QString();
}
QString getTagInnerHtml(TidyTagId tag)
{
return getNodeInnerHtml(getFirstTagNode(tag));
}
private:
// NOLINTNEXTLINE(misc-no-recursion)
void traverseNode(TidyNode node,
const QList<TidyTagId>& tags,
const std::function<void(TidyNode, TidyTagId)>& cb,
int depth = -1)
{
for (auto tag : tags) {
if (tidyNodeGetId(node) == tag && cb) {
cb(node, tag);
if (depth != -1 && --depth == 0) {
return;
}
}
}
// Traverse the children of the current node.
for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
traverseNode(child, tags, cb, depth);
}
}
TidyDoc doc_;
};