Skip to content
Snippets Groups Projects
Commit 61126cfa authored by Andreas Traczyk's avatar Andreas Traczyk Committed by Sébastien Blin
Browse files

messageparser: replace regexes with tidy API

Depend on tidy API for attribute extraction rather than regexes.

1. htmlparser methods return nodes instead of pre-parsed strings
2. htmlparser provides some methods to extract text/attr from nodes

Gitlab: #1248
Change-Id: I367d703680938fb0b7c5055ac41e079c1322da30
parent ec0feef7
No related branches found
No related tags found
No related merge requests found
......@@ -39,6 +39,7 @@ public:
doc_ = tidyCreate();
tidyOptSetBool(doc_, TidyQuiet, yes);
tidyOptSetBool(doc_, TidyShowWarnings, no);
tidyOptSetInt(doc_, TidyUseCustomTags, TidyCustomEmpty);
}
~HtmlParser()
......@@ -51,46 +52,88 @@ public:
return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0;
}
using TagInfoList = QMap<TidyTagId, QList<QString>>;
using TagNodeList = QMap<TidyTagId, QList<TidyNode>>;
// A function that traverses the DOM tree and fills a QVariantMap with a list
// of the tags and their values. The result is structured as follows:
// {tagId1: ["tagValue1", "tagValue2", ...],
// tagId: ["tagValue1", "tagValue2", ...],
// of the tags and their nodes. The result is structured as follows:
// {tagId1: [tagNode1, tagNode2, ...],
// tagId2: [tagNode3, tagNode4, ...],
// ... }
TagInfoList getTags(QList<TidyTagId> tags, int maxDepth = -1)
TagNodeList getTagsNodes(const QList<TidyTagId>& tags, int maxDepth = -1)
{
TagInfoList result;
TagNodeList result;
traverseNode(
tidyGetRoot(doc_),
tags,
[&result](const QString& value, TidyTagId tag) { result[tag].append(value); },
[&result](TidyNode node, TidyTagId tag) { result[tag].append(node); },
maxDepth);
return result;
}
QString getFirstTagValue(TidyTagId tag, int maxDepth = -1)
// The same as the above function, only it returns the first node for a single tag.
TidyNode getFirstTagNode(TidyTagId tag, int maxDepth = -1)
{
QString result;
TidyNode result = nullptr;
traverseNode(
tidyGetRoot(doc_),
{tag},
[&result](const QString& value, TidyTagId) { result = value; },
[&result](TidyNode node, TidyTagId) { result = node; },
maxDepth);
return result;
}
// Extract the text value from a node.
QString getNodeText(TidyNode node)
{
TidyBuffer nodeValue = {};
if (!node || tidyNodeGetText(doc_, node, &nodeValue) != yes) {
return QString();
}
QString result = QString::fromUtf8((char*) nodeValue.bp, nodeValue.size);
tidyBufFree(&nodeValue);
return result;
}
// Extract the attribute value from a node.
QString getNodeAttr(TidyNode node, TidyAttrId attrId)
{
TidyAttr attr = tidyAttrGetById(node, attrId);
if (!attr) {
return QString();
}
const auto* attrValue = tidyAttrValue(attr);
if (!attrValue) {
return QString();
}
return QString::fromLocal8Bit(attrValue);
}
// Extract the inner HTML of a node.
QString getNodeInnerHtml(TidyNode node)
{
if (!node) {
return QString();
}
const auto* child = tidyGetChild(node);
return child ? getNodeText(child) : QString();
}
QString getTagInnerHtml(TidyTagId tag)
{
return getNodeInnerHtml(getFirstTagNode(tag));
}
private:
// NOLINTNEXTLINE(misc-no-recursion)
void traverseNode(TidyNode node,
QList<TidyTagId> tags,
const std::function<void(const QString&, TidyTagId)>& cb,
const QList<TidyTagId>& tags,
const std::function<void(TidyNode, TidyTagId)>& cb,
int depth = -1)
{
TidyBuffer nodeValue = {};
for (auto tag : tags) {
if (tidyNodeGetId(node) == tag && tidyNodeGetText(doc_, node, &nodeValue) == yes && cb) {
cb(QString::fromLocal8Bit(nodeValue.bp), tag);
if (tidyNodeGetId(node) == tag && cb) {
cb(node, tag);
if (depth != -1 && --depth == 0) {
return;
}
......
......@@ -25,6 +25,18 @@
#include "md4c-html.h"
namespace {
// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
void
htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
{
QByteArray* array = static_cast<QByteArray*>(userData);
if (data_size > 0) {
array->append(data, int(data_size));
}
};
} // namespace
MessageParser::MessageParser(PreviewEngine* previewEngine, QObject* parent)
: QObject(parent)
, previewEngine_(previewEngine)
......@@ -51,9 +63,9 @@ MessageParser::parseMessage(const QString& messageId,
// Now that we have the HTML, we can parse it to get a list of tags and their values.
// We are only interested in the <a> and <pre> tags.
htmlParser_->parseHtmlString(html);
auto tagsMap = htmlParser_->getTags({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
auto tagsMap = htmlParser_->getTagsNodes({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
static QString styleTag("<style>%1</style>");
static const QString styleTag("<style>%1</style>");
QString style;
// Check for any <pre> tags. If there are any, we need to:
......@@ -89,11 +101,9 @@ MessageParser::parseMessage(const QString& messageId,
// If the user has enabled link previews, then we need to generate the link preview.
if (previewLinks) {
// Get the first link in the message.
auto anchorTag = tagsMap[TidyTag_A].first();
static QRegularExpression hrefRegex("href=\"(.*?)\"");
auto match = hrefRegex.match(anchorTag);
if (match.hasMatch()) {
Q_EMIT previewEngine_->parseLink(messageId, match.captured(1));
auto href = htmlParser_->getNodeAttr(tagsMap[TidyTag_A].first(), TidyAttr_HREF);
if (!href.isEmpty()) {
Q_EMIT previewEngine_->parseLink(messageId, href);
}
}
......@@ -110,13 +120,13 @@ void
MessageParser::preprocessMarkdown(QString& markdown)
{
// Match all instances of the linefeed character.
static QRegularExpression newlineRegex("\n");
static const QRegularExpression newlineRegex("\\r?\\n");
static const QString newline = " \n";
// Replace all instances of the linefeed character with 2 spaces + a linefeed character
// in order to force a line break in the HTML.
// Note: we should only do this for non-code fenced blocks.
static QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
static const QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
auto match = codeFenceRe.globalMatch(markdown);
// If there are no code blocks, then we can just replace all linefeeds with 2 spaces
......@@ -132,7 +142,7 @@ MessageParser::preprocessMarkdown(QString& markdown)
enum BlockType { Text, Code };
QVector<QPair<BlockType, QString>> codeBlocks;
int start = 0;
qsizetype start = 0;
while (match.hasNext()) {
auto m = match.next();
auto nonCodelength = m.capturedStart() - start;
......@@ -158,27 +168,16 @@ MessageParser::preprocessMarkdown(QString& markdown)
}
}
// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
static void
htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
{
QByteArray* array = static_cast<QByteArray*>(userData);
if (data_size > 0) {
array->append(data, int(data_size));
}
};
QString
MessageParser::markdownToHtml(const char* markdown)
{
static auto md_flags = MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_NOINDENTEDCODEBLOCKS
| MD_FLAG_TASKLISTS | MD_FLAG_STRIKETHROUGH | MD_FLAG_UNDERLINE;
size_t data_len = strlen(markdown);
const size_t data_len = strlen(markdown);
if (data_len <= 0) {
return QString();
} else {
QByteArray array;
int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
return result == 0 ? QString::fromUtf8(array) : QString();
}
QByteArray array;
const int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
return result == 0 ? QString::fromUtf8(array) : QString();
}
......@@ -19,15 +19,6 @@
#include <QRegularExpression>
static QString
getInnerHtml(const QString& tag)
{
static const QRegularExpression re(">([^<]+)<");
const auto match = re.match(tag);
return match.hasMatch() ? match.captured(1) : QString {};
};
// Portable newline regex.
const QRegularExpression PreviewEngine::newlineRe("\\r?\\n");
PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
......@@ -39,12 +30,11 @@ PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
}
QString
PreviewEngine::getTagContent(QList<QString>& tags, const QString& value)
PreviewEngine::getTagContent(const QList<QString>& tags, const QString& value)
{
Q_FOREACH (auto tag, tags) {
const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value
+ "\".*?content=\"([^\"]+)\"");
const auto match = re.match(tag.remove(newlineRe));
if (match.hasMatch()) {
return match.captured(3);
......@@ -54,45 +44,44 @@ PreviewEngine::getTagContent(QList<QString>& tags, const QString& value)
}
QString
PreviewEngine::getTitle(HtmlParser::TagInfoList& metaTags)
PreviewEngine::getTitle(const QList<QString>& metaTags)
{
// Try with opengraph/twitter props
QString title = getTagContent(metaTags[TidyTag_META], "title");
QString title = getTagContent(metaTags, "title");
if (title.isEmpty()) { // Try with title tag
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_TITLE));
title = htmlParser_->getTagInnerHtml(TidyTag_TITLE);
}
if (title.isEmpty()) { // Try with h1 tag
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H1));
title = htmlParser_->getTagInnerHtml(TidyTag_H1);
}
if (title.isEmpty()) { // Try with h2 tag
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H2));
title = htmlParser_->getTagInnerHtml(TidyTag_H2);
}
return title;
}
QString
PreviewEngine::getDescription(HtmlParser::TagInfoList& metaTags)
PreviewEngine::getDescription(const QList<QString>& metaTags)
{
// Try with og/twitter props
QString d = getTagContent(metaTags[TidyTag_META], "description");
if (d.isEmpty()) { // Try with first paragraph
d = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_P));
QString desc = getTagContent(metaTags, "description");
if (desc.isEmpty()) { // Try with first paragraph
desc = htmlParser_->getTagInnerHtml(TidyTag_P);
}
return d;
return desc;
}
QString
PreviewEngine::getImage(HtmlParser::TagInfoList& metaTags)
PreviewEngine::getImage(const QList<QString>& metaTags)
{
// Try with og/twitter props
QString image = getTagContent(metaTags[TidyTag_META], "image");
QString image = getTagContent(metaTags, "image");
if (image.isEmpty()) { // Try with href of link tag (rel="image_src")
auto tags = htmlParser_->getTags({TidyTag_LINK});
Q_FOREACH (auto tag, tags[TidyTag_LINK]) {
static const QRegularExpression re("rel=\"image_src\".*?href=\"([^\"]+)\"");
const auto match = re.match(tag.remove(newlineRe));
if (match.hasMatch()) {
return match.captured(1);
auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_LINK});
Q_FOREACH (auto tag, tagsNodes[TidyTag_LINK]) {
QString href = htmlParser_->getNodeAttr(tag, TidyAttr_HREF);
if (!href.isEmpty()) {
return href;
}
}
}
......@@ -104,7 +93,12 @@ PreviewEngine::onParseLink(const QString& messageId, const QString& link)
{
sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) {
htmlParser_->parseHtmlString(html);
auto metaTags = htmlParser_->getTags({TidyTag_META});
auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_META});
auto metaTagNodes = tagsNodes[TidyTag_META];
QList<QString> metaTags;
Q_FOREACH (auto tag, metaTagNodes) {
metaTags.append(htmlParser_->getNodeText(tag));
}
QString domain = QUrl(link).host();
if (domain.isEmpty()) {
domain = link;
......
......@@ -39,10 +39,10 @@ private:
// An instance of HtmlParser used to parse HTML.
HtmlParser* htmlParser_;
QString getTagContent(QList<QString>& tags, const QString& value);
QString getTitle(HtmlParser::TagInfoList& metaTags);
QString getDescription(HtmlParser::TagInfoList& metaTags);
QString getImage(HtmlParser::TagInfoList& metaTags);
QString getTagContent(const QList<QString>& tags, const QString& value);
QString getTitle(const QList<QString>& metaTags);
QString getDescription(const QList<QString>& metaTags);
QString getImage(const QList<QString>& metaTags);
static const QRegularExpression newlineRe;
};
......@@ -117,7 +117,6 @@ TEST_F(MessageParserFixture, EndOfLineCharactersAreParsedCorrectly)
auto backgroundColor = QColor::fromRgb(0, 0, 255);
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
// Parse a message with a link.
globalEnv.messageParser->parseMessage("msgId_03",
......@@ -148,7 +147,6 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
auto backgroundColor = QColor::fromRgb(0, 0, 255);
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
// Parse a message with a link.
globalEnv.messageParser->parseMessage("msgId_04",
......@@ -169,3 +167,41 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
"<style>pre,code{background-color:#0000ff;color:#ffffff;white-space:pre-wrap;"
"}</style><p>Text with</p>\n<pre><code>code\n</code></pre>\n");
}
/*!
* WHEN We parse a text body with a youtube link.
* THEN PreviewEngine::parseLink should be called with the correct arguments.
*/
TEST_F(MessageParserFixture, YoutubeLinkIsParsedCorrectly)
{
auto url = "https://www.youtube.com/watch?v=1234567890";
auto msg = "blah blah " + QString(url) + " blah blah";
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
// Parse a message with a link.
globalEnv.messageParser->parseMessage("msgId_05",
msg,
true,
QColor::fromRgb(0, 0, 255),
QColor::fromRgb(0, 0, 255));
// Wait for the messageParsed signal which should be emitted once.
messageParsedSpy.wait();
EXPECT_EQ(messageParsedSpy.count(), 1);
QList<QVariant> messageParserArguments = messageParsedSpy.takeFirst();
EXPECT_TRUE(messageParserArguments.at(0).typeId() == qMetaTypeId<QString>());
// Wait for the linkInfoReady signal which should be emitted once.
linkInfoReadySpy.wait();
EXPECT_EQ(linkInfoReadySpy.count(), 1);
QList<QVariant> linkInfoReadyArguments = linkInfoReadySpy.takeFirst();
EXPECT_TRUE(linkInfoReadyArguments.at(0).typeId() == qMetaTypeId<QString>());
EXPECT_EQ(linkInfoReadyArguments.at(0).toString(), "msgId_05");
EXPECT_TRUE(linkInfoReadyArguments.at(1).typeId() == qMetaTypeId<QVariantMap>());
QVariantMap linkInfo = linkInfoReadyArguments.at(1).toMap();
EXPECT_EQ(linkInfo["url"].toString(), url);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment