diff --git a/src/app/htmlparser.h b/src/app/htmlparser.h index c656542b9a8937f718373a9ad63ba026b43d5e8a..3b23b573d07dc7706273d287a8764718523b26aa 100644 --- a/src/app/htmlparser.h +++ b/src/app/htmlparser.h @@ -39,6 +39,7 @@ public: doc_ = tidyCreate(); tidyOptSetBool(doc_, TidyQuiet, yes); tidyOptSetBool(doc_, TidyShowWarnings, no); + tidyOptSetInt(doc_, TidyUseCustomTags, TidyCustomEmpty); } ~HtmlParser() @@ -51,46 +52,88 @@ public: return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0; } - using TagInfoList = QMap<TidyTagId, QList<QString>>; + using TagNodeList = QMap<TidyTagId, QList<TidyNode>>; // A function that traverses the DOM tree and fills a QVariantMap with a list - // of the tags and their values. The result is structured as follows: - // {tagId1: ["tagValue1", "tagValue2", ...], - // tagId: ["tagValue1", "tagValue2", ...], + // of the tags and their nodes. The result is structured as follows: + // {tagId1: [tagNode1, tagNode2, ...], + // tagId2: [tagNode3, tagNode4, ...], // ... } - TagInfoList getTags(QList<TidyTagId> tags, int maxDepth = -1) + TagNodeList getTagsNodes(const QList<TidyTagId>& tags, int maxDepth = -1) { - TagInfoList result; + TagNodeList result; traverseNode( tidyGetRoot(doc_), tags, - [&result](const QString& value, TidyTagId tag) { result[tag].append(value); }, + [&result](TidyNode node, TidyTagId tag) { result[tag].append(node); }, maxDepth); return result; } - QString getFirstTagValue(TidyTagId tag, int maxDepth = -1) + // The same as the above function, only it returns the first node for a single tag. + TidyNode getFirstTagNode(TidyTagId tag, int maxDepth = -1) { - QString result; + TidyNode result = nullptr; traverseNode( tidyGetRoot(doc_), {tag}, - [&result](const QString& value, TidyTagId) { result = value; }, + [&result](TidyNode node, TidyTagId) { result = node; }, maxDepth); return result; } + // Extract the text value from a node. + QString getNodeText(TidyNode node) + { + TidyBuffer nodeValue = {}; + if (!node || tidyNodeGetText(doc_, node, &nodeValue) != yes) { + return QString(); + } + QString result = QString::fromUtf8((char*) nodeValue.bp, nodeValue.size); + tidyBufFree(&nodeValue); + return result; + } + + // Extract the attribute value from a node. + QString getNodeAttr(TidyNode node, TidyAttrId attrId) + { + TidyAttr attr = tidyAttrGetById(node, attrId); + if (!attr) { + return QString(); + } + const auto* attrValue = tidyAttrValue(attr); + if (!attrValue) { + return QString(); + } + return QString::fromLocal8Bit(attrValue); + } + + // Extract the inner HTML of a node. + QString getNodeInnerHtml(TidyNode node) + { + if (!node) { + return QString(); + } + const auto* child = tidyGetChild(node); + return child ? getNodeText(child) : QString(); + } + + QString getTagInnerHtml(TidyTagId tag) + { + return getNodeInnerHtml(getFirstTagNode(tag)); + } + private: + // NOLINTNEXTLINE(misc-no-recursion) void traverseNode(TidyNode node, - QList<TidyTagId> tags, - const std::function<void(const QString&, TidyTagId)>& cb, + const QList<TidyTagId>& tags, + const std::function<void(TidyNode, TidyTagId)>& cb, int depth = -1) { - TidyBuffer nodeValue = {}; for (auto tag : tags) { - if (tidyNodeGetId(node) == tag && tidyNodeGetText(doc_, node, &nodeValue) == yes && cb) { - cb(QString::fromLocal8Bit(nodeValue.bp), tag); + if (tidyNodeGetId(node) == tag && cb) { + cb(node, tag); if (depth != -1 && --depth == 0) { return; } diff --git a/src/app/messageparser.cpp b/src/app/messageparser.cpp index 7941e12dbefc5ca424eef323abdd64bf1939c479..a501b0b4ee3fd3949676d9adb333109f168d1cff 100644 --- a/src/app/messageparser.cpp +++ b/src/app/messageparser.cpp @@ -25,6 +25,18 @@ #include "md4c-html.h" +namespace { +// A callback function that will be called by the md4c library (`md_html`) to output the HTML. +void +htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData) +{ + QByteArray* array = static_cast<QByteArray*>(userData); + if (data_size > 0) { + array->append(data, int(data_size)); + } +}; +} // namespace + MessageParser::MessageParser(PreviewEngine* previewEngine, QObject* parent) : QObject(parent) , previewEngine_(previewEngine) @@ -51,9 +63,9 @@ MessageParser::parseMessage(const QString& messageId, // Now that we have the HTML, we can parse it to get a list of tags and their values. // We are only interested in the <a> and <pre> tags. htmlParser_->parseHtmlString(html); - auto tagsMap = htmlParser_->getTags({TidyTag_A, TidyTag_DEL, TidyTag_PRE}); + auto tagsMap = htmlParser_->getTagsNodes({TidyTag_A, TidyTag_DEL, TidyTag_PRE}); - static QString styleTag("<style>%1</style>"); + static const QString styleTag("<style>%1</style>"); QString style; // Check for any <pre> tags. If there are any, we need to: @@ -89,11 +101,9 @@ MessageParser::parseMessage(const QString& messageId, // If the user has enabled link previews, then we need to generate the link preview. if (previewLinks) { // Get the first link in the message. - auto anchorTag = tagsMap[TidyTag_A].first(); - static QRegularExpression hrefRegex("href=\"(.*?)\""); - auto match = hrefRegex.match(anchorTag); - if (match.hasMatch()) { - Q_EMIT previewEngine_->parseLink(messageId, match.captured(1)); + auto href = htmlParser_->getNodeAttr(tagsMap[TidyTag_A].first(), TidyAttr_HREF); + if (!href.isEmpty()) { + Q_EMIT previewEngine_->parseLink(messageId, href); } } @@ -110,13 +120,13 @@ void MessageParser::preprocessMarkdown(QString& markdown) { // Match all instances of the linefeed character. - static QRegularExpression newlineRegex("\n"); + static const QRegularExpression newlineRegex("\\r?\\n"); static const QString newline = " \n"; // Replace all instances of the linefeed character with 2 spaces + a linefeed character // in order to force a line break in the HTML. // Note: we should only do this for non-code fenced blocks. - static QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}"); + static const QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}"); auto match = codeFenceRe.globalMatch(markdown); // If there are no code blocks, then we can just replace all linefeeds with 2 spaces @@ -132,7 +142,7 @@ MessageParser::preprocessMarkdown(QString& markdown) enum BlockType { Text, Code }; QVector<QPair<BlockType, QString>> codeBlocks; - int start = 0; + qsizetype start = 0; while (match.hasNext()) { auto m = match.next(); auto nonCodelength = m.capturedStart() - start; @@ -158,27 +168,16 @@ MessageParser::preprocessMarkdown(QString& markdown) } } -// A callback function that will be called by the md4c library (`md_html`) to output the HTML. -static void -htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData) -{ - QByteArray* array = static_cast<QByteArray*>(userData); - if (data_size > 0) { - array->append(data, int(data_size)); - } -}; - QString MessageParser::markdownToHtml(const char* markdown) { static auto md_flags = MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_NOINDENTEDCODEBLOCKS | MD_FLAG_TASKLISTS | MD_FLAG_STRIKETHROUGH | MD_FLAG_UNDERLINE; - size_t data_len = strlen(markdown); + const size_t data_len = strlen(markdown); if (data_len <= 0) { return QString(); - } else { - QByteArray array; - int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0); - return result == 0 ? QString::fromUtf8(array) : QString(); } + QByteArray array; + const int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0); + return result == 0 ? QString::fromUtf8(array) : QString(); } diff --git a/src/app/previewengine.cpp b/src/app/previewengine.cpp index 5f4490f1166027b754687f0fe77d7cc6f4e933ff..5e56fbb88bd2d734f6c18ab18ba42e348a0e642e 100644 --- a/src/app/previewengine.cpp +++ b/src/app/previewengine.cpp @@ -19,15 +19,6 @@ #include <QRegularExpression> -static QString -getInnerHtml(const QString& tag) -{ - static const QRegularExpression re(">([^<]+)<"); - const auto match = re.match(tag); - return match.hasMatch() ? match.captured(1) : QString {}; -}; - -// Portable newline regex. const QRegularExpression PreviewEngine::newlineRe("\\r?\\n"); PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent) @@ -39,12 +30,11 @@ PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent) } QString -PreviewEngine::getTagContent(QList<QString>& tags, const QString& value) +PreviewEngine::getTagContent(const QList<QString>& tags, const QString& value) { Q_FOREACH (auto tag, tags) { const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value + "\".*?content=\"([^\"]+)\""); - const auto match = re.match(tag.remove(newlineRe)); if (match.hasMatch()) { return match.captured(3); @@ -54,45 +44,44 @@ PreviewEngine::getTagContent(QList<QString>& tags, const QString& value) } QString -PreviewEngine::getTitle(HtmlParser::TagInfoList& metaTags) +PreviewEngine::getTitle(const QList<QString>& metaTags) { // Try with opengraph/twitter props - QString title = getTagContent(metaTags[TidyTag_META], "title"); + QString title = getTagContent(metaTags, "title"); if (title.isEmpty()) { // Try with title tag - title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_TITLE)); + title = htmlParser_->getTagInnerHtml(TidyTag_TITLE); } if (title.isEmpty()) { // Try with h1 tag - title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H1)); + title = htmlParser_->getTagInnerHtml(TidyTag_H1); } if (title.isEmpty()) { // Try with h2 tag - title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H2)); + title = htmlParser_->getTagInnerHtml(TidyTag_H2); } return title; } QString -PreviewEngine::getDescription(HtmlParser::TagInfoList& metaTags) +PreviewEngine::getDescription(const QList<QString>& metaTags) { // Try with og/twitter props - QString d = getTagContent(metaTags[TidyTag_META], "description"); - if (d.isEmpty()) { // Try with first paragraph - d = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_P)); + QString desc = getTagContent(metaTags, "description"); + if (desc.isEmpty()) { // Try with first paragraph + desc = htmlParser_->getTagInnerHtml(TidyTag_P); } - return d; + return desc; } QString -PreviewEngine::getImage(HtmlParser::TagInfoList& metaTags) +PreviewEngine::getImage(const QList<QString>& metaTags) { // Try with og/twitter props - QString image = getTagContent(metaTags[TidyTag_META], "image"); + QString image = getTagContent(metaTags, "image"); if (image.isEmpty()) { // Try with href of link tag (rel="image_src") - auto tags = htmlParser_->getTags({TidyTag_LINK}); - Q_FOREACH (auto tag, tags[TidyTag_LINK]) { - static const QRegularExpression re("rel=\"image_src\".*?href=\"([^\"]+)\""); - const auto match = re.match(tag.remove(newlineRe)); - if (match.hasMatch()) { - return match.captured(1); + auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_LINK}); + Q_FOREACH (auto tag, tagsNodes[TidyTag_LINK]) { + QString href = htmlParser_->getNodeAttr(tag, TidyAttr_HREF); + if (!href.isEmpty()) { + return href; } } } @@ -104,7 +93,12 @@ PreviewEngine::onParseLink(const QString& messageId, const QString& link) { sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) { htmlParser_->parseHtmlString(html); - auto metaTags = htmlParser_->getTags({TidyTag_META}); + auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_META}); + auto metaTagNodes = tagsNodes[TidyTag_META]; + QList<QString> metaTags; + Q_FOREACH (auto tag, metaTagNodes) { + metaTags.append(htmlParser_->getNodeText(tag)); + } QString domain = QUrl(link).host(); if (domain.isEmpty()) { domain = link; diff --git a/src/app/previewengine.h b/src/app/previewengine.h index db14a96886179febfea78db377d11ed95e996a03..2f0144ad603c2f2039c3d967cbb324bad13e4603 100644 --- a/src/app/previewengine.h +++ b/src/app/previewengine.h @@ -39,10 +39,10 @@ private: // An instance of HtmlParser used to parse HTML. HtmlParser* htmlParser_; - QString getTagContent(QList<QString>& tags, const QString& value); - QString getTitle(HtmlParser::TagInfoList& metaTags); - QString getDescription(HtmlParser::TagInfoList& metaTags); - QString getImage(HtmlParser::TagInfoList& metaTags); + QString getTagContent(const QList<QString>& tags, const QString& value); + QString getTitle(const QList<QString>& metaTags); + QString getDescription(const QList<QString>& metaTags); + QString getImage(const QList<QString>& metaTags); static const QRegularExpression newlineRe; }; diff --git a/tests/unittests/messageparser_unittest.cpp b/tests/unittests/messageparser_unittest.cpp index 4cbe16d9509c878ba88ec6573dd25709a5c39748..8f91834bf583be90953bf0575976c04df4d8cf55 100644 --- a/tests/unittests/messageparser_unittest.cpp +++ b/tests/unittests/messageparser_unittest.cpp @@ -117,7 +117,6 @@ TEST_F(MessageParserFixture, EndOfLineCharactersAreParsedCorrectly) auto backgroundColor = QColor::fromRgb(0, 0, 255); QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed); - QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady); // Parse a message with a link. globalEnv.messageParser->parseMessage("msgId_03", @@ -148,7 +147,6 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly) auto backgroundColor = QColor::fromRgb(0, 0, 255); QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed); - QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady); // Parse a message with a link. globalEnv.messageParser->parseMessage("msgId_04", @@ -169,3 +167,41 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly) "<style>pre,code{background-color:#0000ff;color:#ffffff;white-space:pre-wrap;" "}</style><p>Text with</p>\n<pre><code>code\n</code></pre>\n"); } + +/*! + * WHEN We parse a text body with a youtube link. + * THEN PreviewEngine::parseLink should be called with the correct arguments. + */ +TEST_F(MessageParserFixture, YoutubeLinkIsParsedCorrectly) +{ + auto url = "https://www.youtube.com/watch?v=1234567890"; + auto msg = "blah blah " + QString(url) + " blah blah"; + + QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed); + QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady); + + // Parse a message with a link. + globalEnv.messageParser->parseMessage("msgId_05", + msg, + true, + QColor::fromRgb(0, 0, 255), + QColor::fromRgb(0, 0, 255)); + + // Wait for the messageParsed signal which should be emitted once. + messageParsedSpy.wait(); + EXPECT_EQ(messageParsedSpy.count(), 1); + + QList<QVariant> messageParserArguments = messageParsedSpy.takeFirst(); + EXPECT_TRUE(messageParserArguments.at(0).typeId() == qMetaTypeId<QString>()); + + // Wait for the linkInfoReady signal which should be emitted once. + linkInfoReadySpy.wait(); + EXPECT_EQ(linkInfoReadySpy.count(), 1); + + QList<QVariant> linkInfoReadyArguments = linkInfoReadySpy.takeFirst(); + EXPECT_TRUE(linkInfoReadyArguments.at(0).typeId() == qMetaTypeId<QString>()); + EXPECT_EQ(linkInfoReadyArguments.at(0).toString(), "msgId_05"); + EXPECT_TRUE(linkInfoReadyArguments.at(1).typeId() == qMetaTypeId<QVariantMap>()); + QVariantMap linkInfo = linkInfoReadyArguments.at(1).toMap(); + EXPECT_EQ(linkInfo["url"].toString(), url); +}