2022-04-05 15:04:26 -04:00
|
|
|
/*
|
2023-02-06 01:47:15 -05:00
|
|
|
* Copyright (C) 2021-2023 Savoir-faire Linux Inc.
|
2021-07-06 10:20:46 -04:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "previewengine.h"
|
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
#include <QRegularExpression>
|
2023-01-06 14:07:33 -05:00
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
static QString
|
|
|
|
getInnerHtml(const QString& tag)
|
2021-07-06 10:20:46 -04:00
|
|
|
{
|
2023-03-20 16:26:37 -04:00
|
|
|
static const QRegularExpression re(">([^<]+)<");
|
|
|
|
const auto match = re.match(tag);
|
|
|
|
return match.hasMatch() ? match.captured(1) : QString {};
|
|
|
|
};
|
2021-07-06 10:20:46 -04:00
|
|
|
|
2023-05-18 10:06:29 -04:00
|
|
|
// Portable newline regex.
|
|
|
|
const QRegularExpression PreviewEngine::newlineRe("\\r?\\n");
|
2022-04-05 15:04:26 -04:00
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
|
|
|
|
: NetworkManager(cm, parent)
|
|
|
|
, htmlParser_(new HtmlParser(this))
|
|
|
|
{
|
|
|
|
// Connect on a queued connection to avoid blocking caller thread.
|
|
|
|
connect(this, &PreviewEngine::parseLink, this, &PreviewEngine::onParseLink, Qt::QueuedConnection);
|
|
|
|
}
|
2022-04-05 15:04:26 -04:00
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
QString
|
|
|
|
PreviewEngine::getTagContent(QList<QString>& tags, const QString& value)
|
|
|
|
{
|
|
|
|
Q_FOREACH (auto tag, tags) {
|
|
|
|
const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value
|
|
|
|
+ "\".*?content=\"([^\"]+)\"");
|
|
|
|
|
|
|
|
const auto match = re.match(tag.remove(newlineRe));
|
|
|
|
if (match.hasMatch()) {
|
|
|
|
return match.captured(3);
|
|
|
|
}
|
2022-04-05 15:04:26 -04:00
|
|
|
}
|
2023-03-20 16:26:37 -04:00
|
|
|
return QString {};
|
|
|
|
}
|
2021-07-06 10:20:46 -04:00
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
QString
|
|
|
|
PreviewEngine::getTitle(HtmlParser::TagInfoList& metaTags)
|
2021-07-06 10:20:46 -04:00
|
|
|
{
|
2023-03-20 16:26:37 -04:00
|
|
|
// Try with opengraph/twitter props
|
|
|
|
QString title = getTagContent(metaTags[TidyTag_META], "title");
|
|
|
|
if (title.isEmpty()) { // Try with title tag
|
|
|
|
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_TITLE));
|
|
|
|
}
|
|
|
|
if (title.isEmpty()) { // Try with h1 tag
|
|
|
|
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H1));
|
|
|
|
}
|
|
|
|
if (title.isEmpty()) { // Try with h2 tag
|
|
|
|
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H2));
|
|
|
|
}
|
|
|
|
return title;
|
2021-07-06 10:20:46 -04:00
|
|
|
}
|
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
QString
|
|
|
|
PreviewEngine::getDescription(HtmlParser::TagInfoList& metaTags)
|
2021-07-06 10:20:46 -04:00
|
|
|
{
|
2023-03-20 16:26:37 -04:00
|
|
|
// Try with og/twitter props
|
|
|
|
QString d = getTagContent(metaTags[TidyTag_META], "description");
|
|
|
|
if (d.isEmpty()) { // Try with first paragraph
|
|
|
|
d = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_P));
|
|
|
|
}
|
|
|
|
return d;
|
2021-07-06 10:20:46 -04:00
|
|
|
}
|
|
|
|
|
2023-03-20 16:26:37 -04:00
|
|
|
QString
|
|
|
|
PreviewEngine::getImage(HtmlParser::TagInfoList& metaTags)
|
2022-05-06 15:06:05 -04:00
|
|
|
{
|
2023-03-20 16:26:37 -04:00
|
|
|
// Try with og/twitter props
|
|
|
|
QString image = getTagContent(metaTags[TidyTag_META], "image");
|
|
|
|
if (image.isEmpty()) { // Try with href of link tag (rel="image_src")
|
|
|
|
auto tags = htmlParser_->getTags({TidyTag_LINK});
|
|
|
|
Q_FOREACH (auto tag, tags[TidyTag_LINK]) {
|
|
|
|
static const QRegularExpression re("rel=\"image_src\".*?href=\"([^\"]+)\"");
|
|
|
|
const auto match = re.match(tag.remove(newlineRe));
|
|
|
|
if (match.hasMatch()) {
|
|
|
|
return match.captured(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return image;
|
2022-05-06 15:06:05 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2023-03-20 16:26:37 -04:00
|
|
|
PreviewEngine::onParseLink(const QString& messageId, const QString& link)
|
2022-05-06 15:06:05 -04:00
|
|
|
{
|
2023-03-20 16:26:37 -04:00
|
|
|
sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) {
|
|
|
|
htmlParser_->parseHtmlString(html);
|
|
|
|
auto metaTags = htmlParser_->getTags({TidyTag_META});
|
|
|
|
QString domain = QUrl(link).host();
|
|
|
|
if (domain.isEmpty()) {
|
|
|
|
domain = link;
|
|
|
|
}
|
|
|
|
Q_EMIT infoReady(messageId,
|
|
|
|
{{"title", getTitle(metaTags)},
|
|
|
|
{"description", getDescription(metaTags)},
|
|
|
|
{"image", getImage(metaTags)},
|
|
|
|
{"url", link},
|
|
|
|
{"domain", domain}});
|
|
|
|
});
|
2022-05-06 15:06:05 -04:00
|
|
|
}
|