Commit 0843416c authored by Pascal Potvin's avatar Pascal Potvin

daemon: add utf8_utils to check validity of utf-8 strings

Based on work already existing in glib and dbus, two new functions to validate
and reformat utf-8 strings are added:

 * utf8_validate: returns true if the `char` sequence is valid utf-8.

 * utf8_make_valid: transforms an invalid utf-8 sequence into a valid one,
   by replacing each bad byte by U+FFFD REPLACEMENT CHARACTER. The same
   behaviour is implemented in other GNOME applications.

These two new functions are used in sip_utils::parseDisplayName().

Change-Id: Ie9a6c35b4df7883637374c325b05b6cf875f6b12
Signed-off-by: default avatarPascal Potvin <pascal.potvin@extenway.com>
parent 8df0b8a0
......@@ -91,6 +91,7 @@ libsflphone_la_SOURCES = conference.cpp \
fileutils.cpp \
threadloop.cpp \
ip_utils.cpp \
utf8_utils.cpp \
threadloop.h \
conference.h \
voiplink.h \
......@@ -105,6 +106,7 @@ libsflphone_la_SOURCES = conference.cpp \
numbercleaner.h \
fileutils.h \
noncopyable.h \
utf8_utils.h \
sfl_types.h \
array_size.h \
account_schema.h \
......
......@@ -31,6 +31,7 @@
#include "sip_utils.h"
#include "logger.h"
#include "utf8_utils.h"
#include <pjsip.h>
#include <pjsip_ua.h>
......@@ -96,14 +97,6 @@ sip_utils::createRouteSet(const std::string &route, pj_pool_t *hdr_pool)
return route_set;
}
static bool
isValidUtf8(const std::string &str)
{
std::wstring ws(str.size(), u' ');
const size_t wideSize = mbstowcs(&ws[0], str.c_str(), str.size());
return wideSize != std::wstring::npos;
}
// FIXME: replace with regex
std::string
sip_utils::parseDisplayName(const char * buffer)
......@@ -148,10 +141,9 @@ sip_utils::parseDisplayName(const char * buffer)
std::string displayName = temp.substr(begin_displayName + 1,
end_displayName - begin_displayName - 1);
// Filter out invalid UTF-8 sequences to avoid getting kicked from D-Bus
if (not isValidUtf8(displayName)) {
ERROR("Invalid UTF-8 sequence detected: %s", displayName.c_str());
return "";
// Filter out invalid UTF-8 characters to avoid getting kicked from D-Bus
if (not utf8_validate(displayName)) {
return utf8_make_valid(displayName);
}
static const size_t MAX_DISPLAY_NAME_SIZE = 25;
......
/*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
* Copyright (C) 2014 Savoir-Faire Linux Inc.
*
* Author: Pascal Potvin <pascal.potvin@extenway.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Additional permission under GNU GPL version 3 section 7:
*
* If you modify this program, or any covered work, by linking or
* combining it with the OpenSSL project's OpenSSL library (or a
* modified version of that library), containing parts covered by the
* terms of the OpenSSL or SSLeay licenses, Savoir-Faire Linux Inc.
* grants you additional permission to convey the resulting work.
* Corresponding Source for a non-source form of such a combination
* shall include the source code for the parts of OpenSSL used as well
* as that of the covered work.
*/
#include <cstring>
#include <cassert>
#include "utf8_utils.h"
/*
* The LIKELY and UNLIKELY macros let the programmer give hints to
* the compiler about the expected result of an expression. Some compilers
* can use this information for optimizations.
*/
#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
#define LIKELY(expr) (__builtin_expect (expr, 1))
#define UNLIKELY(expr) (__builtin_expect (expr, 0))
#else
#define LIKELY(expr) (expr)
#define UNLIKELY(expr) (expr)
#endif
/*
* Check whether a Unicode (5.2) char is in a valid range.
*
* The first check comes from the Unicode guarantee to never encode
* a point above 0x0010ffff, since UTF-16 couldn't represent it.
*
* The second check covers surrogate pairs (category Cs).
*
* @param Char the character
*/
#define UNICODE_VALID(Char) \
((Char) < 0x110000 && \
(((Char) & 0xFFFFF800) != 0xD800))
#define CONTINUATION_CHAR \
if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
goto error; \
val <<= 6; \
val |= (*(unsigned char *)p) & 0x3f;
static const char *
fast_validate(const char *str)
{
char32_t val = 0;
char32_t min = 0;
const char *p;
for (p = str; *p; p++) {
if (*(unsigned char *)p < 128)
/* done */;
else {
const char *last;
last = p;
if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0))
goto error;
p++;
if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */
goto error;
} else {
if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
min = (1 << 11);
val = *(unsigned char *)p & 0x0f;
goto TWO_REMAINING;
} else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
min = (1 << 16);
val = *(unsigned char *)p & 0x07;
} else
goto error;
p++;
CONTINUATION_CHAR;
TWO_REMAINING:
p++;
CONTINUATION_CHAR;
p++;
CONTINUATION_CHAR;
if (UNLIKELY(val < min))
goto error;
if (UNLIKELY(!UNICODE_VALID(val)))
goto error;
}
continue;
error:
return last;
}
}
return p;
}
static const char *
fast_validate_len(const char *str, ssize_t max_len)
{
char32_t val = 0;
char32_t min = 0;
const char *p;
assert(max_len >= 0);
for (p = str; ((p - str) < max_len) && *p; p++) {
if (*(unsigned char *)p < 128)
/* done */;
else {
const char *last;
last = p;
if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
if (UNLIKELY(max_len - (p - str) < 2))
goto error;
if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0))
goto error;
p++;
if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */
goto error;
} else {
if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
if (UNLIKELY(max_len - (p - str) < 3))
goto error;
min = (1 << 11);
val = *(unsigned char *)p & 0x0f;
goto TWO_REMAINING;
} else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
if (UNLIKELY(max_len - (p - str) < 4))
goto error;
min = (1 << 16);
val = *(unsigned char *)p & 0x07;
} else
goto error;
p++;
CONTINUATION_CHAR;
TWO_REMAINING:
p++;
CONTINUATION_CHAR;
p++;
CONTINUATION_CHAR;
if (UNLIKELY(val < min))
goto error;
if (UNLIKELY(!UNICODE_VALID(val)))
goto error;
}
continue;
error:
return last;
}
}
return p;
}
/**
* utf8_validate_c_str:
* @str: a pointer to character data
* @max_len: max bytes to validate, or -1 to go until NULL
* @end: return location for end of valid data
*
* Validates UTF-8 encoded text. @str is the text to validate;
* if @str is nul-terminated, then @max_len can be -1, otherwise
* @max_len should be the number of bytes to validate.
* If @end is non-%NULL, then the end of the valid range
* will be stored there (i.e. the start of the first invalid
* character if some bytes were invalid, or the end of the text
* being validated otherwise).
*
* Note that utf8_validate() returns %false if @max_len is
* positive and any of the @max_len bytes are nul.
*
* Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
* sip packets should also be encoded in utf8; so data read from a file or the
* network should be checked with utf8_validate() before doing anything else
* with it.
*
* Returns: true if the text was valid UTF-8
*/
bool
utf8_validate_c_str(const char *str, ssize_t max_len, const char **end)
{
const char *p;
if (max_len < 0)
p = fast_validate(str);
else
p = fast_validate_len(str, max_len);
if (end)
*end = p;
if ((max_len >= 0 && p != str + max_len) ||
(max_len < 0 && *p != '\0'))
return false;
else
return true;
}
bool
utf8_validate(const std::string & str)
{
const char *p;
p = fast_validate(str.c_str());
return (*p == '\0');
}
std::string
utf8_make_valid(const std::string & name)
{
ssize_t remaining_bytes = name.size();
ssize_t valid_bytes;
const char *remainder = name.c_str();
const char *invalid;
char *str = NULL;
char *pos;
while (remaining_bytes != 0) {
if (utf8_validate_c_str(remainder, remaining_bytes, &invalid))
break;
valid_bytes = invalid - remainder;
if (str == NULL)
// If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
str = new char[3 * remaining_bytes];
pos = str;
strncpy(pos, remainder, valid_bytes);
pos += valid_bytes;
/* append U+FFFD REPLACEMENT CHARACTER */
pos[0] = '\357';
pos[1] = '\277';
pos[2] = '\275';
pos += 3;
remaining_bytes -= valid_bytes + 1;
remainder = invalid + 1;
}
if (str == NULL)
return std::string(name);
strncpy(pos, remainder, remaining_bytes);
pos += remaining_bytes;
std::string answer(str, pos - str);
assert(utf8_validate_c_str(answer.c_str(), -1, NULL));
delete[] str;
return answer;
}
/*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
* Copyright (C) 2014 Savoir-Faire Linux Inc.
*
* Author: Pascal Potvin <pascal.potvin@extenway.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Additional permission under GNU GPL version 3 section 7:
*
* If you modify this program, or any covered work, by linking or
* combining it with the OpenSSL project's OpenSSL library (or a
* modified version of that library), containing parts covered by the
* terms of the OpenSSL or SSLeay licenses, Savoir-Faire Linux Inc.
* grants you additional permission to convey the resulting work.
* Corresponding Source for a non-source form of such a combination
* shall include the source code for the parts of OpenSSL used as well
* as that of the covered work.
*/
#ifndef H_UTF8_UTILS
#define H_UTF8_UTILS
#include <cstdlib>
#include <string>
/**
* utf8_validate:
*
* Validates UTF-8 encoded text. @str is the text to validate;
*
* Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
* sip packets should also be encoded in utf8; so data read from a file or the
* network should be checked with utf8_validate() before doing anything else
* with it.
*
* Returns: true if the text was valid UTF-8
*/
bool
utf8_validate(const std::string & str);
/**
* utf8_make_valid:
* @name: a pointer to a nul delimited string.
*
* Transforms a unknown c_string into a pretty utf8 encoded std::string.
* Every unreadable or invalid byte will be transformed into U+FFFD
* (REPLACEMENT CHARACTER).
*
* Returns: a valid utf8 string.
*/
std::string
utf8_make_valid(const std::string & name);
#endif // H_UTF8_UTILS
......@@ -384,7 +384,8 @@ void SIPTest::testParseDisplayName()
{"\nFrom: <sip:pinger@sipwise.local>;tag=01f516a4", ""},
{"\nFrom: sip:pinger@sipwise.local;tag=01f516a4", ""},
{"\nFrom: ", ""},
{"\nFrom: \"±\"", ""},
{"\nFrom: \"\xb1""Alejandro P\xc3\xa9rez\" <sip:1111@10.0.0.1>;tag=3a7516a63bdbo0", "\xef\xbf\xbd""Alejandro P\xc3\xa9rez"},
{"\nFrom: \"Alejandro P\xc3\xa9rez\" <sip:1111@10.0.0.1>;tag=3a7516a63bdbo0", "Alejandro P\xc3\xa9rez"},
{"\nFrom: sip:+1212555@server.example.com;tag=887s", ""}};
for (const auto &t : test_set) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment