diff --git a/daemon/src/Makefile.am b/daemon/src/Makefile.am index 76d37fe670e7342ec36e40f32d84f0cf3e62a23c..184241a6b8e790c493e284299faefb1b3a6d04ba 100644 --- a/daemon/src/Makefile.am +++ b/daemon/src/Makefile.am @@ -91,6 +91,7 @@ libsflphone_la_SOURCES = conference.cpp \ fileutils.cpp \ threadloop.cpp \ ip_utils.cpp \ + utf8_utils.cpp \ threadloop.h \ conference.h \ voiplink.h \ @@ -105,6 +106,7 @@ libsflphone_la_SOURCES = conference.cpp \ numbercleaner.h \ fileutils.h \ noncopyable.h \ + utf8_utils.h \ sfl_types.h \ array_size.h \ account_schema.h \ diff --git a/daemon/src/sip/sip_utils.cpp b/daemon/src/sip/sip_utils.cpp index e81d38753e9dc9f0c0c32f4ed3acf65fe186c1c0..9650104cceb1ab5a828dd8535b866b1f6fdc7c34 100644 --- a/daemon/src/sip/sip_utils.cpp +++ b/daemon/src/sip/sip_utils.cpp @@ -31,6 +31,7 @@ #include "sip_utils.h" #include "logger.h" +#include "utf8_utils.h" #include <pjsip.h> #include <pjsip_ua.h> @@ -96,14 +97,6 @@ sip_utils::createRouteSet(const std::string &route, pj_pool_t *hdr_pool) return route_set; } -static bool -isValidUtf8(const std::string &str) -{ - std::wstring ws(str.size(), u' '); - const size_t wideSize = mbstowcs(&ws[0], str.c_str(), str.size()); - return wideSize != std::wstring::npos; -} - // FIXME: replace with regex std::string sip_utils::parseDisplayName(const char * buffer) @@ -148,10 +141,9 @@ sip_utils::parseDisplayName(const char * buffer) std::string displayName = temp.substr(begin_displayName + 1, end_displayName - begin_displayName - 1); - // Filter out invalid UTF-8 sequences to avoid getting kicked from D-Bus - if (not isValidUtf8(displayName)) { - ERROR("Invalid UTF-8 sequence detected: %s", displayName.c_str()); - return ""; + // Filter out invalid UTF-8 characters to avoid getting kicked from D-Bus + if (not utf8_validate(displayName)) { + return utf8_make_valid(displayName); } static const size_t MAX_DISPLAY_NAME_SIZE = 25; diff --git a/daemon/src/utf8_utils.cpp b/daemon/src/utf8_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..72af6a6d8e5d1a935f6cff0c18c60ee7b1e156a6 --- /dev/null +++ b/daemon/src/utf8_utils.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (C) 1999 Tom Tromey + * Copyright (C) 2000 Red Hat, Inc. + * Copyright (C) 2014 Savoir-Faire Linux Inc. + * + * Author: Pascal Potvin <pascal.potvin@extenway.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Additional permission under GNU GPL version 3 section 7: + * + * If you modify this program, or any covered work, by linking or + * combining it with the OpenSSL project's OpenSSL library (or a + * modified version of that library), containing parts covered by the + * terms of the OpenSSL or SSLeay licenses, Savoir-Faire Linux Inc. + * grants you additional permission to convey the resulting work. + * Corresponding Source for a non-source form of such a combination + * shall include the source code for the parts of OpenSSL used as well + * as that of the covered work. + */ + + +#include <cstring> +#include <cassert> +#include "utf8_utils.h" + +/* + * The LIKELY and UNLIKELY macros let the programmer give hints to + * the compiler about the expected result of an expression. Some compilers + * can use this information for optimizations. + */ +#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) +#define LIKELY(expr) (__builtin_expect (expr, 1)) +#define UNLIKELY(expr) (__builtin_expect (expr, 0)) +#else +#define LIKELY(expr) (expr) +#define UNLIKELY(expr) (expr) +#endif + + +/* + * Check whether a Unicode (5.2) char is in a valid range. + * + * The first check comes from the Unicode guarantee to never encode + * a point above 0x0010ffff, since UTF-16 couldn't represent it. + * + * The second check covers surrogate pairs (category Cs). + * + * @param Char the character + */ +#define UNICODE_VALID(Char) \ + ((Char) < 0x110000 && \ + (((Char) & 0xFFFFF800) != 0xD800)) + +#define CONTINUATION_CHAR \ + if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ + goto error; \ + val <<= 6; \ + val |= (*(unsigned char *)p) & 0x3f; + +static const char * +fast_validate(const char *str) +{ + char32_t val = 0; + char32_t min = 0; + const char *p; + + for (p = str; *p; p++) { + if (*(unsigned char *)p < 128) + /* done */; + else { + const char *last; + + last = p; + + if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */ + if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0)) + goto error; + + p++; + + if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */ + goto error; + } else { + if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */ + min = (1 << 11); + val = *(unsigned char *)p & 0x0f; + goto TWO_REMAINING; + } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */ + min = (1 << 16); + val = *(unsigned char *)p & 0x07; + } else + goto error; + + p++; + CONTINUATION_CHAR; +TWO_REMAINING: + p++; + CONTINUATION_CHAR; + p++; + CONTINUATION_CHAR; + + if (UNLIKELY(val < min)) + goto error; + + if (UNLIKELY(!UNICODE_VALID(val))) + goto error; + } + + continue; + +error: + return last; + } + } + + return p; +} + +static const char * +fast_validate_len(const char *str, ssize_t max_len) +{ + char32_t val = 0; + char32_t min = 0; + const char *p; + + assert(max_len >= 0); + + for (p = str; ((p - str) < max_len) && *p; p++) { + if (*(unsigned char *)p < 128) + /* done */; + else { + const char *last; + + last = p; + + if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */ + if (UNLIKELY(max_len - (p - str) < 2)) + goto error; + + if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0)) + goto error; + + p++; + + if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */ + goto error; + } else { + if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */ + if (UNLIKELY(max_len - (p - str) < 3)) + goto error; + + min = (1 << 11); + val = *(unsigned char *)p & 0x0f; + goto TWO_REMAINING; + } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */ + if (UNLIKELY(max_len - (p - str) < 4)) + goto error; + + min = (1 << 16); + val = *(unsigned char *)p & 0x07; + } else + goto error; + + p++; + CONTINUATION_CHAR; +TWO_REMAINING: + p++; + CONTINUATION_CHAR; + p++; + CONTINUATION_CHAR; + + if (UNLIKELY(val < min)) + goto error; + + if (UNLIKELY(!UNICODE_VALID(val))) + goto error; + } + + continue; + +error: + return last; + } + } + + return p; +} + +/** + * utf8_validate_c_str: + * @str: a pointer to character data + * @max_len: max bytes to validate, or -1 to go until NULL + * @end: return location for end of valid data + * + * Validates UTF-8 encoded text. @str is the text to validate; + * if @str is nul-terminated, then @max_len can be -1, otherwise + * @max_len should be the number of bytes to validate. + * If @end is non-%NULL, then the end of the valid range + * will be stored there (i.e. the start of the first invalid + * character if some bytes were invalid, or the end of the text + * being validated otherwise). + * + * Note that utf8_validate() returns %false if @max_len is + * positive and any of the @max_len bytes are nul. + * + * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input; + * sip packets should also be encoded in utf8; so data read from a file or the + * network should be checked with utf8_validate() before doing anything else + * with it. + * + * Returns: true if the text was valid UTF-8 + */ +bool +utf8_validate_c_str(const char *str, ssize_t max_len, const char **end) +{ + const char *p; + + if (max_len < 0) + p = fast_validate(str); + else + p = fast_validate_len(str, max_len); + + if (end) + *end = p; + + if ((max_len >= 0 && p != str + max_len) || + (max_len < 0 && *p != '\0')) + return false; + else + return true; +} + +bool +utf8_validate(const std::string & str) +{ + const char *p; + + p = fast_validate(str.c_str()); + + return (*p == '\0'); +} + +std::string +utf8_make_valid(const std::string & name) +{ + ssize_t remaining_bytes = name.size(); + ssize_t valid_bytes; + const char *remainder = name.c_str(); + const char *invalid; + char *str = NULL; + char *pos; + + while (remaining_bytes != 0) { + if (utf8_validate_c_str(remainder, remaining_bytes, &invalid)) + break; + + valid_bytes = invalid - remainder; + + if (str == NULL) + // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size() + str = new char[3 * remaining_bytes]; + + pos = str; + + strncpy(pos, remainder, valid_bytes); + pos += valid_bytes; + + /* append U+FFFD REPLACEMENT CHARACTER */ + pos[0] = '\357'; + pos[1] = '\277'; + pos[2] = '\275'; + + pos += 3; + + remaining_bytes -= valid_bytes + 1; + remainder = invalid + 1; + } + + if (str == NULL) + return std::string(name); + + strncpy(pos, remainder, remaining_bytes); + pos += remaining_bytes; + + std::string answer(str, pos - str); + assert(utf8_validate_c_str(answer.c_str(), -1, NULL)); + + delete[] str; + + return answer; +} diff --git a/daemon/src/utf8_utils.h b/daemon/src/utf8_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..88a1ca921915deb2e5c74160923497f11ab47b27 --- /dev/null +++ b/daemon/src/utf8_utils.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 1999 Tom Tromey + * Copyright (C) 2000 Red Hat, Inc. + * Copyright (C) 2014 Savoir-Faire Linux Inc. + * + * Author: Pascal Potvin <pascal.potvin@extenway.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Additional permission under GNU GPL version 3 section 7: + * + * If you modify this program, or any covered work, by linking or + * combining it with the OpenSSL project's OpenSSL library (or a + * modified version of that library), containing parts covered by the + * terms of the OpenSSL or SSLeay licenses, Savoir-Faire Linux Inc. + * grants you additional permission to convey the resulting work. + * Corresponding Source for a non-source form of such a combination + * shall include the source code for the parts of OpenSSL used as well + * as that of the covered work. + */ + +#ifndef H_UTF8_UTILS +#define H_UTF8_UTILS + +#include <cstdlib> +#include <string> + +/** + * utf8_validate: + * + * Validates UTF-8 encoded text. @str is the text to validate; + * + * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input; + * sip packets should also be encoded in utf8; so data read from a file or the + * network should be checked with utf8_validate() before doing anything else + * with it. + * + * Returns: true if the text was valid UTF-8 + */ + +bool +utf8_validate(const std::string & str); + +/** + * utf8_make_valid: + * @name: a pointer to a nul delimited string. + * + * Transforms a unknown c_string into a pretty utf8 encoded std::string. + * Every unreadable or invalid byte will be transformed into U+FFFD + * (REPLACEMENT CHARACTER). + * + * Returns: a valid utf8 string. + */ +std::string +utf8_make_valid(const std::string & name); + +#endif // H_UTF8_UTILS diff --git a/daemon/test/siptest.cpp b/daemon/test/siptest.cpp index 2ff65f971ca4c1c08a90fcbb7e960847e5298774..22ed5bbec1b2c6f72654aa4a503c5406bddbb9f0 100644 --- a/daemon/test/siptest.cpp +++ b/daemon/test/siptest.cpp @@ -384,7 +384,8 @@ void SIPTest::testParseDisplayName() {"\nFrom: <sip:pinger@sipwise.local>;tag=01f516a4", ""}, {"\nFrom: sip:pinger@sipwise.local;tag=01f516a4", ""}, {"\nFrom: ", ""}, - {"\nFrom: \"±\"", ""}, + {"\nFrom: \"\xb1""Alejandro P\xc3\xa9rez\" <sip:1111@10.0.0.1>;tag=3a7516a63bdbo0", "\xef\xbf\xbd""Alejandro P\xc3\xa9rez"}, + {"\nFrom: \"Alejandro P\xc3\xa9rez\" <sip:1111@10.0.0.1>;tag=3a7516a63bdbo0", "Alejandro P\xc3\xa9rez"}, {"\nFrom: sip:+1212555@server.example.com;tag=887s", ""}}; for (const auto &t : test_set) {