/* * Copyright (C) 1999 Tom Tromey * Copyright (C) 2000 Red Hat, Inc. * Copyright (C) 2004-2020 Savoir-faire Linux Inc. * * Author: Pascal Potvin <pascal.potvin@extenway.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include <cstring> #include <cassert> #include "utf8_utils.h" #if defined(_MSC_VER) #include <BaseTsd.h> using ssize_t = SSIZE_T; #endif /* * The LIKELY and UNLIKELY macros let the programmer give hints to * the compiler about the expected result of an expression. Some compilers * can use this information for optimizations. */ #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) #define LIKELY(expr) (__builtin_expect(expr, 1)) #define UNLIKELY(expr) (__builtin_expect(expr, 0)) #else #define LIKELY(expr) (expr) #define UNLIKELY(expr) (expr) #endif /* * Check whether a Unicode (5.2) char is in a valid range. * * The first check comes from the Unicode guarantee to never encode * a point above 0x0010ffff, since UTF-16 couldn't represent it. * * The second check covers surrogate pairs (category Cs). * * @param Char the character */ #define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800)) #define CONTINUATION_CHAR \ if ((*(unsigned char*) p & 0xc0) != 0x80) /* 10xxxxxx */ \ goto error; \ val <<= 6; \ val |= (*(unsigned char*) p) & 0x3f; namespace jami { bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end); static const char* fast_validate(const char* str) { char32_t val = 0; char32_t min = 0; const char* p; for (p = str; *p; p++) { if (*(unsigned char*) p < 128) /* done */; else { const char* last; last = p; if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */ if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0)) goto error; p++; if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */ goto error; } else { if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */ min = (1 << 11); val = *(unsigned char*) p & 0x0f; goto TWO_REMAINING; } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */ min = (1 << 16); val = *(unsigned char*) p & 0x07; } else goto error; p++; CONTINUATION_CHAR; TWO_REMAINING: p++; CONTINUATION_CHAR; p++; CONTINUATION_CHAR; if (UNLIKELY(val < min)) goto error; if (UNLIKELY(!UNICODE_VALID(val))) goto error; } continue; error: return last; } } return p; } static const char* fast_validate_len(const char* str, ssize_t max_len) { char32_t val = 0; char32_t min = 0; const char* p; assert(max_len >= 0); for (p = str; ((p - str) < max_len) && *p; p++) { if (*(unsigned char*) p < 128) /* done */; else { const char* last; last = p; if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */ if (UNLIKELY(max_len - (p - str) < 2)) goto error; if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0)) goto error; p++; if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */ goto error; } else { if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */ if (UNLIKELY(max_len - (p - str) < 3)) goto error; min = (1 << 11); val = *(unsigned char*) p & 0x0f; goto TWO_REMAINING; } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */ if (UNLIKELY(max_len - (p - str) < 4)) goto error; min = (1 << 16); val = *(unsigned char*) p & 0x07; } else goto error; p++; CONTINUATION_CHAR; TWO_REMAINING: p++; CONTINUATION_CHAR; p++; CONTINUATION_CHAR; if (UNLIKELY(val < min)) goto error; if (UNLIKELY(!UNICODE_VALID(val))) goto error; } continue; error: return last; } } return p; } /** * utf8_validate_c_str: * @str: a pointer to character data * @max_len: max bytes to validate, or -1 to go until NULL * @end: return location for end of valid data * * Validates UTF-8 encoded text. @str is the text to validate; * if @str is nul-terminated, then @max_len can be -1, otherwise * @max_len should be the number of bytes to validate. * If @end is non-%NULL, then the end of the valid range * will be stored there (i.e. the start of the first invalid * character if some bytes were invalid, or the end of the text * being validated otherwise). * * Note that utf8_validate() returns %false if @max_len is * positive and any of the @max_len bytes are nul. * * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input; * sip packets should also be encoded in utf8; so data read from a file or the * network should be checked with utf8_validate() before doing anything else * with it. * * Returns: true if the text was valid UTF-8 */ bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end) { const char* p; if (max_len < 0) p = fast_validate(str); else p = fast_validate_len(str, max_len); if (end) *end = p; if ((max_len >= 0 && p != str + max_len) || (max_len < 0 && *p != '\0')) return false; else return true; } bool utf8_validate(const std::string& str) { const char* p; p = fast_validate(str.c_str()); return (*p == '\0'); } std::string utf8_make_valid(const std::string& name) { ssize_t remaining_bytes = name.size(); ssize_t valid_bytes; const char* remainder = name.c_str(); const char* invalid; char* str = NULL; char* pos; while (remaining_bytes != 0) { if (utf8_validate_c_str(remainder, remaining_bytes, &invalid)) break; valid_bytes = invalid - remainder; if (str == NULL) // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size() str = new char[3 * remaining_bytes]; pos = str; strncpy(pos, remainder, valid_bytes); pos += valid_bytes; /* append U+FFFD REPLACEMENT CHARACTER */ pos[0] = '\357'; pos[1] = '\277'; pos[2] = '\275'; pos += 3; remaining_bytes -= valid_bytes + 1; remainder = invalid + 1; } if (str == NULL) return std::string(name); strncpy(pos, remainder, remaining_bytes); pos += remaining_bytes; std::string answer(str, pos - str); assert(utf8_validate_c_str(answer.c_str(), -1, NULL)); delete[] str; return answer; } } // namespace jami