Skip to content
Snippets Groups Projects
Select Git revision
  • 9a36c366a88be1ddf1f81b699ac0eeaecc6d4cdd
  • master default protected
  • release/202005
  • release/202001
  • release/201912
  • release/201911
  • release/releaseWindowsTestOne
  • release/windowsReleaseTest
  • release/releaseTest
  • release/releaseWindowsTest
  • release/201910
  • release/qt/201910
  • release/windows-test/201910
  • release/201908
  • release/201906
  • release/201905
  • release/201904
  • release/201903
  • release/201902
  • release/201901
  • release/201812
  • 4.0.0
  • 2.2.0
  • 2.1.0
  • 2.0.1
  • 2.0.0
  • 1.4.1
  • 1.4.0
  • 1.3.0
  • 1.2.0
  • 1.1.0
31 results

utf8_utils.cpp

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    utf8_utils.cpp 8.10 KiB
    /*
     *  Copyright (C) 1999 Tom Tromey
     *  Copyright (C) 2000 Red Hat, Inc.
     *  Copyright (C) 2014-2017 Savoir-faire Linux Inc.
     *
     *  Author: Pascal Potvin <pascal.potvin@extenway.com>
     *
     *  This program is free software; you can redistribute it and/or modify
     *  it under the terms of the GNU General Public License as published by
     *  the Free Software Foundation; either version 3 of the License, or
     *  (at your option) any later version.
     *
     *  This program is distributed in the hope that it will be useful,
     *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     *  GNU General Public License for more details.
     *
     *  You should have received a copy of the GNU General Public License
     *  along with this program; if not, write to the Free Software
     *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
     */
    
    
    #include <cstring>
    #include <cassert>
    #include "utf8_utils.h"
    
    #if defined(_MSC_VER)
    #include <BaseTsd.h>
    using ssize_t = SSIZE_T;
    #endif
    
    /*
     * The LIKELY and UNLIKELY macros let the programmer give hints to
     * the compiler about the expected result of an expression. Some compilers
     * can use this information for optimizations.
     */
    #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
    #define LIKELY(expr) (__builtin_expect (expr, 1))
    #define UNLIKELY(expr) (__builtin_expect (expr, 0))
    #else
    #define LIKELY(expr) (expr)
    #define UNLIKELY(expr) (expr)
    #endif
    
    
    /*
     * Check whether a Unicode (5.2) char is in a valid range.
     *
     * The first check comes from the Unicode guarantee to never encode
     * a point above 0x0010ffff, since UTF-16 couldn't represent it.
     *
     * The second check covers surrogate pairs (category Cs).
     *
     * @param Char the character
     */
    #define UNICODE_VALID(Char)                   \
        ((Char) < 0x110000 &&                     \
         (((Char) & 0xFFFFF800) != 0xD800))
    
    #define CONTINUATION_CHAR                           \
      if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
        goto error;                                     \
      val <<= 6;                                        \
      val |= (*(unsigned char *)p) & 0x3f;
    
    namespace ring {
    
    bool
    utf8_validate_c_str(const char *str, ssize_t max_len, const char **end);
    
    static const char *
    fast_validate(const char *str)
    {
        char32_t val = 0;
        char32_t min = 0;
        const char *p;
    
        for (p = str; *p; p++) {
            if (*(unsigned char *)p < 128)
                /* done */;
            else {
                const char *last;
    
                last = p;
    
                if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
                    if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0))
                        goto error;
    
                    p++;
    
                    if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80))  /* 10xxxxxx */
                        goto error;
                } else {
                    if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
                        min = (1 << 11);
                        val = *(unsigned char *)p & 0x0f;
                        goto TWO_REMAINING;
                    } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
                        min = (1 << 16);
                        val = *(unsigned char *)p & 0x07;
                    } else
                        goto error;
    
                    p++;
                    CONTINUATION_CHAR;
    TWO_REMAINING:
                    p++;
                    CONTINUATION_CHAR;
                    p++;
                    CONTINUATION_CHAR;
    
                    if (UNLIKELY(val < min))
                        goto error;
    
                    if (UNLIKELY(!UNICODE_VALID(val)))
                        goto error;
                }
    
                continue;
    
    error:
                return last;
            }
        }
    
        return p;
    }
    
    static const char *
    fast_validate_len(const char *str, ssize_t max_len)
    {
        char32_t val = 0;
        char32_t min = 0;
        const char *p;
    
        assert(max_len >= 0);
    
        for (p = str; ((p - str) < max_len) && *p; p++) {
            if (*(unsigned char *)p < 128)
                /* done */;
            else {
                const char *last;
    
                last = p;
    
                if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
                    if (UNLIKELY(max_len - (p - str) < 2))
                        goto error;
    
                    if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0))
                        goto error;
    
                    p++;
    
                    if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80))  /* 10xxxxxx */
                        goto error;
                } else {
                    if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
                        if (UNLIKELY(max_len - (p - str) < 3))
                            goto error;
    
                        min = (1 << 11);
                        val = *(unsigned char *)p & 0x0f;
                        goto TWO_REMAINING;
                    } else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
                        if (UNLIKELY(max_len - (p - str) < 4))
                            goto error;
    
                        min = (1 << 16);
                        val = *(unsigned char *)p & 0x07;
                    } else
                        goto error;
    
                    p++;
                    CONTINUATION_CHAR;
    TWO_REMAINING:
                    p++;
                    CONTINUATION_CHAR;
                    p++;
                    CONTINUATION_CHAR;
    
                    if (UNLIKELY(val < min))
                        goto error;
    
                    if (UNLIKELY(!UNICODE_VALID(val)))
                        goto error;
                }
    
                continue;
    
    error:
                return last;
            }
        }
    
        return p;
    }
    
    /**
     * utf8_validate_c_str:
     * @str: a pointer to character data
     * @max_len: max bytes to validate, or -1 to go until NULL
     * @end: return location for end of valid data
     *
     * Validates UTF-8 encoded text. @str is the text to validate;
     * if @str is nul-terminated, then @max_len can be -1, otherwise
     * @max_len should be the number of bytes to validate.
     * If @end is non-%NULL, then the end of the valid range
     * will be stored there (i.e. the start of the first invalid
     * character if some bytes were invalid, or the end of the text
     * being validated otherwise).
     *
     * Note that utf8_validate() returns %false if @max_len is
     * positive and any of the @max_len bytes are nul.
     *
     * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
     * sip packets should also be encoded in utf8; so data read from a file or the
     * network should be checked with utf8_validate() before doing anything else
     * with it.
     *
     * Returns: true if the text was valid UTF-8
     */
    bool
    utf8_validate_c_str(const char *str, ssize_t max_len, const char **end)
    {
        const char *p;
    
        if (max_len < 0)
            p = fast_validate(str);
        else
            p = fast_validate_len(str, max_len);
    
        if (end)
            *end = p;
    
        if ((max_len >= 0 && p != str + max_len) ||
                (max_len < 0 && *p != '\0'))
            return false;
        else
            return true;
    }
    
    bool
    utf8_validate(const std::string & str)
    {
        const char *p;
    
        p = fast_validate(str.c_str());
    
        return (*p == '\0');
    }
    
    std::string
    utf8_make_valid(const std::string & name)
    {
        ssize_t remaining_bytes = name.size();
        ssize_t valid_bytes;
        const char *remainder = name.c_str();
        const char *invalid;
        char *str = NULL;
        char *pos;
    
        while (remaining_bytes != 0) {
            if (utf8_validate_c_str(remainder, remaining_bytes, &invalid))
                break;
    
            valid_bytes = invalid - remainder;
    
            if (str == NULL)
                // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
                str = new char[3 * remaining_bytes];
    
            pos = str;
    
            strncpy(pos, remainder, valid_bytes);
            pos += valid_bytes;
    
            /* append U+FFFD REPLACEMENT CHARACTER */
            pos[0] = '\357';
            pos[1] = '\277';
            pos[2] = '\275';
    
            pos += 3;
    
            remaining_bytes -= valid_bytes + 1;
            remainder = invalid + 1;
        }
    
        if (str == NULL)
            return std::string(name);
    
        strncpy(pos, remainder, remaining_bytes);
        pos += remaining_bytes;
    
        std::string answer(str, pos - str);
        assert(utf8_validate_c_str(answer.c_str(), -1, NULL));
    
        delete[] str;
    
        return answer;
    }
    
    } // namespace ring