// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*- #ifndef __ustring_hh #define __ustring_hh /*! @file ustring.hh @brief Provides a simple UTF-8 encoded string */ extern "C" { #ifdef HAVE_STDINT_H # include #else # ifdef HAVE_SYS_TYPES_H # include # endif #endif } #include namespace otk { //! The number of bytes to skip to find the next character in the string const char g_utf8_skip[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 }; #ifdef HAVE_STDINT_H typedef uint32_t unichar; #else typedef u_int32_t unichar; #endif //! The iterator type for ustring /*! Note this is not a random access iterator but a bidirectional one, since all index operations need to iterate over the UTF-8 data. Use std::advance() to move to a certain position.

A writeable iterator isn't provided because: The number of bytes of the old UTF-8 character and the new one to write could be different. Therefore, any write operation would invalidate all other iterators pointing into the same string. */ template class ustring_Iterator { public: typedef std::bidirectional_iterator_tag iterator_category; typedef unichar value_type; typedef std::string::difference_type difference_type; typedef value_type reference; typedef void pointer; inline ustring_Iterator() {} inline ustring_Iterator(const ustring_Iterator& other) : _pos(other.base()) {} inline value_type operator*() const { // get a unicode character from the iterator's position // get an iterator to the internal string std::string::const_iterator pos = _pos; unichar result = static_cast(*pos); // if its not a 7-bit ascii character if((result & 0x80) != 0) { // len is the number of bytes this character takes up in the string unsigned char len = g_utf8_skip[result]; result &= 0x7F >> len; while(--len != 0) { result <<= 6; result |= static_cast(*++pos) & 0x3F; } } return result; } inline ustring_Iterator & operator++() { pos_ += g_utf8_skip[static_cast(*pos_)]; return *this; } inline ustring_Iterator & operator--() { do { --_pos; } while((*_pos & '\xC0') == '\x80'); return *this; } explicit inline ustring_Iterator(T pos) : _pos(pos) {} inline T base() const { return _pos; } private: T _pos; }; //! This class provides a simple wrapper to a std::string that is encoded as //! UTF-8. /*! This class does not handle extended 8-bit ASCII charsets like ISO-8859-1.

More info on Unicode and UTF-8 can be found here: http://www.cl.cam.ac.uk/~mgk25/unicode.html

This does not subclass std::string, because std::string was intended to be a final class. For instance, it does not have a virtual destructor. */ class ustring { std::string _string; public: typedef std::string::size_type size_type; typedef std::string::difference_type difference_type; typedef unichar value_type; typedef unichar & reference; typedef const unichar & const_reference; typedef ustring_Iterator iterator; typedef ustring_Iterator const_iterator; static const size_type npos = std::string::npos; ustring(); ~ustring(); // make new strings ustring(const ustring& other); ustring& operator=(const ustring& other); ustring(const std::string& src); ustring::ustring(const char* src); }; } #endif // __ustring_hh