summaryrefslogtreecommitdiff
path: root/textwolf/include/textwolf/charset_utf16.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'textwolf/include/textwolf/charset_utf16.hpp')
-rw-r--r--textwolf/include/textwolf/charset_utf16.hpp224
1 files changed, 224 insertions, 0 deletions
diff --git a/textwolf/include/textwolf/charset_utf16.hpp b/textwolf/include/textwolf/charset_utf16.hpp
new file mode 100644
index 0000000..576c202
--- /dev/null
+++ b/textwolf/include/textwolf/charset_utf16.hpp
@@ -0,0 +1,224 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset_utf16.hpp
+/// \brief Definition of UTF-16 encodings
+
+#ifndef __TEXTWOLF_CHARSET_UTF16_HPP__
+#define __TEXTWOLF_CHARSET_UTF16_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include <cstddef>
+
+namespace textwolf {
+namespace charset {
+
+/// \class UTF16
+/// \brief Character set UTF16 (little/big endian)
+/// \tparam encoding ByteOrder::LE or ByteOrder::BE
+/// \remark BOM character sequences are not interpreted as such and byte swapping is not done implicitely
+/// It is left to the caller to detect BOM or its inverse and to switch the iterator.
+/// \remark See http://en.wikipedia.org/wiki/UTF-16/UCS-2: ... If the endian architecture of the decoder
+/// matches that of the encoder, the decoder detects the 0xFEFF value, but an opposite-endian decoder
+/// interprets the BOM as the non-character value U+FFFE reserved for this purpose. This incorrect
+/// result provides a hint to perform byte-swapping for the remaining values. If the BOM is missing,
+/// the standard says that big-endian encoding should be assumed....
+template <int encoding=ByteOrder::BE>
+class UTF16
+{
+private:
+ enum
+ {
+ LSB=(encoding==ByteOrder::BE), //< least significant byte index (0 or 1)
+ MSB=(encoding==ByteOrder::LE), //< most significant byte index (0 or 1)
+ Print1shift=(encoding==ByteOrder::BE)?8:0, //< value to shift with to get the 1st character to print
+ Print2shift=(encoding==ByteOrder::LE)?8:0 //< value to shift with to get the 2nd character to print
+ };
+
+public:
+ enum
+ {
+ MaxChar=0x10FFFF //< maximum character in alphabet
+ };
+
+public:
+ /// \brief See template<class Iterator>Interface::fetchbytes(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos<2)
+ {
+ if (bufpos<1)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ buf[1] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ }
+
+ /// \brief Get the size of the current character in bytes (variable length encoding)
+ /// \param [in] buf buffer for the character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator
+ template <class Iterator>
+ static inline unsigned int size( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ fetchbytes( buf, bufpos, itr);
+
+ UChar rt = (unsigned char)buf[ MSB];
+ if ((rt - 0xD8) > 0x03)
+ {
+ return 2;
+ }
+ else
+ {
+ return 4;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void skip( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ unsigned int bufsize = size( buf, bufpos, itr);
+ for (;bufpos < bufsize; ++bufpos)
+ {
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ UChar ch = value_impl( buf, bufpos, itr);
+ return (ch > 127)?-1:(char)ch;
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static UChar value_impl( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ unsigned int bufsize = size( buf, bufpos, itr);
+
+ UChar rt = (unsigned char)buf[ MSB];
+ rt = (rt << 8) + (unsigned char)buf[ LSB];
+
+ if (bufsize == 4)
+ {
+ // 2 teilig
+ while (bufpos < bufsize)
+ {
+ buf[bufpos] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ rt -= 0xD800;
+ rt *= 0x400;
+ unsigned short lo = (unsigned char)buf[ 2+MSB];
+ if ((lo - 0xDC) > 0x03) return 0xFFFF;
+ lo = (lo << 8) + (unsigned char)buf[ 2+LSB];
+ return rt + lo - 0xDC00 + 0x010000;
+ }
+ return rt;
+ }
+
+ template <class Iterator>
+ inline UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
+ {
+ return value_impl( buf, bufpos, itr);
+ }
+
+ /// \brief See template<class Buffer>Interface::print(UChar,Buffer&)
+ template <class Buffer_>
+ void print( UChar ch, Buffer_& buf) const
+ {
+ if (ch <= 0xFFFF)
+ {
+ if ((ch - 0xD800) < 0x400)
+ {
+ //... reserved for encoding of characters in range [0xFFFF..0x10FFFF]
+ }
+ else
+ {
+ buf.push_back( (char)(unsigned char)((ch >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((ch >> Print2shift) & 0xFF));
+ return;
+ }
+ }
+ else if (ch <= 0x10FFFF)
+ {
+ ch -= 0x10000;
+ unsigned short hi = (ch / 0x400) + 0xD800;
+ unsigned short lo = (ch % 0x400) + 0xDC00;
+ buf.push_back( (char)(unsigned char)((hi >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((hi >> Print2shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((lo >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((lo >> Print2shift) & 0xFF));
+ return;
+ }
+ char tb[ 32];
+ char* cc = tb;
+ Encoder::encode( ch, tb, sizeof(tb));
+ while (*cc)
+ {
+ buf.push_back( (char)(unsigned char)(((UChar)*cc >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)(((UChar)*cc >> Print2shift) & 0xFF));
+ ++cc;
+ }
+ }
+
+ /// \brief See template<class Buffer>Interface::is_equal( const Interface&, const Interface&)
+ static inline bool is_equal( const UTF16&, const UTF16&)
+ {
+ return true;
+ }
+};
+
+/// \class UTF16LE
+/// \brief UTF-16 little endian character set encoding
+struct UTF16LE :public UTF16<ByteOrder::LE> {};
+/// \class UTF16BE
+/// \brief UTF-16 big endian character set encoding
+struct UTF16BE :public UTF16<ByteOrder::BE> {};
+
+}//namespace
+}//namespace
+#endif
+