summaryrefslogtreecommitdiff
path: root/textwolf
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-06-14 20:15:59 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-06-14 20:15:59 +0200
commit913e4215f22e16ad90a30b7e68e8cd2165c6812d (patch)
treed7aef8f6e7b29895f1b0160cb647e5427181198e /textwolf
parent4f6d08ce39cc430ed7ba90d143bf7af3fc8ca6d5 (diff)
downloadcrawler-913e4215f22e16ad90a30b7e68e8cd2165c6812d.tar.gz
crawler-913e4215f22e16ad90a30b7e68e8cd2165c6812d.tar.bz2
added textwolf and a test for it
Diffstat (limited to 'textwolf')
-rw-r--r--textwolf/.gitignore9
-rw-r--r--textwolf/README28
-rw-r--r--textwolf/include/textwolf.hpp57
-rw-r--r--textwolf/include/textwolf/char.hpp143
-rw-r--r--textwolf/include/textwolf/charset.hpp46
-rw-r--r--textwolf/include/textwolf/charset_interface.hpp146
-rw-r--r--textwolf/include/textwolf/charset_isolatin.hpp126
-rw-r--r--textwolf/include/textwolf/charset_ucs.hpp238
-rw-r--r--textwolf/include/textwolf/charset_utf16.hpp224
-rw-r--r--textwolf/include/textwolf/charset_utf8.hpp218
-rw-r--r--textwolf/include/textwolf/codepages.hpp182
-rw-r--r--textwolf/include/textwolf/cstringiterator.hpp120
-rw-r--r--textwolf/include/textwolf/exception.hpp106
-rw-r--r--textwolf/include/textwolf/istreamiterator.hpp89
-rw-r--r--textwolf/include/textwolf/sourceiterator.hpp136
-rw-r--r--textwolf/include/textwolf/staticbuffer.hpp179
-rw-r--r--textwolf/include/textwolf/textscanner.hpp225
-rw-r--r--textwolf/include/textwolf/traits.hpp65
-rw-r--r--textwolf/include/textwolf/xmlhdrparser.hpp411
-rw-r--r--textwolf/include/textwolf/xmlpathautomaton.hpp778
-rw-r--r--textwolf/include/textwolf/xmlpathautomatonparse.hpp245
-rw-r--r--textwolf/include/textwolf/xmlpathselect.hpp516
-rw-r--r--textwolf/include/textwolf/xmlprinter.hpp387
-rw-r--r--textwolf/include/textwolf/xmlscanner.hpp1355
-rw-r--r--textwolf/include/textwolf/xmltagstack.hpp146
-rw-r--r--textwolf/license.txt165
26 files changed, 6340 insertions, 0 deletions
diff --git a/textwolf/.gitignore b/textwolf/.gitignore
new file mode 100644
index 0000000..9bfe015
--- /dev/null
+++ b/textwolf/.gitignore
@@ -0,0 +1,9 @@
+src/
+tests/readStdinIterator
+tests/readStdinIterator.o
+tests/test_TextReader
+tests/test_TextReader.o
+tests/test_XMLPathSelect
+tests/test_XMLPathSelect.o
+tests/test_XMLScanner
+tests/test_XMLScanner.o
diff --git a/textwolf/README b/textwolf/README
new file mode 100644
index 0000000..66f1e0a
--- /dev/null
+++ b/textwolf/README
@@ -0,0 +1,28 @@
+Documentation
+* For using textwolf just include "include/textwolf.hpp".
+* textwolf can be compiled with the highest optimization level, specially with deep inline expansion
+* The textwolf home is at at http://textwolf.net
+* A textwolf introduction can be found at http://textwolf.net/tutorial.html
+* A doxygen interface documentation is at http://patrickfrey.github.com/textwolf/html/index.html
+
+Examples
+* See the examples in tests:
+** readStdinIterator.cpp :Echoing stdin for all character sets
+** test_TextReader.cpp :Iterating on a set of generated characters and test if read/write works for all characters
+** test_XMLPathSelect.cpp :Iterating on the XML Path selected elements
+** test_XMLScanner.cpp :Iterating on the XML elements
+
+Projects using textwolf
+* textwolf is used in the wolframe project (see http://wolframe.net or http://github.com/Wolframe/Wolframe)
+
+Bugreports
+* textwolf bug reports are for the time beeing collected as CSV file in BUGS.
+
+Project Schedule
+* 2014/06/12
+ Version 0.2.0
+ * Support of IsoLatin codepages besides UTF/UCS encodings
+ * Chunk by chunk feeding reimplementation (using longjmp instead of exceptions)
+
+
+
diff --git a/textwolf/include/textwolf.hpp b/textwolf/include/textwolf.hpp
new file mode 100644
index 0000000..8a71bde
--- /dev/null
+++ b/textwolf/include/textwolf.hpp
@@ -0,0 +1,57 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+
+#ifndef __TEXTWOLF_HPP__
+#define __TEXTWOLF_HPP__
+/// \file textwolf.hpp
+/// \brief Main include file
+
+#include "textwolf/char.hpp"
+#include "textwolf/exception.hpp"
+#include "textwolf/staticbuffer.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/charset.hpp"
+#include "textwolf/textscanner.hpp"
+#include "textwolf/xmlscanner.hpp"
+#include "textwolf/cstringiterator.hpp"
+#include "textwolf/sourceiterator.hpp"
+#include "textwolf/xmltagstack.hpp"
+#include "textwolf/xmlprinter.hpp"
+#include "textwolf/xmlhdrparser.hpp"
+#include "textwolf/xmlpathselect.hpp"
+
+#endif
+
+
diff --git a/textwolf/include/textwolf/char.hpp b/textwolf/include/textwolf/char.hpp
new file mode 100644
index 0000000..419cc24
--- /dev/null
+++ b/textwolf/include/textwolf/char.hpp
@@ -0,0 +1,143 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/char.hpp
+/// \brief Definition of unicode characters
+#ifndef __TEXTWOLF_CHAR_HPP__
+#define __TEXTWOLF_CHAR_HPP__
+#include <cstddef>
+
+#ifdef BOOST_VERSION
+#include <boost/cstdint.hpp>
+namespace textwolf {
+ /// \typedef UChar
+ /// \brief Unicode character type
+ typedef boost::uint32_t UChar;
+ typedef boost::uint64_t EChar;
+}//namespace
+#else
+#ifdef _MSC_VER
+#pragma warning(disable:4290)
+#include <BaseTsd.h>
+namespace textwolf {
+ /// \typedef UChar
+ /// \brief Unicode character type
+ typedef DWORD32 UChar;
+ typedef DWORD64 EChar;
+}//namespace
+#else
+#include <stdint.h>
+namespace textwolf {
+ /// \typedef UChar
+ /// \brief Unicode character type
+ typedef uint32_t UChar;
+ typedef uint64_t EChar;
+}//namespace
+#endif
+#endif
+
+namespace textwolf {
+/// \class CharMap
+/// \brief Character map for fast typing of a character byte
+/// \tparam RESTYPE result type of the map
+/// \tparam nullvalue_ default intitialization value of the map
+/// \tparam RANGE domain of the input values of the map
+template <typename RESTYPE, RESTYPE nullvalue_, int RANGE=256>
+class CharMap
+{
+public:
+ typedef RESTYPE valuetype;
+ enum Constant {nullvalue=nullvalue_};
+
+private:
+ RESTYPE ar[ RANGE]; //< the map elements
+public:
+ /// \brief Constructor
+ CharMap() {for (unsigned int ii=0; ii<RANGE; ii++) ar[ii]=(valuetype)nullvalue;}
+ /// \brief Define the values of the elements in the interval [from,to]
+ /// \param[in] from start of the input intervall (belongs also to the input)
+ /// \param[in] to end of the input intervall (belongs also to the input)
+ /// \param[in] value value assigned to all elements in [from,to]
+ CharMap& operator()( unsigned char from, unsigned char to, valuetype value) {for (unsigned int ii=from; ii<=to; ii++) ar[ii]=value; return *this;}
+ /// \brief Define the values of the single element at 'at'
+ /// \param[in] at the input element
+ /// \param[in] value value assigned to the element 'at'
+ CharMap& operator()( unsigned char at, valuetype value) {ar[at] = value; return *this;}
+ /// \brief Read the element assigned to 'ii'
+ /// \param[in] ii the input element queried
+ /// \return the element at 'ii'
+ valuetype operator []( unsigned char ii) const {return ar[ii];}
+};
+
+/// \enum ControlCharacter
+/// \brief Enumeration of control characters needed as events for XML scanner statemachine
+enum ControlCharacter
+{
+ Undef=0, //< not defined (beyond ascii)
+ EndOfText, //< end of data (EOF,EOD,.)
+ EndOfLine, //< end of line
+ Cntrl, //< control character
+ Space, //< space, tab, etc..
+ Amp, //< ampersant ('&')
+ Lt, //< lesser than '<'
+ Equal, //< equal '='
+ Gt, //< greater than '>'
+ Slash, //< slash '/'
+ Dash, //< en dash (minus) '-'
+ Exclam, //< exclamation mark '!'
+ Questm, //< question mark '?'
+ Sq, //< single quote
+ Dq, //< double quote
+ Osb, //< open square bracket '['
+ Csb, //< close square bracket ']'
+ Any //< any ascii character with meaning
+};
+enum {NofControlCharacter=18}; //< total number of control characters
+
+/// \class ControlCharacterM
+/// \brief Map of the enumeration of control characters to their names for debug messages
+struct ControlCharacterM
+{
+ /// \brief Get the name of a control character as string
+ /// \param [in] c the control character to map
+ static const char* name( ControlCharacter c)
+ {
+ static const char* name[ NofControlCharacter] = {"Undef", "EndOfText", "EndOfLine", "Cntrl", "Space", "Amp", "Lt", "Equal", "Gt", "Slash", "Dash", "Exclam", "Questm", "Sq", "Dq", "Osb", "Csb", "Any"};
+ return name[ (unsigned int)(unsigned char)c];
+ }
+};
+
+}//namespace
+#endif
+
diff --git a/textwolf/include/textwolf/charset.hpp b/textwolf/include/textwolf/charset.hpp
new file mode 100644
index 0000000..93ac5c3
--- /dev/null
+++ b/textwolf/include/textwolf/charset.hpp
@@ -0,0 +1,46 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this Object refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset.hpp
+/// \brief Character set encodings already implemented in textwolf
+/// \note The interface that the classes defined in the files included must fulfill is defined in "charset_interface.hpp"
+
+#ifndef __TEXTWOLF_XML_CHARSET_HPP__
+#define __TEXTWOLF_XML_CHARSET_HPP__
+#include "textwolf/charset_utf8.hpp"
+#include "textwolf/charset_utf16.hpp"
+#include "textwolf/charset_ucs.hpp"
+#include "textwolf/charset_isolatin.hpp"
+#endif
+
diff --git a/textwolf/include/textwolf/charset_interface.hpp b/textwolf/include/textwolf/charset_interface.hpp
new file mode 100644
index 0000000..b99bdf7
--- /dev/null
+++ b/textwolf/include/textwolf/charset_interface.hpp
@@ -0,0 +1,146 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset_interface.hpp
+/// \brief Interface that describes what a character set encoding implementation has to define to be used as character set template parameter for textwolf.
+/// \remark This interface is more a documentation because the template library relies on the properties of the character set classes rather than on the interface it implements.
+#ifndef __TEXTWOLF_CHARSET_INTERFACE_HPP__
+#define __TEXTWOLF_CHARSET_INTERFACE_HPP__
+#include <cstddef>
+#include "textwolf/staticbuffer.hpp"
+
+namespace textwolf {
+/// \namespace textwolf::charset
+/// \brief namespace of character set encoding definitions
+namespace charset {
+
+/// \class Encoder
+/// \brief Collection of functions for encode/decode XML character entities
+struct Encoder
+{
+ /// \brief Write the character 'chr' in encoded form as nul-terminated string to a buffer
+ /// \param[in] chr unicode character to encode
+ /// \param[out] bufptr buffer to write to
+ /// \param[in] bufsize allocation size of buffer pointer by 'bufptr'
+ static bool encode( UChar chr, char* bufptr, std::size_t bufsize)
+ {
+ static const char* HEX = "0123456789abcdef";
+ StaticBuffer buf( bufptr, bufsize);
+ char bb[ 32];
+ unsigned int ii=0;
+ while (chr > 0)
+ {
+ bb[ii++] = HEX[ chr & 0xf];
+ chr /= 16;
+ }
+ buf.push_back( '&');
+ buf.push_back( '#');
+ buf.push_back( 'x');
+ while (ii)
+ {
+ buf.push_back( bb[ --ii]);
+ }
+ buf.push_back( ';');
+ buf.push_back( '\0');
+ return !buf.overflow();
+ }
+};
+
+/// \class Interface
+/// \brief This interface has to be implemented for a character set encoding
+struct Interface
+{
+ /// \brief Maximum character this characer set encoding can represent
+ enum {MaxChar=0xFF};
+
+ /// \brief Skip to start of the next character
+ /// \param [in] buf buffer for the character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator to skip
+ template <class Iterator>
+ static void skip( char* buf, unsigned int& bufpos, Iterator& itr);
+
+ /// \brief Fetches the ascii char representation of the current character
+ /// \param [in] buf buffer for the parses character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator on the source
+ /// \return the value of the ascii character or -1
+ template <class Iterator>
+ static signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr);
+
+ /// \brief Fetches the bytes of the current character into a buffer
+ /// \param [in] buf buffer for the parses character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator on the source
+ template <class Iterator>
+ static void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr);
+
+ /// \brief Fetches the unicode character representation of the current character
+ /// \param [in] buf buffer for the parses character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator on the source
+ /// \return the value of the unicode character
+ template <class Iterator>
+ UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const;
+
+ /// \brief Prints a unicode character to a buffer
+ /// \tparam Buffer_ STL back insertion sequence
+ /// \param [in] chr character to print
+ /// \param [out] buf buffer to print to
+ template <class Buffer_>
+ void print( UChar chr, Buffer_& buf) const;
+
+ /// \brief Evaluate if two character set encodings of the same type are equal in all properties (code page, etc.)
+ /// \return true if yes
+ static bool is_equal( const Interface&, const Interface&)
+ {
+ return true;
+ }
+};
+
+/// \class ByteOrder
+/// \brief Order of bytes for wide char character sets
+struct ByteOrder
+{
+ enum
+ {
+ LE=0, //< little endian
+ BE=1 //< big endian
+ };
+};
+
+}//namespace
+}//namespace
+#endif
+
diff --git a/textwolf/include/textwolf/charset_isolatin.hpp b/textwolf/include/textwolf/charset_isolatin.hpp
new file mode 100644
index 0000000..b6bd660
--- /dev/null
+++ b/textwolf/include/textwolf/charset_isolatin.hpp
@@ -0,0 +1,126 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset_isolatin.hpp
+/// \brief Definition of IsoLatin encodings
+
+#ifndef __TEXTWOLF_CHARSET_ISOLATIN_HPP__
+#define __TEXTWOLF_CHARSET_ISOLATIN_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include "textwolf/codepages.hpp"
+#include <cstddef>
+
+namespace textwolf {
+namespace charset {
+
+/// \class IsoLatin
+/// \brief Character set IsoLatin-1,..IsoLatin-9 (ISO-8859-1,...ISO-8859-9)
+struct IsoLatin :public IsoLatinCodePage
+{
+ enum {MaxChar=0xFF};
+
+ IsoLatin( const IsoLatin& o)
+ :IsoLatinCodePage(o){}
+ IsoLatin( unsigned int codePageIdx=1)
+ :IsoLatinCodePage(codePageIdx){}
+
+ /// \brief See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void skip( char*, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos==0)
+ {
+ ++itr;
+ ++bufpos;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::fetchbytes(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos==0)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ fetchbytes( buf, bufpos, itr);
+ return ((unsigned char)(buf[0])>127)?-1:buf[0];
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ inline UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
+ {
+ fetchbytes( buf, bufpos, itr);
+ return ucharcode( buf[0]);
+ }
+
+ /// \brief See template<class Buffer>Interface::print(UChar,Buffer&)
+ template <class Buffer_>
+ void print( UChar chr, Buffer_& buf) const
+ {
+ char chr_ = invcode( chr);
+ if (chr_ == 0)
+ {
+ char tb[ 32];
+ char* cc = tb;
+ Encoder::encode( chr, tb, sizeof(tb));
+ while (*cc) buf.push_back( *cc++);
+ }
+ else
+ {
+ buf.push_back( chr_);
+ }
+ }
+
+ /// \brief See template<class Buffer>Interface::is_equal( const Interface&, const Interface&)
+ static inline bool is_equal( const IsoLatin& a, const IsoLatin& b)
+ {
+ return IsoLatinCodePage::is_equal( a, b);
+ }
+};
+
+}//namespace
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/charset_ucs.hpp b/textwolf/include/textwolf/charset_ucs.hpp
new file mode 100644
index 0000000..22f2cab
--- /dev/null
+++ b/textwolf/include/textwolf/charset_ucs.hpp
@@ -0,0 +1,238 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset_ucs.hpp
+/// \brief Definition of UCS-2/UCS-4 encodings
+
+#ifndef __TEXTWOLF_CHARSET_UCS_HPP__
+#define __TEXTWOLF_CHARSET_UCS_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include <cstddef>
+
+namespace textwolf {
+namespace charset {
+
+/// \class UCS2
+/// \brief Character set UCS-2 (little/big endian)
+/// \tparam byteorder charset::ByteOrder::LE or charset::ByteOrder::BE
+/// UCS-2 encoding is defined to be big-endian only. Although the similar designations 'UCS-2BE and UCS-2LE
+/// imitate the UTF-16 labels, they do not represent official encoding schemes. (http://en.wikipedia.org/wiki/UTF-16/UCS-2)
+/// therefore we take byteorder=ByteOrder::BE as default.
+template <int byteorder=ByteOrder::BE>
+struct UCS2
+{
+ enum
+ {
+ LSB=(byteorder==ByteOrder::BE), //< least significant byte index (0 or 1)
+ MSB=(byteorder==ByteOrder::LE), //< most significant byte index (0 or 1)
+ Print1shift=(byteorder==ByteOrder::BE)?8:0, //< value to shift with to get the 1st character to print
+ Print2shift=(byteorder==ByteOrder::LE)?8:0, //< value to shift with to get the 2nd character to print
+ MaxChar=0xFFFF
+ };
+
+ /// \brief See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void skip( char*, unsigned int& bufpos, Iterator& itr)
+ {
+ for (;bufpos < 2; ++bufpos)
+ {
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::fetchbytes(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos<2)
+ {
+ if (bufpos<1)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ buf[1] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ }
+
+ template <class Iterator>
+ static inline UChar value_impl( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ fetchbytes( buf, bufpos, itr);
+ UChar res = (unsigned char)buf[MSB];
+ return (res << 8) + (unsigned char)buf[LSB];
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ inline UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
+ {
+ return value_impl( buf, bufpos, itr);
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ UChar ch = value_impl( buf, bufpos, itr);
+ return (ch > 127)?-1:(char)ch;
+ }
+
+ /// \brief See template<class Buffer>Interface::print(UChar,Buffer&)
+ template <class Buffer_>
+ inline void print( UChar chr, Buffer_& buf) const
+ {
+ if (chr>MaxChar)
+ {
+ char tb[ 32];
+ char* cc = tb;
+ Encoder::encode( chr, tb, sizeof(tb));
+ while (*cc)
+ {
+ buf.push_back( (UChar)*cc >> Print1shift);
+ buf.push_back( (UChar)*cc >> Print2shift);
+ ++cc;
+ }
+ }
+ else
+ {
+ buf.push_back( chr >> Print1shift);
+ buf.push_back( chr >> Print2shift);
+ }
+ }
+
+ /// \brief See template<class Buffer>Interface::is_equal( const Interface&, const Interface&)
+ static inline bool is_equal( const UCS2&, const UCS2&)
+ {
+ return true;
+ }
+};
+
+/// \class UCS4
+/// \brief Character set UCS-4 (little/big endian)
+/// \tparam byteorder ByteOrder::LE or ByteOrder::BE
+template <int byteorder>
+struct UCS4
+{
+ enum
+ {
+ B0=(byteorder==ByteOrder::BE)?3:0,
+ B1=(byteorder==ByteOrder::BE)?2:1,
+ B2=(byteorder==ByteOrder::BE)?1:2,
+ B3=(byteorder==ByteOrder::BE)?0:3,
+ Print1shift=(byteorder==ByteOrder::BE)?24:0, //< value to shift with to get the 1st character to print
+ Print2shift=(byteorder==ByteOrder::BE)?16:8, //< value to shift with to get the 2nd character to print
+ Print3shift=(byteorder==ByteOrder::BE)?8:16, //< value to shift with to get the 3rd character to print
+ Print4shift=(byteorder==ByteOrder::BE)?0:24, //< value to shift with to get the 4th character to print
+ MaxChar=0xFFFFFFFF
+ };
+
+ /// \brief See template<class Iterator>Interface::fetchbytes(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ for (;bufpos < 4; ++bufpos)
+ {
+ buf[ bufpos] = *itr;
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline UChar value( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ fetchbytes( buf, bufpos, itr);
+ UChar res = (unsigned char)buf[B3];
+ res = (res << 8) + (unsigned char)buf[B2];
+ res = (res << 8) + (unsigned char)buf[B1];
+ return (res << 8) + (unsigned char)buf[B0];
+ }
+
+ /// \brief See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void skip( char*, unsigned int& bufpos, Iterator& itr)
+ {
+ for (;bufpos < 4; ++bufpos)
+ {
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ UChar ch = value( buf, bufpos, itr);
+ return (ch > 127)?-1:(char)ch;
+ }
+
+ /// \brief See template<class Buffer>Interface::print(UChar,Buffer&)
+ template <class Buffer_>
+ static void print( UChar chr, Buffer_& buf)
+ {
+ buf.push_back( (unsigned char)((chr >> Print1shift) & 0xFF));
+ buf.push_back( (unsigned char)((chr >> Print2shift) & 0xFF));
+ buf.push_back( (unsigned char)((chr >> Print3shift) & 0xFF));
+ buf.push_back( (unsigned char)((chr >> Print4shift) & 0xFF));
+ }
+
+ /// \brief See template<class Buffer>Interface::is_equal( const Interface&, const Interface&)
+ static inline bool is_equal( const UCS4&, const UCS4&)
+ {
+ return true;
+ }
+};
+
+/// \class UCS2LE
+/// \brief UCS-2 little endian character set encoding
+struct UCS2LE :public UCS2<ByteOrder::LE> {};
+/// \class UCS2BE
+/// \brief UCS-2 big endian character set encoding
+struct UCS2BE :public UCS2<ByteOrder::BE> {};
+/// \class UCS4LE
+/// \brief UCS-4 little endian character set encoding
+struct UCS4LE :public UCS4<ByteOrder::LE> {};
+/// \class UCS4BE
+/// \brief UCS-4 big endian character set encoding
+struct UCS4BE :public UCS4<ByteOrder::BE> {};
+
+}//namespace
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/charset_utf16.hpp b/textwolf/include/textwolf/charset_utf16.hpp
new file mode 100644
index 0000000..576c202
--- /dev/null
+++ b/textwolf/include/textwolf/charset_utf16.hpp
@@ -0,0 +1,224 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset_utf16.hpp
+/// \brief Definition of UTF-16 encodings
+
+#ifndef __TEXTWOLF_CHARSET_UTF16_HPP__
+#define __TEXTWOLF_CHARSET_UTF16_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include <cstddef>
+
+namespace textwolf {
+namespace charset {
+
+/// \class UTF16
+/// \brief Character set UTF16 (little/big endian)
+/// \tparam encoding ByteOrder::LE or ByteOrder::BE
+/// \remark BOM character sequences are not interpreted as such and byte swapping is not done implicitely
+/// It is left to the caller to detect BOM or its inverse and to switch the iterator.
+/// \remark See http://en.wikipedia.org/wiki/UTF-16/UCS-2: ... If the endian architecture of the decoder
+/// matches that of the encoder, the decoder detects the 0xFEFF value, but an opposite-endian decoder
+/// interprets the BOM as the non-character value U+FFFE reserved for this purpose. This incorrect
+/// result provides a hint to perform byte-swapping for the remaining values. If the BOM is missing,
+/// the standard says that big-endian encoding should be assumed....
+template <int encoding=ByteOrder::BE>
+class UTF16
+{
+private:
+ enum
+ {
+ LSB=(encoding==ByteOrder::BE), //< least significant byte index (0 or 1)
+ MSB=(encoding==ByteOrder::LE), //< most significant byte index (0 or 1)
+ Print1shift=(encoding==ByteOrder::BE)?8:0, //< value to shift with to get the 1st character to print
+ Print2shift=(encoding==ByteOrder::LE)?8:0 //< value to shift with to get the 2nd character to print
+ };
+
+public:
+ enum
+ {
+ MaxChar=0x10FFFF //< maximum character in alphabet
+ };
+
+public:
+ /// \brief See template<class Iterator>Interface::fetchbytes(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos<2)
+ {
+ if (bufpos<1)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ buf[1] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ }
+
+ /// \brief Get the size of the current character in bytes (variable length encoding)
+ /// \param [in] buf buffer for the character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator
+ template <class Iterator>
+ static inline unsigned int size( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ fetchbytes( buf, bufpos, itr);
+
+ UChar rt = (unsigned char)buf[ MSB];
+ if ((rt - 0xD8) > 0x03)
+ {
+ return 2;
+ }
+ else
+ {
+ return 4;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void skip( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ unsigned int bufsize = size( buf, bufpos, itr);
+ for (;bufpos < bufsize; ++bufpos)
+ {
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ UChar ch = value_impl( buf, bufpos, itr);
+ return (ch > 127)?-1:(char)ch;
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static UChar value_impl( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ unsigned int bufsize = size( buf, bufpos, itr);
+
+ UChar rt = (unsigned char)buf[ MSB];
+ rt = (rt << 8) + (unsigned char)buf[ LSB];
+
+ if (bufsize == 4)
+ {
+ // 2 teilig
+ while (bufpos < bufsize)
+ {
+ buf[bufpos] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ rt -= 0xD800;
+ rt *= 0x400;
+ unsigned short lo = (unsigned char)buf[ 2+MSB];
+ if ((lo - 0xDC) > 0x03) return 0xFFFF;
+ lo = (lo << 8) + (unsigned char)buf[ 2+LSB];
+ return rt + lo - 0xDC00 + 0x010000;
+ }
+ return rt;
+ }
+
+ template <class Iterator>
+ inline UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
+ {
+ return value_impl( buf, bufpos, itr);
+ }
+
+ /// \brief See template<class Buffer>Interface::print(UChar,Buffer&)
+ template <class Buffer_>
+ void print( UChar ch, Buffer_& buf) const
+ {
+ if (ch <= 0xFFFF)
+ {
+ if ((ch - 0xD800) < 0x400)
+ {
+ //... reserved for encoding of characters in range [0xFFFF..0x10FFFF]
+ }
+ else
+ {
+ buf.push_back( (char)(unsigned char)((ch >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((ch >> Print2shift) & 0xFF));
+ return;
+ }
+ }
+ else if (ch <= 0x10FFFF)
+ {
+ ch -= 0x10000;
+ unsigned short hi = (ch / 0x400) + 0xD800;
+ unsigned short lo = (ch % 0x400) + 0xDC00;
+ buf.push_back( (char)(unsigned char)((hi >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((hi >> Print2shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((lo >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)((lo >> Print2shift) & 0xFF));
+ return;
+ }
+ char tb[ 32];
+ char* cc = tb;
+ Encoder::encode( ch, tb, sizeof(tb));
+ while (*cc)
+ {
+ buf.push_back( (char)(unsigned char)(((UChar)*cc >> Print1shift) & 0xFF));
+ buf.push_back( (char)(unsigned char)(((UChar)*cc >> Print2shift) & 0xFF));
+ ++cc;
+ }
+ }
+
+ /// \brief See template<class Buffer>Interface::is_equal( const Interface&, const Interface&)
+ static inline bool is_equal( const UTF16&, const UTF16&)
+ {
+ return true;
+ }
+};
+
+/// \class UTF16LE
+/// \brief UTF-16 little endian character set encoding
+struct UTF16LE :public UTF16<ByteOrder::LE> {};
+/// \class UTF16BE
+/// \brief UTF-16 big endian character set encoding
+struct UTF16BE :public UTF16<ByteOrder::BE> {};
+
+}//namespace
+}//namespace
+#endif
+
diff --git a/textwolf/include/textwolf/charset_utf8.hpp b/textwolf/include/textwolf/charset_utf8.hpp
new file mode 100644
index 0000000..f31277a
--- /dev/null
+++ b/textwolf/include/textwolf/charset_utf8.hpp
@@ -0,0 +1,218 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/charset_utf8.hpp
+/// \brief Definition of UTF-8 encoding
+
+#ifndef __TEXTWOLF_CHARSET_UTF8_HPP__
+#define __TEXTWOLF_CHARSET_UTF8_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include <cstddef>
+
+namespace textwolf {
+namespace charset {
+
+/// \class UTF8
+/// \brief character set encoding UTF-8
+struct UTF8
+{
+ /// \brief Maximum character that can be represented by this encoding implementation
+ enum {MaxChar=0x7FFFFFFF};
+ enum {
+ B11111111=0xFF,
+ B01111111=0x7F,
+ B00111111=0x3F,
+ B00011111=0x1F,
+ B00001111=0x0F,
+ B00000111=0x07,
+ B00000011=0x03,
+ B00000001=0x01,
+ B00000000=0x00,
+ B10000000=0x80,
+ B11000000=0xC0,
+ B11100000=0xE0,
+ B11110000=0xF0,
+ B11111000=0xF8,
+ B11111100=0xFC,
+ B11111110=0xFE,
+
+ B11011111=B11000000|B00011111,
+ B11101111=B11100000|B00001111,
+ B11110111=B11110000|B00000111,
+ B11111011=B11111000|B00000011,
+ B11111101=B11111100|B00000001
+ };
+
+ /// \class CharLengthTab
+ /// \brief Table that maps the first UTF-8 character byte to the length of the character in bytes
+ struct CharLengthTab :public CharMap<unsigned char, 0>
+ {
+ CharLengthTab()
+ {
+ (*this)
+ (B00000000,B01111111,1)
+ (B11000000,B11011111,2)
+ (B11100000,B11101111,3)
+ (B11110000,B11110111,4)
+ (B11111000,B11111011,5)
+ (B11111100,B11111101,6)
+ (B11111110,B11111110,7)
+ (B11111111,B11111111,8);
+ };
+ };
+
+ /// \brief Get the size of the current character in bytes (variable length encoding)
+ /// \param [in] buf buffer for the character data
+ /// \param [in,out] bufpos position in 'buf'
+ /// \param [in,out] itr iterator to skip
+ template <class Iterator>
+ static inline unsigned int size( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ static CharLengthTab charLengthTab;
+ if (bufpos==0)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ return charLengthTab[ (unsigned char)buf[ 0]];
+ }
+
+ /// \brief See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void skip( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ unsigned int bufsize = size( buf, bufpos, itr);
+ for (;bufpos < bufsize; ++bufpos)
+ {
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos==0)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ return ((unsigned char)(buf[0])>127)?-1:buf[0];
+ }
+
+ /// \brief See template<class Iterator>Interface::fetch(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
+ {
+ if (bufpos==0)
+ {
+ buf[0] = *itr;
+ ++itr;
+ ++bufpos;
+ }
+ unsigned int bufsize = size( buf, bufpos, itr);
+ for (;bufpos < bufsize; ++bufpos)
+ {
+ buf[ bufpos] = *itr;
+ ++itr;
+ }
+ }
+
+ /// \brief See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
+ template <class Iterator>
+ UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
+ {
+ fetchbytes( buf, bufpos, itr);
+
+ UChar res = (unsigned char)buf[0];
+ if (res > 127)
+ {
+ int gg = bufpos-2;
+ if (gg < 0) return MaxChar;
+
+ res = ((unsigned char)buf[0])&(B00011111>>gg);
+ for (int ii=0; ii<=gg; ii++)
+ {
+ unsigned char xx = (unsigned char)buf[ii+1];
+ res = (res<<6) | (xx & B00111111);
+ if ((unsigned char)(xx & B11000000) != B10000000)
+ {
+ return MaxChar;
+ }
+ }
+ }
+ return res;
+ }
+
+ /// \brief See template<class Buffer>Interface::print(UChar,Buffer&)
+ template <class Buffer_>
+ void print( UChar chr, Buffer_& buf) const
+ {
+ unsigned int rt;
+ if (chr <= 127)
+ {
+ buf.push_back( (char)(unsigned char)chr);
+ return;
+ }
+ unsigned int pp,sf;
+ for (pp=1,sf=5; pp<5; pp++,sf+=5)
+ {
+ if (chr < (unsigned int)((1<<6)<<sf)) break;
+ }
+ rt = pp+1;
+ unsigned char HB = (unsigned char)(B11111111 << (8-rt));
+ unsigned char shf = (unsigned char)(pp*6);
+ unsigned int ii;
+ buf.push_back( (char)(((unsigned char)(chr >> shf) & (~HB >> 1)) | HB));
+ for (ii=1,shf-=6; ii<=pp; shf-=6,ii++)
+ {
+ buf.push_back( (char)(unsigned char) (((chr >> shf) & B00111111) | B10000000));
+ }
+ }
+
+ /// \brief See template<class Buffer>Interface::is_equal( const Interface&, const Interface&)
+ static bool is_equal( const UTF8&, const UTF8&)
+ {
+ return true;
+ }
+};
+
+}//namespace
+}//namespace
+#endif
+
diff --git a/textwolf/include/textwolf/codepages.hpp b/textwolf/include/textwolf/codepages.hpp
new file mode 100644
index 0000000..4e8e7cf
--- /dev/null
+++ b/textwolf/include/textwolf/codepages.hpp
@@ -0,0 +1,182 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/codepages.hpp
+/// \brief Definition of IsoLatin code pages
+
+#ifndef __TEXTWOLF_CODE_PAGES_HPP__
+#define __TEXTWOLF_CODE_PAGES_HPP__
+#include "textwolf/char.hpp"
+#include <map>
+
+namespace textwolf {
+namespace charset {
+
+/// \class IsoLatinCodePage
+/// \brief IsoLatin code page
+class IsoLatinCodePage
+{
+private:
+ struct InvOvlCodeMap
+ {
+ InvOvlCodeMap()
+ {
+ struct Element
+ {
+ unsigned short first;
+ unsigned char second;
+ };
+ struct ElementAr
+ {
+ Element ar[ 64];
+ };
+ static const ElementAr ovlar[9] =
+ {
+ {{{0,0}}},
+ {{{260,161}, {728,162}, {321,163}, {317,165}, {346,166}, {352,169}, {350,170}, {356,171}, {377,172}, {381,174}, {379,175}, {261,177}, {731,178}, {322,179}, {318,181}, {347,182}, {711,183}, {353,185}, {351,186}, {357,187}, {378,188}, {733,189}, {382,190}, {380,191}, {340,192}, {258,195}, {313,197}, {262,198}, {268,200}, {280,202}, {282,204}, {270,207}, {272,208}, {323,209}, {327,210}, {336,213}, {344,216}, {366,217}, {368,219}, {354,222}, {341,224}, {259,227}, {314,229}, {263,230}, {269,232}, {281,234}, {283,236}, {271,239}, {273,240}, {324,241}, {328,242}, {337,245}, {345,248}, {367,249}, {369,251}, {355,254}, {729,255}, {0,0}}},
+ {{{294,161}, {728,162}, {292,165}, {304,168}, {350,169}, {286,170}, {308,171}, {379,173}, {295,175}, {293,180}, {305,183}, {351,184}, {287,185}, {309,186}, {380,188}, {266,193}, {264,194}, {288,208}, {284,211}, {364,216}, {348,217}, {267,223}, {265,224}, {289,238}, {285,241}, {365,246}, {349,247}, {729,248}, {0,0}}},
+ {{{260,161}, {312,162}, {342,163}, {296,165}, {315,166}, {352,169}, {274,170}, {290,171}, {358,172}, {381,174}, {261,177}, {731,178}, {343,179}, {297,181}, {316,182}, {711,183}, {353,185}, {275,186}, {291,187}, {359,188}, {330,189}, {382,190}, {331,191}, {256,192}, {302,199}, {268,200}, {280,202}, {278,204}, {298,207}, {272,208}, {325,209}, {332,210}, {310,211}, {370,217}, {360,221}, {362,222}, {257,224}, {303,231}, {269,232}, {281,234}, {279,236}, {299,239}, {273,240}, {326,241}, {333,242}, {311,243}, {371,249}, {361,253}, {363,254}, {729,255}, {0,0}}},
+ {{{286,208}, {304,221}, {350,222}, {287,240}, {305,253}, {351,254}, {0,0}}},
+ {{{260,161}, {274,162}, {290,163}, {298,164}, {296,165}, {310,166}, {315,168}, {272,169}, {352,170}, {358,171}, {381,172}, {362,174}, {330,175}, {261,177}, {275,178}, {291,179}, {299,180}, {297,181}, {311,182}, {316,184}, {273,185}, {353,186}, {359,187}, {382,188}, {8213,189}, {363,190}, {331,191}, {256,192}, {302,199}, {268,200}, {280,202}, {278,204}, {325,209}, {332,210}, {360,215}, {370,217}, {257,224}, {303,231}, {269,232}, {281,234}, {279,236}, {326,241}, {333,242}, {361,247}, {371,249}, {312,255}, {0,0}}},
+ {{{8221,161}, {8222,165}, {342,170}, {8220,180}, {343,186}, {260,192}, {302,193}, {256,194}, {262,195}, {280,198}, {274,199}, {268,200}, {377,202}, {278,203}, {290,204}, {310,205}, {298,206}, {315,207}, {352,208}, {323,209}, {325,210}, {332,212}, {370,216}, {321,217}, {346,218}, {362,219}, {379,221}, {381,222}, {261,224}, {303,225}, {257,226}, {263,227}, {281,230}, {275,231}, {269,232}, {378,234}, {279,235}, {291,236}, {311,237}, {299,238}, {316,239}, {353,240}, {324,241}, {326,242}, {333,244}, {371,248}, {322,249}, {347,250}, {363,251}, {380,253}, {382,254}, {8217,255}, {0,0}}},
+ {{{7682,161}, {7683,162}, {266,164}, {267,165}, {7690,166}, {7808,168}, {7810,170}, {7691,171}, {7922,172}, {376,175}, {7710,176}, {7711,177}, {288,178}, {289,179}, {7744,180}, {7745,181}, {7766,183}, {7809,184}, {7767,185}, {7811,186}, {7776,187}, {7923,188}, {7812,189}, {7813,190}, {7777,191}, {372,208}, {7786,215}, {374,222}, {373,240}, {7787,247}, {375,254}, {0,0}}},
+ {{{8364,164}, {352,166}, {353,168}, {381,180}, {382,184}, {338,188}, {339,189}, {376,190}, {0,0}}}
+ };
+ unsigned int idx = 0;
+ for (; idx < 9; ++idx)
+ {
+ unsigned int ii = 0;
+ for (; ovlar[idx].ar[ii].first; ++ii)
+ {
+ m_map[idx][ ovlar[idx].ar[ii].first] = ovlar[idx].ar[ii].second;
+ }
+ }
+ }
+
+ inline const std::map<unsigned short, unsigned char>* get( unsigned int idx) const
+ {
+ return &m_map[ idx];
+ }
+ private:
+ std::map<unsigned short, unsigned char> m_map[9];
+ };
+
+public:
+ /// \brief Copy constructor
+ IsoLatinCodePage( const IsoLatinCodePage& o)
+ :m_cd(o.m_cd)
+ ,m_invcd(o.m_invcd)
+ ,m_invovlcd(o.m_invovlcd){}
+
+ /// \brief Constructor
+ /// \param[in] idx IsoLatin code page index, 1 for "IsoLatin-1"
+ IsoLatinCodePage( unsigned int idx)
+ {
+ enum {NofCodePages=9};
+ struct CodePage
+ {
+ unsigned short ar[128];
+ };
+ static const CodePage codePage[ NofCodePages] = {
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 260, 728, 321, 164, 317, 346, 167, 168, 352, 350, 356, 377, 173, 381, 379, 176, 261, 731, 322, 180, 318, 347, 711, 184, 353, 351, 357, 378, 733, 382, 380, 340, 193, 194, 258, 196, 313, 262, 199, 268, 201, 280, 203, 282, 205, 206, 270, 272, 323, 327, 211, 212, 336, 214, 215, 344, 366, 218, 368, 220, 221, 354, 223, 341, 225, 226, 259, 228, 314, 263, 231, 269, 233, 281, 235, 283, 237, 238, 271, 273, 324, 328, 243, 244, 337, 246, 247, 345, 367, 250, 369, 252, 253, 355, 729}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 294, 728, 163, 164, 292, 167, 168, 304, 350, 286, 308, 173, 379, 176, 295, 178, 179, 180, 181, 293, 183, 184, 305, 351, 287, 309, 189, 380, 192, 193, 194, 196, 266, 264, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 288, 214, 215, 284, 217, 218, 219, 220, 364, 348, 223, 224, 225, 226, 228, 267, 265, 231, 232, 233, 234, 235, 236, 237, 238, 239, 241, 242, 243, 244, 289, 246, 247, 285, 249, 250, 251, 252, 365, 349, 729}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 260, 312, 342, 164, 296, 315, 167, 168, 352, 274, 290, 358, 173, 381, 175, 176, 261, 731, 343, 180, 297, 316, 711, 184, 353, 275, 291, 359, 330, 382, 331, 256, 193, 194, 195, 196, 197, 198, 302, 268, 201, 280, 203, 278, 205, 206, 298, 272, 325, 332, 310, 212, 213, 214, 215, 216, 370, 218, 219, 220, 360, 362, 223, 257, 225, 226, 227, 228, 229, 230, 303, 269, 233, 281, 235, 279, 237, 238, 299, 273, 326, 333, 311, 244, 245, 246, 247, 248, 371, 250, 251, 252, 361, 363, 729}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 286, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 304, 350, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 287, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 305, 351, 255}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 260, 274, 290, 298, 296, 310, 167, 315, 272, 352, 358, 381, 173, 362, 330, 176, 261, 275, 291, 299, 297, 311, 183, 316, 273, 353, 359, 382, 8213, 363, 331, 256, 193, 194, 195, 196, 197, 198, 302, 268, 201, 280, 203, 278, 205, 206, 207, 208, 325, 332, 211, 212, 213, 214, 360, 216, 370, 218, 219, 220, 221, 222, 223, 257, 225, 226, 227, 228, 229, 230, 303, 269, 233, 281, 235, 279, 237, 238, 239, 240, 326, 333, 243, 244, 245, 246, 361, 248, 371, 250, 251, 252, 253, 254, 312}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 8221, 162, 163, 164, 8222, 166, 167, 216, 169, 342, 171, 172, 173, 174, 198, 176, 177, 178, 179, 8220, 181, 182, 183, 248, 185, 343, 187, 188, 189, 190, 230, 260, 302, 256, 262, 196, 197, 280, 274, 268, 201, 377, 278, 290, 310, 298, 315, 352, 323, 325, 211, 332, 213, 214, 215, 370, 321, 346, 362, 220, 379, 381, 223, 261, 303, 257, 263, 228, 229, 281, 275, 269, 233, 378, 279, 291, 311, 299, 316, 353, 324, 326, 243, 333, 245, 246, 247, 371, 322, 347, 363, 252, 380, 382, 8217}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 7682, 7683, 163, 266, 267, 7690, 167, 7808, 169, 7810, 7691, 7922, 173, 174, 376, 7710, 7711, 288, 289, 7744, 7745, 182, 7766, 7809, 7767, 7811, 7776, 7923, 7812, 7813, 7777, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 372, 209, 210, 211, 212, 213, 214, 7786, 216, 217, 218, 219, 220, 221, 374, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 373, 241, 242, 243, 244, 245, 246, 7787, 248, 249, 250, 251, 252, 253, 375, 255}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 8364, 165, 352, 167, 353, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 381, 181, 182, 183, 382, 185, 186, 187, 338, 339, 376, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}}
+ };
+ static const CodePage invcodePage[ NofCodePages] = {
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 0, 0, 0, 164, 0, 0, 167, 168, 0, 0, 0, 0, 173, 0, 0, 176, 0, 0, 0, 180, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0, 0, 0, 193, 194, 0, 196, 0, 0, 199, 0, 201, 0, 203, 0, 205, 206, 0, 0, 0, 0, 211, 212, 0, 214, 215, 0, 0, 218, 0, 220, 221, 0, 223, 0, 225, 226, 0, 228, 0, 0, 231, 0, 233, 0, 235, 0, 237, 238, 0, 0, 0, 0, 243, 244, 0, 246, 247, 0, 0, 250, 0, 252, 253, 0, 0}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 0, 0, 163, 164, 0, 0, 166, 167, 0, 0, 0, 0, 172, 0, 0, 174, 0, 176, 177, 178, 179, 0, 181, 182, 0, 0, 0, 0, 187, 0, 0, 189, 190, 191, 0, 192, 0, 0, 195, 196, 197, 198, 199, 200, 201, 202, 203, 0, 204, 205, 206, 207, 0, 209, 210, 0, 212, 213, 214, 215, 0, 0, 218, 219, 220, 221, 0, 222, 0, 0, 225, 226, 227, 228, 229, 230, 231, 232, 233, 0, 234, 235, 236, 237, 0, 239, 240, 0, 242, 243, 244, 245, 0, 0, 0}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 0, 0, 0, 164, 0, 0, 167, 168, 0, 0, 0, 0, 173, 0, 175, 176, 0, 0, 0, 180, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0, 0, 0, 193, 194, 195, 196, 197, 198, 0, 0, 201, 0, 203, 0, 205, 206, 0, 0, 0, 0, 0, 212, 213, 214, 215, 216, 0, 218, 219, 220, 0, 0, 223, 0, 225, 226, 227, 228, 229, 230, 0, 0, 233, 0, 235, 0, 237, 238, 0, 0, 0, 0, 0, 244, 245, 246, 247, 248, 0, 250, 251, 252, 0, 0, 0}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 0, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 0, 0, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 0, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 0, 0, 255}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 0, 0, 0, 0, 0, 0, 167, 0, 0, 0, 0, 0, 173, 0, 0, 176, 0, 0, 0, 0, 0, 0, 183, 0, 0, 0, 0, 0, 0, 0, 0, 0, 193, 194, 195, 196, 197, 198, 0, 0, 201, 0, 203, 0, 205, 206, 207, 208, 0, 0, 211, 212, 213, 214, 0, 216, 0, 218, 219, 220, 221, 222, 223, 0, 225, 226, 227, 228, 229, 230, 0, 0, 233, 0, 235, 0, 237, 238, 239, 240, 0, 0, 243, 244, 245, 246, 0, 248, 0, 250, 251, 252, 253, 254, 0}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 0, 162, 163, 164, 0, 166, 167, 0, 169, 0, 171, 172, 173, 174, 0, 176, 177, 178, 179, 0, 181, 182, 183, 0, 185, 0, 187, 188, 189, 190, 0, 0, 0, 0, 0, 196, 197, 175, 0, 0, 201, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 0, 213, 214, 215, 168, 0, 0, 0, 220, 0, 0, 223, 0, 0, 0, 0, 228, 229, 191, 0, 0, 233, 0, 0, 0, 0, 0, 0, 0, 0, 0, 243, 0, 245, 246, 247, 184, 0, 0, 0, 252, 0, 0, 0}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 0, 0, 163, 0, 0, 0, 167, 0, 169, 0, 0, 0, 173, 174, 0, 0, 0, 0, 0, 0, 0, 182, 0, 0, 0, 0, 0, 0, 0, 0, 0, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 0, 209, 210, 211, 212, 213, 214, 0, 216, 217, 218, 219, 220, 221, 0, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 0, 241, 242, 243, 244, 245, 246, 0, 248, 249, 250, 251, 252, 253, 0, 255}},
+ {{128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 0, 165, 0, 167, 0, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 0, 181, 182, 183, 0, 185, 186, 187, 0, 0, 0, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}}
+ };
+ static const InvOvlCodeMap invOvlCodeMap;
+
+ if (idx > NofCodePages || idx == 0) throw std::logic_error( "code page index not supported");
+ m_cd = &codePage[ idx-1].ar[0];
+ m_invcd = &invcodePage[ idx-1].ar[0];
+ m_invovlcd = invOvlCodeMap.get( idx-1);
+ }
+
+ /// \brief Get the unicode character representation of the character ch in this codepage
+ /// \param[in] ch character in this codepage
+ /// \return the unicode representation of the passed character
+ inline UChar ucharcode( char ch) const
+ {
+ if ((signed char)ch >= 0) return ch;
+ return m_cd[ (unsigned int)(unsigned char)ch - 128];
+ }
+
+ /// \brief Get the character representation of a unicode character in this codepage
+ /// \param[in] ch unicode character
+ /// \return the representation of the passed unicode character in this codepage
+ inline char invcode( UChar ch) const
+ {
+ char rt = 0;
+ if (ch <= 128) return ch;
+ if (ch <= 255) rt = m_invcd[ ch - 128];
+ if (rt == 0)
+ {
+ std::map<unsigned short, unsigned char>::const_iterator fi = m_invovlcd->find( ch);
+ if (fi == m_invovlcd->end()) return 0;
+ rt = fi->second;
+ }
+ return rt;
+ }
+
+ /// \brief Evaluate if two code pages are equal
+ static inline bool is_equal( const IsoLatinCodePage& a, const IsoLatinCodePage& b)
+ {
+ return a.m_cd == b.m_cd;
+ }
+
+private:
+ const unsigned short* m_cd;
+ const unsigned short* m_invcd;
+ const std::map<unsigned short, unsigned char>* m_invovlcd;
+};
+
+}}
+#endif
+
+
diff --git a/textwolf/include/textwolf/cstringiterator.hpp b/textwolf/include/textwolf/cstringiterator.hpp
new file mode 100644
index 0000000..f2d5c12
--- /dev/null
+++ b/textwolf/include/textwolf/cstringiterator.hpp
@@ -0,0 +1,120 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this Object refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/cstringiterator.hpp
+/// \brief textwolf iterator on strings
+
+#ifndef __TEXTWOLF_CSTRING_ITERATOR_HPP__
+#define __TEXTWOLF_CSTRING_ITERATOR_HPP__
+#include <string>
+#include <cstring>
+#include <cstdlib>
+
+/// \namespace textwolf
+/// \brief Toplevel namespace of the library
+namespace textwolf {
+
+/// \class CStringIterator
+/// \brief Input iterator on a constant string returning null characters after EOF as required by textwolf scanners
+class CStringIterator
+{
+public:
+ /// \brief Default constructor
+ CStringIterator()
+ :m_src(0)
+ ,m_size(0)
+ ,m_pos(0){}
+
+ /// \brief Constructor
+ /// \param [in] src null terminated C string to iterate on
+ /// \param [in] size number of bytes in the string to iterate on
+ CStringIterator( const char* src, unsigned int size)
+ :m_src(src)
+ ,m_size(size)
+ ,m_pos(0){}
+
+ /// \brief Constructor
+ /// \param [in] src string to iterate on
+ CStringIterator( const char* src)
+ :m_src(src)
+ ,m_pos(0){m_size=std::strlen(m_src);}
+
+ /// \brief Constructor
+ /// \param [in] src string to iterate on
+ CStringIterator( const std::string& src)
+ :m_src(src.c_str())
+ ,m_size(src.size())
+ ,m_pos(0){}
+
+ /// \brief Copy constructor
+ /// \param [in] o iterator to copy
+ CStringIterator( const CStringIterator& o)
+ :m_src(o.m_src)
+ ,m_size(o.m_size)
+ ,m_pos(o.m_pos){}
+
+ /// \brief Element access
+ /// \return current character
+ inline char operator* ()
+ {
+ return (m_pos < m_size)?m_src[m_pos]:0;
+ }
+
+ /// \brief Preincrement
+ inline CStringIterator& operator++()
+ {
+ m_pos++;
+ return *this;
+ }
+
+ /// \brief Return current char position
+ inline unsigned int pos() const {return m_pos;}
+
+ /// \brief Set current char position
+ inline void pos( unsigned int i) {m_pos=(i<m_size)?i:m_size;}
+
+ inline int operator - (const CStringIterator& o) const
+ {
+ if (m_src != o.m_src) return 0;
+ return (int)(m_pos - o.m_pos);
+ }
+
+private:
+ const char* m_src;
+ unsigned int m_size;
+ unsigned int m_pos;
+};
+
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/exception.hpp b/textwolf/include/textwolf/exception.hpp
new file mode 100644
index 0000000..bf236fe
--- /dev/null
+++ b/textwolf/include/textwolf/exception.hpp
@@ -0,0 +1,106 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/exception.hpp
+/// \brief Definition of exceptions with containing error codes thrown by textwolf
+
+#ifndef __TEXTWOLF_EXCEPTION_HPP__
+#define __TEXTWOLF_EXCEPTION_HPP__
+#include <exception>
+#include <stdexcept>
+
+namespace textwolf {
+
+/// \class throws_exception
+/// \brief Base class for structures that can throw exceptions for non recoverable errors
+struct throws_exception
+{
+ /// \enum Cause
+ /// \brief Enumeration of error cases
+ enum Cause
+ {
+ Unknown, ///< uknown error
+ DimOutOfRange, ///< memory reserved for statically allocated table or memory block is too small. Increase the size of memory block passed to the XML path select automaton. Usage error !
+ StateNumbersNotAscending, ///< XML scanner automaton definition check failed. Labels of states must be equal to their indices. Internal textwold error !
+ InvalidParamState, ///< parameter check (for state) in automaton definition failed. Internal textwold error !
+ InvalidParamChar, ///< parameter check (for control character) in automaton definition failed. Internal textwold error !
+ DuplicateStateTransition, ///< duplicate transition definition in automaton. Internal textwold error !
+ InvalidState, ///< invalid state definition in automaton. Internal textwold error !
+ IllegalParam, ///< parameter check in automaton definition failed. Internal textwold error !
+ IllegalAttributeName, ///< invalid string for a tag or attribute in the automaton definition. Usage error !
+ OutOfMem, ///< out of memory in the automaton definition. System error (std::bad_alloc) !
+ ArrayBoundsReadWrite, ///< invalid array access. Internal textwold error !
+ NotAllowedOperation ///< defining an operation in an automaton definition that is not allowed there. Usage error !
+ };
+};
+
+/// \class exception
+/// \brief textwolf exception class
+struct exception :public std::runtime_error
+{
+ typedef throws_exception::Cause Cause;
+ Cause cause; //< exception cause tag
+
+ /// \brief Constructor
+ /// \return exception object
+ exception (Cause p_cause) throw()
+ :std::runtime_error("textwolf error in XML"), cause(p_cause) {}
+ /// \brief Copy constructor
+ exception (const exception& orig) throw()
+ :std::runtime_error("textwolf error in XML"), cause(orig.cause) {}
+ /// \brief Destructor
+ virtual ~exception() throw() {}
+
+ /// \brief Assignement
+ /// \param[in] orig exception to copy
+ /// \return *this
+ exception& operator= (const exception& orig) throw()
+ {cause=orig.cause; return *this;}
+
+ /// \brief Exception message
+ /// \return exception cause as string
+ virtual const char* what() const throw()
+ {
+ // enumeration of exception causes as strings
+ static const char* nameCause[ 12] = {
+ "Unknown","DimOutOfRange","StateNumbersNotAscending","InvalidParamState",
+ "InvalidParamChar","DuplicateStateTransition","InvalidState","IllegalParam",
+ "IllegalAttributeName","OutOfMem","ArrayBoundsReadWrite","NotAllowedOperation"
+ };
+ return nameCause[ (unsigned int) cause];
+ }
+};
+
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/istreamiterator.hpp b/textwolf/include/textwolf/istreamiterator.hpp
new file mode 100644
index 0000000..5a09669
--- /dev/null
+++ b/textwolf/include/textwolf/istreamiterator.hpp
@@ -0,0 +1,89 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this Object refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/istreamiterator.hpp
+/// \brief Definition of iterators for textwolf on STL input streams (std::istream)
+
+#ifndef __TEXTWOLF_ISTREAM_ITERATOR_HPP__
+#define __TEXTWOLF_ISTREAM_ITERATOR_HPP__
+#include <iostream>
+#include <iterator>
+
+/// \namespace textwolf
+/// \brief Toplevel namespace of the library
+namespace textwolf {
+
+/// \class IStreamIterator
+/// \brief Input iterator on an STL input stream
+class IStreamIterator
+{
+public:
+ /// \brief Default constructor
+ IStreamIterator(){}
+
+ /// \brief Constructor
+ /// \param [in] input input to iterate on
+ IStreamIterator( std::istream& input)
+ :m_itr(input)
+ {
+ input.unsetf( std::ios::skipws);
+ }
+
+ /// \brief Copy constructor
+ /// \param [in] o iterator to copy
+ IStreamIterator( const IStreamIterator& o)
+ :m_itr(o.m_itr)
+ ,m_end(o.m_end){}
+
+ /// \brief Element access
+ /// \return current character
+ inline char operator* ()
+ {
+ return (m_itr != m_end)?*m_itr:0;
+ }
+
+ /// \brief Pre increment
+ inline IStreamIterator& operator++()
+ {
+ ++m_itr;
+ return *this;
+ }
+
+private:
+ std::istream_iterator<unsigned char> m_itr;
+ std::istream_iterator<unsigned char> m_end;
+};
+
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/sourceiterator.hpp b/textwolf/include/textwolf/sourceiterator.hpp
new file mode 100644
index 0000000..98acfb5
--- /dev/null
+++ b/textwolf/include/textwolf/sourceiterator.hpp
@@ -0,0 +1,136 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this Object refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/sourceiterator.hpp
+/// \brief textwolf byte source iterator template
+
+#ifndef __TEXTWOLF_SOURCE_ITERATOR_HPP__
+#define __TEXTWOLF_SOURCE_ITERATOR_HPP__
+#include <cstdlib>
+#include <stdexcept>
+#include <setjmp.h>
+
+/// \namespace textwolf
+/// \brief Toplevel namespace of the library
+namespace textwolf {
+
+/// \class SrcIterator
+/// \brief Input iterator as source for the XML scanner with the possibility of being fed chunk by chunk
+class SrcIterator
+{
+public:
+ /// \brief Empty constructor
+ SrcIterator()
+ :m_itr(0)
+ ,m_end(0)
+ ,m_eom(0){}
+
+ /// \brief Copy constructor
+ /// \param [in] o iterator to copy
+ SrcIterator( const SrcIterator& o)
+ :m_itr(o.m_itr)
+ ,m_end(o.m_end)
+ ,m_eom(o.m_eom){}
+
+ /// \brief Constructor
+ /// \param [in] buf source chunk to iterate on
+ /// \param [in] size size of source chunk to iterate on in bytes
+ /// \param [in] eom_ trigger to activate if end of data has been reached (no next chunk anymore)
+ SrcIterator( const char* buf, std::size_t size, jmp_buf* eom_=0)
+ :m_itr(const_cast<char*>(buf))
+ ,m_end(m_itr+size)
+ ,m_eom(eom_){}
+
+ /// \brief Assingment operator
+ SrcIterator& operator=( const SrcIterator& o)
+ {
+ m_itr = o.m_itr;
+ m_end = o.m_end;
+ m_eom = o.m_eom;
+ return *this;
+ }
+
+ /// \brief Element access operator (required by textwolf for an input iterator)
+ inline char operator*()
+ {
+ if (m_itr >= m_end)
+ {
+ if (m_eom) longjmp(*m_eom,1);
+ return 0;
+ }
+ return *m_itr;
+ }
+
+ /// \brief Prefix increment operator (required by textwolf for an input iterator)
+ inline SrcIterator& operator++()
+ {
+ ++m_itr;
+ return *this;
+ }
+
+ /// \brief Get the iterator difference in bytes
+ inline std::size_t operator-( const SrcIterator& b) const
+ {
+ if (b.m_end != m_end || m_itr < b.m_itr) throw std::logic_error( "illegal operation");
+ return m_itr - b.m_itr;
+ }
+
+ /// \brief Feed input to the source iterator
+ /// \param[in] buf poiner to start of input
+ /// \param[in] size size of input passed in bytes
+ /// \param[in] eom longjmp to call with parameter 1, if the end of data has been reached before EOF (null termination), eom=null, if the chunk passed contains the complete reset of the input and eof (null) can be returned if we reach the end
+ void putInput( const char* buf, std::size_t size, jmp_buf* eom=0)
+ {
+ m_itr = const_cast<char*>(buf);
+ m_end = m_itr+size;
+ m_eom = eom;
+ }
+
+ /// \brief Get the current position in the current chunk parsed
+ /// \remark Does not return the absolute position in the source parsed but the position in the chunk
+ std::size_t getPosition() const
+ {
+ return (m_end >= m_itr)?(m_end-m_itr):0;
+ }
+
+private:
+ char* m_itr;
+ char* m_end;
+ jmp_buf* m_eom;
+};
+
+}//namespace
+#endif
+
+
diff --git a/textwolf/include/textwolf/staticbuffer.hpp b/textwolf/include/textwolf/staticbuffer.hpp
new file mode 100644
index 0000000..8bbadb8
--- /dev/null
+++ b/textwolf/include/textwolf/staticbuffer.hpp
@@ -0,0 +1,179 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/staticbuffer.hpp
+/// \brief Fixed size buffer fulfilling the requirement of a back insertion sequence needed for textwolf output
+
+#ifndef __TEXTWOLF_STATIC_BUFFER_HPP__
+#define __TEXTWOLF_STATIC_BUFFER_HPP__
+#include "textwolf/exception.hpp"
+#include <cstddef>
+#include <cstring>
+#include <cstdlib>
+#include <stdexcept>
+
+namespace textwolf {
+
+/// \class StaticBuffer
+/// \brief Simple back insertion sequence for storing the outputs of textwolf in a contant size buffer
+class StaticBuffer :public throws_exception
+{
+public:
+ /// \brief Constructor
+ explicit StaticBuffer( std::size_t n)
+ :m_pos(0),m_size(n),m_ar(0),m_allocated(true)
+ {
+ m_ar = (char*)std::calloc( n, sizeof(char));
+ if (!m_ar) throw std::bad_alloc();
+ }
+
+ /// \brief Constructor
+ StaticBuffer( char* p, std::size_t n, std::size_t i=0)
+ :m_pos(i)
+ ,m_size(n)
+ ,m_ar(p)
+ ,m_allocated(false)
+ ,m_overflow(false) {}
+
+ /// \brief Copy constructor
+ StaticBuffer( const StaticBuffer& o)
+ :m_pos(o.m_pos)
+ ,m_size(o.m_size)
+ ,m_ar(0)
+ ,m_allocated(o.m_allocated)
+ ,m_overflow(o.m_overflow)
+ {
+ m_ar = (char*)std::malloc( m_size * sizeof(char));
+ if (!m_ar) throw std::bad_alloc();
+ std::memcpy( m_ar, o.m_ar, m_size);
+ }
+
+ /// \brief Destructor
+ ~StaticBuffer()
+ {
+ if (m_allocated && m_ar) std::free(m_ar);
+ }
+
+ /// \brief Clear the buffer content
+ void clear()
+ {
+ m_pos = 0;
+ m_overflow = false;
+ }
+
+ /// \brief Append one character
+ /// \param[in] ch the character to append
+ void push_back( char ch)
+ {
+ if (m_pos < m_size)
+ {
+ m_ar[m_pos++] = ch;
+ }
+ else
+ {
+ m_overflow = true;
+ }
+ }
+
+ /// \brief Append an array of characters
+ /// \param[in] cc the characters to append
+ /// \param[in] ccsize the number of characters to append
+ void append( const char* cc, std::size_t ccsize)
+ {
+ if (m_pos+ccsize > m_size)
+ {
+ m_overflow = true;
+ ccsize = m_size - m_pos;
+ }
+ std::memcpy( m_ar+m_pos, cc, ccsize);
+ m_pos += ccsize;
+ }
+
+ /// \brief Return the number of characters in the buffer
+ /// \return the number of characters (bytes)
+ std::size_t size() const {return m_pos;}
+
+ /// \brief Return the buffer content as 0-terminated string
+ /// \return the C-string
+ const char* ptr() const {return m_ar;}
+
+ /// \brief Shrinks the size of the buffer or expands it with c
+ /// \param [in] n new size of the buffer
+ /// \param [in] c fill character if n bigger than the current fill size
+ void resize( std::size_t n, char c=0)
+ {
+ if (m_pos>n)
+ {
+ m_pos=n;
+ }
+ else
+ {
+ if (m_size<n) n=m_size;
+ while (n>m_pos) push_back(c);
+ }
+ }
+
+ /// \brief random access of element
+ /// \param [in] ii
+ /// \return the character at this position
+ char operator []( std::size_t ii) const
+ {
+ if (ii > m_pos) throw exception( DimOutOfRange);
+ return m_ar[ii];
+ }
+
+ /// \brief random access of element reference
+ /// \param [in] ii
+ /// \return the reference to the character at this position
+ char& at( std::size_t ii) const
+ {
+ if (ii > m_pos) throw exception( DimOutOfRange);
+ return m_ar[ii];
+ }
+
+ /// \brief check for array bounds write
+ /// \return true if a push_back would have caused an array bounds write
+ bool overflow() const {return m_overflow;}
+private:
+ StaticBuffer(){} ///< non copyable
+private:
+ std::size_t m_pos; ///< current cursor position of the buffer (number of added characters)
+ std::size_t m_size; ///< allocation size of the buffer in bytes
+ char* m_ar; ///< buffer content
+ bool m_allocated; ///< true, if the buffer is allocated by this class and not passed by constructor
+ bool m_overflow; ///< true, if an array bounds write would have happened with push_back
+};
+
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/textscanner.hpp b/textwolf/include/textwolf/textscanner.hpp
new file mode 100644
index 0000000..21fa568
--- /dev/null
+++ b/textwolf/include/textwolf/textscanner.hpp
@@ -0,0 +1,225 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+#ifndef __TEXTWOLF_TEXT_SCANNER_HPP__
+#define __TEXTWOLF_TEXT_SCANNER_HPP__
+/// \file textwolf/textscanner.hpp
+/// \brief Implementation of iterator for character-wise parsing of input
+
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include <cstddef>
+
+namespace textwolf {
+
+/// \class TextScanner
+/// \brief Reader for scanning the input character by character
+/// \tparam Iterator source iterator type (implements preincrement and '*' input byte access indirection)
+/// \tparam CharSet character set of the source stream
+template <class Iterator, class CharSet>
+class TextScanner
+{
+private:
+ Iterator start; ///< source iterator start of current chunk
+ Iterator input; ///< source iterator
+ char buf[8]; ///< buffer for one character (the current character parsed)
+ UChar val; ///< Unicode character representation of the current character parsed
+ signed char cur; ///< ASCII character representation of the current character parsed or -1 if not in ASCII range
+ unsigned int state; ///< current state of the text scanner (byte position of iterator cursor in 'buf')
+ CharSet charset;
+
+public:
+ /// \class ControlCharMap
+ /// \brief Map of ASCII characters to control character identifiers used in the XML scanner automaton
+ struct ControlCharMap :public CharMap<ControlCharacter,Undef>
+ {
+ ControlCharMap()
+ {
+ (*this)
+ (0,EndOfText)
+ (1,31,Cntrl)
+ (5,Undef)
+ (33,127,Any)
+ (128,255,Undef)
+ ('\t',Space)
+ ('\r',Space)
+ ('\n',EndOfLine)
+ (' ',Space)
+ ('&',Amp)
+ ('<',Lt)
+ ('=',Equal)
+ ('>',Gt)
+ ('/',Slash)
+ ('-',Dash)
+ ('!',Exclam)
+ ('?',Questm)
+ ('\'',Sq)
+ ('\"',Dq)
+ ('[',Osb)
+ (']',Csb);
+ };
+ };
+
+ /// \brief Constructor
+ TextScanner( const CharSet& charset_)
+ :val(0),cur(0),state(0),charset(charset_)
+ {
+ for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii] = 0;
+ }
+
+ TextScanner( const CharSet& charset_, const Iterator& p_iterator)
+ :start(p_iterator),input(p_iterator),val(0),cur(0),state(0),charset(charset_)
+ {
+ for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii] = 0;
+ }
+
+ TextScanner( const Iterator& p_iterator)
+ :start(p_iterator),input(p_iterator),val(0),cur(0),state(0),charset(CharSet())
+ {
+ for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii] = 0;
+ }
+
+ /// \brief Copy constructor
+ /// \param [in] orig textscanner to copy
+ TextScanner( const TextScanner& orig)
+ :start(orig.start)
+ ,input(orig.input)
+ ,val(orig.val)
+ ,cur(orig.cur)
+ ,state(orig.state)
+ ,charset(orig.charset)
+ {
+ for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii]=orig.buf[ii];
+ }
+
+ /// \brief Assign something to the iterator while keeping the state
+ /// \param [in] a source iterator assignment
+ template <class IteratorAssignment>
+ void setSource( const IteratorAssignment& a)
+ {
+ input = a;
+ start = a;
+ }
+
+ /// \brief Get the current source iterator position
+ /// \return source iterator position in character words (usually bytes)
+ std::size_t getPosition() const
+ {
+ return input - start;
+ }
+
+ /// \brief Get the unicode representation of the current character
+ /// \return the unicode character
+ inline UChar chr()
+ {
+ if (val == 0)
+ {
+ val = charset.value( buf, state, input);
+ }
+ return val;
+ }
+
+ /// \brief Fill the internal buffer with as many current character bytes needed for reading the ASCII representation
+ inline void getcur()
+ {
+ cur = CharSet::asciichar( buf, state, input);
+ }
+
+ /// \class copychar
+ /// \brief Direct copy of a character from input to output without encoding/decoding it
+ /// \remark Assumes the character set encodings to be of the same class
+ template <class Buffer>
+ inline void copychar( CharSet& output_, Buffer& buf_)
+ {
+ /// \todo more efficient solution of copy character to sink with same encoding here
+ /// \remark a check if the character sets fulfill is_equal(..) (IsoLatin code page !)
+ if (CharSet::is_equal( charset, output_))
+ {
+ // ... if the character sets are equal and of the same subclass (code pages)
+ // then we do not decode/encode the character but copy it directly to the output
+ charset.fetchbytes( buf, state, input);
+ for (unsigned int ii=0; ii<state; ++ii) buf_.push_back(buf[ii]);
+ }
+ else
+ {
+ output_.print( chr(), buf_);
+ }
+ }
+
+ /// \brief Get the control character representation of the current character
+ /// \return the control character
+ inline ControlCharacter control()
+ {
+ static ControlCharMap controlCharMap;
+ getcur();
+ return controlCharMap[ (unsigned char)cur];
+ }
+
+ /// \brief Get the ASCII character representation of the current character
+ /// \return the ASCII character or 0 if not defined
+ inline unsigned char ascii()
+ {
+ getcur();
+ return cur>=0?(unsigned char)cur:0;
+ }
+
+ /// \brief Skip to the next character of the source
+ /// \return *this
+ inline TextScanner& skip()
+ {
+ CharSet::skip( buf, state, input);
+ state = 0;
+ cur = 0;
+ val = 0;
+ return *this;
+ }
+
+ /// \brief see TextScanner::chr()
+ inline UChar operator*()
+ {
+ return chr();
+ }
+
+ /// \brief Preincrement: Skip to the next character of the source
+ /// \return *this
+ inline TextScanner& operator ++() {return skip();}
+
+ /// \brief Postincrement: Skip to the next character of the source
+ /// \return *this
+ inline TextScanner operator ++(int) {TextScanner tmp(*this); skip(); return tmp;}
+};
+
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/traits.hpp b/textwolf/include/textwolf/traits.hpp
new file mode 100644
index 0000000..15a318d
--- /dev/null
+++ b/textwolf/include/textwolf/traits.hpp
@@ -0,0 +1,65 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+#ifndef __TEXTWOLF_TRAITS_HPP__
+#define __TEXTWOLF_TRAITS_HPP__
+/// \file textwolf/traits.hpp
+/// \brief Type traits
+
+namespace textwolf {
+namespace traits {
+
+/// \class TypeCheck
+/// \brief Test structure to stear the compiler
+class TypeCheck
+{
+public:
+ struct YES {};
+ struct NO {};
+
+ template<typename T, typename U>
+ struct is_same
+ {
+ static const NO type() {return NO();}
+ };
+
+ template<typename T>
+ struct is_same<T,T>
+ {
+ static const YES type() {return YES();}
+ };
+};
+
+}}//namespace
+#endif
diff --git a/textwolf/include/textwolf/xmlhdrparser.hpp b/textwolf/include/textwolf/xmlhdrparser.hpp
new file mode 100644
index 0000000..f18b76b
--- /dev/null
+++ b/textwolf/include/textwolf/xmlhdrparser.hpp
@@ -0,0 +1,411 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this Object refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmlhdrparser.hpp
+/// \brief Class for parsing the header to get the character set encoding
+
+#ifndef __TEXTWOLF_XML_HEADER_PARSER_HPP__
+#define __TEXTWOLF_XML_HEADER_PARSER_HPP__
+#include <cstdlib>
+#include "textwolf/sourceiterator.hpp"
+
+/// \namespace textwolf
+/// \brief Toplevel namespace of the library
+namespace textwolf {
+
+/// \class XmlHdrParser
+/// \brief Class for parsing the header to get the character set encoding
+/// \remark Works with all single byte or multibyte character sets with ASCII as base
+class XmlHdrParser
+{
+public:
+ /// \brief Constructor
+ XmlHdrParser()
+ :m_state(Init)
+ ,m_attributetype(Encoding)
+ ,m_idx(0)
+ ,m_charsConsumed(0)
+ ,m_zeroCount(0){}
+
+ /// \brief Copy constructor
+ /// \brief param[in] o object to copy
+ XmlHdrParser( const XmlHdrParser& o)
+ :m_state(o.m_state)
+ ,m_attributetype(o.m_attributetype)
+ ,m_idx(o.m_idx)
+ ,m_charsConsumed(o.m_charsConsumed)
+ ,m_zeroCount(o.m_zeroCount)
+ ,m_item(o.m_item)
+ ,m_src(o.m_src){}
+
+
+ /// \brief Add another input chunk to process
+ /// \param[in] src_ pointer to chunk
+ /// \param[in] srcsize_ size of chunk in bytes
+ void putInput( const char* src_, std::size_t srcsize_)
+ {
+ m_src.append( src_, srcsize_);
+ }
+
+ /// \brief Get the whole original data added with subsequent calls of putInput(const char*,std::size_t)
+ /// \return the data block as string reference
+ const std::string& consumedData() const
+ {
+ return m_src;
+ }
+
+ /// \brief Call the first/next iteration of parsing the header
+ /// \return true on success, false if more data is needed (putInput(const char*,std::size_t)) or if an error occurred. Check lasterror() for an error
+ bool parse()
+ {
+ unsigned char ch = nextChar();
+ for (;ch != 0; ch = nextChar())
+ {
+ switch (m_state)
+ {
+ case Init:
+ if (ch == '<')
+ {
+ m_state = ParseXmlOpen;
+ }
+ else if (ch <= 32)
+ {
+ continue;
+ }
+ else
+ {
+ setError( "expected open tag angle bracket '>'");
+ return false;
+ }
+ break;
+
+ case ParseXmlOpen:
+ if (ch == '?')
+ {
+ m_state = ParseXmlHdr;
+ }
+ else if (ch <= 32)
+ {
+ break;
+ }
+ else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
+ {
+ return true;
+ }
+ else
+ {
+ setError( "expected xml header question mark '?' after open tag angle bracket '<'");
+ return false;
+ }
+ break;
+
+ case ParseXmlHdr:
+ if (ch <= 32 || ch == '?')
+ {
+ if (m_item != "xml")
+ {
+ setError( "expected '<?xml' as xml header start");
+ return false;
+ }
+ m_item.clear();
+ if (ch == '?') return true; /*...."<?xml?>"*/
+
+ m_state = FindAttributeName;
+ }
+ else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
+ {
+ m_item.push_back(ch);
+ continue;
+ }
+ else if (ch == '>')
+ {
+ setError( "unexpected close angle bracket '>' in xml header after '<?xml'");
+ return false;
+ }
+ else
+ {
+ setError( "expected '<?xml' as xml header start (invalid character)");
+ return false;
+ }
+ break;
+
+ case FindAttributeName:
+ if (ch <= 32)
+ {
+ continue;
+ }
+ else if (ch == '>' || ch == '?')
+ {
+ if (ch == '>')
+ {
+ setError( "unexpected close angle bracket '>' in xml header (missing '?')");
+ return false;
+ }
+ return true;
+ }
+ else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
+ {
+ m_item.push_back(ch);
+ m_state = ParseAttributeName;
+ }
+ else
+ {
+ setError( "invalid character in xml header attribute name");
+ return false;
+ }
+ break;
+ case ParseAttributeName:
+ if (ch <= 32 || ch == '=')
+ {
+ if (m_item == "encoding")
+ {
+ m_attributetype = Encoding;
+ }
+ else if (m_item == "version")
+ {
+ m_attributetype = Version;
+ }
+ else if (m_item == "standalone")
+ {
+ m_attributetype = Standalone;
+ }
+ else
+ {
+ setError( "unknown xml header attribute name");
+ return false;
+ }
+ m_item.clear();
+ if (ch == '=')
+ {
+ m_state = FindAttributeValue;
+ continue;
+ }
+ m_state = FindAttributeAssign;
+ }
+ else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
+ {
+ m_item.push_back(ch);
+ continue;
+ }
+ else
+ {
+ setError( "invalid character in xml header attribute name");
+ return false;
+ }
+ break;
+ case FindAttributeAssign:
+ if (ch == '=')
+ {
+ m_state = FindAttributeValue;
+ }
+ else if (ch <= 32)
+ {
+ continue;
+ }
+ else
+ {
+ setError( "expected '=' after xml header attribute name");
+ return false;
+ }
+ break;
+ case FindAttributeValue:
+ if (ch == '"')
+ {
+ m_state = ParseAttributeValueDq;
+ continue;
+ }
+ else if (ch == '\'')
+ {
+ m_state = ParseAttributeValueSq;
+ continue;
+ }
+ else if (ch <= 32)
+ {
+ continue;
+ }
+ else
+ {
+ setError( "expected single or double quote string as xml header attribute value");
+ return false;
+ }
+ break;
+ case ParseAttributeValueSq:
+ if (ch == '\'')
+ {
+ switch (m_attributetype)
+ {
+ case Encoding:
+ m_encoding = m_item;
+ break;
+ case Version:
+ case Standalone:
+ break;
+ }
+ m_item.clear();
+ m_state = FindAttributeName;
+ continue;
+ }
+ else
+ {
+ m_item.push_back( ch);
+ }
+ break;
+ case ParseAttributeValueDq:
+ if (ch == '\"')
+ {
+ switch (m_attributetype)
+ {
+ case Encoding:
+ m_encoding = m_item;
+ break;
+ case Version:
+ case Standalone:
+ break;
+ }
+ m_item.clear();
+ m_state = FindAttributeName;
+ continue;
+ }
+ else
+ {
+ m_item.push_back( ch);
+ }
+ break;
+ }/*switch(..)*/
+ }/*for(;..;..)*/
+ return false;
+ }
+
+ /// \brief Get the last error occurred
+ /// \return the pointer to the last error or NULL if no error occurred
+ const char* lasterror() const
+ {
+ return m_lasterror.empty()?0:m_lasterror.c_str();
+ }
+
+ /// \brief Get the encoding specified as attribute in the header
+ /// \return the encoding or NULL if not specified or not encountered yet in the source parsed
+ const char* encoding() const
+ {
+ return m_encoding.empty()?0:m_encoding.c_str();
+ }
+
+ /// \brief Get the number of ASCII characters consumed
+ /// \return the number of characters
+ std::size_t charsConsumed() const
+ {
+ return m_charsConsumed;
+ }
+
+ /// \brief Clear the data, reset the state
+ void clear()
+ {
+ m_state = Init;
+ m_attributetype = Encoding;
+ m_idx = 0;
+ m_charsConsumed = 0;
+ m_zeroCount = 0;
+ m_item.clear();
+ m_src.clear();
+ m_encoding.clear();
+ m_lasterror.clear();
+ }
+
+private:
+ void setError( const std::string& m)
+ {
+ m_lasterror = m;
+ }
+
+ unsigned char nextChar()
+ {
+ for (; m_zeroCount<4; m_zeroCount++)
+ {
+ if (m_idx >= m_src.size()) return 0;
+ unsigned char ch = m_src[m_idx];
+ ++m_idx;
+ if (ch != 0)
+ {
+ m_zeroCount = 0;
+ if (ch > 32)
+ {
+ ++m_charsConsumed;
+ }
+ return ch;
+ }
+ }
+ throw std::runtime_error( "illegal XML header (more than 4 null bytes in a row)");
+ }
+
+ enum State
+ {
+ Init,
+ ParseXmlOpen,
+ ParseXmlHdr,
+ FindAttributeName,
+ ParseAttributeName,
+ FindAttributeAssign,
+ FindAttributeValue,
+ ParseAttributeValueSq,
+ ParseAttributeValueDq
+ };
+
+ enum AttributeType
+ {
+ Encoding,
+ Version,
+ Standalone
+ };
+
+ static const char* stateName( State i)
+ {
+ static const char* ar[] = {"Init","ParseXmlOpen","ParseXmlHdr","FindAttributeName","ParseAttributeName","FindAttributeAssign","FindAttributeValue","ParseAttributeValueSq","ParseAttributeValueDq"};
+ return ar[ (int)i];
+ }
+
+private:
+ State m_state; ///< header parsing state
+ AttributeType m_attributetype; ///< currently parsed attribute type
+ std::size_t m_idx; ///< source index (index in m_src)
+ std::size_t m_charsConsumed; ///< number of characters consumed
+ std::size_t m_zeroCount; ///< counter of subsequent null bytes
+ std::string m_item; ///< parsed item
+ std::string m_src; ///< source buffered
+ std::string m_encoding; ///< character set encoding parsed
+ std::string m_lasterror; ///< last error
+};
+
+}//namespace
+#endif
+
diff --git a/textwolf/include/textwolf/xmlpathautomaton.hpp b/textwolf/include/textwolf/xmlpathautomaton.hpp
new file mode 100644
index 0000000..9ce8896
--- /dev/null
+++ b/textwolf/include/textwolf/xmlpathautomaton.hpp
@@ -0,0 +1,778 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmlpathautomaton.hpp
+/// \brief Automaton to select path expressions from an XML iterator
+
+#ifndef __TEXTWOLF_XML_PATH_AUTOMATON_HPP__
+#define __TEXTWOLF_XML_PATH_AUTOMATON_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset.hpp"
+#include "textwolf/exception.hpp"
+#include "textwolf/xmlscanner.hpp"
+#include "textwolf/staticbuffer.hpp"
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <map>
+#include <cstddef>
+#include <stdexcept>
+
+namespace textwolf {
+
+///\class XMLPathSelectAutomaton
+///\tparam CharSet_ character set of the token defintions of the automaton
+///\brief Automaton to define XML path expressions and assign types (int values) to them
+template <class CharSet_=charset::UTF8>
+class XMLPathSelectAutomaton :public throws_exception
+{
+public:
+ enum
+ {
+ defaultMemUsage=3*1024, //< default memory usage of the XML path select process, if not specified else
+ defaultMaxDepth=32 //< default max tag stack depth, if not specified else
+ };
+ std::size_t memUsage; //< total memory usage
+ unsigned int maxDepth; //< max tag stack depth
+ std::size_t maxScopeStackSize; //< max scope stack depth
+ unsigned int maxFollows; //< maximum number of tokens searched in depth
+ unsigned int maxTriggers; //< maximum number of open triggers
+ unsigned int maxTokens; //< maximum number of open tokens
+
+public:
+ ///\brief Constructor
+ XMLPathSelectAutomaton()
+ :memUsage(defaultMemUsage)
+ ,maxDepth(defaultMaxDepth)
+ ,maxScopeStackSize(0)
+ ,maxFollows(0)
+ ,maxTriggers(0)
+ ,maxTokens(0)
+ {
+ if (!setMemUsage( memUsage, maxDepth)) throw exception( DimOutOfRange);
+ }
+ typedef CharSet_ CharSet;
+ typedef int Hash;
+ typedef XMLPathSelectAutomaton<CharSet> ThisXMLPathSelectAutomaton;
+
+ virtual ~XMLPathSelectAutomaton(){}
+
+public:
+ ///\enum Operation
+ ///\brief Enumeration of operation types in the automaton definition
+ enum Operation
+ {
+ Content, //< searching content token
+ Tag, //< searching a tag
+ Attribute, //< searching an attribute
+ ThisAttributeValue, //< checking the value of the attribute just parsed (not an arbitrary but this one)
+ AttributeValue, //< searching a value of an attribute
+ ContentStart //< looking for the start of content (to signal the end of the XML header)
+ };
+
+ ///\brief Get the name of the operation as string
+ ///\return the operation as string
+ static const char* operationName( Operation op)
+ {
+ static const char* name[ 6] = {"Content", "Tag", "Attribute", "ThisAttributeValue", "AttributeValue", "ContentStart"};
+ return name[ (unsigned int)op];
+ }
+
+ ///\class Mask
+ ///\brief Mask to query for element types, if they match or not
+ struct Mask
+ {
+ unsigned short pos; //< positively selected elements bitmask
+ unsigned short neg; //< negatively selected elements bitmask that determines when a search pattern is given up copletely
+
+ ///\brief Tells if mask does not select anything anymore
+ ///\return true if it is not active anymore
+ bool empty() const {return (pos==0);}
+
+ ///\brief Constructor by values
+ ///\param [in] p_pos positively selected elements bitmask
+ ///\param [in] p_neg negatively selected elements bitmask that determines when a search pattern is given up copletely
+ Mask( unsigned short p_pos=0, unsigned short p_neg=0):pos(p_pos),neg(p_neg) {}
+
+ ///\brief Copy constructor
+ ///\param[in] orig mask to copy
+ Mask( const Mask& orig) :pos(orig.pos),neg(orig.neg) {}
+
+ ///\brief Constructor by operation type
+ Mask( Operation op) :pos(0),neg(0) {this->match(op);}
+
+ ///\brief Reset operation (deactivate)
+ void reset() {pos=0; neg=0;}
+
+ ///\brief Deactivate operation for a certain element type
+ void reject( XMLScannerBase::ElementType e) {neg |= (1<<(unsigned short)e);}
+ bool hasReject( XMLScannerBase::ElementType e) const {return (neg & (1<<(unsigned short)e)) != 0;}
+
+ ///\brief Declare an operation to match on an element type
+ void match( XMLScannerBase::ElementType e) {pos |= (1<<(unsigned short)e);}
+ bool hasMatch( XMLScannerBase::ElementType e) const {return (pos & (1<<(unsigned short)e)) != 0;}
+
+ ///\brief Declare an operation as seek operation
+ void seekop( Operation op)
+ {
+ switch (op)
+ {
+ case Tag:
+ this->match( XMLScannerBase::OpenTag);
+ this->match( XMLScannerBase::HeaderStart);
+ break;
+ case Attribute:
+ this->match( XMLScannerBase::TagAttribName);
+ this->match( XMLScannerBase::HeaderAttribName);
+ this->reject( XMLScannerBase::Content);
+ break;
+ case ThisAttributeValue:
+ this->match( XMLScannerBase::TagAttribValue);
+ this->match( XMLScannerBase::HeaderAttribValue);
+ this->reject( XMLScannerBase::TagAttribName);
+ this->reject( XMLScannerBase::HeaderAttribName);
+ this->reject( XMLScannerBase::Content);
+ this->reject( XMLScannerBase::OpenTag);
+ break;
+ case AttributeValue:
+ this->match( XMLScannerBase::TagAttribValue);
+ this->match( XMLScannerBase::HeaderAttribValue);
+ this->reject( XMLScannerBase::Content);
+ break;
+ case Content:
+ this->match( XMLScannerBase::Content);
+ break;
+ case ContentStart:
+ this->match( XMLScannerBase::HeaderEnd);
+ break;
+ }
+ }
+
+ const char* seekopName() const
+ {
+ if (this->hasMatch( XMLScannerBase::OpenTag)
+ && this->hasMatch( XMLScannerBase::HeaderStart))
+ return "Tag";
+
+ if (this->hasMatch( XMLScannerBase::TagAttribName)
+ && this->hasMatch( XMLScannerBase::HeaderAttribName)
+ && this->hasReject( XMLScannerBase::Content))
+ return "Attribute";
+
+ if (this->hasMatch( XMLScannerBase::TagAttribValue)
+ && this->hasMatch( XMLScannerBase::HeaderAttribValue)
+ && this->hasReject( XMLScannerBase::Content))
+ return "AttributeValue";
+
+ if (this->hasMatch( XMLScannerBase::TagAttribValue)
+ && this->hasMatch( XMLScannerBase::HeaderAttribValue)
+ && this->hasReject( XMLScannerBase::TagAttribName)
+ && this->hasReject( XMLScannerBase::HeaderAttribName)
+ && this->hasReject( XMLScannerBase::Content)
+ && this->hasReject( XMLScannerBase::OpenTag))
+ return "ThisAttributeValue";
+
+ if (this->hasMatch( XMLScannerBase::Content))
+ return "Content";
+
+ if (this->hasMatch( XMLScannerBase::HeaderEnd))
+ return "ContentStart";
+
+ if (pos == 0 && neg == 0)
+ return "None";
+
+ return "";
+ }
+
+ ///\brief Join two mask definitions
+ ///\param[in] mask definition of mask to join this with
+ void join( const Mask& mask) {pos |= mask.pos; neg |= mask.neg;}
+
+ ///\brief Check if an element type matches the mask
+ ///\param[in] e element type to check
+ bool matches( XMLScannerBase::ElementType e) const {return (0 != (pos & (1<<(unsigned short)e)));}
+
+ ///\brief Check if an element type should reset a mask
+ ///\param[in] e element type to check
+ bool rejects( XMLScannerBase::ElementType e) const {return (0 != (neg & (1<<(unsigned short)e)));}
+ };
+
+ ///\class Core
+ ///\brief Core of an automaton state definition that is used during XML processing
+ struct Core
+ {
+ Mask mask; //< mask definiting what tokens are matching this state
+ bool follow; //< true, if the state is seeking tokens in all follow scopes in the XML tree
+ int typeidx; //< type of the element emitted by this state on a match
+ int cnt_start; //< lower bound of the element index matching (for index ranges)
+ int cnt_end; //< upper bound of the element index matching (for index ranges)
+
+ ///\brief Constructor
+ Core() :follow(false),typeidx(0),cnt_start(0),cnt_end(-1) {}
+ ///\brief Copy constructor
+ ///\param [in] o element to copy
+ Core( const Core& o) :mask(o.mask),follow(o.follow),typeidx(o.typeidx),cnt_start(o.cnt_start),cnt_end(o.cnt_end) {}
+ };
+
+ ///\class State
+ ///\brief State of an automaton in its definition
+ struct State
+ {
+ Core core; //< core of the state (the part used in processing)
+ unsigned int keysize; //< key size of the element
+ char* key; //< key of the element
+ char* srckey; //< key of the element as in source (for debugging or reporting, etc.)
+ int next; //< follow state
+ int link; //< alternative state to check
+
+ ///\brief Constructor
+ State()
+ :keysize(0),key(0),srckey(0),next(-1),link(-1) {}
+
+ ///\brief Copy constructor
+ ///\param [in] orig element to copy
+ State( const State& orig)
+ :core(orig.core),keysize(orig.keysize),key(0),srckey(0),next(orig.next),link(orig.link)
+ {
+ defineKey( orig.keysize, orig.key, orig.srckey);
+ }
+
+ ///\brief Destructor
+ ~State()
+ {
+ if (key) delete [] key;
+ if (srckey) delete [] srckey;
+ }
+
+ ///\brief Check it the state definition is empty
+ ///\return true for an empty state
+ bool isempty() {return key==0&&core.typeidx==0;}
+
+ ///\brief Define the matching key of this state
+ ///\param[in] p_keysize size of the key in bytes
+ ///\param[in] p_key pointer to the key
+ ///\param[in] p_srckey the source form of the key (ASCII with encoded entities for everything else)
+ void defineKey( unsigned int p_keysize, const char* p_key, const char* p_srckey)
+ {
+ unsigned int ii;
+ if (key)
+ {
+ delete [] key;
+ key = 0;
+ }
+ if (srckey)
+ {
+ delete [] srckey;
+ srckey = 0;
+ }
+ if (p_key)
+ {
+ key = new char[ keysize=p_keysize];
+ for (ii=0; ii<keysize; ii++) key[ii]=p_key[ii];
+ }
+ if (p_srckey)
+ {
+ for (ii=0; p_srckey[ii]!=0; ii++);
+ srckey = new char[ ii+1];
+ for (ii=0; p_srckey[ii]!=0; ii++) srckey[ii]=p_srckey[ii];
+ srckey[ ii] = 0;
+ }
+ }
+
+ ///\brief Define a state transition by key and operation
+ ///\param[in] op operation type
+ ///\param[in] p_keysize size of the key in bytes
+ ///\param[in] p_key pointer to the key
+ ///\param[in] p_srckey the source form of the key (ASCII with encoded entities for everything else)
+ ///\param[in] p_next follow state on a match
+ ///\param[in] p_follow true if the search reaches all included follow scopes of the definition scope
+ void defineNext( Operation op, unsigned int p_keysize, const char* p_key, const char* p_srckey, int p_next, bool p_follow=false)
+ {
+ core.mask.seekop( op);
+ defineKey( p_keysize, p_key, p_srckey);
+ next = p_next;
+ core.follow = p_follow;
+ }
+
+ ///\brief Define an element output operation
+ ///\param[in] mask mask defining the element types to output
+ ///\param[in] p_typeidx the type of the element produced
+ ///\param[in] p_follow true if the output reaches all included follow scopes of the definition scope
+ ///\param[in] p_start start index of the element range produced
+ ///\param[in] p_end upper bound index of the element range produced
+ void defineOutput( const Mask& mask, int p_typeidx, bool p_follow, int p_start, int p_end)
+ {
+ core.mask = mask;
+ core.typeidx = p_typeidx;
+ core.cnt_end = p_end;
+ core.cnt_start = p_start;
+ core.follow = p_follow;
+ }
+
+ ///\brief Link another state to check to the current state
+ ///\param[in] p_link the index of the state to link
+ void defLink( int p_link)
+ {
+ link = p_link;
+ }
+
+ std::string tostring() const
+ {
+ std::ostringstream rt;
+ if (next >= 0) rt << " ->" << next;
+ if (link >= 0) rt << " ~" << link;
+ rt << ' ';
+ if (core.follow)
+ {
+ rt << '/';
+ }
+ rt << '/';
+ rt << core.mask.seekopName();
+ if (srckey)
+ {
+ rt << " '" << srckey << "'";
+ }
+ else
+ {
+ rt << " (null)";
+ }
+ if (core.cnt_end > 0)
+ {
+ rt << '[' << core.cnt_start << ',' << rt << core.cnt_end << ']';
+ }
+ if (core.typeidx)
+ {
+ rt << " =>" << core.typeidx;
+ }
+ return rt.str();
+ }
+ };
+ std::vector<State> states; //< the states of the statemachine
+
+ ///\brief Returns the content of the automaton as pretty printed string for debug output
+ std::string tostring() const
+ {
+ std::ostringstream rt;
+ typename std::vector<State>::const_iterator ii=states.begin(), ee=states.end();
+ for (; ii != ee; ++ii)
+ {
+ rt << (int)(ii-states.begin()) << ": " << ii->tostring() << std::endl;
+ }
+ return rt.str();
+ }
+
+ ///\class Token
+ ///\brief Active or passive but still valid token of the XML processing (this is a trigger waiting to match)
+ struct Token
+ {
+ Core core; //< core of the state
+ int stateidx; //< index into the automaton, poiting to the state
+
+ ///\brief Constructor
+ Token() :stateidx(-1) {}
+ ///\brief Copy constructor
+ Token( const Token& orig) :core(orig.core),stateidx(orig.stateidx) {}
+ ///\brief Constructor by value
+ ///\param [in] state state that generated this token
+ ///\param [in] p_stateidx index of the state that generated this token
+ Token( const State& state, int p_stateidx) :core(state.core),stateidx(p_stateidx) {}
+ };
+
+ ///\class Scope
+ ///\brief Tag scope definition
+ struct Scope
+ {
+ Mask mask; //< joined mask of all tokens active in this scope
+ Mask followMask; //< joined mask of all tokens active in this and all sub scopes of this scope
+
+ ///\class Range
+ ///\brief Range on the token stack with all tokens that belong to this scope
+ struct Range
+ {
+ unsigned int tokenidx_from; //< lower bound token index
+ unsigned int tokenidx_to; //< upper bound token index
+ unsigned int followidx; //< pointer to follow token stack with tokens active in this and all sub scopes of this scope
+
+ ///\brief Constructor
+ Range() :tokenidx_from(0),tokenidx_to(0),followidx(0) {}
+ ///\brief Copy constructor
+ ///\param[in] orig scope to copy
+ Range( const Scope& orig) :tokenidx_from(orig.tokenidx_from),tokenidx_to(orig.tokenidx_to),followidx(orig.followidx) {}
+ };
+ Range range; //< valid (active) token range of this scope (on the token stacks)
+
+ ///\brief Copy constructor
+ ///\param[in] orig scope to copy
+ Scope( const Scope& orig) :mask(orig.mask),followMask(orig.followMask),range(orig.range) {}
+ ///\brief Assignement operator
+ ///\param[in] orig scope to copy
+ Scope& operator =( const Scope& orig) {mask=orig.mask; followMask=orig.followMask; range=orig.range; return *this;}
+ ///\brief Constructor
+ Scope() {}
+ };
+
+ ///\brief Defines the usage of memory
+ ///\param [in] p_memUsage size of the memory block in bytes
+ ///\param [in] p_maxDepth maximum depht of the scope stack
+ ///\return true, if everything is OK
+ bool setMemUsage( std::size_t p_memUsage, unsigned int p_maxDepth)
+ {
+ memUsage = p_memUsage;
+ maxDepth = p_maxDepth;
+ maxScopeStackSize = maxDepth;
+ if (p_memUsage < maxScopeStackSize * sizeof(Scope))
+ {
+ maxScopeStackSize = 0;
+ }
+ else
+ {
+ p_memUsage -= maxScopeStackSize * sizeof(Scope);
+ }
+ maxFollows = (p_memUsage / sizeof(std::size_t)) / 32 + 2;
+ maxTriggers = (p_memUsage / sizeof(std::size_t)) / 32 + 3;
+ p_memUsage -= sizeof(std::size_t) * maxFollows + sizeof(std::size_t) * maxTriggers;
+ maxTokens = p_memUsage / sizeof(Token);
+ return (maxScopeStackSize != 0 && maxTokens != 0 && maxFollows != 0 && maxTriggers != 0);
+ }
+
+private:
+ ///\brief Defines a state transition
+ ///\param [in] stateidx from what source state
+ ///\param [in] op operation firing the state transition
+ ///\param [in] keysize length of the key firing the state transition in bytes
+ ///\param [in] key the key string firing the state transition in bytes
+ ///\param [in] srckey the ASCII encoded representation in the source
+ ///\param [in] follow true, uf the state transition is active for all sub scopes of the activation state
+ ///\return the target state of the transition defined
+ int defineNext( int stateidx, Operation op, unsigned int keysize, const char* key, const char* srckey, bool follow=false) throw(exception)
+ {
+ try
+ {
+ State state;
+ if (states.size() == 0)
+ {
+ stateidx = states.size();
+ states.push_back( state);
+ }
+ for (int ee=stateidx; ee != -1; stateidx=ee,ee=states[ee].link)
+ {
+ if (states[ee].key != 0 && keysize == states[ee].keysize && states[ee].core.follow == follow)
+ {
+ unsigned int ii;
+ for (ii=0; ii<keysize && states[ee].key[ii]==key[ii]; ii++);
+ if (ii == keysize) return states[ee].next;
+ }
+ }
+ if (!states[stateidx].isempty())
+ {
+ stateidx = states[stateidx].link = states.size();
+ states.push_back( state);
+ }
+ states.push_back( state);
+ unsigned int lastidx = states.size()-1;
+ states[ stateidx].defineNext( op, keysize, key, srckey, lastidx, follow);
+ return stateidx=lastidx;
+ }
+ catch (std::bad_alloc)
+ {
+ throw exception( OutOfMem);
+ }
+ catch (...)
+ {
+ throw exception( Unknown);
+ }
+ }
+
+ ///\brief Defines an output print action and output type for a state
+ ///\param [in] stateidx from what source state
+ ///\param [in] printOpMask mask for elements printed
+ ///\param [in] typeidx type identifier
+ ///\param [in] follow true, uf the state transition is active for all sub scopes of the activation state
+ ///\param [in] start start of index range where this state transition fires
+ ///\param [in] end end of index range where this state transition fires
+ ///\return index of the state where this output action was defined
+ int defineOutput( int stateidx, const Mask& printOpMask, int typeidx, bool follow, int start, int end) throw(exception)
+ {
+ try
+ {
+ State state;
+ if (states.size() == 0)
+ {
+ stateidx = states.size();
+ states.push_back( state);
+ }
+ if ((unsigned int)stateidx >= states.size()) throw exception( IllegalParam);
+
+ if (!states[stateidx].isempty())
+ {
+ stateidx = states[stateidx].link = states.size();
+ states.push_back( state);
+ }
+ states[ stateidx].defineOutput( printOpMask, typeidx, follow, start, end);
+ return stateidx;
+ }
+ catch (std::bad_alloc)
+ {
+ throw exception( OutOfMem);
+ }
+ catch (...)
+ {
+ throw exception( Unknown);
+ }
+ }
+
+public:
+ ///\class PathElement
+ ///\brief Defines one node in the XML Path element tree in the construction phase.
+ ///\remark This is just a construct for building the tree with cascading operators forming a path representation
+ struct PathElement :throws_exception
+ {
+ private:
+ XMLPathSelectAutomaton* xs; //< XML Path select automaton where this node is an element of
+ int stateidx; //< state of this element in the automaton
+
+ ///\class Range
+ ///\brief Element counting range defining what are indices of valid elements
+ struct Range
+ {
+ int start; //< index of starting element starting with 0
+ int end; //< index of upper boundary element (not belonging to range anymore). -1 if undefined (unlimited)
+
+ ///\brief Copy constructor
+ ///\param [in] o range element to copy
+ Range( const Range& o) :start(o.start),end(o.end){}
+ ///\brief Constructor by value
+ ///\param [in] p_start index of starting element
+ ///\param [in] p_end index of upper boundary element (not belonging to range anymore). -1 if undefined (unlimited)
+ Range( int p_start, int p_end) :start(p_start),end(p_end){}
+ ///\brief Constructor by value
+ ///\param [in] count number of elements starting with the first one (with index 0)
+ Range( int count) :start(0),end(count){}
+ ///\brief Constructor
+ Range() :start(0),end(-1){}
+ };
+ Range range; //< Index range of this XML path element
+ bool follow; //< true, if this element is active (firing) for all sub scopes of the activation scope
+ Mask pushOpMask; //< mask for firing element actions
+ Mask printOpMask; //< mask for printing element actions
+
+ private:
+ ///\brief Define an output operation for a certain element type in this state
+ ///\param [in] op XML operation type of this output
+ ///\return *this
+ PathElement& defineOutput( Operation op)
+ {
+ printOpMask.reset();
+ printOpMask.seekop( op);
+ return *this;
+ }
+
+ ///\brief Define a state transition operation for a token of a certain element type in this state
+ ///\param [in] op XML operation type of this state transition
+ ///\param [in] value key value as ASCII with encoded entities for higher unicode characters of this state transition
+ ///\return *this
+ PathElement& doSelect( Operation op, const char* value) throw(exception)
+ {
+ static XMLScannerBase::IsTagCharMap isTagCharMap;
+ if (xs != 0)
+ {
+ if (value)
+ {
+ char buf[ 1024];
+ StaticBuffer pb( buf, sizeof(buf));
+ char* itr = const_cast<char*>(value);
+ typedef XMLScanner<char*,CharSet,CharSet,StaticBuffer> StaticXMLScanner;
+ if (!StaticXMLScanner::parseStaticToken( isTagCharMap, itr, pb))
+ {
+ throw exception( IllegalAttributeName);
+ }
+ stateidx = xs->defineNext( stateidx, op, pb.size(), pb.ptr(), value, follow);
+ }
+ else
+ {
+ stateidx = xs->defineNext( stateidx, op, 0, 0, 0, follow);
+ }
+ }
+ return *this;
+ }
+
+ ///\brief Define this element as active (firing,printing) for all sub scopes of the activation scope
+ ///\return *this
+ PathElement& doFollow()
+ {
+ follow = true;
+ return *this;
+ }
+
+ ///\brief Define a valid range of token count for this element to be active
+ ///\param [in] p_start index of starting element starting with 0
+ ///\param [in] p_end index of upper boundary element (not belonging to range anymore). -1 if undefined (unlimited)
+ ///\return *this
+ PathElement& doRange( int p_start, int p_end)
+ {
+ if (range.end == -1)
+ {
+ range = Range( p_start, p_end);
+ }
+ else if (p_end < range.end)
+ {
+ range.end = p_end;
+ }
+ else if (p_start > range.start)
+ {
+ range.start = p_start;
+ }
+ return *this;
+ }
+
+ ///\brief Define a valid range of token count for this element to be active by the number of elements
+ ///\param [in] p_count number of elements starting with 0
+ ///\return *this
+ PathElement& doCount( int p_count)
+ {
+ return doRange( 0, p_count);
+ }
+
+ ///\brief Define the start of the range of token count for this element to be active
+ ///\param [in] p_start index of starting element starting with 0
+ ///\return *this
+ PathElement& doStart( int p_start)
+ {
+ return doRange( p_start, std::numeric_limits<int>::max());
+ }
+
+ ///\brief Define the output of the current element
+ ///\param [in] typeidx type of the element produced
+ ///\return *this
+ PathElement& push( int typeidx) throw(exception)
+ {
+ if (xs != 0) stateidx = xs->defineOutput( stateidx, printOpMask, typeidx, follow, range.start, range.end);
+ return *this;
+ }
+
+ public:
+ ///\brief Constructor
+ PathElement() :xs(0),stateidx(0),follow(false),pushOpMask(0),printOpMask(0){}
+ ///\brief Constructor by values
+ ///\param [in] p_xs automaton of this element
+ ///\param [in] p_si state index of this element in the automaton definition
+ PathElement( XMLPathSelectAutomaton* p_xs, int p_si=0) :xs(p_xs),stateidx(p_si),follow(false),pushOpMask(0),printOpMask(0){}
+ ///\brief Copy constructor
+ ///\param [in] orig element to copy
+ PathElement( const PathElement& orig) :xs(orig.xs),stateidx(orig.stateidx),range(orig.range),follow(orig.follow),pushOpMask(orig.pushOpMask),printOpMask(orig.printOpMask) {}
+
+ ///\brief Corresponds to "//" in abbreviated syntax of XPath
+ ///\return *this
+ PathElement& operator --(int) {return doFollow();}
+ ///\brief Find tag by name
+ ///\param [in] name name of the tag
+ ///\return *this
+ ///\remark same as selectTag(const char*)
+ PathElement& operator []( const char* name) throw(exception) {return doSelect( Tag, name);}
+ ///\brief Find tag by name
+ ///\param [in] name name of the tag
+ ///\return *this
+ PathElement& selectTag( const char* name) throw(exception) {return doSelect( Tag, name);}
+
+ ///\brief Find tag with one attribute
+ ///\param [in] name name of the attribute
+ ///\return *this
+ ///\remark same as selectAttribute(const char*)
+ PathElement& operator ()( const char* name) throw(exception) {return doSelect( Attribute, name).defineOutput( ThisAttributeValue);}
+ ///\brief Find tag with one attribute
+ ///\param [in] name name of the attribute
+ ///\return *this
+ PathElement& selectAttribute( const char* name) throw(exception) {return doSelect( Attribute, name).defineOutput( ThisAttributeValue);}
+
+ ///\brief Find tag with one attribute,value condition
+ ///\remark same as ifAttribute(const char*,const char*)
+ ///\param [in] name name of the attribute
+ ///\param [in] value value of the attribute
+ ///\return *this
+ PathElement& operator ()( const char* name, const char* value) throw(exception) {return doSelect( Attribute, name).doSelect( ThisAttributeValue, value);}
+
+ ///\brief Find tag with one attribute,value condition
+ ///\param [in] name name of the attribute
+ ///\param [in] value value of the attribute
+ ///\return *this
+ PathElement& ifAttribute( const char* name, const char* value) throw(exception) {return doSelect( Attribute, name).doSelect( ThisAttributeValue, value);}
+
+ ///\brief Define maximum element index to push
+ ///\param [in] idx maximum element index
+ ///\return *this
+ PathElement& TO(int idx) throw(exception) {return doCount((idx>=0)?(idx+1):-1);}
+ ///\brief Define minimum element index to push
+ ///\param [in] idx minimum element index
+ ///\return *this
+ PathElement& FROM(int idx) throw(exception) {return doStart(idx); return *this;}
+ ///\brief Define minimum and maximum element index to push
+ ///\param [in] idx1 minimum element index
+ ///\param [in] idx2 maximum element index
+ ///\return *this
+ PathElement& RANGE(int idx1, int idx2) throw(exception) {return doRange(idx1,(idx2>=0)?(idx2+1):-1); return *this;}
+ ///\brief Define index of the element index to push
+ ///\param [in] idx element index
+ ///\return *this
+ PathElement& INDEX(int idx) throw(exception) {return doRange(idx,idx+1); return *this;}
+
+ ///\brief Define element type to push
+ ///\param [in] type element type
+ ///\return *this
+ ///\remark same as assignType(int)
+ PathElement& operator =(int type) throw(exception) {return push( type);}
+ ///\brief Define element type to push
+ ///\param [in] type element type
+ ///\return *this
+ PathElement& assignType(int type) throw(exception) {return push( type);}
+
+ ///\brief Define grab content
+ ///\remark same as selectContent()
+ ///\return *this
+ PathElement& operator ()() throw(exception) {return defineOutput(Content);}
+ ///\brief Define grab content
+ ///\return *this
+ PathElement& selectContent() throw(exception) {return defineOutput(Content);}
+ };
+
+ ///\brief Get automaton root element to start an XML path definition
+ ///\return the automaton root element
+ PathElement operator*()
+ {
+ return PathElement( this);
+ }
+};
+
+} //namespace
+#endif
diff --git a/textwolf/include/textwolf/xmlpathautomatonparse.hpp b/textwolf/include/textwolf/xmlpathautomatonparse.hpp
new file mode 100644
index 0000000..33856de
--- /dev/null
+++ b/textwolf/include/textwolf/xmlpathautomatonparse.hpp
@@ -0,0 +1,245 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmlpathautomatonparse.hpp
+/// \brief Parser to create a path expression selector automaton from a source (list of path expression in abbreviated syntax of xpath)
+
+#ifndef __TEXTWOLF_XML_PATH_AUTOMATON_PARSE_HPP__
+#define __TEXTWOLF_XML_PATH_AUTOMATON_PARSE_HPP__
+#include "textwolf/xmlpathautomaton.hpp"
+#include "textwolf/charset.hpp"
+#include "textwolf/cstringiterator.hpp"
+#include <limits>
+#include <string>
+#include <vector>
+#include <cstring>
+#include <cstddef>
+#include <stdexcept>
+
+namespace textwolf {
+
+///\class XMLPathSelectAutomatonParser
+///\tparam SrcCharSet character set of the automaton definition source
+///\tparam AtmCharSet character set of the token defintions of the automaton
+///\brief Automaton to define XML path expressions and assign types (int values) to them
+template <class SrcCharSet=charset::UTF8, class AtmCharSet=charset::UTF8>
+class XMLPathSelectAutomatonParser :public XMLPathSelectAutomaton<AtmCharSet>
+{
+public:
+ typedef XMLPathSelectAutomaton<AtmCharSet> ThisAutomaton;
+ typedef typename ThisAutomaton::PathElement PathElement;
+ typedef XMLPathSelectAutomatonParser This;
+ typedef TextScanner<CStringIterator,SrcCharSet> SrcScanner;
+
+public:
+ ///\brief Constructor
+ XMLPathSelectAutomatonParser(){}
+ virtual ~XMLPathSelectAutomatonParser(){}
+
+ int addExpression( int typeidx, const char* esrc, std::size_t esrcsize)
+ {
+ std::string idstrings;
+ CStringIterator pitr( esrc, esrcsize);
+ SrcScanner pp( m_srccharset, pitr);
+ std::vector<std::size_t> idref;
+
+ for (; *pp; skipSpaces( pp))
+ {
+ switch (*pp)
+ {
+ case '/':
+ case '@':
+ ++pp;
+ continue;
+ case '[':
+ while (*pp != 0 && *pp != ']') pp++;
+ if (*pp == 0) return pp.getPosition()+1;
+ ++pp;
+ continue;
+ default:
+ if (pp.control() == Undef || pp.control() == Any)
+ {
+ idref.push_back( parseIdentifier( pp, idstrings));
+ }
+ else
+ {
+ return pp.getPosition()+1;
+ }
+ }
+ }
+ typename std::vector<std::size_t>::const_iterator di = idref.begin(), de = idref.end();
+
+ CStringIterator itr( esrc, esrcsize);
+ SrcScanner src( m_srccharset, itr);
+ PathElement expr( this);
+
+ for (; *src; skipSpaces( src))
+ {
+ switch (*src)
+ {
+ case '@':
+ {
+ if (di == de) return src.getPosition()+1;
+ ++src;
+ skipIdentifier( src);
+ expr( getIdentifier( *di, idstrings) );
+ }
+ case '/':
+ {
+ ++src;
+ if (*src == '/')
+ {
+ ++src;
+ if (*src == '@')
+ {
+ if (di == de) return src.getPosition()+1;
+ ++src;
+ skipIdentifier( src);
+ expr -- ( getIdentifier( *di, idstrings) );
+ }
+ else
+ {
+ if (di == de) return src.getPosition()+1;
+ skipIdentifier( src);
+ expr -- [ getIdentifier( *di, idstrings) ];
+ }
+ }
+ else
+ {
+ if (*src == '@')
+ {
+ if (di == de) return src.getPosition()+1;
+ ++src;
+ skipIdentifier( src);
+ expr ( getIdentifier( *di, idstrings) );
+ }
+ else
+ {
+ if (di == de) return src.getPosition()+1;
+ skipIdentifier( src);
+ expr [ getIdentifier( *di, idstrings) ];
+ }
+ }
+ continue;
+ }
+ case '[':
+ {
+ // Range
+ int range_start = -1;
+ int range_end = -1;
+ ++src; skipSpaces( src);
+ range_start = parseNum( src);
+ if (range_start < 0) return src.getPosition()+1;
+ skipSpaces( src);
+
+ if (*src == ',')
+ {
+ ++src; skipSpaces( src);
+ if (*src == ']')
+ {
+ expr.FROM( range_start);
+ ++src;
+ }
+ else
+ {
+ range_end = parseNum( src);
+ if (range_end < 0) return src.getPosition()+1;
+ ++src; skipSpaces( src);
+ if (*src != ']') return src.getPosition()+1;
+ expr.RANGE( range_start, range_end);
+ ++src;
+ }
+ }
+ else if (*src == ']')
+ {
+ range_start = range_end;
+ expr.INDEX( range_start);
+ ++src;
+ }
+ else
+ {
+ return src.getPosition()+1;
+ }
+ continue;
+ }
+ default:
+ return src.getPosition()+1;
+ }
+ }
+ expr.assignType( typeidx);
+ return 0;
+ }
+
+private:
+ static void skipSpaces( SrcScanner& src)
+ {
+ for (; src.control() == Space; ++src);
+ }
+
+ static int parseNum( SrcScanner& src)
+ {
+ std::string num;
+ for (; *src>='0' && *src<='9';++src) num.push_back( *src);
+ if (num.size() == 0 || num.size() > 8) return -1;
+ return std::atoi( num.c_str());
+ }
+
+ std::size_t parseIdentifier( SrcScanner& src, std::string& idstrings)
+ {
+ std::size_t rt = idstrings.size();
+ for (; src.control() == Undef || src.control() == Any; ++src)
+ {
+ m_atmcharset.print( *src, idstrings);
+ }
+ m_atmcharset.print( 0, idstrings);
+ return rt;
+ }
+
+ static void skipIdentifier( SrcScanner& src)
+ {
+ for (; src.control() == Undef || src.control() == Any; ++src);
+ }
+
+ const char* getIdentifier( std::size_t idx, const std::string& idstrings) const
+ {
+ return idstrings.c_str() + idx;
+ }
+
+private:
+ AtmCharSet m_atmcharset;
+ SrcCharSet m_srccharset;
+};
+
+} //namespace
+#endif
diff --git a/textwolf/include/textwolf/xmlpathselect.hpp b/textwolf/include/textwolf/xmlpathselect.hpp
new file mode 100644
index 0000000..a57969e
--- /dev/null
+++ b/textwolf/include/textwolf/xmlpathselect.hpp
@@ -0,0 +1,516 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmlpathselect.hpp
+/// \brief Context of running automaton selecting path expressions from an XML iterator
+
+#ifndef __TEXTWOLF_XML_PATH_SELECT_HPP__
+#define __TEXTWOLF_XML_PATH_SELECT_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include "textwolf/xmlscanner.hpp"
+#include "textwolf/staticbuffer.hpp"
+#include "textwolf/xmlpathautomaton.hpp"
+#include <limits>
+#include <string>
+#include <vector>
+#include <map>
+#include <cstddef>
+
+namespace textwolf {
+
+/// \brief XML path select template
+/// \tparam CharSet_ character set encoding of the automaton elements
+template <class CharSet_>
+class XMLPathSelect :public throws_exception
+{
+public:
+ typedef XMLPathSelectAutomaton<CharSet_> ThisXMLPathSelectAutomaton;
+ typedef XMLPathSelect<CharSet_> ThisXMLPathSelect;
+
+private:
+ const ThisXMLPathSelectAutomaton* atm; //< XML select automaton
+ typedef typename ThisXMLPathSelectAutomaton::Mask Mask;
+ typedef typename ThisXMLPathSelectAutomaton::Token Token;
+ typedef typename ThisXMLPathSelectAutomaton::Hash Hash;
+ typedef typename ThisXMLPathSelectAutomaton::State State;
+ typedef typename ThisXMLPathSelectAutomaton::Scope Scope;
+
+ /// \class Array
+ /// \brief static array of POD types. I decided to implement it on my own though using boost::array would maybe be better.
+ /// \tparam Element element type of the array
+ template <typename Element>
+ class Array :public throws_exception
+ {
+ Element* m_ar; //< pointer to elements
+ std::size_t m_size; //< fill size (number of elements inserted)
+ std::size_t m_maxSize; //< allocation size (space reserved for this number of elements)
+ public:
+ /// \brief Constructor
+ /// \param [in] p_maxSize allocation size (number of elements) to reserve
+ Array( std::size_t p_maxSize) :m_size(0),m_maxSize(p_maxSize)
+ {
+ m_ar = new (std::nothrow) Element[ m_maxSize];
+ if (m_ar == 0) throw exception( OutOfMem);
+ }
+
+ /// \brief Destructor
+ ~Array()
+ {
+ if (m_ar) delete [] m_ar;
+ }
+
+ /// \brief Append one element
+ /// \param [in] elem element to append
+ void push_back( const Element& elem)
+ {
+ if (m_size == m_maxSize) throw exception( OutOfMem);
+ m_ar[ m_size++] = elem;
+ }
+
+ /// \brief Remove one element from the end
+ void pop_back()
+ {
+ if (m_size == 0) throw exception( NotAllowedOperation);
+ m_size--;
+ }
+
+ /// \brief Access element by index
+ /// \param [in] idx index of the element starting with 0
+ /// \return element reference
+ Element& operator[]( std::size_t idx)
+ {
+ if (idx >= m_size) throw exception( ArrayBoundsReadWrite);
+ return m_ar[ idx];
+ }
+
+ /// \brief Get a reference of the element at the end of the array
+ /// \return element reference
+ Element& back()
+ {
+ if (m_size == 0) throw exception( ArrayBoundsReadWrite);
+ return m_ar[ m_size-1];
+ }
+
+ /// \brief Resize of the array
+ /// \param [in] p_size new array size
+ void resize( std::size_t p_size)
+ {
+ if (p_size > m_size) throw exception( ArrayBoundsReadWrite);
+ m_size = p_size;
+ }
+ std::size_t size() const {return m_size;}
+ bool empty() const {return m_size==0;}
+ };
+
+ /// \class Context
+ /// \brief State variables without stacks of the automaton
+ struct Context
+ {
+ XMLScannerBase::ElementType type; //< element type processed
+ const char* key; //< string value of element processed
+ unsigned int keysize; //< size of string value in bytes of element processed
+ Scope scope; //< active scope
+ unsigned int scope_iter; //< position of currently visited token in the active scope
+
+ /// \brief Constructor
+ Context() :type(XMLScannerBase::Content),key(0),keysize(0) {}
+
+ /// \brief Initialization
+ /// \param [in] p_type type of the current element processed
+ /// \param [in] p_key current element processed
+ /// \param [in] p_keysize size of the key in bytes
+ void init( XMLScannerBase::ElementType p_type, const char* p_key, int p_keysize)
+ {
+ type = p_type;
+ key = p_key;
+ keysize = p_keysize;
+ scope_iter = scope.range.tokenidx_from;
+ }
+ };
+
+ Array<Scope> scopestk; //< stack of scopes opened
+ Array<unsigned int> follows; //< indices of tokens active in all descendant scopes
+ Array<int> triggers; //< triggered elements
+ Array<Token> tokens; //< list of waiting tokens
+ Context context; //< state variables without stacks of the automaton
+
+ /// \brief Activate a state by index
+ /// \param stateidx index of the state to activate
+ void expand( int stateidx)
+ {
+ while (stateidx!=-1)
+ {
+ const State& st = atm->states[ stateidx];
+ context.scope.mask.join( st.core.mask);
+ if (st.core.mask.empty() && st.core.typeidx != 0)
+ {
+ triggers.push_back( st.core.typeidx);
+ }
+ else
+ {
+ if (st.core.follow)
+ {
+ context.scope.followMask.join( st.core.mask);
+ follows.push_back( tokens.size());
+ }
+ tokens.push_back( Token( st, stateidx));
+ }
+ stateidx = st.link;
+ }
+ }
+
+ /// \brief Declares the currently processed element of the XMLScanner input. By calling fetch we get the output elements from it
+ /// \param [in] type type of the current element processed
+ /// \param [in] key current element processed
+ /// \param [in] keysize size of the key in bytes
+ void initProcessElement( XMLScannerBase::ElementType type, const char* key, int keysize)
+ {
+ if (context.type == XMLScannerBase::OpenTag)
+ {
+ //last step of open scope has to be done after all tokens were visited,
+ //e.g. with the next element initialization
+ context.scope.range.tokenidx_from = context.scope.range.tokenidx_to;
+ }
+ context.scope.range.tokenidx_to = tokens.size();
+ context.scope.range.followidx = follows.size();
+ context.init( type, key, keysize);
+
+ if (type == XMLScannerBase::OpenTag)
+ {
+ //first step of open scope saves the context context on stack
+ scopestk.push_back( context.scope);
+ context.scope.mask = context.scope.followMask;
+ context.scope.mask.match( XMLScannerBase::OpenTag);
+ //... we reset the mask but ensure that this 'OpenTag' is processed for sure
+ }
+ else if (type == XMLScannerBase::CloseTag || type == XMLScannerBase::CloseTagIm)
+ {
+ if (!scopestk.empty())
+ {
+ context.scope = scopestk.back();
+ scopestk.pop_back();
+ follows.resize( context.scope.range.followidx);
+ tokens.resize( context.scope.range.tokenidx_to);
+ }
+ }
+ }
+
+ /// \brief produce an element adressed by token index
+ /// \param [in] tokenidx index of the token in the list of active tokens
+ /// \param [in] st state from which the expand was triggered
+ void produce( unsigned int tokenidx, const State& st)
+ {
+ const Token& tk = tokens[ tokenidx];
+ if (tk.core.cnt_end == -1)
+ {
+ expand( st.next);
+ }
+ else
+ {
+ if (tk.core.cnt_end > 0)
+ {
+ if (--tokens[ tokenidx].core.cnt_end == 0)
+ {
+ tokens[ tokenidx].core.mask.reset();
+ }
+ if (tk.core.cnt_start <= 0)
+ {
+ expand( st.next);
+ }
+ else
+ {
+ --tokens[ tokenidx].core.cnt_start;
+ }
+ }
+ }
+ }
+
+ /// \brief check if an active token addressed by index matches to the currently processed element
+ /// \param [in] tokenidx index of the token in the list of active tokens
+ /// \return matching token type
+ int match( unsigned int tokenidx)
+ {
+ int rt = 0;
+ if (context.key != 0)
+ {
+ if (tokenidx >= context.scope.range.tokenidx_to) return 0;
+
+ const Token& tk = tokens[ tokenidx];
+ if (tk.core.mask.matches( context.type))
+ {
+ const State& st = atm->states[ tk.stateidx];
+ if (st.key)
+ {
+ if (st.keysize == context.keysize)
+ {
+ unsigned int ii;
+ for (ii=0; ii<context.keysize && st.key[ii] == context.key[ii]; ii++);
+ if (ii==context.keysize)
+ {
+ produce( tokenidx, st);
+ }
+ }
+ }
+ else
+ {
+ produce( tokenidx, st);
+ }
+ if (tk.core.typeidx != 0)
+ {
+ if (tk.core.cnt_end == -1)
+ {
+ rt = tk.core.typeidx;
+ }
+ else if (tk.core.cnt_end > 0)
+ {
+ if (--tokens[ tokenidx].core.cnt_end == 0)
+ {
+ tokens[ tokenidx].core.mask.reset();
+ }
+ if (tk.core.cnt_start <= 0)
+ {
+ rt = tk.core.typeidx;
+ }
+ else
+ {
+ --tokens[ tokenidx].core.cnt_start;
+ }
+ }
+ }
+ }
+ if (tk.core.mask.rejects( context.type))
+ {
+ //The token must not match anymore after encountering a reject item
+ tokens[ tokenidx].core.mask.reset();
+ }
+ }
+ return rt;
+ }
+
+ /// \brief fetch the next matching element
+ /// \return type of the matching element
+ int fetch()
+ {
+ int type = 0;
+
+ if (context.scope.mask.matches( context.type))
+ {
+ while (!type)
+ {
+ if (context.scope_iter < context.scope.range.tokenidx_to)
+ {
+ type = match( context.scope_iter);
+ ++context.scope_iter;
+ }
+ else
+ {
+ unsigned int ii = context.scope_iter - context.scope.range.tokenidx_to;
+ //we match all follows that are not yet been checked in the current scope
+ if (ii < context.scope.range.followidx && context.scope.range.tokenidx_from > follows[ ii])
+ {
+ type = match( follows[ ii]);
+ ++context.scope_iter;
+ }
+ else if (!triggers.empty())
+ {
+ type = triggers.back();
+ triggers.pop_back();
+ }
+ else
+ {
+ context.key = 0;
+ context.keysize = 0;
+ return 0; //end of all candidates
+ }
+ }
+ }
+ }
+ else
+ {
+ context.key = 0;
+ context.keysize = 0;
+ }
+ return type;
+ }
+
+public:
+ /// \brief Constructor
+ /// \param[in] p_atm read only ML path select automaton reference
+ XMLPathSelect( const ThisXMLPathSelectAutomaton* p_atm)
+ :atm(p_atm),scopestk(p_atm->maxScopeStackSize),follows(p_atm->maxFollows),triggers(p_atm->maxTriggers),tokens(p_atm->maxTokens)
+ {
+ if (atm->states.size() > 0) expand(0);
+ }
+
+ /// \brief Copy constructor
+ /// \param [in] o element to copy
+ XMLPathSelect( const XMLPathSelect& o)
+ :atm(o.atm),scopestk(o.scopestk),follows(o.follows),triggers(o.triggers),tokens(o.tokens){}
+
+ /// \class iterator
+ /// \brief input iterator for the output of this XMLScanner
+ class iterator
+ {
+ public:
+ typedef int value_type;
+ typedef std::size_t difference_type;
+ typedef int* pointer;
+ typedef int& reference;
+ typedef std::input_iterator_tag iterator_category;
+
+ private:
+ int element; //< currently visited element (type)
+ ThisXMLPathSelect* input; //< producing XML path selection stream
+
+ /// \brief Skip to next element
+ /// \return *this
+ iterator& skip() throw(exception)
+ {
+ if (input != 0)
+ {
+ element = input->fetch();
+ }
+ else
+ {
+ element = 0;
+ }
+ return *this;
+ }
+
+ /// \brief Iterator compare
+ /// \param [in] iter iterator to compare with
+ /// \return true, if the elements are equal
+ bool compare( const iterator& iter) const
+ {
+ return (element == iter.element);
+ }
+
+ public:
+ /// \brief Assign iterator
+ /// \param [in] orig iterator to copy
+ void assign( const iterator& orig)
+ {
+ input = orig.input;
+ element = orig.element;
+ }
+
+ /// \brief Copy constructor
+ /// \param [in] orig iterator to copy
+ iterator( const iterator& orig)
+ {
+ assign( orig);
+ }
+
+ /// \brief Constructor by values
+ /// \param [in] p_input XML path selection stream to iterate through
+ /// \param [in] p_type XML element type to feed to XML path matcher
+ /// \param [in] p_key XML element value reference to feed to XML path matcher
+ /// \param [in] p_keysize XML element value size in bytes to feed to XML path matcher
+ iterator( ThisXMLPathSelect& p_input, XMLScannerBase::ElementType p_type, const char* p_key, int p_keysize)
+ :input( &p_input)
+ {
+ input->initProcessElement( p_type, p_key, p_keysize);
+ skip();
+ }
+
+ /// \brief Default constructor
+ iterator()
+ :element(0),input(0) {}
+
+ /// \brief Assignement
+ /// \param [in] orig iterator to copy
+ /// \return *this
+ iterator& operator = (const iterator& orig)
+ {
+ assign( orig);
+ return *this;
+ }
+
+ /// \brief Element acceess
+ /// \return read only element reference
+ int operator*() const
+ {
+ return element;
+ }
+
+ /// \brief Element acceess
+ /// \return read only element reference
+ const int* operator->() const
+ {
+ return &element;
+ }
+
+ /// \brief Preincrement
+ /// \return *this
+ iterator& operator++() {return skip();}
+
+ /// \brief Postincrement
+ /// \return *this
+ iterator operator++(int) {iterator tmp(*this); skip(); return tmp;}
+
+ /// \brief Compare elements for equality
+ /// \return true, if they are equal
+ bool operator==( const iterator& iter) const {return compare( iter);}
+
+ /// \brief Compare elements for inequality
+ /// \return true, if they are not equal
+ bool operator!=( const iterator& iter) const {return !compare( iter);}
+ };
+
+ /// \brief Feed the path selector with the next token and get the start iterator for the results
+ /// \return iterator pointing to the first of the selected XML path elements
+ iterator push( XMLScannerBase::ElementType type, const char* key, int keysize)
+ {
+ return iterator( *this, type, key, keysize);
+ }
+
+ /// \brief Feed the path selector with the next token and get the start iterator for the results
+ /// \return iterator pointing to the first of the selected XML path elements
+ iterator push( XMLScannerBase::ElementType type, const std::string& key)
+ {
+ return iterator( *this, type, key.c_str(), key.size);
+ }
+
+ /// \brief Get the end of results returned by 'push(XMLScannerBase::ElementType,const char*, int)'
+ /// \return the end iterator
+ iterator end()
+ {
+ return iterator();
+ }
+};
+
+}//namespace
+#endif
diff --git a/textwolf/include/textwolf/xmlprinter.hpp b/textwolf/include/textwolf/xmlprinter.hpp
new file mode 100644
index 0000000..acf7bd4
--- /dev/null
+++ b/textwolf/include/textwolf/xmlprinter.hpp
@@ -0,0 +1,387 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this Object refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmlprinter.hpp
+/// \brief XML printer interface hiding character encoding properties
+
+#ifndef __TEXTWOLF_XML_PRINTER_HPP__
+#define __TEXTWOLF_XML_PRINTER_HPP__
+#include "textwolf/cstringiterator.hpp"
+#include "textwolf/textscanner.hpp"
+#include "textwolf/xmlscanner.hpp"
+#include "textwolf/charset.hpp"
+#include "textwolf/xmltagstack.hpp"
+#include <cstring>
+#include <cstdlib>
+
+/// \namespace textwolf
+/// \brief Toplevel namespace of the library
+namespace textwolf {
+
+/// \class XMLPrinter
+/// \brief Character encoding dependent XML printer
+/// \tparam IOCharset Character set encoding of input and output
+/// \tparam AppCharset Character set encoding of the application processor
+/// \tparam BufferType STL back insertion sequence to use for printing output
+template <class IOCharset, class AppCharset,class BufferType>
+class XMLPrinter
+{
+private:
+ /// \brief Prints a character string to an STL back insertion sequence buffer in the IO character set encoding
+ /// \param [in] src pointer to string to print
+ /// \param [in] srcsize size of src in bytes
+ /// \param [out] buf buffer to append result to
+ void printToBuffer( const char* src, std::size_t srcsize, BufferType& buf) const
+ {
+ CStringIterator itr( src, srcsize);
+ TextScanner<CStringIterator,AppCharset> ts( itr);
+
+ UChar ch;
+ while ((ch = ts.chr()) != 0)
+ {
+ m_output.print( ch, buf);
+ ++ts;
+ }
+ }
+
+ /// \brief print a character substitute or the character itself
+ /// \param [in] ch character to print
+ /// \param [in,out] buf buffer to print to
+ /// \param [in] nof_echr number of elements in echr and estr
+ /// \param [in] echr ASCII characters to substitute
+ /// \param [in] estr ASCII strings to substitute with (array parallel to echr)
+ void printEsc( char ch, BufferType& buf, unsigned int nof_echr, const char* echr, const char** estr) const
+ {
+ const char* cc = (const char*)memchr( echr, ch, nof_echr);
+ if (cc)
+ {
+ unsigned int ii = 0;
+ const char* tt = estr[ cc-echr];
+ while (tt[ii]) m_output.print( tt[ii++], buf);
+ }
+ else
+ {
+ m_output.print( ch, buf);
+ }
+ }
+
+ /// \brief print a value with some characters replaced by a string
+ /// \param [in] src pointer to attribute value string to print
+ /// \param [in] srcsize size of src in bytes
+ /// \param [in,out] buf buffer to print to
+ /// \param [in] nof_echr number of elements in echr and estr
+ /// \param [in] echr ASCII characters to substitute
+ /// \param [in] estr ASCII strings to substitute with (array parallel to echr)
+ void printToBufferSubstChr( const char* src, std::size_t srcsize, BufferType& buf, unsigned int nof_echr, const char* echr, const char** estr) const
+ {
+ CStringIterator itr( src, srcsize);
+ textwolf::TextScanner<CStringIterator,AppCharset> ts( itr);
+
+ textwolf::UChar ch;
+ while ((ch = ts.chr()) != 0)
+ {
+ if (ch < 128)
+ {
+ printEsc( (char)ch, buf, nof_echr, echr, estr);
+ }
+ else
+ {
+ m_output.print( ch, buf);
+ }
+ ++ts;
+ }
+ }
+
+ /// \brief print attribute value string
+ /// \param [in] src pointer to attribute value string to print
+ /// \param [in] srcsize size of src in bytes
+ /// \param [in,out] buf buffer to print to
+ void printToBufferAttributeValue( const char* src, std::size_t srcsize, BufferType& buf) const
+ {
+ enum {nof_echr = 12};
+ static const char* estr[nof_echr] = {"&lt;", "&gt;", "&apos;", "&quot;", "&amp;", "&#0;", "&#8;", "&#9;", "&#10;", "&#13;"};
+ static const char echr[nof_echr+1] = "<>'\"&\0\b\t\n\r";
+ m_output.print( '"', buf);
+ printToBufferSubstChr( src, srcsize, buf, nof_echr, echr, estr);
+ m_output.print( '"', buf);
+ }
+
+ /// \brief print content value string
+ /// \param [in] src pointer to content string to print
+ /// \param [in] srcsize size of src in bytes
+ /// \param [in,out] buf buffer to print to
+ void printToBufferContent( const char* src, std::size_t srcsize, BufferType& buf) const
+ {
+ enum {nof_echr = 6};
+ static const char* estr[nof_echr] = {"&lt;", "&gt;", "&amp;", "&#0;", "&#8;"};
+ static const char echr[nof_echr+1] = "<>&\0\b";
+ printToBufferSubstChr( src, srcsize, buf, nof_echr, echr, estr);
+ }
+
+ /// \brief Prints a character to an STL back insertion sequence buffer in the IO character set encoding
+ /// \param [in] ch character to print
+ /// \param [in,out] buf buffer to print to
+ void printToBuffer( char ch, BufferType& buf) const
+ {
+ m_output.print( (textwolf::UChar)(unsigned char)ch, buf);
+ }
+
+public:
+ /// \brief Default constructor
+ XMLPrinter()
+ :m_state(Init){}
+
+ /// \brief Constructor
+ explicit XMLPrinter( const IOCharset& output_)
+ :m_state(Init),m_output(output_){}
+
+ /// \brief Copy constructor
+ XMLPrinter( const XMLPrinter& o)
+ :m_state(o.m_state),m_buf(o.m_buf),m_tagstack(o.m_tagstack),m_output(o.m_output)
+ {}
+
+ /// \brief Prints an XML header (version "1.0")
+ /// \param [in] encoding character set encoding name
+ /// \param [in] standalone standalone attribute ("yes","no" or NULL for undefined)
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool printHeader( const char* encoding, const char* standalone, BufferType& buf)
+ {
+ if (m_state != Init)
+ {
+ m_lasterror = "printing document not starting with xml header";
+ return false;
+ }
+ std::string enc = encoding?encoding:"UTF-8";
+ printToBuffer( "<?xml version=\"1.0\" encoding=\"", 30, buf);
+ printToBuffer( enc.c_str(), enc.size(), buf);
+ if (standalone)
+ {
+ printToBuffer( "\" standalone=\"", 14, buf);
+ printToBuffer( standalone, std::strlen(standalone), buf);
+ printToBuffer( "\"?>\n", 4, buf);
+ }
+ else
+ {
+ printToBuffer( "\"?>\n", 4, buf);
+ }
+ m_state = Content;
+ return true;
+ }
+
+ /// \brief Prints an XML <!DOCTYPE ...> declaration
+ /// \param [in] rootid root element name
+ /// \param [in] publicid PUBLIC attribute
+ /// \param [in] systemid SYSTEM attribute
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool printDoctype( const char* rootid, const char* publicid, const char* systemid, BufferType& buf)
+ {
+ if (rootid)
+ {
+ if (publicid)
+ {
+ if (!systemid)
+ {
+ m_lasterror = "defined DOCTYPE with PUBLIC id but no SYSTEM id";
+ return false;
+ }
+ printToBuffer( "<!DOCTYPE ", 10, buf);
+ printToBuffer( rootid, std::strlen( rootid), buf);
+ printToBuffer( " PUBLIC \"", 9, buf);
+ printToBuffer( publicid, std::strlen( publicid), buf);
+ printToBuffer( "\" \"", 3, buf);
+ printToBuffer( systemid, std::strlen( systemid), buf);
+ printToBuffer( "\">", 2, buf);
+ }
+ else if (systemid)
+ {
+ printToBuffer( "<!DOCTYPE ", 10, buf);
+ printToBuffer( rootid, std::strlen( rootid), buf);
+ printToBuffer( " SYSTEM \"", 9, buf);
+ printToBuffer( systemid, std::strlen( systemid), buf);
+ printToBuffer( "\">", 2, buf);
+ }
+ else
+ {
+ printToBuffer( "<!DOCTYPE ", 11, buf);
+ printToBuffer( rootid, std::strlen( rootid), buf);
+ printToBuffer( ">", 2, buf);
+ }
+ }
+ return true;
+ }
+
+ /// \brief Close the current tag attribute context opened
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool exitTagContext( BufferType& buf)
+ {
+ if (m_state != Content)
+ {
+ if (m_state == Init)
+ {
+ m_lasterror = "printed xml without root element";
+ return false;
+ }
+ printToBuffer( '>', buf);
+ m_state = Content;
+ }
+ return true;
+ }
+
+ /// \brief Print the start of an open tag
+ /// \param [in] src start of the tag name
+ /// \param [in] srcsize length of the tag name in bytes
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool printOpenTag( const char* src, std::size_t srcsize, BufferType& buf)
+ {
+ if (!exitTagContext( buf)) return false;
+ printToBuffer( '<', buf);
+ printToBuffer( (const char*)src, srcsize, buf);
+
+ m_tagstack.push( src, srcsize);
+ m_state = TagElement;
+ return true;
+ }
+
+ /// \brief Print the start of an attribute name
+ /// \param [in] src start of the attribute name
+ /// \param [in] srcsize length of the attribute name in bytes
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool printAttribute( const char* src, std::size_t srcsize, BufferType& buf)
+ {
+ if (m_state == TagElement)
+ {
+ printToBuffer( ' ', buf);
+ printToBuffer( (const char*)src, srcsize, buf);
+ printToBuffer( '=', buf);
+ m_state = TagAttribute;
+ return true;
+ }
+ return false;
+ }
+
+ /// \brief Print a content or attribute value depending on context
+ /// \param [in] src start of the value
+ /// \param [in] srcsize length of the value in bytes
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool printValue( const char* src, std::size_t srcsize, BufferType& buf)
+ {
+ if (m_state == TagAttribute)
+ {
+ printToBufferAttributeValue( (const char*)src, srcsize, buf);
+ m_state = TagElement;
+ }
+ else
+ {
+ if (!exitTagContext( buf)) return false;
+ printToBufferContent( (const char*)src, srcsize, buf);
+ }
+ return true;
+ }
+
+ /// \brief Print the close of the current tag open
+ /// \param [out] buf buffer to print to
+ /// \return true on success, false if failed (check lasterror())
+ bool printCloseTag( BufferType& buf)
+ {
+ const void* cltag;
+ std::size_t cltagsize;
+
+ if (!m_tagstack.top( cltag, cltagsize) || !cltagsize)
+ {
+ return false;
+ }
+ if (m_state == TagElement)
+ {
+ printToBuffer( '/', buf);
+ printToBuffer( '>', buf);
+ m_state = Content;
+ }
+ else if (m_state != Content)
+ {
+ return false;
+ }
+ else
+ {
+ printToBuffer( '<', buf);
+ printToBuffer( '/', buf);
+ printToBuffer( (const char*)cltag, cltagsize, buf);
+ printToBuffer( '>', buf);
+ }
+ m_tagstack.pop();
+ if (m_tagstack.empty())
+ {
+ printToBuffer( '\n', buf);
+ }
+ return true;
+ }
+
+ /// \brief Internal state
+ enum State
+ {
+ Init,
+ Content,
+ TagAttribute,
+ TagElement
+ };
+
+ /// \brief Get the current internal state
+ /// \return the current state
+ State state() const
+ {
+ return m_state;
+ }
+
+ /// \brief Get the last error occurred
+ /// \return the last error string
+ const char* lasterror() const
+ {
+ return m_lasterror.empty()?0:m_lasterror.c_str();
+ }
+
+private:
+ State m_state; ///< internal state
+ BufferType m_buf; ///< element output buffer
+ TagStack m_tagstack; ///< tag name stack of open tags
+ IOCharset m_output; ///< output character set encoding
+ std::string m_lasterror; ///< the last error occurred
+};
+
+} //namespace
+#endif
diff --git a/textwolf/include/textwolf/xmlscanner.hpp b/textwolf/include/textwolf/xmlscanner.hpp
new file mode 100644
index 0000000..9018816
--- /dev/null
+++ b/textwolf/include/textwolf/xmlscanner.hpp
@@ -0,0 +1,1355 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmlscanner.hpp
+/// \brief XML parser iterator interface for processing the XML elements one by one
+
+#ifndef __TEXTWOLF_XML_SCANNER_HPP__
+#define __TEXTWOLF_XML_SCANNER_HPP__
+#include "textwolf/char.hpp"
+#include "textwolf/charset_interface.hpp"
+#include "textwolf/exception.hpp"
+#include "textwolf/textscanner.hpp"
+#include "textwolf/traits.hpp"
+#include <map>
+#include <cstddef>
+
+namespace textwolf {
+
+/// \class ScannerStatemachine
+/// \brief Class to build up the XML element scanner state machine in a descriptive way
+class ScannerStatemachine :public throws_exception
+{
+public:
+ enum
+ {
+ MaxNofStates=64 ///< maximum number of states (fixed allocated array for state machine)
+ };
+ /// \class Element
+ /// \brief One state in the state machine
+ struct Element
+ {
+ int fallbackState; ///< state transition if the event does not match (it belongs to the next state = fallbackState)
+ int missError; ///< error code in case of an event that does not match and there is no fallback
+
+ /// \class Action
+ /// \brief Definition of action fired by the state machine
+ struct Action
+ {
+ int op; ///< action operand
+ int arg; ///< action argument
+ };
+ Action action; ///< action executed after entering this state
+ unsigned char nofnext; ///< number of follow states defined
+ signed char next[ NofControlCharacter]; ///< follow state fired by an event (control character type parsed)
+
+ /// \brief Constructor
+ Element() :fallbackState(-1),missError(-1),nofnext(0)
+ {
+ action.op = -1;
+ action.arg = 0;
+ for (unsigned int ii=0; ii<NofControlCharacter; ii++) next[ii] = -1;
+ }
+ };
+ /// \brief Get state addressed by its index
+ /// \param [in] stateIdx index of the state
+ /// \return state defintion reference
+ Element* get( int stateIdx) throw(exception)
+ {
+ if ((unsigned int)stateIdx>size) throw exception(InvalidState);
+ return tab + stateIdx;
+ }
+
+private:
+ Element tab[ MaxNofStates]; ///< states of the STM
+ unsigned int size; ///< number of states defined in the STM
+
+ /// \brief Create a new state
+ /// \param [in] stateIdx index of the state (must be the size of the STM array, so that state identifiers can be named by enumeration constants for better readability)
+ void newState( int stateIdx) throw(exception)
+ {
+ if (size != (unsigned int)stateIdx) throw exception( StateNumbersNotAscending);
+ if (size >= MaxNofStates) throw exception( DimOutOfRange);
+ size++;
+ }
+
+ /// \brief Define a transition for all control character types not firing yet in the last state defined
+ /// \param [in] nextState the follow state index defined for these transitions
+ void addOtherTransition( int nextState) throw(exception)
+ {
+ if (size == 0) throw exception( InvalidState);
+ if (nextState < 0 || nextState > MaxNofStates) throw exception( InvalidParamState);
+ for (unsigned int inputchr=0; inputchr<NofControlCharacter; inputchr++)
+ {
+ if (tab[ size-1].next[ inputchr] == -1) tab[ size-1].next[ inputchr] = (unsigned char)nextState;
+ }
+ tab[ size-1].nofnext = NofControlCharacter;
+ }
+
+ /// \brief Define a transition for inputchr in the last state defined
+ /// \param [in] inputchr the firing input control character type
+ /// \param [in] nextState the follow state index defined for this transition
+ void addTransition( ControlCharacter inputchr, int nextState) throw(exception)
+ {
+ if (size == 0) throw exception( InvalidState);
+ if ((int)inputchr >= (int)NofControlCharacter) throw exception( InvalidParamChar);
+ if (nextState < 0 || nextState > MaxNofStates) throw exception( InvalidParamState);
+ if (tab[ size-1].next[ inputchr] != -1) throw exception( DuplicateStateTransition);
+ tab[ size-1].next[ inputchr] = (unsigned char)nextState;
+ tab[ size-1].nofnext += 1;
+ }
+
+ /// \brief Define a self directing transition for inputchr in the last state defined (the state remains the same for this input)
+ /// \param [in] inputchr the firing input control character type
+ void addTransition( ControlCharacter inputchr) throw(exception)
+ {
+ addTransition( inputchr, size-1);
+ }
+
+ /// \brief Define an action in the last state defined (to be executed when entering the state)
+ /// \param [in] action_op action operand
+ /// \param [in] action_arg action argument
+ void addAction( int action_op, int action_arg=0) throw(exception)
+ {
+ if (size == 0) throw exception( InvalidState);
+ if (tab[ size-1].action.op != -1) throw exception( InvalidState);
+ tab[ size-1].action.op = action_op;
+ tab[ size-1].action.arg = action_arg;
+ }
+
+ /// \brief Define an error in the last state defined to be reported when no fallback is defined and no firing input character parsed
+ /// \param [in] error code to be reported
+ void addMiss( int error) throw(exception)
+ {
+ if (size == 0) throw exception( InvalidState);
+ if (tab[ size-1].missError != -1) throw exception( InvalidState);
+ tab[ size-1].missError = error;
+ }
+
+ /// \brief Define in the last state defined a fallback state transition that is fired when no firing input character parsed
+ /// \param [in] stateIdx follow state index
+ void addFallback( int stateIdx) throw(exception)
+ {
+ if (size == 0) throw exception( InvalidState);
+ if (tab[ size-1].fallbackState != -1) throw exception( InvalidState);
+ if (stateIdx < 0 || stateIdx > MaxNofStates) throw exception( InvalidParamState);
+ tab[ size-1].fallbackState = stateIdx;
+ }
+public:
+ /// \brief Constructor
+ ScannerStatemachine() :size(0){}
+
+ /// \brief See ScannerStatemachine::newState(int)
+ ScannerStatemachine& operator[]( int stateIdx) {newState(stateIdx); return *this;}
+ /// \brief See ScannerStatemachine::addTransition(ControlCharacter,int)
+ ScannerStatemachine& operator()( ControlCharacter inputchr, int ns) {addTransition(inputchr,ns); return *this;}
+ /// \brief See ScannerStatemachine::addTransition(ControlCharacter,int)
+ ScannerStatemachine& operator()( ControlCharacter i1, ControlCharacter i2, int ns) {addTransition(i1,ns); addTransition(i2,ns); return *this;}
+ /// \brief See ScannerStatemachine::addTransition(ControlCharacter,int)
+ ScannerStatemachine& operator()( ControlCharacter i1, ControlCharacter i2, ControlCharacter i3, int ns) {addTransition(i1,ns); addTransition(i2,ns); addTransition(i3,ns); return *this;}
+ /// \brief See ScannerStatemachine::addTransition(ControlCharacter)
+ ScannerStatemachine& operator()( ControlCharacter inputchr) {addTransition(inputchr); return *this;}
+ /// \brief See ScannerStatemachine::addAction(int,int)
+ ScannerStatemachine& action( int aa, int arg=0) {addAction(aa,arg); return *this;}
+ /// \brief See ScannerStatemachine::addMiss(int)
+ ScannerStatemachine& miss( int ee) {addMiss(ee); return *this;}
+ /// \brief See ScannerStatemachine::addFallback(int)
+ ScannerStatemachine& fallback( int stateIdx) {addFallback(stateIdx); return *this;}
+ /// \brief See ScannerStatemachine::addOtherTransition(int)
+ ScannerStatemachine& other( int stateIdx) {addOtherTransition(stateIdx); return *this;}
+};
+
+/// \class XMLScannerBase
+/// \brief XML scanner base class for things common for all XML scanners
+class XMLScannerBase
+{
+public:
+ /// \enum ElementType
+ /// \brief Enumeration of XML element types returned by an XML scanner
+ enum ElementType
+ {
+ None, ///< empty (NULL)
+ ErrorOccurred, ///< XML scanning error error reported
+ HeaderStart, ///< open XML header tag
+ HeaderAttribName, ///< tag attribute name in the XML header
+ HeaderAttribValue, ///< tag attribute value in the XML header
+ HeaderEnd, ///< end of XML header event (after parsing '?&gt;')
+ DocAttribValue, ///< document attribute value in a DOCTYPE or ENTITY definition
+ DocAttribEnd, ///< end of a document attribute definition <! .. !>
+ TagAttribName, ///< tag attribute name (e.g. "id" in &lt;person id='5'&gt;
+ TagAttribValue, ///< tag attribute value (e.g. "5" in &lt;person id='5'&gt;
+ OpenTag, ///< open tag (e.g. "bla" for "&lt;bla...")
+ CloseTag, ///< close tag (e.g. "bla" for "&lt;/bla&gt;")
+ CloseTagIm, ///< immediate close tag (e.g. "bla" for "&lt;bla /&gt;")
+ Content, ///< content element string (separated by spaces or end of line)
+ Exit ///< end of document
+ };
+ enum
+ {
+ NofElementTypes=Exit+1 ///< number of XML element types defined
+ };
+
+ /// \brief Get the XML element type as string
+ /// \param [in] ee XML element type
+ /// \return XML element type as string
+ static const char* getElementTypeName( ElementType ee)
+ {
+ static const char* names[ NofElementTypes] = {"None","ErrorOccurred","HeaderStart","HeaderAttribName","HeaderAttribValue","HeaderEnd", "DocAttribValue", "DocAttribEnd", "TagAttribName","TagAttribValue","OpenTag","CloseTag","CloseTagIm","Content","Exit"};
+ return names[ (unsigned int)ee];
+ }
+
+ /// \enum Error
+ /// \brief Enumeration of XML scanner error codes
+ enum Error
+ {
+ Ok, ///< no error, everything is OK
+ ErrIllegalDocumentAttributeDef, ///< error in document attribute or entity definition
+ ErrExpectedOpenTag, ///< expected an open tag in this state
+ ErrExpectedXMLTag, ///< expected an <?xml tag in this state
+ ErrUnexpectedEndOfText, ///< unexpected end of text in the middle of the XML definition
+ ErrSyntaxToken, ///< a specific string expected as token in XML but does not match
+ ErrStringNotTerminated, ///< attribute string in XML not terminated on the same line
+ ErrUndefinedCharacterEntity, ///< named entity is not defined in the entity map
+ ErrExpectedTagEnd, ///< expected end of tag
+ ErrExpectedEqual, ///< expected equal in tag attribute definition
+ ErrExpectedTagAttribute, ///< expected tag attribute
+ ErrExpectedCDATATag, ///< expected CDATA tag definition
+ ErrInternal, ///< internal error (textwolf implementation error)
+ ErrUnexpectedEndOfInput, ///< unexpected end of input stream
+ ErrExpectedEndOfLine, ///< expected mandatory end of line (after XML header)
+ ErrExpectedDash2 ///< expected second '-' after '<!-' to start an XML comment as '<!-- ... -->'
+ };
+
+ /// \brief Get the error code as string
+ /// \param [in] ee error code
+ /// \return the error code as string
+ static const char* getErrorString( Error ee)
+ {
+ enum {NofErrors=16};
+ static const char* sError[NofErrors]
+ = {0,"illegal document attribute definition",
+ "expected open tag",
+ "expected XML tag",
+ "unexpected end of text",
+ "syntax token",
+ "string not terminated",
+ "undefined character entity",
+ "expected tag end",
+ "expected equal",
+ "expected tag attribute",
+ "expected CDATA tag",
+ "internal",
+ "unexpected end of input",
+ "expected end of line",
+ "expected 2nd '-' to complete marker for start of comment '<!--'"
+ };
+ return sError[(unsigned int)ee];
+ }
+
+ /// \enum STMState
+ /// \brief Enumeration of states of the XML scanner state machine
+ enum STMState
+ {
+ START, STARTTAG, XTAG, PITAG, PITAGEND, XTAGEND, XTAGDONE, XTAGAISK, XTAGANAM, XTAGAESK, XTAGAVSK, XTAGAVID, XTAGAVSQ, XTAGAVDQ, XTAGAVQE,
+ DOCSTART, CONTENT, TOKEN, SEEKTOK, XMLTAG, OPENTAG, CLOSETAG, TAGCLSK, TAGAISK, TAGANAM, TAGAESK, TAGAVSK, TAGAVID, TAGAVSQ, TAGAVDQ, TAGAVQE,
+ TAGCLIM, ENTITYSL, ENTITY, ENTITYE, ENTITYID, ENTITYSQ, ENTITYDQ, ENTITYLC,
+ COMDASH2, COMSEEKE, COMENDD2, COMENDCL, CDATA, CDATA1, CDATA2, CDATA3, EXIT
+ };
+
+ /// \brief Get the scanner state machine state as string
+ /// \param [in] s the state
+ /// \return the state as string
+ static const char* getStateString( STMState s)
+ {
+ enum Constant {NofStates=48};
+ static const char* sState[NofStates]
+ = {
+ "START", "STARTTAG", "XTAG", "PITAG", "PITAGEND",
+ "XTAGEND", "XTAGDONE", "XTAGAISK", "XTAGANAM",
+ "XTAGAESK", "XTAGAVSK", "XTAGAVID", "XTAGAVSQ", "XTAGAVDQ",
+ "XTAGAVQE", "DOCSTART", "CONTENT", "TOKEN", "SEEKTOK", "XMLTAG",
+ "OPENTAG", "CLOSETAG", "TAGCLSK", "TAGAISK", "TAGANAM",
+ "TAGAESK", "TAGAVSK", "TAGAVID", "TAGAVSQ", "TAGAVDQ",
+ "TAGAVQE", "TAGCLIM", "ENTITYSL", "ENTITY", "ENTITYE",
+ "ENTITYID", "ENTITYSQ", "ENTITYDQ", "ENTITYLC",
+ "COMDASH2", "COMSEEKE", "COMENDD2", "COMENDCL",
+ "CDATA", "CDATA1", "CDATA2", "CDATA3", "EXIT"
+ };
+ return sState[(unsigned int)s];
+ }
+
+ /// \enum STMAction
+ /// \brief Enumeration of actions in the XML scanner state machine
+ enum STMAction
+ {
+ Return, ReturnWord, ReturnContent, ReturnIdentifier, ReturnSQString, ReturnDQString, ExpectIdentifierXML, ExpectIdentifierCDATA, ReturnEOF,
+ NofSTMActions = 9
+ };
+
+ /// \brief Get the scanner state machine action as string
+ /// \param [in] a the action
+ /// \return the action as string
+ static const char* getActionString( STMAction a)
+ {
+ static const char* name[ NofSTMActions] = {"Return", "ReturnWord", "ReturnContent", "ReturnIdentifier", "ReturnSQString", "ReturnDQString", "ExpectIdentifierXML", "ExpectIdentifierCDATA", "ReturnEOF"};
+ return name[ (unsigned int)a];
+ };
+
+ /// \class Statemachine
+ /// \brief XML scanner state machine implementation
+ struct Statemachine :public ScannerStatemachine
+ {
+ /// \brief Constructor (defines the state machine completely)
+ Statemachine()
+ {
+ (*this)
+ [ START ](EndOfText,EXIT)(EndOfLine)(Cntrl)(Space)(Lt,STARTTAG).miss(ErrExpectedOpenTag)
+ [ STARTTAG ](EndOfLine)(Cntrl)(Space)(Questm,XTAG)(Exclam,ENTITYSL).fallback(OPENTAG)
+ [ XTAG ].action(ExpectIdentifierXML)(EndOfLine,Cntrl,Space,XTAGAISK)(Questm,XTAGEND).miss(ErrExpectedXMLTag)
+ [ PITAG ](Questm,PITAGEND).other(PITAG)
+ [ PITAGEND ](Gt,CONTENT).miss(ErrExpectedTagEnd)
+ [ XTAGEND ](Gt,XTAGDONE)(EndOfLine)(Cntrl)(Space).miss(ErrExpectedTagEnd)
+ [ XTAGDONE ].action(Return,HeaderEnd).fallback(DOCSTART)
+ [ XTAGAISK ](EndOfLine)(Cntrl)(Space)(Questm,XTAGEND).fallback(XTAGANAM)
+ [ XTAGANAM ].action(ReturnIdentifier,HeaderAttribName)(EndOfLine,Cntrl,Space,XTAGAESK)(Equal,XTAGAVSK).miss(ErrExpectedEqual)
+ [ XTAGAESK ](EndOfLine)(Cntrl)(Space)(Equal,XTAGAVSK).miss(ErrExpectedEqual)
+ [ XTAGAVSK ](EndOfLine)(Cntrl)(Space)(Sq,XTAGAVSQ)(Dq,XTAGAVDQ).fallback(XTAGAVID)
+ [ XTAGAVID ].action(ReturnIdentifier,HeaderAttribValue)(EndOfLine,Cntrl,Space,XTAGAISK)(Questm,XTAGEND).miss(ErrExpectedTagAttribute)
+ [ XTAGAVSQ ].action(ReturnSQString,HeaderAttribValue)(Sq,XTAGAVQE).miss(ErrStringNotTerminated)
+ [ XTAGAVDQ ].action(ReturnDQString,HeaderAttribValue)(Dq,XTAGAVQE).miss(ErrStringNotTerminated)
+ [ XTAGAVQE ](EndOfLine,Cntrl,Space,XTAGAISK)(Questm,XTAGEND).miss(ErrExpectedTagAttribute)
+ [ DOCSTART ](EndOfText,EXIT)(EndOfLine)(Cntrl)(Space)(Lt,XMLTAG).fallback(TOKEN)
+ [ CONTENT ](EndOfText,EXIT)(Lt,XMLTAG).fallback(TOKEN)
+ [ TOKEN ].action(ReturnContent,Content)(EndOfText,EXIT)(EndOfLine,Cntrl,Space,CONTENT)(Lt,XMLTAG).fallback(CONTENT)
+ [ SEEKTOK ](EndOfText,EXIT)(EndOfLine)(Cntrl)(Space)(Lt,XMLTAG).fallback(TOKEN)
+ [ XMLTAG ](EndOfLine)(Cntrl)(Space)(Questm,PITAG)(Exclam,ENTITYSL)(Slash,CLOSETAG).fallback(OPENTAG)
+ [ OPENTAG ].action(ReturnIdentifier,OpenTag)(EndOfLine,Cntrl,Space,TAGAISK)(Slash,TAGCLIM)(Gt,CONTENT).miss(ErrExpectedTagAttribute)
+ [ CLOSETAG ].action(ReturnIdentifier,CloseTag)(EndOfLine,Cntrl,Space,TAGCLSK)(Gt,CONTENT).miss(ErrExpectedTagEnd)
+ [ TAGCLSK ](EndOfLine)(Cntrl)(Space)(Gt,CONTENT).miss(ErrExpectedTagEnd)
+ [ TAGAISK ](EndOfLine)(Cntrl)(Space)(Gt,CONTENT)(Slash,TAGCLIM).fallback(TAGANAM)
+ [ TAGANAM ].action(ReturnIdentifier,TagAttribName)(EndOfLine,Cntrl,Space,TAGAESK)(Equal,TAGAVSK).miss(ErrExpectedEqual)
+ [ TAGAESK ](EndOfLine)(Cntrl)(Space)(Equal,TAGAVSK).miss(ErrExpectedEqual)
+ [ TAGAVSK ](EndOfLine)(Cntrl)(Space)(Sq,TAGAVSQ)(Dq,TAGAVDQ).fallback(TAGAVID)
+ [ TAGAVID ].action(ReturnIdentifier,TagAttribValue)(EndOfLine,Cntrl,Space,TAGAISK)(Slash,TAGCLIM)(Gt,CONTENT).miss(ErrExpectedTagAttribute)
+ [ TAGAVSQ ].action(ReturnSQString,TagAttribValue)(Sq,TAGAVQE).miss(ErrStringNotTerminated)
+ [ TAGAVDQ ].action(ReturnDQString,TagAttribValue)(Dq,TAGAVQE).miss(ErrStringNotTerminated)
+ [ TAGAVQE ](EndOfLine,Cntrl,Space,TAGAISK)(Slash,TAGCLIM)(Gt,CONTENT).miss(ErrExpectedTagAttribute)
+ [ TAGCLIM ].action(Return,CloseTagIm)(EndOfLine)(Cntrl)(Space)(Gt,CONTENT).miss(ErrExpectedTagEnd)
+ [ ENTITYSL ](Osb,CDATA)(Dash,COMDASH2).fallback(ENTITY)
+ [ ENTITY ](Gt,ENTITYE)(EndOfLine)(Cntrl)(Space)(Dq,ENTITYDQ)(Sq,ENTITYSQ)(Osb,ENTITYLC).fallback(ENTITYID)
+ [ ENTITYE ].action(Return,DocAttribEnd).fallback(SEEKTOK)
+ [ ENTITYID ].action(ReturnIdentifier,DocAttribValue)(EndOfLine,Cntrl,Space,ENTITY)(Gt,ENTITYE).miss(ErrIllegalDocumentAttributeDef)
+ [ ENTITYSQ ].action(ReturnSQString,DocAttribValue)(Sq,ENTITY).miss(ErrStringNotTerminated)
+ [ ENTITYDQ ].action(ReturnDQString,DocAttribValue)(Dq,ENTITY).miss(ErrStringNotTerminated)
+ [ ENTITYLC ](Csb,ENTITY).other( ENTITYLC)
+ [ COMDASH2 ](Dash,COMSEEKE).miss(ErrExpectedDash2)
+ [ COMSEEKE ](Dash,COMENDD2).other(COMSEEKE)
+ [ COMENDD2 ](Dash,COMENDCL).other(COMSEEKE)
+ [ COMENDCL ](Gt,SEEKTOK)(Dash,COMENDD2).other(COMSEEKE)
+ [ CDATA ].action(ExpectIdentifierCDATA)(Osb,CDATA1).miss(ErrExpectedCDATATag)
+ [ CDATA1 ](Csb,CDATA2).other(CDATA1)
+ [ CDATA2 ](Csb,CDATA3).other(CDATA1)
+ [ CDATA3 ](Gt,CONTENT).other(CDATA1)
+ [ EXIT ].action(Return,Exit);
+ }
+ };
+
+ /// \typedef IsTokenCharMap
+ /// \brief Forms a set of characters by assigning (true/false) to the whole domain
+ typedef CharMap<bool,false,NofControlCharacter> IsTokenCharMap;
+
+ /// \class IsTagCharMap
+ /// \brief Defines the set of tag characters
+ struct IsTagCharMap :public IsTokenCharMap
+ {
+ IsTagCharMap()
+ {
+ (*this)(Undef,true)(Any,true)(Dash,true);
+ }
+ };
+
+ /// \class IsWordCharMap
+ /// \brief Defines the set of content word characters (for tokenization)
+ /// \deprecated automatic tokenization with whitespace separators option not provided anymore
+ struct IsWordCharMap :public IsTokenCharMap
+ {
+ IsWordCharMap()
+ {
+ (*this)(Undef,true)(Equal,true)(Gt,true)(Slash,true)(Dash,true)(Exclam,true)(Questm,true)(Sq,true)(Dq,true)(Osb,true)(Csb,true)(Any,true);
+ }
+ };
+
+ /// \class IsContentCharMap
+ /// \brief Defines the set of content token characters
+ struct IsContentCharMap :public IsTokenCharMap
+ {
+ IsContentCharMap()
+ {
+ (*this)(Cntrl,true)(Space,true)(EndOfLine,true)(Undef,true)(Equal,true)(Gt,true)(Slash,true)(Dash,true)(Exclam,true)(Questm,true)(Sq,true)(Dq,true)(Osb,true)(Csb,true)(Any,true);
+ }
+ };
+
+ /// \class IsSQStringCharMap
+ /// \brief Defines the set characters belonging to a single quoted string
+ struct IsSQStringCharMap :public IsContentCharMap
+ {
+ IsSQStringCharMap()
+ {
+ (*this)(Sq,false)(Space,true);
+ }
+ };
+
+ /// \class IsDQStringCharMap
+ /// \brief Defines the set characters belonging to a double quoted string
+ struct IsDQStringCharMap :public IsContentCharMap
+ {
+ IsDQStringCharMap()
+ {
+ (*this)(Dq,false)(Space,true);
+ }
+ };
+};
+
+
+/// \class XMLScanner
+/// \brief XML scanner template that adds the functionality to the statemachine base definition
+/// \tparam InputIterator input iterator with ++ and read only * returning 0 als last character of the input
+/// \tparam InputCharSet_ character set encoding of the input, read as stream of bytes
+/// \tparam OutputCharSet_ character set encoding of the output, printed as string of the item type of the character set,
+/// \tparam OutputBuffer_ buffer for output with STL back insertion sequence interface (e.g. std::string,std::vector<char>,textwolf::StaticBuffer)
+template
+<
+ class InputIterator,
+ class InputCharSet_,
+ class OutputCharSet_,
+ class OutputBuffer_
+>
+class XMLScanner :public XMLScannerBase
+{
+private:
+ /// \class TokState
+ /// \brief Token state variables
+ struct TokState
+ {
+ /// \enum Id
+ /// \brief Enumeration of token parser states.
+ /// \remark These states define where the scanner has to continue parsing when it was interrupted by an EoD exception and reentered again with more input to process.
+ enum Id
+ {
+ Start, ///< start state (no parsing action performed at the moment)
+ ParsingDone, ///< scanner war interrupted after parsing something when accessing the follow character
+ ParsingKey, ///< scanner was interrupted when parsing a key
+ ParsingEntity, ///< scanner was interrupted when parsing an XML character entity
+ ParsingNumericEntity, ///< scanner was interrupted when parsing an XML numeric character entity
+ ParsingNumericBaseEntity, ///< scanner was interrupted when parsing an XML basic character entity (apos,amp,etc..)
+ ParsingNamedEntity, ///< scanner was interrupted when parsing an XML named character entity
+ ParsingToken ///< scanner was interrupted when parsing a token (not in entity cotext)
+ };
+ Id id; ///< the scanner token parser state
+
+ enum EolnState ///< end of line state to fulfill the W3C requirements for end of line mapping (see http://www.w3.org/TR/xml/: 2.11 End-of-Line Handling)
+ {
+ SRC,CR
+ };
+ EolnState eolnState; ///< the scanner end of line state
+
+ unsigned int pos; ///< entity buffer position (buf)
+ unsigned int base; ///< numeric entity base (10 for decimal/16 for hexadecimal)
+ EChar value; ///< parsed entity value
+ char buf[ 16]; ///< parsed entity buffer
+ UChar curchr_saved; ///< save current character parsed for the case we cannot print it (output buffer too small)
+
+ /// \brief Constructor
+ TokState() :id(Start),eolnState(SRC),pos(0),base(0),value(0),curchr_saved(0) {}
+
+ /// \brief Reset this state variables (after succesful exit with a new token parsed)
+ /// \param [in] id_ the new entity parse state
+ /// \param [in] eolnState_ the end of line mapping state
+ void init(Id id_=Start, EolnState eolnState_=SRC)
+ {
+ id=id_;eolnState=eolnState_;pos=0;base=0;value=0;curchr_saved=0;
+ }
+ };
+ TokState tokstate; ///< the entity parsing state of this XML scanner
+
+public:
+ typedef InputCharSet_ InputCharSet;
+ typedef OutputCharSet_ OutputCharSet;
+ class iterator;
+
+public:
+ typedef TextScanner<InputIterator,InputCharSet_> InputReader;
+ typedef XMLScanner<InputIterator,InputCharSet_,OutputCharSet_,OutputBuffer_> ThisXMLScanner;
+ typedef std::map<const char*,UChar> EntityMap;
+ typedef OutputBuffer_ OutputBuffer;
+
+private:
+ /// \brief Print a character to the output token buffer
+ /// \param [in] ch unicode character to print
+ void push( UChar ch)
+ {
+ m_output.print( ch, m_outputBuf);
+ }
+
+ void copychar_impl( const traits::TypeCheck::YES&)
+ {
+ m_src.copychar( m_output, m_outputBuf);
+ }
+
+ void copychar_impl( const traits::TypeCheck::NO&)
+ {
+ push( m_src.chr());
+ }
+
+ void copychar()
+ {
+ copychar_impl( traits::TypeCheck::is_same<InputCharSet,OutputCharSet>::type());
+ }
+
+ /// \brief Map a hexadecimal digit to its value
+ /// \param [in] ch hexadecimal digit to map to its decimal value
+ static unsigned char HEX( unsigned char ch)
+ {
+ struct HexCharMap :public CharMap<unsigned char, 0xFF>
+ {
+ HexCharMap()
+ {
+ (*this)
+ ('0',0) ('1', 1)('2', 2)('3', 3)('4', 4)('5', 5)('6', 6)('7', 7)('8', 8)('9', 9)
+ ('A',10)('B',11)('C',12)('D',13)('E',14)('F',15)('a',10)('b',11)('c',12)('d',13)('e',14)('f',15);
+ }
+ };
+ static HexCharMap hexCharMap;
+ return hexCharMap[ch];
+ }
+
+ /// \brief Parse a numeric entity value for a table definition (map it to the target character set)
+ /// \param [in] ir input reader
+ /// \return the value of the entity parsed
+ static UChar parseStaticNumericEntityValue( InputReader& ir)
+ {
+ EChar value = 0;
+ unsigned char ch = ir.ascii();
+ unsigned int base;
+ if (ch != '#') return 0;
+ ir.skip();
+ ch = ir.ascii();
+ if (ch == 'x')
+ {
+ ir.skip();
+ ch = ir.ascii();
+ base = 16;
+ }
+ else
+ {
+ base = 10;
+ }
+ while (ch != ';')
+ {
+ unsigned char chval = HEX(ch);
+ if (value >= base) return 0;
+ value = value * base + chval;
+ if (value >= 0xFFFFFFFF) return 0;
+ ir.skip();
+ ch = ir.ascii();
+ }
+ return (UChar)value;
+ }
+
+ /// \brief Print the characters of a sequence that was thought to form an entity but did not
+ /// \return true on success
+ void fallbackEntity()
+ {
+ switch (tokstate.id)
+ {
+ case TokState::Start:
+ case TokState::ParsingDone:
+ case TokState::ParsingKey:
+ case TokState::ParsingToken:
+ break;
+ case TokState::ParsingEntity:
+ push('&');
+ break;
+ case TokState::ParsingNumericEntity:
+ push('&');
+ push('#');
+ break;
+ case TokState::ParsingNumericBaseEntity:
+ push('&');
+ push('#');
+ for (unsigned int ii=0; ii<tokstate.pos; ii++) push( tokstate.buf[ii]);
+ break;
+ case TokState::ParsingNamedEntity:
+ push('&');
+ for (unsigned int ii=0; ii<tokstate.pos; ii++) push( tokstate.buf[ii]);
+ break;
+ }
+ }
+
+ /// \brief Try to parse an entity (we got '&')
+ /// \return true on success
+ bool parseEntity()
+ {
+ unsigned char ch;
+ tokstate.id = TokState::ParsingEntity;
+ ch = m_src.ascii();
+ if (ch == '#')
+ {
+ m_src.skip();
+ return parseNumericEntity();
+ }
+ else
+ {
+ return parseNamedEntity();
+ }
+ }
+
+ /// \brief Try to parse a numeric entity (we got '&#')
+ /// \return true on success
+ bool parseNumericEntity()
+ {
+ unsigned char ch;
+ tokstate.id = TokState::ParsingNumericEntity;
+ ch = m_src.ascii();
+ if (ch == 'x')
+ {
+ tokstate.base = 16;
+ m_src.skip();
+ return parseNumericBaseEntity();
+ }
+ else
+ {
+ tokstate.base = 10;
+ return parseNumericBaseEntity();
+ }
+ }
+
+ /// \brief Try to parse a numeric entity with known base (we got '&#' and we know the base 10/16 of it)
+ /// \return true on success
+ bool parseNumericBaseEntity()
+ {
+ unsigned char ch;
+ tokstate.id = TokState::ParsingNumericBaseEntity;
+
+ while (tokstate.pos < sizeof(tokstate.buf))
+ {
+ ch = m_src.ascii();
+ if (ch == ';')
+ {
+ if (tokstate.value > 0xFFFFFFFF)
+ {
+ tokstate.buf[ tokstate.pos++] = ch;
+ fallbackEntity();
+ return true;
+ }
+ push( (UChar)tokstate.value);
+ tokstate.init( TokState::ParsingToken);
+ m_src.skip();
+ return true;
+ }
+ else
+ {
+ unsigned char chval = HEX(ch);
+ if (chval >= tokstate.base)
+ {
+ fallbackEntity();
+ return true;
+ }
+ tokstate.buf[ tokstate.pos++] = ch;
+ tokstate.value = tokstate.value * tokstate.base + chval;
+ m_src.skip();
+ }
+ }
+ fallbackEntity();
+ return true;
+ }
+
+ /// \brief Try to parse a named entity
+ /// \return true on success
+ bool parseNamedEntity()
+ {
+ unsigned char ch;
+ tokstate.id = TokState::ParsingNamedEntity;
+ ch = m_src.ascii();
+ while (tokstate.pos < sizeof(tokstate.buf)-1 && ch != ';' && m_src.control() == Any)
+ {
+ tokstate.buf[ tokstate.pos] = ch;
+ m_src.skip();
+ tokstate.pos++;
+ ch = m_src.ascii();
+ }
+ if (ch == ';')
+ {
+ tokstate.buf[ tokstate.pos] = '\0';
+ if (!pushEntity( tokstate.buf)) return false;
+ tokstate.init( TokState::ParsingToken);
+ m_src.skip();
+ return true;
+ }
+ else
+ {
+ fallbackEntity();
+ return true;
+ }
+ }
+
+ /// \brief Try to recover from an interrupted token parsing state (end of input exception)
+ /// \return true on success
+ bool parseTokenRecover()
+ {
+ bool rt = false;
+ if (tokstate.curchr_saved)
+ {
+ push( tokstate.curchr_saved);
+ tokstate.curchr_saved = 0;
+ }
+ switch (tokstate.id)
+ {
+ case TokState::Start:
+ case TokState::ParsingDone:
+ case TokState::ParsingKey:
+ case TokState::ParsingToken:
+ error = ErrInternal;
+ return false;
+ case TokState::ParsingEntity: rt = parseEntity(); break;
+ case TokState::ParsingNumericEntity: rt = parseNumericEntity(); break;
+ case TokState::ParsingNumericBaseEntity: rt = parseNumericBaseEntity(); break;
+ case TokState::ParsingNamedEntity: rt = parseNamedEntity(); break;
+ }
+ tokstate.init( TokState::ParsingToken);
+ return rt;
+ }
+
+ /// \brief Parse a token defined by the set of valid token characters
+ /// \param [in] isTok set of valid token characters
+ /// \return true on success
+ bool parseToken( const IsTokenCharMap& isTok)
+ {
+ if (tokstate.id == TokState::Start)
+ {
+ tokstate.id = TokState::ParsingToken;
+ m_outputBuf.clear();
+ }
+ else if (tokstate.id != TokState::ParsingToken)
+ {
+ if (!parseTokenRecover())
+ {
+ tokstate.init();
+ return false;
+ }
+ }
+ for (;;)
+ {
+ /// \todo When source and dest encoding are equal, then do not decode
+ /// the value in parsing and encode it when printing. Use some sort
+ /// of enable_if do redirect to a simple buffer copy.
+ ControlCharacter ch;
+ while (isTok[ (unsigned char)(ch=m_src.control())])
+ {
+ unsigned char aa = m_src.ascii();
+ if (aa <= 0xD)
+ {
+ //handling W3C requirements for end of line translation in XML:
+ if (aa == '\r')
+ {
+ push( (unsigned char)'\n');
+ tokstate.eolnState = TokState::CR;
+ }
+ else if (aa == '\n')
+ {
+ if (tokstate.eolnState != TokState::CR)
+ {
+ push( (unsigned char)'\n');
+ }
+ tokstate.eolnState = TokState::SRC;
+ }
+ else
+ {
+ copychar();
+ tokstate.eolnState = TokState::SRC;
+ }
+ }
+ else
+ {
+ copychar();
+ tokstate.eolnState = TokState::SRC;
+ }
+ m_src.skip();
+ }
+ if (ch == Amp)
+ {
+ m_src.skip();
+ if (!parseEntity()) break;
+ tokstate.init( TokState::ParsingToken);
+ continue;
+ }
+ else
+ {
+ tokstate.init( TokState::ParsingDone);
+ return true;
+ }
+ }
+ tokstate.init();
+ return false;
+ }
+
+public:
+ /// \brief Static version of parse a token for parsing table definition elements
+ /// \tparam OutputBufferType type buffer for output
+ /// \param [in] isTok set of valid token characters
+ /// \param [in] ir input reader iterator
+ /// \param [out] buf buffer where to write the result to
+ /// \return true on success
+ template <class OutputBufferType>
+ static bool parseStaticToken( const IsTokenCharMap& isTok, InputReader ir, OutputBufferType& buf)
+ {
+ static OutputCharSet output;
+ buf.clear();
+ for (;;)
+ {
+ ControlCharacter ch;
+ for (;;)
+ {
+ UChar pc;
+ if (isTok[ (unsigned char)(ch=ir.control())])
+ {
+ pc = ir.chr();
+ }
+ else if (ch == Amp)
+ {
+ pc = parseStaticNumericEntityValue( ir);
+ }
+ else
+ {
+ return true;
+ }
+ output.print( pc, buf);
+ ir.skip();
+ }
+ }
+ }
+
+private:
+ /// \brief Skip a token defined by the set of valid token characters (same as parseToken but nothing written to the output buffer)
+ /// \param [in] isTok set of valid token characters
+ /// \return true on success
+ bool skipToken( const IsTokenCharMap& isTok)
+ {
+ do
+ {
+ ControlCharacter ch;
+ while (isTok[ (unsigned char)(ch=m_src.control())] || ch == Amp)
+ {
+ m_src.skip();
+ }
+ }
+ while (m_src.control() == Any);
+ return true;
+ }
+
+ /// \brief Parse a token that must be the same as a given string
+ /// \param [in] str string expected
+ /// \return true on success
+ bool expectStr( const char* str)
+ {
+ bool rt = true;
+ tokstate.id = TokState::ParsingKey;
+ for (; str[tokstate.pos] != '\0'; m_src.skip(),tokstate.pos++)
+ {
+ if (m_src.ascii() == str[ tokstate.pos]) continue;
+ ControlCharacter ch = m_src.control();
+ if (ch == EndOfText)
+ {
+ error = ErrUnexpectedEndOfText;
+ }
+ else
+ {
+ error = ErrSyntaxToken;
+ }
+ rt = false;
+ break;
+ }
+ tokstate.init( TokState::ParsingDone);
+ return rt;
+ }
+
+ /// \brief Parse an entity defined by name (predefined)
+ /// \param [in] str pointer to the buffer with the entity name
+ /// \return true on success
+ bool pushPredefinedEntity( const char* str)
+ {
+ switch (str[0])
+ {
+ case 'q':
+ if (str[1] == 'u' && str[2] == 'o' && str[3] == 't' && str[4] == '\0')
+ {
+ push( '\"');
+ return true;
+ }
+ break;
+
+ case 'a':
+ if (str[1] == 'm')
+ {
+ if (str[2] == 'p' && str[3] == '\0')
+ {
+ push( '&');
+ return true;
+ }
+ }
+ else if (str[1] == 'p')
+ {
+ if (str[2] == 'o' && str[3] == 's' && str[4] == '\0')
+ {
+ push( '\'');
+ return true;
+ }
+ }
+ break;
+
+ case 'l':
+ if (str[1] == 't' && str[2] == '\0')
+ {
+ push( '<');
+ return true;
+ }
+ break;
+
+ case 'g':
+ if (str[1] == 't' && str[2] == '\0')
+ {
+ push( '>');
+ return true;
+ }
+ break;
+
+ case 'n':
+ if (str[1] == 'b' && str[2] == 's' && str[3] == 'p' && str[4] == '\0')
+ {
+ push( ' ');
+ return true;
+ }
+ break;
+ }
+ return false;
+ }
+
+ /// \brief Parse an entity defined by name (predefined or in defined in entity table)
+ /// \param [in] str pointer to the buffer with the entity name
+ /// \return true on success
+ bool pushEntity( const char* str)
+ {
+ if (pushPredefinedEntity( str))
+ {
+ return true;
+ }
+ else if (m_entityMap)
+ {
+ EntityMap::const_iterator itr = m_entityMap->find( str);
+ if (itr == m_entityMap->end())
+ {
+ error = ErrUndefinedCharacterEntity;
+ return false;
+ }
+ else
+ {
+ UChar ch = itr->second;
+ push( ch);
+ return true;
+ }
+ }
+ else
+ {
+ error = ErrUndefinedCharacterEntity;
+ return false;
+ }
+ }
+
+private:
+ STMState state; ///< current state of the XML scanner
+ Error error; ///< last error code
+ InputReader m_src; ///< source input iterator
+ const EntityMap* m_entityMap; ///< map with entities defined by the caller
+ OutputBuffer m_outputBuf; ///< buffer to use for output
+ OutputCharSet m_output;
+
+public:
+ /// \brief Constructor
+ /// \param [in] p_src source iterator
+ /// \param [in] p_entityMap read only map of named entities defined by the user
+ XMLScanner( const InputIterator& p_src, const EntityMap& p_entityMap)
+ :state(START),error(Ok),m_src(InputCharSet(),p_src),m_entityMap(&p_entityMap),m_output(OutputCharSet())
+ {}
+ /// \brief Constructor
+ /// \param [in] p_src source iterator
+ XMLScanner( const InputIterator& p_src)
+ :state(START),error(Ok),m_src(InputCharSet(),p_src),m_entityMap(0),m_output(OutputCharSet())
+ {}
+ /// \brief Constructor
+ /// \param [in] p_charset character set encoding of input in case of non default settings (code page) needed
+ /// \param [in] p_src source iterator
+ /// \param [in] p_entityMap read only map of named entities defined by the user
+ XMLScanner( const InputCharSet& p_charset, const InputIterator& p_src, const EntityMap& p_entityMap)
+ :state(START),error(Ok),m_src(p_charset,p_src),m_entityMap(&p_entityMap),m_output(OutputCharSet())
+ {}
+ /// \brief Constructor
+ /// \param [in] p_charset character set encoding of input in case of non default settings (code page) needed
+ /// \param [in] p_src source iterator
+ XMLScanner( const InputCharSet& p_charset, const InputIterator& p_src)
+ :state(START),error(Ok),m_src(p_charset,p_src),m_entityMap(0),m_output(OutputCharSet())
+ {}
+ /// \brief Constructor
+ /// \param [in] p_charset character set encoding of input in case of non default settings (code page) needed
+ XMLScanner( const InputCharSet& p_charset)
+ :state(START),error(Ok),m_src(p_charset),m_entityMap(0)
+ {}
+ /// \brief Default constructor
+ XMLScanner()
+ :state(START),error(Ok),m_src(InputCharSet()),m_entityMap(0)
+ {}
+
+ /// \brief Copy constructor
+ /// \param [in] o scanner to copy
+ XMLScanner( const XMLScanner& o)
+ :state(o.state)
+ ,error(o.error)
+ ,m_src(o.m_src)
+ ,m_entityMap(o.m_entityMap)
+ ,m_outputBuf(o.m_outputBuf)
+ {}
+
+ /// \brief Assign something to the source iterator while keeping the state
+ /// \param [in] a source iterator assignment
+ template <class IteratorAssignment>
+ void setSource( const IteratorAssignment& a)
+ {
+ m_src.setSource( a);
+ }
+
+ /// \brief Get the current source iterator position
+ /// \return source iterator position in character words (usually bytes)
+ std::size_t getPosition() const
+ {
+ return m_src.getPosition();
+ }
+
+ /// \brief Get the current parsed XML element pointer, if it was not masked out, see nextItem(unsigned short)
+ /// \return the item string
+ const char* getItemPtr() const {return m_outputBuf.size()?&m_outputBuf.at(0):"\0\0\0\0";}
+
+ /// \brief Get the size of the current parsed XML element in bytes
+ /// \return the item string
+ std::size_t getItemSize() const {return m_outputBuf.size();}
+
+ /// \brief Get the current parsed XML element, if it was not masked out, see nextItem(unsigned short)
+ /// \return the item string
+ const OutputBuffer& getItem() const
+ {
+ return m_outputBuf;
+ }
+
+ /// \brief Get the current XML scanner state machine state
+ /// \return pointer to the state variables
+ ScannerStatemachine::Element* getState()
+ {
+ static Statemachine stm;
+ return stm.get( state);
+ }
+
+ /// \brief Get the last error
+ /// \param [out] str the error as string
+ /// \return the error code
+ Error getError( const char** str=0)
+ {
+ Error rt = error;
+ error = Ok;
+ if (str) *str=getErrorString(rt);
+ return rt;
+ }
+
+ /// \brief Scan the next XML element
+ /// \param [in] mask element types that should be printed to the output buffer (1 -> print, 0 -> mask out, just return the element as event)
+ /// \return the type of the XML element
+ ElementType nextItem( unsigned short mask=0xFFFF)
+ {
+ static const IsWordCharMap wordC;
+ static const IsContentCharMap contentC;
+ static const IsTagCharMap tagC;
+ static const IsSQStringCharMap sqC;
+ static const IsDQStringCharMap dqC;
+ static const IsTokenCharMap* tokenDefs[ NofSTMActions] = {0,&wordC,&contentC,&tagC,&sqC,&dqC,0,0,0};
+ static const char* stringDefs[ NofSTMActions] = {0,0,0,0,0,0,"xml","CDATA",0};
+
+ ElementType rt = None;
+ ControlCharacter ch;
+ do
+ {
+ ScannerStatemachine::Element* sd = getState();
+ if (sd->action.op != -1)
+ {
+ if (tokenDefs[sd->action.op])
+ {
+ if (tokstate.id != TokState::ParsingDone)
+ {
+ if ((mask&(1<<sd->action.arg)) != 0)
+ {
+ if (!parseToken( *tokenDefs[ sd->action.op])) return ErrorOccurred;
+ }
+ else
+ {
+ if (!skipToken( *tokenDefs[ sd->action.op])) return ErrorOccurred;
+ }
+ }
+ rt = (ElementType)sd->action.arg;
+ }
+ else if (stringDefs[sd->action.op])
+ {
+ if (tokstate.id != TokState::ParsingDone)
+ {
+ if (!expectStr( stringDefs[sd->action.op])) return ErrorOccurred;
+ if (sd->action.op == ExpectIdentifierXML)
+ {
+ //... special treatement for xml header for not
+ // enforcing the model too much just for this case
+ push( '?'); push( 'x'); push( 'm'); push( 'l');
+ rt = HeaderStart;
+ }
+ }
+ else if (sd->action.op == ExpectIdentifierXML)
+ {
+ //... special treatement for xml header for not
+ // enforcing the model too much just for this case
+ rt = HeaderStart;
+ }
+ }
+ else
+ {
+ m_outputBuf.clear();
+ rt = (ElementType)sd->action.arg;
+ }
+ if (sd->nofnext == 0)
+ {
+ if (sd->fallbackState != -1)
+ {
+ state = (STMState)sd->fallbackState;
+ }
+ return rt;
+ }
+ }
+ ch = m_src.control();
+ tokstate.id = TokState::Start;
+
+ if (sd->next[ ch] != -1)
+ {
+ state = (STMState)sd->next[ ch];
+ m_src.skip();
+ }
+ else if (sd->fallbackState != -1)
+ {
+ state = (STMState)sd->fallbackState;
+ }
+ else if (sd->missError != -1)
+ {
+ error = (Error)sd->missError;
+ return ErrorOccurred;
+ }
+ else if (ch == EndOfText)
+ {
+ error = ErrUnexpectedEndOfText;
+ return ErrorOccurred;
+ }
+ else
+ {
+ error = ErrInternal;
+ return ErrorOccurred;
+ }
+ }
+ while (rt == None);
+ return rt;
+ }
+
+ /// \class End
+ /// \brief end of input tag
+ struct End {};
+
+ /// \class iterator
+ /// \brief input iterator for iterating on the output of an XML scanner
+ class iterator
+ {
+ public:
+ /// \class Element
+ /// \brief Iterator element visited
+ class Element
+ {
+ private:
+ friend class iterator;
+ ElementType m_type; ///< type of the element
+ const char* m_content; ///< value string of the element
+ std::size_t m_size; ///< size of the value string in bytes
+ public:
+ /// \brief Type of the current element as string
+ const char* name() const {return getElementTypeName( m_type);}
+ /// \brief Type of the current element
+ ElementType type() const {return m_type;}
+ /// \brief Value of the current element
+ const char* content() const {return m_content;}
+ /// \brief Size of the value of the current element in bytes
+ std::size_t size() const {return m_size;}
+ /// \brief Constructor
+ Element() :m_type(None),m_content(0),m_size(0) {}
+ /// \brief Constructor
+ Element( const End&) :m_type(Exit),m_content(0),m_size(0) {}
+ /// \brief Copy constructor
+ /// \param [in] orig element to copy
+ Element( const Element& orig) :m_type(orig.m_type),m_content(orig.m_content),m_size(orig.m_size) {}
+ };
+ // input iterator traits
+ typedef Element value_type;
+ typedef std::size_t difference_type;
+ typedef std::size_t size_type;
+ typedef Element* pointer;
+ typedef Element& reference;
+ typedef std::input_iterator_tag iterator_category;
+
+ private:
+ Element element; ///< currently visited element
+ ThisXMLScanner* input; ///< XML scanner
+
+ /// \brief Skip to the next element
+ /// \param [in] mask element types that should be printed to the output buffer (1 -> print, 0 -> mask out, just return the element as event)
+ /// \return iterator pointing to the next element
+ iterator& skip( unsigned short mask=0xFFFF)
+ {
+ if (input != 0)
+ {
+ element.m_type = input->nextItem(mask);
+ element.m_content = input->getItemPtr();
+ element.m_size = input->getItemSize();
+ }
+ return *this;
+ }
+
+ /// \brief Compare iterator with another
+ /// \param [in] iter iterator to compare with
+ /// \return true if they are equal
+ bool compare( const iterator& iter) const
+ {
+ if (element.type() == iter.element.type())
+ {
+ if (element.type() == Exit || element.type() == None) return true; //equal only at beginning and end
+ }
+ return false;
+ }
+ public:
+ /// \brief Assign an iterator to another
+ /// \param [in] orig iterator to copy
+ void assign( const iterator& orig)
+ {
+ input = orig.input;
+ element = orig.element;
+ }
+ /// \brief Copy constructor
+ /// \param [in] orig iterator to copy
+ iterator( const iterator& orig)
+ {
+ assign( orig);
+ }
+ /// \brief Constructor
+ /// \param [in] p_input XML scanner to use for iteration
+ /// \param [in] doSkipToFirst true, if the iterator should skip to the first character of the input (default behaviour of STL conform iterators but maybe not exception save)
+ iterator( ThisXMLScanner& p_input, bool doSkipToFirst=true)
+ :input( &p_input)
+ {
+ if (doSkipToFirst)
+ {
+ element.m_type = input->nextItem();
+ element.m_content = input->getItemPtr();
+ element.m_size = input->getItemSize();
+ }
+ }
+ /// \brief Constructor
+ iterator( const End& et) :element(et),input(0) {}
+ /// \brief Constructor
+ iterator() :input(0) {}
+ /// \brief Assignement operator
+ /// \param [in] orig iterator to assign to this
+ iterator& operator = (const iterator& orig)
+ {
+ assign( orig);
+ return *this;
+ }
+ /// \brief Element dereference operator
+ const Element& operator*() const
+ {
+ return element;
+ }
+ /// \brief Element dereference operator
+ const Element* operator->() const
+ {
+ return &element;
+ }
+ /// \brief Preincrement
+ /// \return *this
+ iterator& operator++() {return skip();}
+ /// \brief Postincrement
+ /// \return *this
+ iterator operator++(int) {iterator tmp(*this); skip(); return tmp;}
+
+ /// \brief Compare to check for equality
+ /// \return true, if equal
+ bool operator==( const iterator& iter) const {return compare( iter);}
+ /// \brief Compare to check for unequality
+ /// \return true, if not equal
+ bool operator!=( const iterator& iter) const {return !compare( iter);}
+ };
+
+ /// \brief Get begin iterator
+ /// \return iterator
+ /// \param [in] doSkipToFirst true, if the iterator should skip to the first character of the input (default behaviour of STL conform iterators but maybe not exception save)
+ iterator begin( bool doSkipToFirst=true)
+ {
+ return iterator( *this, doSkipToFirst);
+ }
+ /// \brief Get the pointer to the end of content
+ /// \return iterator
+ iterator end()
+ {
+ return iterator( End());
+ }
+};
+
+}//namespace
+#endif
+
+
diff --git a/textwolf/include/textwolf/xmltagstack.hpp b/textwolf/include/textwolf/xmltagstack.hpp
new file mode 100644
index 0000000..a4671fe
--- /dev/null
+++ b/textwolf/include/textwolf/xmltagstack.hpp
@@ -0,0 +1,146 @@
+/*
+---------------------------------------------------------------------
+ The template library textwolf implements an input iterator on
+ a set of XML path expressions without backward references on an
+ STL conforming input iterator as source. It does no buffering
+ or read ahead and is dedicated for stream processing of XML
+ for a small set of XML queries.
+ Stream processing in this context refers to processing the
+ document without buffering anything but the current result token
+ processed with its tag hierarchy information.
+
+ Copyright (C) 2010,2011,2012,2013,2014 Patrick Frey
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+--------------------------------------------------------------------
+
+ The latest version of textwolf can be found at 'http://github.com/patrickfrey/textwolf'
+ For documentation see 'http://patrickfrey.github.com/textwolf'
+
+--------------------------------------------------------------------
+*/
+/// \file textwolf/xmltagstack.hpp
+/// \brief textwolf XML printer tag stack
+
+#ifndef __TEXTWOLF_XML_TAG_STACK_HPP__
+#define __TEXTWOLF_XML_TAG_STACK_HPP__
+#include <cstring>
+#include <cstdlib>
+
+/// \namespace textwolf
+/// \brief Toplevel namespace of the library
+namespace textwolf {
+
+/// \class TagStack
+/// \brief stack of tag names
+class TagStack
+{
+public:
+ /// \brief Destructor
+ ~TagStack()
+ {
+ if (m_ptr) std::free( m_ptr);
+ }
+
+ /// \brief Default constructor
+ TagStack()
+ :m_ptr(0),m_pos(0),m_size(InitSize)
+ {
+ if ((m_ptr=(char*)std::malloc( m_size)) == 0) throw std::bad_alloc();
+ }
+ /// \brief Copy constructor
+ TagStack( const TagStack& o)
+ :m_ptr(0),m_pos(o.m_pos),m_size(o.m_size)
+ {
+ if ((m_ptr=(char*)std::malloc( m_size)) == 0) throw std::bad_alloc();
+ std::memcpy( m_ptr, o.m_ptr, m_pos);
+ }
+
+ /// \brief Push a tag on top
+ /// \param[out] pp pointer to tag value to push
+ /// \param[out] nn size of tag value to push in bytes
+ void push( const char* pp, std::size_t nn)
+ {
+ std::size_t align = getAlign( nn);
+ std::size_t ofs = nn + align + sizeof( std::size_t);
+ if (m_pos + ofs > m_size)
+ {
+ while (m_pos + ofs > m_size) m_size *= 2;
+ if (m_pos + ofs > m_size) throw std::bad_alloc();
+ if (nn > ofs) throw std::logic_error( "invalid tag offset");
+ char* xx = (char*)std::realloc( m_ptr, m_size);
+ if (!xx) throw std::bad_alloc();
+ m_ptr = xx;
+ }
+ std::memcpy( m_ptr + m_pos, pp, nn);
+ m_pos += ofs;
+ void* tt = m_ptr + m_pos - sizeof( std::size_t);
+ *(std::size_t*)(tt) = nn;
+ }
+
+ /// \brief Get the topmost tag
+ /// \param[out] element pointer to topmost tag value
+ /// \param[out] elementsize size of topmost tag value in bytes
+ /// \return true on success, false if the stack is empty
+ bool top( const void*& element, std::size_t& elementsize)
+ {
+ std::size_t ofs = topofs(elementsize);
+ if (!ofs) return false;
+ element = m_ptr + m_pos - ofs;
+ return true;
+ }
+
+ /// \brief Pop (remove) the topmost tag
+ void pop()
+ {
+ std::size_t elementsize=0;
+ std::size_t ofs = topofs(elementsize);
+ if (m_pos < ofs) throw std::runtime_error( "corrupt tag stack");
+ m_pos -= ofs;
+ }
+
+ /// \brief Find out if the stack is empty
+ /// \return true if yes
+ bool empty() const
+ {
+ return (m_pos == 0);
+ }
+
+private:
+ std::size_t topofs( std::size_t& elementsize)
+ {
+ if (m_pos < sizeof( std::size_t)) return false;
+ void* tt = m_ptr + (m_pos - sizeof( std::size_t));
+ elementsize = *(std::size_t*)(tt);
+ std::size_t align = getAlign( elementsize);
+ std::size_t ofs = elementsize + align + sizeof( std::size_t);
+ if (ofs > m_pos) return 0;
+ return ofs;
+ }
+private:
+ enum {InitSize=256};
+ char* m_ptr;
+ std::size_t m_pos; ///< current position in the tag hierarchy stack buffer
+ std::size_t m_size; ///< current position in the tag hierarchy stack buffer
+
+ static std::size_t getAlign( std::size_t n)
+ {
+ return (sizeof(std::size_t) - (n & (sizeof(std::size_t)-1))) & (sizeof(std::size_t)-1);
+ }
+};
+
+} //namespace
+#endif
diff --git a/textwolf/license.txt b/textwolf/license.txt
new file mode 100644
index 0000000..65c5ca8
--- /dev/null
+++ b/textwolf/license.txt
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.