diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
commit | 54cce110784d33d658b5f78286a98bee244a9eeb (patch) | |
tree | 9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /tests/streamhtmlparser | |
parent | fcb682cb1955d362390665330fdf476cab7dc10b (diff) | |
download | crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2 |
added streamhtmlparser
Diffstat (limited to 'tests/streamhtmlparser')
-rw-r--r-- | tests/streamhtmlparser/GNUmakefile | 26 | ||||
-rw-r--r-- | tests/streamhtmlparser/test1.cpp | 74 | ||||
-rw-r--r-- | tests/streamhtmlparser/test1.html | 55 |
3 files changed, 155 insertions, 0 deletions
diff --git a/tests/streamhtmlparser/GNUmakefile b/tests/streamhtmlparser/GNUmakefile new file mode 100644 index 0000000..0fad0a5 --- /dev/null +++ b/tests/streamhtmlparser/GNUmakefile @@ -0,0 +1,26 @@ +TOPDIR = ../.. + +SUBDIRS = + +INCLUDE_DIRS = \ + -I$(TOPDIR)/streamhtmlparser/include + +INCLUDE_LDFLAGS = + +INCLUDE_LIBS = \ + $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a + +TEST_CPP_BINS = \ + test1$(EXE) + +OBJS = + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_test: diff --git a/tests/streamhtmlparser/test1.cpp b/tests/streamhtmlparser/test1.cpp new file mode 100644 index 0000000..2e13c5c --- /dev/null +++ b/tests/streamhtmlparser/test1.cpp @@ -0,0 +1,74 @@ +#include "htmlparser_cpp.h" + +#include <iostream> +#include <fstream> + +using namespace std; +using namespace streamhtmlparser; + +int main( int argc, char *argv[] ) +{ + if( argc != 2 ) { + cerr << "Usage: test1 <HTML file>\n" << endl; + return 1; + } + + char *htmlFileName = argv[1]; + + HtmlParser parser; + + ifstream htmlFile( htmlFileName ); + if( !htmlFile.good( ) ) { + cerr << "ERROR: Can't open file '" << htmlFileName << "'" << endl; + return 1; + } + + char buf[1] = {0}; + while( htmlFile.good( ) && !htmlFile.eof( ) ) { + buf[0] = htmlFile.get( ); + parser.Parse( buf, 1 ); + int state = parser.state( ); + switch( state ) { + case HtmlParser::STATE_TEXT: + cout << "TEXT '" << buf[0] << "'" + << " '" << ( ( parser.tag( ) != NULL ) ? parser.tag( ) : "NULL" ) << "'" + << " '" << ( ( parser.attribute( ) != NULL ) ? parser.attribute( ) : "NULL" ) << "'" + << " '" << ( ( parser.value( ) != NULL ) ? parser.value( ) : "NULL" ) << "'" + << endl; + break; + + case HtmlParser::STATE_TAG: + cout << "TAG" + << " '" << ( ( parser.tag( ) != NULL ) ? parser.tag( ) : "NULL" ) << "'" + << " '" << ( ( parser.attribute( ) != NULL ) ? parser.attribute( ) : "NULL" ) << "'" + << " '" << ( ( parser.value( ) != NULL ) ? parser.value( ) : "NULL" ) << "'" + << endl; + break; + + case HtmlParser::STATE_ATTR: + cout << "ATTR" + << " '" << ( ( parser.tag( ) != NULL ) ? parser.tag( ) : "NULL" ) << "'" + << " '" << ( ( parser.attribute( ) != NULL ) ? parser.attribute( ) : "NULL" ) << "'" + << " '" << ( ( parser.value( ) != NULL ) ? parser.value( ) : "NULL" ) << "'" + << endl; + break; + + case HtmlParser::STATE_VALUE: + cout << "VALUE" + << " '" << ( ( parser.tag( ) != NULL ) ? parser.tag( ) : "NULL" ) << "'" + << " '" << ( ( parser.attribute( ) != NULL ) ? parser.attribute( ) : "NULL" ) << "'" + << " '" << ( ( parser.value( ) != NULL ) ? parser.value( ) : "NULL" ) << "'" + << endl; + break; + + case HtmlParser::STATE_ERROR: + cerr << endl << "ERROR at " << endl; + return 1; + + default: + cout << "UNKNOWN state " << state << endl; + } + } + + return 0; +} diff --git a/tests/streamhtmlparser/test1.html b/tests/streamhtmlparser/test1.html new file mode 100644 index 0000000..27d9184 --- /dev/null +++ b/tests/streamhtmlparser/test1.html @@ -0,0 +1,55 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" + "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> +<head> + <title>Andreas Baumann's Personal Home Page</title> + <meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/> + <meta name="author" content="Andreas Baumann"/> + <link rel="stylesheet" href="css/style.css" type="text/css" media="screen"/> +</head> + +<body> + <div id="container"> + + <div id="sitename"> + <h1>Andreas Baumann's Personal Home Page</h1> + </div> + + <div id="mainmenu"> + <ul> + <li><a href="/">Home</a></li> + <li><a href="/software.shtml">Software</a></li> + <li><a href="/toolbox.shtml">Toolbox</a></li> + </ul> +</div> + + <div id="wrap"> + + + +<div id="leftside"> +</div> + + +<div id="content"> + +<h1>Andreas Baumann's Personal Home Page</h1> + +<p>Written in plain HTML with some server-side includes. Synchronization is done + with rsync over ssh. If you ask why, well, the two wikis I had before + (I don't mention names) were hacked in no time.. :-) +</p> + +</div> <!-- end content --> +<div class="clearingdiv"> </div> +</div> <!-- end wrap --> +</div> <!-- end container --> + +<div id="footer"> + © 2009 - 2012, Zurich, Andreas Baumann +</div> + +</body> + +</html> + |