diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-08 21:40:10 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-08 21:40:10 +0200 |
commit | 920ba33a9d12a3d2117112ce5676f606d708964c (patch) | |
tree | 059dfb115273f4b9617bd5016151d69212a4913f | |
parent | 3fb1aad6a170b546c193ecc12c8957a628840d52 (diff) | |
download | crawler-920ba33a9d12a3d2117112ce5676f606d708964c.tar.gz crawler-920ba33a9d12a3d2117112ce5676f606d708964c.tar.bz2 |
added a file rewind input stream
started to add MIME type detection and a module based on libmagic (not finished yet)
-rw-r--r-- | src/GNUmakefile | 3 | ||||
-rw-r--r-- | src/MIMEType.cpp | 5 | ||||
-rw-r--r-- | src/MIMEType.hpp | 93 | ||||
-rw-r--r-- | src/Processor.hpp | 3 | ||||
-rw-r--r-- | src/TypeDetect.hpp | 15 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 1 | ||||
-rw-r--r-- | src/modules/GNUmakefile | 4 | ||||
-rw-r--r-- | src/modules/fetcher/GNUmakefile | 2 | ||||
-rw-r--r-- | src/modules/fetcher/file/FileFetcher.cpp | 12 | ||||
-rw-r--r-- | src/modules/fetcher/file/FileFetcher.hpp | 19 | ||||
-rw-r--r-- | src/modules/fetcher/file/FileRewindInputStream.cpp | 20 | ||||
-rw-r--r-- | src/modules/fetcher/file/FileRewindInputStream.hpp | 16 | ||||
-rw-r--r-- | src/modules/fetcher/file/GNUmakefile | 40 | ||||
-rw-r--r-- | src/modules/typedetect/GNUmakefile | 17 | ||||
-rw-r--r-- | src/modules/typedetect/libmagic/GNUmakefile | 40 | ||||
-rw-r--r-- | src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp | 42 | ||||
-rw-r--r-- | src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp | 24 | ||||
-rw-r--r-- | tests/GNUmakefile | 2 | ||||
-rw-r--r-- | tests/typedetect/GNUmakefile | 39 | ||||
-rwxr-xr-x | tests/typedetect/exec_test | 12 | ||||
-rw-r--r-- | tests/typedetect/test1.MUST | 0 | ||||
-rw-r--r-- | tests/typedetect/test1.cpp | 65 |
22 files changed, 469 insertions, 5 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index 8f1657e..11bc63f 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -23,7 +23,8 @@ INCLUDE_LIBS += \ endif LOCAL_STATIC_LIB_OBJS = \ - URL.o + URL.o \ + MIMEType.o CPP_OBJS = \ $(LOCAL_STATIC_LIB_OBJS) diff --git a/src/MIMEType.cpp b/src/MIMEType.cpp new file mode 100644 index 0000000..25dc20c --- /dev/null +++ b/src/MIMEType.cpp @@ -0,0 +1,5 @@ +#include "MIMEType.hpp" + +MIMEType MIMEType::Null; + + diff --git a/src/MIMEType.hpp b/src/MIMEType.hpp new file mode 100644 index 0000000..02fc503 --- /dev/null +++ b/src/MIMEType.hpp @@ -0,0 +1,93 @@ +#ifndef __MIMETYPE_H +#define __MIMETYPE_H + +#include <string> +#include <iostream> +#include <sstream> + +class MIMEType { + protected: + std::string m_type; + std::string m_subtype; + + public: + MIMEType( ) + : m_type( "" ), m_subtype( "" ) + { + } + + MIMEType( const std::string _type, const std::string _subtype ) + : m_type( _type ), m_subtype( _subtype ) + { + } + + MIMEType( const MIMEType &m ) + : m_type( m.m_type ), m_subtype( m.m_subtype ) + { + } + + MIMEType( const char *s ) + : m_type( s ), m_subtype( "" ) + { + } + + MIMEType& operator=( const MIMEType &m ) + { + if( this != &m ) { + this->m_type = m.m_type; + this->m_subtype = m.m_subtype; + } + return *this; + } + + const std::string type( ) const + { + return m_type; + } + + const std::string subtype( ) const + { + return m_subtype; + } + + std::string str( ) const + { + std::ostringstream os; + os << *this; + return os.str( ); + } + + static MIMEType Null; + + bool operator!=( const MIMEType &other ) const + { + return( str( ) != other.str( ) ); + } + + bool operator==( const MIMEType &other ) const + { + return( str( ) == other.str( ) ); + } + + bool operator<( const MIMEType &other ) const + { + return( str( ) < other.str( ) ); + } + + template< typename CharT, typename TraitsT > friend + std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream<CharT, TraitsT>&s, const MIMEType& m ); +}; + +template< typename CharT, typename TraitsT > +inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m ) +{ + if( m.type( ).empty( ) ) { + return s; + } + + s << m.type( ) << "/" << m.subtype( ); + + return s; +} + +#endif diff --git a/src/Processor.hpp b/src/Processor.hpp index b796e65..bc17ec0 100644 --- a/src/Processor.hpp +++ b/src/Processor.hpp @@ -5,7 +5,8 @@ class Processor { public: - virtual ~Processor( ) {}; + virtual ~Processor( ) { } + virtual void process( RewindInputStream *s ) = 0; }; diff --git a/src/TypeDetect.hpp b/src/TypeDetect.hpp new file mode 100644 index 0000000..7db714b --- /dev/null +++ b/src/TypeDetect.hpp @@ -0,0 +1,15 @@ +#ifndef __TYPE_DETECTION_H +#define __TYPE_DETECTION_H + +#include "RewindInputStream.hpp" +#include "MIMEType.hpp" + +class TypeDetect +{ + public: + virtual ~TypeDetect( ) { }; + + virtual MIMEType detect( RewindInputStream *s ) = 0; +}; + +#endif diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index c34b345..a1a5151 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -5,6 +5,7 @@ #include "URLSeen.hpp" #include "URLNormalizer.hpp" #include "URLFilter.hpp" +#include "TypeDetect.hpp" #include "ModuleLoader.hpp" diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile index 31dc26c..4d032d5 100644 --- a/src/modules/GNUmakefile +++ b/src/modules/GNUmakefile @@ -1,6 +1,8 @@ TOPDIR = ../.. -SUBDIRS = urlnormalizer urlfilter frontier fetcher urlseen deduper processor +SUBDIRS = \ + urlnormalizer urlfilter frontier fetcher urlseen \ + deduper processor typedetect -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile index 526e9e5..89dfe93 100644 --- a/src/modules/fetcher/GNUmakefile +++ b/src/modules/fetcher/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = libfetch +SUBDIRS = libfetch file -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/fetcher/file/FileFetcher.cpp b/src/modules/fetcher/file/FileFetcher.cpp new file mode 100644 index 0000000..8d66e14 --- /dev/null +++ b/src/modules/fetcher/file/FileFetcher.cpp @@ -0,0 +1,12 @@ +#include "FileFetcher.hpp" +#include "FileRewindInputStream.hpp" + +using namespace std; + +RewindInputStream *FileFetcher::fetch( const URL url ) +{ + FileRewindInputStream *s = new FileRewindInputStream( url ); + return s; +} + +REGISTER_MODULE( "file", Fetcher, FileFetcher ) diff --git a/src/modules/fetcher/file/FileFetcher.hpp b/src/modules/fetcher/file/FileFetcher.hpp new file mode 100644 index 0000000..747c9b0 --- /dev/null +++ b/src/modules/fetcher/file/FileFetcher.hpp @@ -0,0 +1,19 @@ +#ifndef __FILE_FETCHER_H +#define __FILE_FETCHER_H + +#include "Fetcher.hpp" +#include "ModuleRegistry.hpp" + +class FileFetcher : public Fetcher +{ + public: + FileFetcher( ) { } + + virtual ~FileFetcher( ) { } + + virtual RewindInputStream *fetch( const URL url ); +}; + +DECLARE_MODULE( Fetcher ) + +#endif diff --git a/src/modules/fetcher/file/FileRewindInputStream.cpp b/src/modules/fetcher/file/FileRewindInputStream.cpp new file mode 100644 index 0000000..034bd8d --- /dev/null +++ b/src/modules/fetcher/file/FileRewindInputStream.cpp @@ -0,0 +1,20 @@ +#include "FileRewindInputStream.hpp" + +#include <stdexcept> + +using namespace std; + +FileRewindInputStream::FileRewindInputStream( const URL &url ) + : RewindInputStream( url ), ifstream( ) +{ + if( url.protocol( ) != "file" || url.host( ) != "localhost" ) { + throw new runtime_error( "URL doesn't denote a local file" ); + } + + open( url.path( ).c_str( ), ios::binary ); +} + +FileRewindInputStream::~FileRewindInputStream( ) +{ + close( ); +} diff --git a/src/modules/fetcher/file/FileRewindInputStream.hpp b/src/modules/fetcher/file/FileRewindInputStream.hpp new file mode 100644 index 0000000..21a194a --- /dev/null +++ b/src/modules/fetcher/file/FileRewindInputStream.hpp @@ -0,0 +1,16 @@ +#ifndef __FILE_REWIND_INPUT_STREAM_H +#define __FILE_REWIND_INPUT_STREAM_H + +#include "RewindInputStream.hpp" + +#include <fstream> + +class FileRewindInputStream : public RewindInputStream, std::ifstream +{ + public: + FileRewindInputStream( const URL &url ); + + virtual ~FileRewindInputStream( ); +}; + +#endif diff --git a/src/modules/fetcher/file/GNUmakefile b/src/modules/fetcher/file/GNUmakefile new file mode 100644 index 0000000..c143110 --- /dev/null +++ b/src/modules/fetcher/file/GNUmakefile @@ -0,0 +1,40 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a + +DYNAMIC_MODULE = \ + mod_fetcher_file.so + +STATIC_LIB = \ + libfilefetcher.a + +CPP_OBJS = \ + FileFetcher.o \ + FileRewindInputStream.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/typedetect/GNUmakefile b/src/modules/typedetect/GNUmakefile new file mode 100644 index 0000000..3923feb --- /dev/null +++ b/src/modules/typedetect/GNUmakefile @@ -0,0 +1,17 @@ +TOPDIR = ../../.. + +SUBDIRS = libmagic + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/modules/typedetect/libmagic/GNUmakefile b/src/modules/typedetect/libmagic/GNUmakefile new file mode 100644 index 0000000..c88c4dd --- /dev/null +++ b/src/modules/typedetect/libmagic/GNUmakefile @@ -0,0 +1,40 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ + -lmagic + +DYNAMIC_MODULE = \ + mod_typedetect_libmagic.so + +STATIC_LIB = \ + liblibmagictypedetect.a + +CPP_OBJS = \ + LibMagicTypeDetect.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp new file mode 100644 index 0000000..4c5a442 --- /dev/null +++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp @@ -0,0 +1,42 @@ +#include "LibMagicTypeDetect.hpp" +#include "Logger.hpp" + +#include <stdexcept> + +LibMagicTypeDetect::LibMagicTypeDetect( ) +{ + m_magic = magic_open( MAGIC_MIME_TYPE ); + if( m_magic == NULL ) { + throw runtime_error( "Unable to open magic file" ); + } + + if( magic_load( m_magic, NULL ) != 0 ) { + throw runtime_error( "Unable to load standard magic file database" ); + } +} + +LibMagicTypeDetect::~LibMagicTypeDetect( ) +{ + magic_close( m_magic ); +} + +MIMEType LibMagicTypeDetect::detect( RewindInputStream *s ) +{ + enum { BUFSIZE = 109056 }; + char buf[BUFSIZE]; + const char *res = 0; + + while( s->good( ) && !s->eof( ) ) { + s->read( buf, BUFSIZE ); + res = magic_buffer( m_magic, buf, BUFSIZE ); + if( res == NULL ) { + return MIMEType::Null; + } + // once + break; + } + + return MIMEType( res ); +} + +REGISTER_MODULE( "libmagic", TypeDetect, LibMagicTypeDetect ) diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp new file mode 100644 index 0000000..748dfbf --- /dev/null +++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp @@ -0,0 +1,24 @@ +#ifndef __LIBMAGIC_TYPE_DETECTION_H +#define __LIBMAGIC_TYPE_DETECTION_H + +#include "TypeDetect.hpp" +#include "ModuleRegistry.hpp" + +#include <magic.h> + +class LibMagicTypeDetect : public TypeDetect +{ + public: + LibMagicTypeDetect( ); + + virtual ~LibMagicTypeDetect( ); + + virtual MIMEType detect( RewindInputStream *s ); + + private: + magic_t m_magic; +}; + +DECLARE_MODULE( TypeDetect ) + +#endif diff --git a/tests/GNUmakefile b/tests/GNUmakefile index e2b08bb..09bc024 100644 --- a/tests/GNUmakefile +++ b/tests/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = .. -SUBDIRS = utils url streamhtmlparser libfetch curl psql sqlite +SUBDIRS = utils url streamhtmlparser libfetch curl psql sqlite typedetect -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/tests/typedetect/GNUmakefile b/tests/typedetect/GNUmakefile new file mode 100644 index 0000000..ee52471 --- /dev/null +++ b/tests/typedetect/GNUmakefile @@ -0,0 +1,39 @@ +TOPDIR = ../.. + +SUBDIRS = + +#INCLUDE_CXXFLAGS = \ +# -DUSE_MODULELOADER + +INCLUDE_DIRS = \ + -I$(TOPDIR)/src \ + -I$(TOPDIR)/src/modules/typedetect/libmagic \ + -I$(TOPDIR)/src/modules/fetcher/file + +INCLUDE_LDFLAGS = + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ + $(TOPDIR)/src/modules/typedetect/libmagic/liblibmagictypedetect.a \ + -lmagic \ + $(TOPDIR)/src/modules/fetcher/file/libfilefetcher.a + +TEST_CPP_BINS = \ + test1$(EXE) + +OBJS = + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + -@rm -f *.RES *.DIFF + +local_distclean: + +local_test: + @-for METHOD in libmagic; do \ + echo "Using MIME type detector '$$METHOD'.." ; \ + ./exec_test test1 test1 "delete a simple C++ file" $$METHOD `pwd`/test1.cpp ; \ + done diff --git a/tests/typedetect/exec_test b/tests/typedetect/exec_test new file mode 100755 index 0000000..92b656f --- /dev/null +++ b/tests/typedetect/exec_test @@ -0,0 +1,12 @@ +#!/bin/sh + +BINARY=$1 +shift +ID=$1 +shift +TITLE=$1 +shift + +printf "$ID: $TITLE .. " +./$BINARY $* >$ID.RES 2>&1 +diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n" diff --git a/tests/typedetect/test1.MUST b/tests/typedetect/test1.MUST new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/typedetect/test1.MUST diff --git a/tests/typedetect/test1.cpp b/tests/typedetect/test1.cpp new file mode 100644 index 0000000..b84fa55 --- /dev/null +++ b/tests/typedetect/test1.cpp @@ -0,0 +1,65 @@ +#ifdef USE_MODULELOADER +#include "TypeDetect.hpp" +#include "ModuleLoader.hpp" +#else +#include "LibMagicTypeDetect.hpp" +#endif + +#include "FileFetcher.hpp" + +#include <vector> +#include <iostream> +#include <string> +#include <cstring> + +using namespace std; + +int main( int argc, char *argv[] ) +{ + if( argc < 3 ) { + cerr << "usage: test1 <method> <file>\n" << endl; + return 1; + } + + char *method = argv[1]; + char *file = argv[2]; + +#ifdef USE_MODULELOADER + vector<string> modules; + modules.push_back( "../../src/modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); + ModuleLoader<TypeDetect> typeDetectors( modules ); + + TypeDetect *typeDetect = typeDetectors.create( method ); +#else + TypeDetect *typeDetect; + if( strcmp( method, "libmagic" ) == 0 ) { + typeDetect = new LibMagicTypeDetect( ); + } else { + cerr << "Unknown type detection method '" << method << "'" << endl; + return 1; + } +#endif + + URL fileUrl( "file", "localhost", 0, file, "", "" ); + FileFetcher fetcher; + RewindInputStream *s = fetcher.fetch( fileUrl ); + + MIMEType type = typeDetect->detect( s ); + + delete s; + +#ifdef USE_MODULELOADER + typeDetects.destroy( typeDetect ); +#else + delete typeDetect; +#endif + + if( type == MIMEType::Null ) { + cerr << "Unable to detect MIME type!" << endl; + return 1; + } + + cout << "MIME type: " << type << endl; + + return 0; +} |