summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-08 21:40:10 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-08 21:40:10 +0200
commit920ba33a9d12a3d2117112ce5676f606d708964c (patch)
tree059dfb115273f4b9617bd5016151d69212a4913f
parent3fb1aad6a170b546c193ecc12c8957a628840d52 (diff)
downloadcrawler-920ba33a9d12a3d2117112ce5676f606d708964c.tar.gz
crawler-920ba33a9d12a3d2117112ce5676f606d708964c.tar.bz2
added a file rewind input stream
started to add MIME type detection and a module based on libmagic (not finished yet)
-rw-r--r--src/GNUmakefile3
-rw-r--r--src/MIMEType.cpp5
-rw-r--r--src/MIMEType.hpp93
-rw-r--r--src/Processor.hpp3
-rw-r--r--src/TypeDetect.hpp15
-rw-r--r--src/crawlingwolf.cpp1
-rw-r--r--src/modules/GNUmakefile4
-rw-r--r--src/modules/fetcher/GNUmakefile2
-rw-r--r--src/modules/fetcher/file/FileFetcher.cpp12
-rw-r--r--src/modules/fetcher/file/FileFetcher.hpp19
-rw-r--r--src/modules/fetcher/file/FileRewindInputStream.cpp20
-rw-r--r--src/modules/fetcher/file/FileRewindInputStream.hpp16
-rw-r--r--src/modules/fetcher/file/GNUmakefile40
-rw-r--r--src/modules/typedetect/GNUmakefile17
-rw-r--r--src/modules/typedetect/libmagic/GNUmakefile40
-rw-r--r--src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp42
-rw-r--r--src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp24
-rw-r--r--tests/GNUmakefile2
-rw-r--r--tests/typedetect/GNUmakefile39
-rwxr-xr-xtests/typedetect/exec_test12
-rw-r--r--tests/typedetect/test1.MUST0
-rw-r--r--tests/typedetect/test1.cpp65
22 files changed, 469 insertions, 5 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 8f1657e..11bc63f 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -23,7 +23,8 @@ INCLUDE_LIBS += \
endif
LOCAL_STATIC_LIB_OBJS = \
- URL.o
+ URL.o \
+ MIMEType.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/MIMEType.cpp b/src/MIMEType.cpp
new file mode 100644
index 0000000..25dc20c
--- /dev/null
+++ b/src/MIMEType.cpp
@@ -0,0 +1,5 @@
+#include "MIMEType.hpp"
+
+MIMEType MIMEType::Null;
+
+
diff --git a/src/MIMEType.hpp b/src/MIMEType.hpp
new file mode 100644
index 0000000..02fc503
--- /dev/null
+++ b/src/MIMEType.hpp
@@ -0,0 +1,93 @@
+#ifndef __MIMETYPE_H
+#define __MIMETYPE_H
+
+#include <string>
+#include <iostream>
+#include <sstream>
+
+class MIMEType {
+ protected:
+ std::string m_type;
+ std::string m_subtype;
+
+ public:
+ MIMEType( )
+ : m_type( "" ), m_subtype( "" )
+ {
+ }
+
+ MIMEType( const std::string _type, const std::string _subtype )
+ : m_type( _type ), m_subtype( _subtype )
+ {
+ }
+
+ MIMEType( const MIMEType &m )
+ : m_type( m.m_type ), m_subtype( m.m_subtype )
+ {
+ }
+
+ MIMEType( const char *s )
+ : m_type( s ), m_subtype( "" )
+ {
+ }
+
+ MIMEType& operator=( const MIMEType &m )
+ {
+ if( this != &m ) {
+ this->m_type = m.m_type;
+ this->m_subtype = m.m_subtype;
+ }
+ return *this;
+ }
+
+ const std::string type( ) const
+ {
+ return m_type;
+ }
+
+ const std::string subtype( ) const
+ {
+ return m_subtype;
+ }
+
+ std::string str( ) const
+ {
+ std::ostringstream os;
+ os << *this;
+ return os.str( );
+ }
+
+ static MIMEType Null;
+
+ bool operator!=( const MIMEType &other ) const
+ {
+ return( str( ) != other.str( ) );
+ }
+
+ bool operator==( const MIMEType &other ) const
+ {
+ return( str( ) == other.str( ) );
+ }
+
+ bool operator<( const MIMEType &other ) const
+ {
+ return( str( ) < other.str( ) );
+ }
+
+ template< typename CharT, typename TraitsT > friend
+ std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream<CharT, TraitsT>&s, const MIMEType& m );
+};
+
+template< typename CharT, typename TraitsT >
+inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m )
+{
+ if( m.type( ).empty( ) ) {
+ return s;
+ }
+
+ s << m.type( ) << "/" << m.subtype( );
+
+ return s;
+}
+
+#endif
diff --git a/src/Processor.hpp b/src/Processor.hpp
index b796e65..bc17ec0 100644
--- a/src/Processor.hpp
+++ b/src/Processor.hpp
@@ -5,7 +5,8 @@
class Processor {
public:
- virtual ~Processor( ) {};
+ virtual ~Processor( ) { }
+
virtual void process( RewindInputStream *s ) = 0;
};
diff --git a/src/TypeDetect.hpp b/src/TypeDetect.hpp
new file mode 100644
index 0000000..7db714b
--- /dev/null
+++ b/src/TypeDetect.hpp
@@ -0,0 +1,15 @@
+#ifndef __TYPE_DETECTION_H
+#define __TYPE_DETECTION_H
+
+#include "RewindInputStream.hpp"
+#include "MIMEType.hpp"
+
+class TypeDetect
+{
+ public:
+ virtual ~TypeDetect( ) { };
+
+ virtual MIMEType detect( RewindInputStream *s ) = 0;
+};
+
+#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index c34b345..a1a5151 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -5,6 +5,7 @@
#include "URLSeen.hpp"
#include "URLNormalizer.hpp"
#include "URLFilter.hpp"
+#include "TypeDetect.hpp"
#include "ModuleLoader.hpp"
diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile
index 31dc26c..4d032d5 100644
--- a/src/modules/GNUmakefile
+++ b/src/modules/GNUmakefile
@@ -1,6 +1,8 @@
TOPDIR = ../..
-SUBDIRS = urlnormalizer urlfilter frontier fetcher urlseen deduper processor
+SUBDIRS = \
+ urlnormalizer urlfilter frontier fetcher urlseen \
+ deduper processor typedetect
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile
index 526e9e5..89dfe93 100644
--- a/src/modules/fetcher/GNUmakefile
+++ b/src/modules/fetcher/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../../..
-SUBDIRS = libfetch
+SUBDIRS = libfetch file
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/fetcher/file/FileFetcher.cpp b/src/modules/fetcher/file/FileFetcher.cpp
new file mode 100644
index 0000000..8d66e14
--- /dev/null
+++ b/src/modules/fetcher/file/FileFetcher.cpp
@@ -0,0 +1,12 @@
+#include "FileFetcher.hpp"
+#include "FileRewindInputStream.hpp"
+
+using namespace std;
+
+RewindInputStream *FileFetcher::fetch( const URL url )
+{
+ FileRewindInputStream *s = new FileRewindInputStream( url );
+ return s;
+}
+
+REGISTER_MODULE( "file", Fetcher, FileFetcher )
diff --git a/src/modules/fetcher/file/FileFetcher.hpp b/src/modules/fetcher/file/FileFetcher.hpp
new file mode 100644
index 0000000..747c9b0
--- /dev/null
+++ b/src/modules/fetcher/file/FileFetcher.hpp
@@ -0,0 +1,19 @@
+#ifndef __FILE_FETCHER_H
+#define __FILE_FETCHER_H
+
+#include "Fetcher.hpp"
+#include "ModuleRegistry.hpp"
+
+class FileFetcher : public Fetcher
+{
+ public:
+ FileFetcher( ) { }
+
+ virtual ~FileFetcher( ) { }
+
+ virtual RewindInputStream *fetch( const URL url );
+};
+
+DECLARE_MODULE( Fetcher )
+
+#endif
diff --git a/src/modules/fetcher/file/FileRewindInputStream.cpp b/src/modules/fetcher/file/FileRewindInputStream.cpp
new file mode 100644
index 0000000..034bd8d
--- /dev/null
+++ b/src/modules/fetcher/file/FileRewindInputStream.cpp
@@ -0,0 +1,20 @@
+#include "FileRewindInputStream.hpp"
+
+#include <stdexcept>
+
+using namespace std;
+
+FileRewindInputStream::FileRewindInputStream( const URL &url )
+ : RewindInputStream( url ), ifstream( )
+{
+ if( url.protocol( ) != "file" || url.host( ) != "localhost" ) {
+ throw new runtime_error( "URL doesn't denote a local file" );
+ }
+
+ open( url.path( ).c_str( ), ios::binary );
+}
+
+FileRewindInputStream::~FileRewindInputStream( )
+{
+ close( );
+}
diff --git a/src/modules/fetcher/file/FileRewindInputStream.hpp b/src/modules/fetcher/file/FileRewindInputStream.hpp
new file mode 100644
index 0000000..21a194a
--- /dev/null
+++ b/src/modules/fetcher/file/FileRewindInputStream.hpp
@@ -0,0 +1,16 @@
+#ifndef __FILE_REWIND_INPUT_STREAM_H
+#define __FILE_REWIND_INPUT_STREAM_H
+
+#include "RewindInputStream.hpp"
+
+#include <fstream>
+
+class FileRewindInputStream : public RewindInputStream, std::ifstream
+{
+ public:
+ FileRewindInputStream( const URL &url );
+
+ virtual ~FileRewindInputStream( );
+};
+
+#endif
diff --git a/src/modules/fetcher/file/GNUmakefile b/src/modules/fetcher/file/GNUmakefile
new file mode 100644
index 0000000..c143110
--- /dev/null
+++ b/src/modules/fetcher/file/GNUmakefile
@@ -0,0 +1,40 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_fetcher_file.so
+
+STATIC_LIB = \
+ libfilefetcher.a
+
+CPP_OBJS = \
+ FileFetcher.o \
+ FileRewindInputStream.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/typedetect/GNUmakefile b/src/modules/typedetect/GNUmakefile
new file mode 100644
index 0000000..3923feb
--- /dev/null
+++ b/src/modules/typedetect/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = libmagic
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/typedetect/libmagic/GNUmakefile b/src/modules/typedetect/libmagic/GNUmakefile
new file mode 100644
index 0000000..c88c4dd
--- /dev/null
+++ b/src/modules/typedetect/libmagic/GNUmakefile
@@ -0,0 +1,40 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ -lmagic
+
+DYNAMIC_MODULE = \
+ mod_typedetect_libmagic.so
+
+STATIC_LIB = \
+ liblibmagictypedetect.a
+
+CPP_OBJS = \
+ LibMagicTypeDetect.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp
new file mode 100644
index 0000000..4c5a442
--- /dev/null
+++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp
@@ -0,0 +1,42 @@
+#include "LibMagicTypeDetect.hpp"
+#include "Logger.hpp"
+
+#include <stdexcept>
+
+LibMagicTypeDetect::LibMagicTypeDetect( )
+{
+ m_magic = magic_open( MAGIC_MIME_TYPE );
+ if( m_magic == NULL ) {
+ throw runtime_error( "Unable to open magic file" );
+ }
+
+ if( magic_load( m_magic, NULL ) != 0 ) {
+ throw runtime_error( "Unable to load standard magic file database" );
+ }
+}
+
+LibMagicTypeDetect::~LibMagicTypeDetect( )
+{
+ magic_close( m_magic );
+}
+
+MIMEType LibMagicTypeDetect::detect( RewindInputStream *s )
+{
+ enum { BUFSIZE = 109056 };
+ char buf[BUFSIZE];
+ const char *res = 0;
+
+ while( s->good( ) && !s->eof( ) ) {
+ s->read( buf, BUFSIZE );
+ res = magic_buffer( m_magic, buf, BUFSIZE );
+ if( res == NULL ) {
+ return MIMEType::Null;
+ }
+ // once
+ break;
+ }
+
+ return MIMEType( res );
+}
+
+REGISTER_MODULE( "libmagic", TypeDetect, LibMagicTypeDetect )
diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp
new file mode 100644
index 0000000..748dfbf
--- /dev/null
+++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp
@@ -0,0 +1,24 @@
+#ifndef __LIBMAGIC_TYPE_DETECTION_H
+#define __LIBMAGIC_TYPE_DETECTION_H
+
+#include "TypeDetect.hpp"
+#include "ModuleRegistry.hpp"
+
+#include <magic.h>
+
+class LibMagicTypeDetect : public TypeDetect
+{
+ public:
+ LibMagicTypeDetect( );
+
+ virtual ~LibMagicTypeDetect( );
+
+ virtual MIMEType detect( RewindInputStream *s );
+
+ private:
+ magic_t m_magic;
+};
+
+DECLARE_MODULE( TypeDetect )
+
+#endif
diff --git a/tests/GNUmakefile b/tests/GNUmakefile
index e2b08bb..09bc024 100644
--- a/tests/GNUmakefile
+++ b/tests/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ..
-SUBDIRS = utils url streamhtmlparser libfetch curl psql sqlite
+SUBDIRS = utils url streamhtmlparser libfetch curl psql sqlite typedetect
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/tests/typedetect/GNUmakefile b/tests/typedetect/GNUmakefile
new file mode 100644
index 0000000..ee52471
--- /dev/null
+++ b/tests/typedetect/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../..
+
+SUBDIRS =
+
+#INCLUDE_CXXFLAGS = \
+# -DUSE_MODULELOADER
+
+INCLUDE_DIRS = \
+ -I$(TOPDIR)/src \
+ -I$(TOPDIR)/src/modules/typedetect/libmagic \
+ -I$(TOPDIR)/src/modules/fetcher/file
+
+INCLUDE_LDFLAGS =
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ $(TOPDIR)/src/modules/typedetect/libmagic/liblibmagictypedetect.a \
+ -lmagic \
+ $(TOPDIR)/src/modules/fetcher/file/libfilefetcher.a
+
+TEST_CPP_BINS = \
+ test1$(EXE)
+
+OBJS =
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+ -@rm -f *.RES *.DIFF
+
+local_distclean:
+
+local_test:
+ @-for METHOD in libmagic; do \
+ echo "Using MIME type detector '$$METHOD'.." ; \
+ ./exec_test test1 test1 "delete a simple C++ file" $$METHOD `pwd`/test1.cpp ; \
+ done
diff --git a/tests/typedetect/exec_test b/tests/typedetect/exec_test
new file mode 100755
index 0000000..92b656f
--- /dev/null
+++ b/tests/typedetect/exec_test
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+BINARY=$1
+shift
+ID=$1
+shift
+TITLE=$1
+shift
+
+printf "$ID: $TITLE .. "
+./$BINARY $* >$ID.RES 2>&1
+diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n"
diff --git a/tests/typedetect/test1.MUST b/tests/typedetect/test1.MUST
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/typedetect/test1.MUST
diff --git a/tests/typedetect/test1.cpp b/tests/typedetect/test1.cpp
new file mode 100644
index 0000000..b84fa55
--- /dev/null
+++ b/tests/typedetect/test1.cpp
@@ -0,0 +1,65 @@
+#ifdef USE_MODULELOADER
+#include "TypeDetect.hpp"
+#include "ModuleLoader.hpp"
+#else
+#include "LibMagicTypeDetect.hpp"
+#endif
+
+#include "FileFetcher.hpp"
+
+#include <vector>
+#include <iostream>
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+int main( int argc, char *argv[] )
+{
+ if( argc < 3 ) {
+ cerr << "usage: test1 <method> <file>\n" << endl;
+ return 1;
+ }
+
+ char *method = argv[1];
+ char *file = argv[2];
+
+#ifdef USE_MODULELOADER
+ vector<string> modules;
+ modules.push_back( "../../src/modules/typedetect/libmagic/mod_typedetect_libmagic.so" );
+ ModuleLoader<TypeDetect> typeDetectors( modules );
+
+ TypeDetect *typeDetect = typeDetectors.create( method );
+#else
+ TypeDetect *typeDetect;
+ if( strcmp( method, "libmagic" ) == 0 ) {
+ typeDetect = new LibMagicTypeDetect( );
+ } else {
+ cerr << "Unknown type detection method '" << method << "'" << endl;
+ return 1;
+ }
+#endif
+
+ URL fileUrl( "file", "localhost", 0, file, "", "" );
+ FileFetcher fetcher;
+ RewindInputStream *s = fetcher.fetch( fileUrl );
+
+ MIMEType type = typeDetect->detect( s );
+
+ delete s;
+
+#ifdef USE_MODULELOADER
+ typeDetects.destroy( typeDetect );
+#else
+ delete typeDetect;
+#endif
+
+ if( type == MIMEType::Null ) {
+ cerr << "Unable to detect MIME type!" << endl;
+ return 1;
+ }
+
+ cout << "MIME type: " << type << endl;
+
+ return 0;
+}