summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-08 21:40:10 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-08 21:40:10 +0200
commit920ba33a9d12a3d2117112ce5676f606d708964c (patch)
tree059dfb115273f4b9617bd5016151d69212a4913f /src
parent3fb1aad6a170b546c193ecc12c8957a628840d52 (diff)
downloadcrawler-920ba33a9d12a3d2117112ce5676f606d708964c.tar.gz
crawler-920ba33a9d12a3d2117112ce5676f606d708964c.tar.bz2
added a file rewind input stream
started to add MIME type detection and a module based on libmagic (not finished yet)
Diffstat (limited to 'src')
-rw-r--r--src/GNUmakefile3
-rw-r--r--src/MIMEType.cpp5
-rw-r--r--src/MIMEType.hpp93
-rw-r--r--src/Processor.hpp3
-rw-r--r--src/TypeDetect.hpp15
-rw-r--r--src/crawlingwolf.cpp1
-rw-r--r--src/modules/GNUmakefile4
-rw-r--r--src/modules/fetcher/GNUmakefile2
-rw-r--r--src/modules/fetcher/file/FileFetcher.cpp12
-rw-r--r--src/modules/fetcher/file/FileFetcher.hpp19
-rw-r--r--src/modules/fetcher/file/FileRewindInputStream.cpp20
-rw-r--r--src/modules/fetcher/file/FileRewindInputStream.hpp16
-rw-r--r--src/modules/fetcher/file/GNUmakefile40
-rw-r--r--src/modules/typedetect/GNUmakefile17
-rw-r--r--src/modules/typedetect/libmagic/GNUmakefile40
-rw-r--r--src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp42
-rw-r--r--src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp24
17 files changed, 352 insertions, 4 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 8f1657e..11bc63f 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -23,7 +23,8 @@ INCLUDE_LIBS += \
endif
LOCAL_STATIC_LIB_OBJS = \
- URL.o
+ URL.o \
+ MIMEType.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/MIMEType.cpp b/src/MIMEType.cpp
new file mode 100644
index 0000000..25dc20c
--- /dev/null
+++ b/src/MIMEType.cpp
@@ -0,0 +1,5 @@
+#include "MIMEType.hpp"
+
+MIMEType MIMEType::Null;
+
+
diff --git a/src/MIMEType.hpp b/src/MIMEType.hpp
new file mode 100644
index 0000000..02fc503
--- /dev/null
+++ b/src/MIMEType.hpp
@@ -0,0 +1,93 @@
+#ifndef __MIMETYPE_H
+#define __MIMETYPE_H
+
+#include <string>
+#include <iostream>
+#include <sstream>
+
+class MIMEType {
+ protected:
+ std::string m_type;
+ std::string m_subtype;
+
+ public:
+ MIMEType( )
+ : m_type( "" ), m_subtype( "" )
+ {
+ }
+
+ MIMEType( const std::string _type, const std::string _subtype )
+ : m_type( _type ), m_subtype( _subtype )
+ {
+ }
+
+ MIMEType( const MIMEType &m )
+ : m_type( m.m_type ), m_subtype( m.m_subtype )
+ {
+ }
+
+ MIMEType( const char *s )
+ : m_type( s ), m_subtype( "" )
+ {
+ }
+
+ MIMEType& operator=( const MIMEType &m )
+ {
+ if( this != &m ) {
+ this->m_type = m.m_type;
+ this->m_subtype = m.m_subtype;
+ }
+ return *this;
+ }
+
+ const std::string type( ) const
+ {
+ return m_type;
+ }
+
+ const std::string subtype( ) const
+ {
+ return m_subtype;
+ }
+
+ std::string str( ) const
+ {
+ std::ostringstream os;
+ os << *this;
+ return os.str( );
+ }
+
+ static MIMEType Null;
+
+ bool operator!=( const MIMEType &other ) const
+ {
+ return( str( ) != other.str( ) );
+ }
+
+ bool operator==( const MIMEType &other ) const
+ {
+ return( str( ) == other.str( ) );
+ }
+
+ bool operator<( const MIMEType &other ) const
+ {
+ return( str( ) < other.str( ) );
+ }
+
+ template< typename CharT, typename TraitsT > friend
+ std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream<CharT, TraitsT>&s, const MIMEType& m );
+};
+
+template< typename CharT, typename TraitsT >
+inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m )
+{
+ if( m.type( ).empty( ) ) {
+ return s;
+ }
+
+ s << m.type( ) << "/" << m.subtype( );
+
+ return s;
+}
+
+#endif
diff --git a/src/Processor.hpp b/src/Processor.hpp
index b796e65..bc17ec0 100644
--- a/src/Processor.hpp
+++ b/src/Processor.hpp
@@ -5,7 +5,8 @@
class Processor {
public:
- virtual ~Processor( ) {};
+ virtual ~Processor( ) { }
+
virtual void process( RewindInputStream *s ) = 0;
};
diff --git a/src/TypeDetect.hpp b/src/TypeDetect.hpp
new file mode 100644
index 0000000..7db714b
--- /dev/null
+++ b/src/TypeDetect.hpp
@@ -0,0 +1,15 @@
+#ifndef __TYPE_DETECTION_H
+#define __TYPE_DETECTION_H
+
+#include "RewindInputStream.hpp"
+#include "MIMEType.hpp"
+
+class TypeDetect
+{
+ public:
+ virtual ~TypeDetect( ) { };
+
+ virtual MIMEType detect( RewindInputStream *s ) = 0;
+};
+
+#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index c34b345..a1a5151 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -5,6 +5,7 @@
#include "URLSeen.hpp"
#include "URLNormalizer.hpp"
#include "URLFilter.hpp"
+#include "TypeDetect.hpp"
#include "ModuleLoader.hpp"
diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile
index 31dc26c..4d032d5 100644
--- a/src/modules/GNUmakefile
+++ b/src/modules/GNUmakefile
@@ -1,6 +1,8 @@
TOPDIR = ../..
-SUBDIRS = urlnormalizer urlfilter frontier fetcher urlseen deduper processor
+SUBDIRS = \
+ urlnormalizer urlfilter frontier fetcher urlseen \
+ deduper processor typedetect
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile
index 526e9e5..89dfe93 100644
--- a/src/modules/fetcher/GNUmakefile
+++ b/src/modules/fetcher/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../../..
-SUBDIRS = libfetch
+SUBDIRS = libfetch file
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/fetcher/file/FileFetcher.cpp b/src/modules/fetcher/file/FileFetcher.cpp
new file mode 100644
index 0000000..8d66e14
--- /dev/null
+++ b/src/modules/fetcher/file/FileFetcher.cpp
@@ -0,0 +1,12 @@
+#include "FileFetcher.hpp"
+#include "FileRewindInputStream.hpp"
+
+using namespace std;
+
+RewindInputStream *FileFetcher::fetch( const URL url )
+{
+ FileRewindInputStream *s = new FileRewindInputStream( url );
+ return s;
+}
+
+REGISTER_MODULE( "file", Fetcher, FileFetcher )
diff --git a/src/modules/fetcher/file/FileFetcher.hpp b/src/modules/fetcher/file/FileFetcher.hpp
new file mode 100644
index 0000000..747c9b0
--- /dev/null
+++ b/src/modules/fetcher/file/FileFetcher.hpp
@@ -0,0 +1,19 @@
+#ifndef __FILE_FETCHER_H
+#define __FILE_FETCHER_H
+
+#include "Fetcher.hpp"
+#include "ModuleRegistry.hpp"
+
+class FileFetcher : public Fetcher
+{
+ public:
+ FileFetcher( ) { }
+
+ virtual ~FileFetcher( ) { }
+
+ virtual RewindInputStream *fetch( const URL url );
+};
+
+DECLARE_MODULE( Fetcher )
+
+#endif
diff --git a/src/modules/fetcher/file/FileRewindInputStream.cpp b/src/modules/fetcher/file/FileRewindInputStream.cpp
new file mode 100644
index 0000000..034bd8d
--- /dev/null
+++ b/src/modules/fetcher/file/FileRewindInputStream.cpp
@@ -0,0 +1,20 @@
+#include "FileRewindInputStream.hpp"
+
+#include <stdexcept>
+
+using namespace std;
+
+FileRewindInputStream::FileRewindInputStream( const URL &url )
+ : RewindInputStream( url ), ifstream( )
+{
+ if( url.protocol( ) != "file" || url.host( ) != "localhost" ) {
+ throw new runtime_error( "URL doesn't denote a local file" );
+ }
+
+ open( url.path( ).c_str( ), ios::binary );
+}
+
+FileRewindInputStream::~FileRewindInputStream( )
+{
+ close( );
+}
diff --git a/src/modules/fetcher/file/FileRewindInputStream.hpp b/src/modules/fetcher/file/FileRewindInputStream.hpp
new file mode 100644
index 0000000..21a194a
--- /dev/null
+++ b/src/modules/fetcher/file/FileRewindInputStream.hpp
@@ -0,0 +1,16 @@
+#ifndef __FILE_REWIND_INPUT_STREAM_H
+#define __FILE_REWIND_INPUT_STREAM_H
+
+#include "RewindInputStream.hpp"
+
+#include <fstream>
+
+class FileRewindInputStream : public RewindInputStream, std::ifstream
+{
+ public:
+ FileRewindInputStream( const URL &url );
+
+ virtual ~FileRewindInputStream( );
+};
+
+#endif
diff --git a/src/modules/fetcher/file/GNUmakefile b/src/modules/fetcher/file/GNUmakefile
new file mode 100644
index 0000000..c143110
--- /dev/null
+++ b/src/modules/fetcher/file/GNUmakefile
@@ -0,0 +1,40 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_fetcher_file.so
+
+STATIC_LIB = \
+ libfilefetcher.a
+
+CPP_OBJS = \
+ FileFetcher.o \
+ FileRewindInputStream.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/typedetect/GNUmakefile b/src/modules/typedetect/GNUmakefile
new file mode 100644
index 0000000..3923feb
--- /dev/null
+++ b/src/modules/typedetect/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = libmagic
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/typedetect/libmagic/GNUmakefile b/src/modules/typedetect/libmagic/GNUmakefile
new file mode 100644
index 0000000..c88c4dd
--- /dev/null
+++ b/src/modules/typedetect/libmagic/GNUmakefile
@@ -0,0 +1,40 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ -lmagic
+
+DYNAMIC_MODULE = \
+ mod_typedetect_libmagic.so
+
+STATIC_LIB = \
+ liblibmagictypedetect.a
+
+CPP_OBJS = \
+ LibMagicTypeDetect.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp
new file mode 100644
index 0000000..4c5a442
--- /dev/null
+++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp
@@ -0,0 +1,42 @@
+#include "LibMagicTypeDetect.hpp"
+#include "Logger.hpp"
+
+#include <stdexcept>
+
+LibMagicTypeDetect::LibMagicTypeDetect( )
+{
+ m_magic = magic_open( MAGIC_MIME_TYPE );
+ if( m_magic == NULL ) {
+ throw runtime_error( "Unable to open magic file" );
+ }
+
+ if( magic_load( m_magic, NULL ) != 0 ) {
+ throw runtime_error( "Unable to load standard magic file database" );
+ }
+}
+
+LibMagicTypeDetect::~LibMagicTypeDetect( )
+{
+ magic_close( m_magic );
+}
+
+MIMEType LibMagicTypeDetect::detect( RewindInputStream *s )
+{
+ enum { BUFSIZE = 109056 };
+ char buf[BUFSIZE];
+ const char *res = 0;
+
+ while( s->good( ) && !s->eof( ) ) {
+ s->read( buf, BUFSIZE );
+ res = magic_buffer( m_magic, buf, BUFSIZE );
+ if( res == NULL ) {
+ return MIMEType::Null;
+ }
+ // once
+ break;
+ }
+
+ return MIMEType( res );
+}
+
+REGISTER_MODULE( "libmagic", TypeDetect, LibMagicTypeDetect )
diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp
new file mode 100644
index 0000000..748dfbf
--- /dev/null
+++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.hpp
@@ -0,0 +1,24 @@
+#ifndef __LIBMAGIC_TYPE_DETECTION_H
+#define __LIBMAGIC_TYPE_DETECTION_H
+
+#include "TypeDetect.hpp"
+#include "ModuleRegistry.hpp"
+
+#include <magic.h>
+
+class LibMagicTypeDetect : public TypeDetect
+{
+ public:
+ LibMagicTypeDetect( );
+
+ virtual ~LibMagicTypeDetect( );
+
+ virtual MIMEType detect( RewindInputStream *s );
+
+ private:
+ magic_t m_magic;
+};
+
+DECLARE_MODULE( TypeDetect )
+
+#endif