diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-04-30 16:46:00 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-04-30 16:46:00 +0200 |
commit | 12c50867c04b2c2a11f5026466bbea02d5406b70 (patch) | |
tree | 4008a8d5e3660d823197f97b3c0b244fa37d3ea1 /src/modules | |
parent | eb3771cafb98451116a4f0ec0e7a371800770de1 (diff) | |
download | crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.gz crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.bz2 |
started a robots.txt parser
Diffstat (limited to 'src/modules')
-rw-r--r-- | src/modules/processor/GNUmakefile | 2 | ||||
-rwxr-xr-x | src/modules/processor/Makefile.W32 | 2 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/GNUmakefile | 47 | ||||
-rwxr-xr-x | src/modules/processor/robotstxt/Makefile.W32 | 54 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 30 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.hpp | 16 |
6 files changed, 149 insertions, 2 deletions
diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile index 8bfd814..8b91967 100644 --- a/src/modules/processor/GNUmakefile +++ b/src/modules/processor/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = htmllinkextract +SUBDIRS = htmllinkextract robotstxt -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/processor/Makefile.W32 b/src/modules/processor/Makefile.W32 index f98b918..530fd98 100755 --- a/src/modules/processor/Makefile.W32 +++ b/src/modules/processor/Makefile.W32 @@ -1,6 +1,6 @@ TOPDIR = ..\..\.. -SUBDIRS = htmllinkextract +SUBDIRS = htmllinkextract robotstxt !INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile new file mode 100644 index 0000000..d52c92e --- /dev/null +++ b/src/modules/processor/robotstxt/GNUmakefile @@ -0,0 +1,47 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src \ + -I$(TOPDIR)/include/logger \ + -I$(TOPDIR)/include/util \ + -I$(TOPDIR)/include/module \ + -I$(TOPDIR)/include/crawler \ + -I$(TOPDIR)/streamhtmlparser + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + -L$(TOPDIR)/src/libcrawler \ + -L$(TOPDIR)/streamhtmlparser + +INCLUDE_LIBS = \ + -lcrawler \ + -lstreamhtmlparser + +DYNAMIC_MODULE = \ + mod_processor_robotstxt.so + +STATIC_LIB = \ + librobotstxtprocessor.a + +CPP_OBJS = \ + RobotsTxtProcessor.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32 new file mode 100755 index 0000000..b67513a --- /dev/null +++ b/src/modules/processor/robotstxt/Makefile.W32 @@ -0,0 +1,54 @@ +TOPDIR = ..\..\..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\src \ + /I$(TOPDIR)\include\module \ + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler \ + /I$(TOPDIR)\streamhtmlparser + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ + $(TOPDIR)\src\liblogger\logger.lib + +DYNAMIC_MODULE = \ + mod_processor_robotstxt.dll + +STATIC_LIB = \ + robotstxtprocessor.lib + +CPP_OBJS = \ + RobotsTxtProcessor.obj + +SHARED_CPP_OBJS = \ + RobotsTxtProcessor.dllobj + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +$(STATIC_LIB): $(CPP_OBJS) + $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $? + +$(DYNAMIC_MODULE): $(SHARED_CPP_OBJS) + $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $? + +local_all: $(STATIC_LIB) $(DYNAMIC_MODULE) + +local_clean: + @-erase $(LOCAL_STATIC_LIB) 2>NUL + @-erase $(CPP_OBJS) 2>NUL + +local_distclean: + +local_test: diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp new file mode 100644 index 0000000..1b7dbc8 --- /dev/null +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -0,0 +1,30 @@ +#include "RobotsTxtProcessor.hpp" +#include "Logger.hpp" + +#include <string> +#include <cstring> + +using namespace std; + +RobotsTxtProcessor::RobotsTxtProcessor( ) +{ +} + +RobotsTxtProcessor::~RobotsTxtProcessor( ) +{ +} + +void RobotsTxtProcessor::process( RewindInputStream *s ) +{ + char buf[2] = {0, 0}; + URL url = s->getBaseUrl( ); + + while( s->good( ) && !s->eof( ) ) { + buf[0] = s->get( ); + if( buf[0] ) { + cout << buf; + } + } +} + +REGISTER_MODULE( "robotstxt_processor", Processor, RobotsTxtProcessor ) diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp new file mode 100644 index 0000000..a274f2b --- /dev/null +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp @@ -0,0 +1,16 @@ +#ifndef __ROBOTS_TXT_PROCESSOR_H +#define __ROBOTS_TXT_PROCESSOR_H + +#include "Processor.hpp" +#include "ModuleRegistry.hpp" + +class RobotsTxtProcessor : public Processor { + public: + RobotsTxtProcessor( ); + virtual ~RobotsTxtProcessor( ); + virtual void process( RewindInputStream *s ); +}; + +DECLARE_MODULE( Processor ) + +#endif |