summaryrefslogtreecommitdiff
path: root/src/modules
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-04-30 16:46:00 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-04-30 16:46:00 +0200
commit12c50867c04b2c2a11f5026466bbea02d5406b70 (patch)
tree4008a8d5e3660d823197f97b3c0b244fa37d3ea1 /src/modules
parenteb3771cafb98451116a4f0ec0e7a371800770de1 (diff)
downloadcrawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.gz
crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.bz2
started a robots.txt parser
Diffstat (limited to 'src/modules')
-rw-r--r--src/modules/processor/GNUmakefile2
-rwxr-xr-xsrc/modules/processor/Makefile.W322
-rw-r--r--src/modules/processor/robotstxt/GNUmakefile47
-rwxr-xr-xsrc/modules/processor/robotstxt/Makefile.W3254
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.cpp30
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.hpp16
6 files changed, 149 insertions, 2 deletions
diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile
index 8bfd814..8b91967 100644
--- a/src/modules/processor/GNUmakefile
+++ b/src/modules/processor/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../../..
-SUBDIRS = htmllinkextract
+SUBDIRS = htmllinkextract robotstxt
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/processor/Makefile.W32 b/src/modules/processor/Makefile.W32
index f98b918..530fd98 100755
--- a/src/modules/processor/Makefile.W32
+++ b/src/modules/processor/Makefile.W32
@@ -1,6 +1,6 @@
TOPDIR = ..\..\..
-SUBDIRS = htmllinkextract
+SUBDIRS = htmllinkextract robotstxt
!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile
new file mode 100644
index 0000000..d52c92e
--- /dev/null
+++ b/src/modules/processor/robotstxt/GNUmakefile
@@ -0,0 +1,47 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src \
+ -I$(TOPDIR)/include/logger \
+ -I$(TOPDIR)/include/util \
+ -I$(TOPDIR)/include/module \
+ -I$(TOPDIR)/include/crawler \
+ -I$(TOPDIR)/streamhtmlparser
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+ -L$(TOPDIR)/src/libcrawler \
+ -L$(TOPDIR)/streamhtmlparser
+
+INCLUDE_LIBS = \
+ -lcrawler \
+ -lstreamhtmlparser
+
+DYNAMIC_MODULE = \
+ mod_processor_robotstxt.so
+
+STATIC_LIB = \
+ librobotstxtprocessor.a
+
+CPP_OBJS = \
+ RobotsTxtProcessor.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32
new file mode 100755
index 0000000..b67513a
--- /dev/null
+++ b/src/modules/processor/robotstxt/Makefile.W32
@@ -0,0 +1,54 @@
+TOPDIR = ..\..\..\..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504
+
+INCLUDE_DIRS = \
+ /I. \
+ /I$(TOPDIR)\src \
+ /I$(TOPDIR)\include\module \
+ /I$(TOPDIR)\include\util \
+ /I$(TOPDIR)\include\logger \
+ /I$(TOPDIR)\include\crawler \
+ /I$(TOPDIR)\streamhtmlparser
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \
+ $(TOPDIR)\src\libcrawler\crawler.lib \
+ $(TOPDIR)\src\liblogger\logger.lib
+
+DYNAMIC_MODULE = \
+ mod_processor_robotstxt.dll
+
+STATIC_LIB = \
+ robotstxtprocessor.lib
+
+CPP_OBJS = \
+ RobotsTxtProcessor.obj
+
+SHARED_CPP_OBJS = \
+ RobotsTxtProcessor.dllobj
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+$(STATIC_LIB): $(CPP_OBJS)
+ $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $?
+
+$(DYNAMIC_MODULE): $(SHARED_CPP_OBJS)
+ $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $?
+
+local_all: $(STATIC_LIB) $(DYNAMIC_MODULE)
+
+local_clean:
+ @-erase $(LOCAL_STATIC_LIB) 2>NUL
+ @-erase $(CPP_OBJS) 2>NUL
+
+local_distclean:
+
+local_test:
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
new file mode 100644
index 0000000..1b7dbc8
--- /dev/null
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -0,0 +1,30 @@
+#include "RobotsTxtProcessor.hpp"
+#include "Logger.hpp"
+
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+RobotsTxtProcessor::RobotsTxtProcessor( )
+{
+}
+
+RobotsTxtProcessor::~RobotsTxtProcessor( )
+{
+}
+
+void RobotsTxtProcessor::process( RewindInputStream *s )
+{
+ char buf[2] = {0, 0};
+ URL url = s->getBaseUrl( );
+
+ while( s->good( ) && !s->eof( ) ) {
+ buf[0] = s->get( );
+ if( buf[0] ) {
+ cout << buf;
+ }
+ }
+}
+
+REGISTER_MODULE( "robotstxt_processor", Processor, RobotsTxtProcessor )
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
new file mode 100644
index 0000000..a274f2b
--- /dev/null
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
@@ -0,0 +1,16 @@
+#ifndef __ROBOTS_TXT_PROCESSOR_H
+#define __ROBOTS_TXT_PROCESSOR_H
+
+#include "Processor.hpp"
+#include "ModuleRegistry.hpp"
+
+class RobotsTxtProcessor : public Processor {
+ public:
+ RobotsTxtProcessor( );
+ virtual ~RobotsTxtProcessor( );
+ virtual void process( RewindInputStream *s );
+};
+
+DECLARE_MODULE( Processor )
+
+#endif