summaryrefslogtreecommitdiff
path: root/src/crawl/crawl.conf
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-10-16 13:29:01 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-10-16 13:29:01 +0200
commit971d5d22e7117acb95c7903dd5b911b96fc97dcf (patch)
tree079fb0e064e1c4a35dbd27821e993b573d268ba2 /src/crawl/crawl.conf
parentff403df10813717698dc47e0b22f19d62c007cff (diff)
downloadcrawler-971d5d22e7117acb95c7903dd5b911b96fc97dcf.tar.gz
crawler-971d5d22e7117acb95c7903dd5b911b96fc97dcf.tar.bz2
creating all module constructors now from Lua configuration
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r--src/crawl/crawl.conf38
1 files changed, 38 insertions, 0 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
index bfcd07b..fd9776f 100644
--- a/src/crawl/crawl.conf
+++ b/src/crawl/crawl.conf
@@ -17,7 +17,45 @@ modules = {
urlnormalizers = {
"mod_urlnormalizer_simple",
"mod_urlnormalizer_googleurl"
+ },
+
+ urlfilters = {
+ "mod_urlfilter_host",
+ "mod_urlfilter_protocol"
+ },
+
+ urlchainfilters = {
+ "mod_urlfilter_chain"
+ },
+
+ urlfrontiers = {
+ "mod_frontier_memory"
+ },
+
+ fetchers = {
+ "mod_fetcher_libfetch",
+ "mod_fetcher_libcurl",
+ "mod_fetcher_winhttp"
+ },
+
+ urlseens = {
+ "mod_urlseen_memory"
+ },
+
+ dedupers = {
+ "mod_deduper_null"
+ },
+
+ processors = {
+ "mod_processor_htmllinkextract",
+ "mod_processor_robotstxt",
+ "mod_processor_sitemap"
+ },
+
+ typedetects = {
+ "mod_typedetect_libmagic"
}
+
}
-- seeds: URLS which are fed in the beginning to the URL frontier