summaryrefslogtreecommitdiff
path: root/src/crawl/crawl.conf
diff options
context:
space:
mode:
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r--src/crawl/crawl.conf38
1 files changed, 38 insertions, 0 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
index bfcd07b..fd9776f 100644
--- a/src/crawl/crawl.conf
+++ b/src/crawl/crawl.conf
@@ -17,7 +17,45 @@ modules = {
urlnormalizers = {
"mod_urlnormalizer_simple",
"mod_urlnormalizer_googleurl"
+ },
+
+ urlfilters = {
+ "mod_urlfilter_host",
+ "mod_urlfilter_protocol"
+ },
+
+ urlchainfilters = {
+ "mod_urlfilter_chain"
+ },
+
+ urlfrontiers = {
+ "mod_frontier_memory"
+ },
+
+ fetchers = {
+ "mod_fetcher_libfetch",
+ "mod_fetcher_libcurl",
+ "mod_fetcher_winhttp"
+ },
+
+ urlseens = {
+ "mod_urlseen_memory"
+ },
+
+ dedupers = {
+ "mod_deduper_null"
+ },
+
+ processors = {
+ "mod_processor_htmllinkextract",
+ "mod_processor_robotstxt",
+ "mod_processor_sitemap"
+ },
+
+ typedetects = {
+ "mod_typedetect_libmagic"
}
+
}
-- seeds: URLS which are fed in the beginning to the URL frontier