diff options
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r-- | src/crawl/crawl.conf | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index bfcd07b..fd9776f 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -17,7 +17,45 @@ modules = { urlnormalizers = { "mod_urlnormalizer_simple", "mod_urlnormalizer_googleurl" + }, + + urlfilters = { + "mod_urlfilter_host", + "mod_urlfilter_protocol" + }, + + urlchainfilters = { + "mod_urlfilter_chain" + }, + + urlfrontiers = { + "mod_frontier_memory" + }, + + fetchers = { + "mod_fetcher_libfetch", + "mod_fetcher_libcurl", + "mod_fetcher_winhttp" + }, + + urlseens = { + "mod_urlseen_memory" + }, + + dedupers = { + "mod_deduper_null" + }, + + processors = { + "mod_processor_htmllinkextract", + "mod_processor_robotstxt", + "mod_processor_sitemap" + }, + + typedetects = { + "mod_typedetect_libmagic" } + } -- seeds: URLS which are fed in the beginning to the URL frontier |