diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-09-28 21:32:29 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-09-28 21:32:29 +0200 |
commit | c8b5a508eb6c1cc2d1e5574e294c72d745a36fdc (patch) | |
tree | 018a2a1a9506fcc6921d5e9ebee86b9b89dde535 /src/crawl/crawl.conf | |
parent | b80687f17644766eb890598297c0f37bb898d76d (diff) | |
download | crawler-c8b5a508eb6c1cc2d1e5574e294c72d745a36fdc.tar.gz crawler-c8b5a508eb6c1cc2d1e5574e294c72d745a36fdc.tar.bz2 |
.
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r-- | src/crawl/crawl.conf | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index 95d0f75..8dfc895 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -3,6 +3,8 @@ crawler = { -- stop after N documents stop_after_N_operations = 10 + + } -- seeds: URLS which are fed in the beginning to the URL frontier @@ -11,3 +13,17 @@ seeds = { "http://www.wolframe.net", "http://wolframe.net" } + +filters = { + -- allowed protocols to be fetched + protocols = { + "http", + "https" + }, + + -- allowed hosts to be crawled (FQDN) + hosts = { + "www.wolframe.net", + "wolframe.net" + } +} |