summaryrefslogtreecommitdiff
path: root/src/crawl/crawl.conf
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 21:32:29 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 21:32:29 +0200
commitc8b5a508eb6c1cc2d1e5574e294c72d745a36fdc (patch)
tree018a2a1a9506fcc6921d5e9ebee86b9b89dde535 /src/crawl/crawl.conf
parentb80687f17644766eb890598297c0f37bb898d76d (diff)
downloadcrawler-c8b5a508eb6c1cc2d1e5574e294c72d745a36fdc.tar.gz
crawler-c8b5a508eb6c1cc2d1e5574e294c72d745a36fdc.tar.bz2
.
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r--src/crawl/crawl.conf16
1 files changed, 16 insertions, 0 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
index 95d0f75..8dfc895 100644
--- a/src/crawl/crawl.conf
+++ b/src/crawl/crawl.conf
@@ -3,6 +3,8 @@
crawler = {
-- stop after N documents
stop_after_N_operations = 10
+
+
}
-- seeds: URLS which are fed in the beginning to the URL frontier
@@ -11,3 +13,17 @@ seeds = {
"http://www.wolframe.net",
"http://wolframe.net"
}
+
+filters = {
+ -- allowed protocols to be fetched
+ protocols = {
+ "http",
+ "https"
+ },
+
+ -- allowed hosts to be crawled (FQDN)
+ hosts = {
+ "www.wolframe.net",
+ "wolframe.net"
+ }
+}