diff options
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r-- | src/crawl/crawl.conf | 33 |
1 files changed, 24 insertions, 9 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index 7c3fb80..816f23f 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -1,17 +1,12 @@ -local normalizer = GoogleURLNormalizer:new( ) -local baseUrl = normalizer:parseUrl( "http://www.base.com" ) -io.write( "base URL is: " .. baseUrl:str( ) .. "\n" ) -local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" ) -io.write( "URL is: " .. url:str( ) .. "\n" ) -normalizer:delete( ) - -- global setting crawler = { -- stop after N documents - stop_after_N_operations = 10 + stop_after_N_operations = 10, + module_path = modules, + modules_search_recursive = true } logger = { @@ -25,6 +20,11 @@ seeds = { "http://wolframe.net" } +urlnormalizers = { + "mod_normalizer_simple", + "mod_normalizer_google" +} + filters = { -- allowed protocols to be fetched protocols = { @@ -39,6 +39,21 @@ filters = { } } + +function init( ) + io.write( "Init..\n" ) + normalizer = GoogleURLNormalizer:new( ) +end + +function destroy( ) + io.write( "Destroy..\n" ) + normalizer:delete( ) +end + function crawl( ) - io.write( "Crawling." ) + io.write( "Crawling..\n" ) + local baseUrl = normalizer:parseUrl( "http://www.base.com" ) + io.write( "base URL is: " .. baseUrl:str( ) .. "\n" ) + local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" ) + io.write( "URL is: " .. url:str( ) .. "\n" ) end |