-- global settings crawler = { -- stop after N documents stop_after_N_operations = 10, module_path = "modules", modules_search_recursive = true } logger = { level = "DEBUG" } modules = { urlnormalizers = { "mod_normalizer_simple", "mod_normalizer_google" } } -- seeds: URLS which are fed in the beginning to the URL frontier seeds = { "http://www.wolframe.net", "http://wolframe.net" } filters = { -- allowed protocols to be fetched protocols = { "http", "https" }, -- allowed hosts to be crawled (FQDN) hosts = { "www.wolframe.net", "wolframe.net" } } function init( ) io.write( "Init..\n" ) normalizer = GoogleURLNormalizer:new( ) end function destroy( ) io.write( "Destroy..\n" ) normalizer:delete( ) end function crawl( ) io.write( "Crawling..\n" ) local baseUrl = normalizer:parseUrl( "http://www.base.com" ) io.write( "base URL is: " .. baseUrl:str( ) .. "\n" ) local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" ) io.write( "URL is: " .. url:str( ) .. "\n" ) end