local dbg = require( "crawl/debugger" ) -- global settings crawler = { -- stop after N documents stop_after_N_operations = 0, module_path = "modules", modules_search_recursive = true } logger = { level = "DEBUG" } modules = { urlnormalizers = { "mod_urlnormalizer_simple", "mod_urlnormalizer_googleurl" }, urlfilters = { "mod_urlfilter_host", "mod_urlfilter_protocol" }, urlchainfilters = { "mod_urlfilter_chain" }, urlfrontiers = { "mod_frontier_memory" }, fetchers = { "mod_fetcher_libfetch", "mod_fetcher_libcurl", "mod_fetcher_winhttp" }, urlseens = { "mod_urlseen_memory" }, dedupers = { "mod_deduper_null" }, processors = { "mod_processor_htmllinkextract", "mod_processor_robotstxt", "mod_processor_sitemap" }, typedetects = { "mod_typedetect_libmagic" } } -- seeds: URLS which are fed in the beginning to the URL frontier seeds = { "http://www.wolframe.net", "http://wolframe.net" } filters = { -- allowed protocols to be fetched protocols = { "http", "https" }, -- allowed hosts to be crawled (FQDN) hosts = { "www.wolframe.net", "wolframe.net" } } function init( ) log.notice( "Init.." ) -- normalizer = urlnormalizers.create( "google_urlnormalizer" ); normalizer = GoogleURLNormalizer:new( ) -- normalizer2 = urlnormalizers.create( "simple_urlnormalizer" ); normalizer2 = SimpleURLNormalizer:new( ) base = tolua.cast( normalizer, "URLNormalizer" ) log.debug( "type: " .. tolua.type( base ) ) end function destroy( ) log.notice( "Destroy.." ) normalizer:delete( ) end function crawl( ) --dbg( ) log.notice( "Crawling.." ) local baseUrl = base:parseUrl( "http://www.base.com" ) log.debug( "base URL is: " .. baseUrl:str( ) ) local url = base:normalize( baseUrl, "/relativedir/relativefile.html" ) log.debug( "URL is: " .. url:str( ) ) end