diff options
Diffstat (limited to 'src/crawl/crawl.conf')
-rw-r--r-- | src/crawl/crawl.conf | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index fd9776f..154d90a 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -2,7 +2,7 @@ crawler = { -- stop after N documents - stop_after_N_operations = 10, + stop_after_N_operations = 0, module_path = "modules", @@ -82,7 +82,12 @@ filters = { function init( ) io.write( "Init..\n" ) + -- normalizer = urlnormalizers.create( "google_urlnormalizer" ); normalizer = GoogleURLNormalizer:new( ) + -- normalizer2 = urlnormalizers.create( "simple_urlnormalizer" ); + normalizer2 = SimpleURLNormalizer:new( ) + base = tolua.cast( normalizer, "URLNormalizer" ) + io.write( "type: " .. tolua.type( base ) .. "\n" ) end function destroy( ) @@ -92,8 +97,8 @@ end function crawl( ) io.write( "Crawling..\n" ) - local baseUrl = normalizer:parseUrl( "http://www.base.com" ) + local baseUrl = base:parseUrl( "http://www.base.com" ) io.write( "base URL is: " .. baseUrl:str( ) .. "\n" ) - local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" ) + local url = base:normalize( baseUrl, "/relativedir/relativefile.html" ) io.write( "URL is: " .. url:str( ) .. "\n" ) end |