summaryrefslogtreecommitdiff
path: root/src/crawlingwolf.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/crawlingwolf.cpp')
-rwxr-xr-xsrc/crawlingwolf.cpp28
1 files changed, 25 insertions, 3 deletions
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index f96ecbb..08f3eec 100755
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -86,8 +86,8 @@ int main( void )
filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" );
#else
- normalizerModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" );
- normalizerModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" );
+ filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" );
+ filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" );
#endif
ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules );
@@ -95,37 +95,47 @@ int main( void )
#ifndef _WIN32
filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
#else
- normalizerModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" );
+ filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" );
#endif
ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
vector<string> frontierModules;
#ifndef _WIN32
frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" );
+#else
+ frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" );
#endif
ModuleLoader<Frontier> frontiers( frontierModules );
vector<string> fetcherModules;
#ifndef _WIN32
fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
+#else
+ fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" );
#endif
ModuleLoader<Fetcher> fetchers( fetcherModules );
vector<string> urlseenModules;
#ifndef _WIN32
urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" );
+#else
+ urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" );
#endif
ModuleLoader<URLSeen> urlSeens( urlseenModules );
vector<string> deduperModules;
#ifndef _WIN32
deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" );
+#else
+ deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" );
#endif
ModuleLoader<Deduper> dedupers( deduperModules );
vector<string> processorModules;
#ifndef _WIN32
processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
+#else
+ processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" );
#endif
ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
@@ -136,10 +146,16 @@ int main( void )
ModuleLoader<TypeDetect> typeDetectors( typeDetectModules );
Frontier *frontier = frontiers.create( "memory_frontier" );
+#ifndef _WIN32
Fetcher *fetcher = fetchers.create( "libfetch_fetcher" );
+#else
+ Fetcher *fetcher = fetchers.create( "winhttp_fetcher" );
+#endif
Deduper *deduper = dedupers.create( "null_deduper" );
URLSeen *urlSeen = urlSeens.create( "memory_urlseen" );
+#ifndef _WIN32
TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" );
+#endif
set<string> protocols;
protocols.insert( "http" );
@@ -180,6 +196,7 @@ int main( void )
continue;
}
+#ifndef _WIN32
MIMEType mimeType = typeDetect->detect( s );
if( mimeType != MIMEType::Null ) {
@@ -191,6 +208,9 @@ int main( void )
LOG( logINFO ) << "Storing archive " << url;
}
}
+#else
+ htmlParser->process( s );
+#endif
delete s;
}
@@ -200,7 +220,9 @@ int main( void )
urlChainFilter.destroy( chainFilter );
urlFilters.destroy( protocolFilter );
urlFilters.destroy( hostFilter );
+#ifndef _WIN32
typeDetectors.destroy( typeDetect );
+#endif
urlSeens.destroy( urlSeen );
dedupers.destroy( deduper );
fetchers.destroy( fetcher );