/* * LuceneAnalyzer - Lucene Index Analyzer * * Copyright (C) 2006 Andreas Baumann * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.dyndns.andreasbaumann; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.ArrayList; import java.util.List; import java.util.Properties; import jargs.gnu.CmdLineParser; import jargs.gnu.CmdLineParser.Option; import jargs.gnu.CmdLineParser.OptionException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermPositions; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.util.logging.LogManager; import java.util.logging.Logger; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.IndexSchema; import org.apache.solr.search.SolrIndexSearcher; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; /** * Lucene index analyzer. Works for file system indexes only (not * for indexes fully in RAM or in different persistence systems as * a JDBCDirectory. * * Note: requires at least lucene 3.0! * * @author Andreas Baumann, * @version $Id$ */ public class LuceneAnalyzer { private static final String programName = "lucenanalyzer"; private static final String versionString = "0.0.4"; private static void printGlobalInfo( IndexReader indexReader, boolean printHeaders ) throws IOException { if( printHeaders ) { System.out.println( "Global Information:" ); System.out.println( "===================" ); } System.out.println( "\tnumber of documents: " + indexReader.numDocs( ) ); // we should get the number of features differently, this is inefficient, but Lucene // has no notion of global statistics (because the default weighting schema doesn't // make use of it!) int nofFeatures = 0; int nofTokens = 0; TermEnum terms = indexReader.terms( ); while( terms.next( ) ) { Term term = terms.term( ); int df = terms.docFreq( ); nofFeatures++; nofTokens += df; } System.out.println( "\ttotal number of features: " + nofFeatures ); System.out.println( "\ttotal number of tokens: " + nofTokens ); System.out.println( "\tversion: " + indexReader.getVersion( ) ); System.out.println( "\tstill current: " + indexReader.isCurrent( ) ); //TODO: we don't get segment information! //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) ); System.out.println( "\tmaximal document number: " + indexReader.maxDoc( ) ); System.out.println( "\thas deletions: " + indexReader.hasDeletions( ) ); System.out.println( "" ); } private static void printFieldInfoPerFieldOption( IndexReader indexReader, IndexReader.FieldOption fieldOption ) { System.out.println( "Fields of type '" + fieldOption + "':" ); Collection fields = indexReader.getFieldNames( fieldOption ); Iterator fieldIterator = fields.iterator( ); while( fieldIterator.hasNext( ) ) { String field = (String)fieldIterator.next( ); if( field != null && !field.equals( "" ) ) { // TODO: define data type here! System.out.println( "\t" + field.toString( ) ); } } } private static void printFieldInfo( IndexReader indexReader, boolean printHeaders ) throws IOException { if( printHeaders ) { System.out.println( "Field Information:" ); System.out.println( "==================" ); } // print info per Lucene field type printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.ALL ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_NO_TERMVECTOR ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.OMIT_POSITIONS ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.OMIT_TERM_FREQ_AND_POSITIONS ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.STORES_PAYLOADS ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET ); printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.UNINDEXED ); System.out.println( "" ); } private static void printTerms( IndexReader indexReader, boolean printHeaders, boolean printDocNumbers, boolean printPositions ) throws IOException { if( printHeaders ) { System.out.println( "Terms:" ); System.out.println( "======" ); } TermEnum terms = indexReader.terms( ); while( terms.next( ) ) { Term term = terms.term( ); // the df is stored in the iterator and not in the term, weird... int df = terms.docFreq( ); if( !printDocNumbers && !printPositions ) { System.out.print( term.field( ) + "\t" + term.text( ) + "\t" + df ); } else { System.out.print( term.field( ) + "\t" + term.text( ) ); } if( printDocNumbers ) { TermDocs termDocs = indexReader.termDocs( term ); boolean first = true; while( termDocs.next( ) ) { if( first ) { System.out.print( "\t" + termDocs.doc( ) ); first = false; } else { System.out.print( "," + termDocs.doc( ) ); } } termDocs.close( ); } else if( printPositions ) { TermPositions termPositions = indexReader.termPositions( term ); boolean first = true; while( termPositions.next( ) ) { if( first ) { System.out.print( "\t" + termPositions.doc( ) ); first = false; } else { System.out.print( "," + termPositions.doc( ) ); } for( int i = 0; i < termPositions.freq( ); i++ ) { int position = termPositions.nextPosition( ); if( i == 0 ) { System.out.print( "[" ); } System.out.print( position ); if( i < termPositions.freq( ) - 1 ) { System.out.print( "," ); } if( i == termPositions.freq( ) - 1 ) { System.out.print( "]" ); } } } termPositions.close( ); } System.out.println( "" ); } System.out.println( "" ); } private static List optionHelpStrings = new ArrayList(); private static Option addHelp( Option option, String helpString ) { if( option.shortForm( ) != null ) { optionHelpStrings.add( " -" + option.shortForm( ) + "/--" + option.longForm( ) + ": " + helpString ); } else { optionHelpStrings.add( " --" + option.longForm( ) + ": " + helpString ); } return option; } private static void printUsage() { System.err.println( "Usage: " + programName + " [options] \n" ); for( Iterator i = optionHelpStrings.iterator( ); i.hasNext( ); ) { System.err.println( i.next( ) ); } } private static void printVersion() { System.out.println( "Version " + LuceneAnalyzer.class.getName( ) + " " + versionString ); } public static void main( String[] args ) throws IOException { CmdLineParser parser = new CmdLineParser( ); // default options, well-known, should always be around Option verbose = addHelp( parser.addBooleanOption( 'v', "verbose" ), "print extra verbosity information" ); Option help = addHelp( parser.addBooleanOption( 'h', "help" ), "print this help message" ); Option version = addHelp( parser.addBooleanOption( "version" ), "print version information" ); Option globals = addHelp( parser.addBooleanOption( 'g', "globals" ), "print global statistics" ); Option fields = addHelp( parser.addBooleanOption( 'f', "fields" ), "print field information" ); Option terms = addHelp( parser.addBooleanOption( 't', "terms" ), "print statistics per term" ); Option headers = addHelp( parser.addBooleanOption( 'H', "headers" ), "print headers for sections" ); Option solr = addHelp( parser.addBooleanOption( 's', "solr" ), "treat index as a Solr index, indexDir is the Solr base dir" ); // read the command line options try { parser.parse( args ); } catch( OptionException e ) { System.err.println( e.getMessage( ) ); printUsage( ); System.exit( 1 ); } if( (Boolean)parser.getOptionValue( help, Boolean.FALSE ) ) { printUsage( ); System.exit( 0 ); } if( (Boolean)parser.getOptionValue( version, Boolean.FALSE ) ) { printVersion( ); System.exit( 0 ); } // verbosity as a level, increased with -vvv int verbosity = 0; while( true ) { Boolean verboseValue = (Boolean)parser.getOptionValue( verbose ); if( verboseValue == null ) { break; } else { verbosity++; } } boolean printHeaders = false; if( (Boolean)parser.getOptionValue( headers, Boolean.FALSE ) ) { printHeaders = true; } boolean isSolr = false; if( (Boolean)parser.getOptionValue( solr, Boolean.FALSE ) ) { isSolr = true; } // read command line arguments String[] otherArgs = parser.getRemainingArgs( ); if( otherArgs.length != 1 ) { System.err.println( "Missing a lucene index directory as first argument" ); printUsage( ); System.exit( 1 ); } String basePath = otherArgs[0]; String indexPath = otherArgs[0]; if( isSolr ) { indexPath += "/data/index"; } File indexDir = new File( indexPath ); if( !indexDir.exists( ) ) { System.err.println( indexPath + " doesn't exist" ); System.exit( 1 ); } if( !indexDir.isDirectory( ) ) { System.err.println( indexPath + " is not a directory" ); System.exit( 1 ); } SolrIndexSearcher solrSearcher; Directory luceneDirectory = new SimpleFSDirectory( indexDir ); IndexReader indexReader = IndexReader.open( luceneDirectory ); if( isSolr ) { try { Properties p = System.getProperties( ); p.setProperty( "solr.solr.home", basePath ); LogManager.getLogManager( ).reset( ); Logger globalLogger = Logger.getLogger( "" /* java.util.logging.Logger.GLOBAL_LOGGER_NAME */ ); globalLogger.setLevel( java.util.logging.Level.OFF ); CoreContainer cores = new CoreContainer( new SolrResourceLoader( basePath ) ); SolrConfig solrConfig = new SolrConfig( basePath, SolrConfig.DEFAULT_CONF_FILE, null ); CoreDescriptor descrCore = new CoreDescriptor( cores, "", solrConfig.getResourceLoader( ).getInstanceDir( ) ); IndexSchema solrSchema = new IndexSchema( solrConfig, basePath + "/conf/schema.xml", null ); SolrCore solrCore = new SolrCore( basePath, solrSchema ); solrSearcher = new SolrIndexSearcher( solrCore, solrSchema, "test", luceneDirectory, true, false ); } catch( javax.xml.parsers.ParserConfigurationException e ) { System.err.println( "Illegal Solr configuration: " + e ); System.exit( 1 ); } catch( org.xml.sax.SAXException e ) { System.err.println( "Illegal Solr configuration: " + e ); System.exit( 1 ); } } if( (Boolean)parser.getOptionValue( globals, Boolean.FALSE ) ) { printGlobalInfo( indexReader, printHeaders ); } if( (Boolean)parser.getOptionValue( fields, Boolean.FALSE ) ) { printFieldInfo( indexReader, printHeaders ); } if( (Boolean)parser.getOptionValue( terms, Boolean.FALSE ) ) { printTerms( indexReader, printHeaders, verbosity == 1, verbosity >= 2 ); } indexReader.close( ); System.exit( 0 ); } }