From 271357c05b42f2a32250f20e55a6ac3d99c9529d Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Wed, 20 Aug 2008 11:33:56 +0200 Subject: initial load --- .../org/dyndns/andreasbaumann/LuceneAnalyzer.java | 284 +++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java (limited to 'src') diff --git a/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java b/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java new file mode 100644 index 0000000..aab2862 --- /dev/null +++ b/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java @@ -0,0 +1,284 @@ +/* + * LuceneAnalyzer - Lucene Index Analyzer + * + * Copyright (C) 2006 Andreas Baumann + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +package org.dyndns.andreasbaumann; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.ArrayList; +import java.util.List; + +import jargs.gnu.CmdLineParser; +import jargs.gnu.CmdLineParser.Option; +import jargs.gnu.CmdLineParser.OptionException; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.FieldOption; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +/** + * Lucene index analyzer. Works for file system indexes only (not + * for indexes fully in RAM or in different persistence systems as + * a JDBCDirectory. + * + * Note: requires lucene 1.9.1 + * + * @author Andreas Baumann, + * @version $Id$ + */ + +public class LuceneAnalyzer +{ + private static final String versionString = "0.0.2"; + + private static void printGlobalInfo( IndexReader indexReader ) throws IOException + { + System.out.println( "Global Information:" ); + System.out.println( "===================" ); + + System.out.println( "\tnumber of documents: " + indexReader.numDocs( ) ); + + // we should get the number of features differently, this is inefficient, but Lucene + // has no notion of global statistics (because the default weighting schema doesn't + // make use of it!) + int nofFeatures = 0; + int nofTokens = 0; + TermEnum terms = indexReader.terms( ); + while( terms.next( ) ) { + Term term = terms.term( ); + int df = terms.docFreq( ); + nofFeatures++; + nofTokens += df; + } + System.out.println( "\ttotal number of features: " + nofFeatures ); + System.out.println( "\ttotal number of tokens: " + nofTokens ); + + System.out.println( "\tversion: " + indexReader.getVersion( ) ); + System.out.println( "\tstill current: " + indexReader.isCurrent( ) ); + + //TODO: we don't get segment information! + //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) ); + System.out.println( "\tmaximal document number: " + indexReader.maxDoc( ) ); + System.out.println( "\thas deletions: " + indexReader.hasDeletions( ) ); + + System.out.println( "" ); + } + + private static void printFieldInfoPerFieldOption( IndexReader indexReader, IndexReader.FieldOption fieldOption ) + { + System.out.println( "Fields of type '" + fieldOption + "':" ); + Collection fields = indexReader.getFieldNames( fieldOption ); + Iterator fieldIterator = fields.iterator( ); + while( fieldIterator.hasNext( ) ) { + String field = (String)fieldIterator.next( ); + if( field != null && !field.equals( "" ) ) { + // TODO: define data type here! + System.out.println( "\t" + field.toString( ) ); + } + } + } + + private static void printFieldInfo( IndexReader indexReader ) throws IOException + { + System.out.println( "Field Information:" ); + System.out.println( "==================" ); + + // very bad design, this field types! + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.ALL ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_NO_TERMVECTOR ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET ); + printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.UNINDEXED ); + System.out.println( "" ); + } + + private static void printTerms( IndexReader indexReader, + boolean printDocNumbers, + boolean printPositions ) throws IOException + { + System.out.println( "Terms:" ); + System.out.println( "======" ); + TermEnum terms = indexReader.terms( ); + while( terms.next( ) ) { + Term term = terms.term( ); + // the df is stored in the iterator and not in the term, weird... + int df = terms.docFreq( ); + if( !printDocNumbers && !printPositions ) { + System.out.print( term.field( ) + "\t" + term.text( ) + "\t" + df ); + } else { + System.out.print( term.field( ) + "\t" + term.text( ) ); + } + + if( printDocNumbers ) { + TermDocs termDocs = indexReader.termDocs( term ); + boolean first = true; + while( termDocs.next( ) ) { + if( first ) { + System.out.print( "\t" + termDocs.doc( ) ); + first = false; + } else { + System.out.print( "," + termDocs.doc( ) ); + } + } + termDocs.close( ); + } else if( printPositions ) { + TermPositions termPositions = indexReader.termPositions( term ); + boolean first = true; + while( termPositions.next( ) ) { + if( first ) { + System.out.print( "\t" + termPositions.doc( ) ); + first = false; + } else { + System.out.print( "," + termPositions.doc( ) ); + } + + for( int i = 0; i < termPositions.freq( ); i++ ) { + int position = termPositions.nextPosition( ); + if( i == 0 ) { + System.out.print( "[" ); + } + System.out.print( position ); + if( i > 0 && i < termPositions.freq( ) - 1 ) { + System.out.print( "," ); + } + if( i == termPositions.freq( ) - 1 ) { + System.out.print( "]" ); + } + } + } + termPositions.close( ); + } + + System.out.println( "" ); + } + System.out.println( "" ); + } + + private static List optionHelpStrings = new ArrayList(); + + private static Option addHelp( Option option, String helpString ) + { + if( option.shortForm( ) != null ) { + optionHelpStrings.add( " -" + option.shortForm( ) + "/--" + option.longForm( ) + ": " + helpString ); + } else { + optionHelpStrings.add( " --" + option.longForm( ) + ": " + helpString ); + } + return option; + } + + private static void printUsage() + { + System.err.println( "Usage: java " + LuceneAnalyzer.class.getName( ) + " " ); + for( Iterator i = optionHelpStrings.iterator( ); i.hasNext( ); ) { + System.err.println( i.next( ) ); + } + } + + private static void printVersion() + { + System.out.println( "Version " + LuceneAnalyzer.class.getName( ) + " " + versionString ); + } + + public static void main( String[] args ) throws IOException + { + CmdLineParser parser = new CmdLineParser( ); + + // default options, well-known, should always be around + Option verbose = addHelp( parser.addBooleanOption( 'v', "verbose" ), + "print extra verbosity information" ); + Option help = addHelp( parser.addBooleanOption( 'h', "help" ), + "print this help message" ); + Option version = addHelp( parser.addBooleanOption( "version" ), + "print version information" ); + + // read the command line options + try { + parser.parse( args ); + } catch( OptionException e ) { + System.err.println( e.getMessage( ) ); + printUsage( ); + System.exit( 1 ); + } + + if( (Boolean)parser.getOptionValue( help, Boolean.FALSE ) ) { + printUsage( ); + System.exit( 0 ); + } + + if( (Boolean)parser.getOptionValue( version, Boolean.FALSE ) ) { + printVersion( ); + System.exit( 0 ); + } + + // verbosity as a level, increased with -vvv + int verbosity = 0; + while( true ) { + Boolean verboseValue = (Boolean)parser.getOptionValue( verbose ); + if( verboseValue == null ) { + break; + } else { + verbosity++; + } + } + + // read command line arguments + String[] otherArgs = parser.getRemainingArgs( ); + + if( otherArgs.length != 1 ) { + System.err.println( "Missing a lucene index directory as first argument" ); + printUsage( ); + System.exit( 1 ); + } + + File indexDir = new File( otherArgs[0] ); + if( !indexDir.exists( ) ) { + System.err.println( indexDir + " doesn't exist" ); + System.exit( 1 ); + } + if( !indexDir.isDirectory( ) ) { + System.err.println( indexDir + " is not a directory" ); + System.exit( 1 ); + } + + Directory luceneDirectory = FSDirectory.getDirectory( indexDir, false ); + IndexReader indexReader = IndexReader.open( luceneDirectory ); + + printGlobalInfo( indexReader ); + printFieldInfo( indexReader ); + printTerms( indexReader, verbosity == 1, verbosity == 2 ); + + indexReader.close( ); + + System.exit( 0 ); + } +} -- cgit v1.2.3-54-g00ecf