initial load

author: Andreas Baumann <abaumann@yahoo.com> 2008-08-20 11:33:56 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2008-08-20 11:33:56 +0200
commit: 271357c05b42f2a32250f20e55a6ac3d99c9529d (patch)
tree: 3a8d20df61dda0c85bf66519aad9873bcc79c0cd /src
download: LuceneAnalyzer-271357c05b42f2a32250f20e55a6ac3d99c9529d.tar.gz
LuceneAnalyzer-271357c05b42f2a32250f20e55a6ac3d99c9529d.tar.bz2
1 files changed, 284 insertions, 0 deletions
diff --git a/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java b/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java
new file mode 100644
index 0000000..aab2862
--- /dev/null
+++ b/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java
@@ -0,0 +1,284 @@
+/*
+ *   LuceneAnalyzer - Lucene Index Analyzer
+ *
+ *   Copyright (C) 2006  Andreas Baumann
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+ 
+package org.dyndns.andreasbaumann;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.List;
+
+import jargs.gnu.CmdLineParser;
+import jargs.gnu.CmdLineParser.Option;
+import jargs.gnu.CmdLineParser.OptionException;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.FieldOption;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/**
+ * Lucene index analyzer. Works for file system indexes only (not
+ * for indexes fully in RAM or in different persistence systems as
+ * a JDBCDirectory.
+ *
+ * Note: requires lucene 1.9.1
+ *
+ * @author Andreas Baumann, <abaumann@yahoo.com>
+ * @version $Id$
+ */
+
+public class LuceneAnalyzer
+{
+	private static final String versionString = "0.0.2";
+	
+	private static void printGlobalInfo( IndexReader indexReader ) throws IOException
+	{
+		System.out.println( "Global Information:" );
+		System.out.println( "===================" );
+		
+		System.out.println( "\tnumber of documents: " + indexReader.numDocs( ) );
+
+		// we should get the number of features differently, this is inefficient, but Lucene
+		// has no notion of global statistics (because the default weighting schema doesn't
+		// make use of it!)
+		int nofFeatures = 0;
+		int nofTokens = 0;
+		TermEnum terms = indexReader.terms( );
+		while( terms.next( ) ) {
+		   Term term = terms.term( );
+			int df = terms.docFreq( );
+			nofFeatures++;
+			nofTokens += df;
+		}
+		System.out.println( "\ttotal number of features: " + nofFeatures );
+		System.out.println( "\ttotal number of tokens: " + nofTokens );
+
+		System.out.println( "\tversion: " + indexReader.getVersion( ) );
+		System.out.println( "\tstill current: " + indexReader.isCurrent( ) );
+		
+		//TODO: we don't get segment information!
+		//System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) );
+		System.out.println( "\tmaximal document number: " + indexReader.maxDoc( ) );
+		System.out.println( "\thas deletions: " + indexReader.hasDeletions( ) );
+
+		System.out.println( "" );
+	}
+	
+	private static void printFieldInfoPerFieldOption( IndexReader indexReader, IndexReader.FieldOption fieldOption )
+	{
+		System.out.println( "Fields of type '" + fieldOption + "':" );
+		Collection fields = indexReader.getFieldNames( fieldOption );
+		Iterator fieldIterator = fields.iterator( );
+		while( fieldIterator.hasNext( ) ) {
+			String field = (String)fieldIterator.next( );
+			if( field != null && !field.equals( "" ) ) {
+				// TODO: define data type here!
+				System.out.println( "\t" + field.toString( ) );
+			}
+		}
+	}
+	
+	private static void printFieldInfo( IndexReader indexReader ) throws IOException
+	{
+		System.out.println( "Field Information:" );
+		System.out.println( "==================" );
+		
+		// very bad design, this field types!
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.ALL );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_NO_TERMVECTOR );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET );
+		printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.UNINDEXED );		
+		System.out.println( "" );
+	}
+	
+	private static void printTerms( IndexReader indexReader,
+	                                boolean printDocNumbers,
+                                   boolean printPositions ) throws IOException
+	{
+		System.out.println( "Terms:" );
+		System.out.println( "======" );
+		TermEnum terms = indexReader.terms( );
+		while( terms.next( ) ) {
+		   Term term = terms.term( );
+			// the df is stored in the iterator and not in the term, weird...
+			int df = terms.docFreq( );
+			if( !printDocNumbers && !printPositions ) {
+				System.out.print( term.field( ) + "\t" + term.text( ) + "\t" + df );
+			} else {
+				System.out.print( term.field( ) + "\t" + term.text( ) );
+			}
+
+			if( printDocNumbers ) {
+				TermDocs termDocs = indexReader.termDocs( term );
+				boolean first = true;
+				while( termDocs.next( ) ) {
+					if( first ) {
+						System.out.print( "\t" + termDocs.doc( ) );
+						first = false;
+					} else {
+						System.out.print( "," + termDocs.doc( ) );
+					}
+				}
+				termDocs.close( );
+			} else if( printPositions ) {
+				TermPositions termPositions = indexReader.termPositions( term );
+				boolean first = true;
+				while( termPositions.next( ) ) {
+					if( first ) {
+						System.out.print( "\t" + termPositions.doc( ) );
+						first = false;
+					} else {
+						System.out.print( "," + termPositions.doc( ) );
+					}
+					
+					for( int i = 0; i < termPositions.freq( ); i++ ) {
+						int position = termPositions.nextPosition( );
+						if( i == 0 ) {
+							System.out.print( "[" );
+						}
+						System.out.print( position );
+						if( i > 0 && i < termPositions.freq( ) - 1 ) {
+							System.out.print( "," );
+						}
+						if( i == termPositions.freq( ) - 1 ) {
+							System.out.print( "]" );
+						}
+					}
+				}
+				termPositions.close( );
+			}
+			
+			System.out.println( "" );
+		}
+		System.out.println( "" );
+	}
+
+ 	private static List optionHelpStrings = new ArrayList();
+	
+	private static Option addHelp( Option option, String helpString )
+	{
+		if( option.shortForm( ) != null ) {
+			optionHelpStrings.add( " -" + option.shortForm( ) + "/--" + option.longForm( ) + ": " + helpString  );
+		} else {
+			optionHelpStrings.add( " --" + option.longForm( ) + ": " + helpString  );
+		}
+		return option;
+	}
+
+	private static void printUsage()
+	{
+		System.err.println( "Usage: java " + LuceneAnalyzer.class.getName( ) + " <lucene index dir>" );
+		for( Iterator i = optionHelpStrings.iterator( ); i.hasNext( ); ) {
+			System.err.println( i.next( ) );
+		}
+	}
+	
+	private static void printVersion()
+	{
+		System.out.println( "Version " + LuceneAnalyzer.class.getName( ) + " " + versionString );
+	}
+                	
+	public static void main( String[] args ) throws IOException
+	{
+		CmdLineParser parser = new CmdLineParser( );
+		
+		// default options, well-known, should always be around
+		Option verbose = addHelp( parser.addBooleanOption( 'v', "verbose" ),
+		                          "print extra verbosity information" );
+		Option help =    addHelp( parser.addBooleanOption( 'h', "help" ),
+		                          "print this help message" );
+		Option version = addHelp( parser.addBooleanOption( "version" ),
+		                          "print version information" );
+
+		// read the command line options
+		try {
+			parser.parse( args );
+		} catch( OptionException e ) {
+			System.err.println( e.getMessage( ) );
+			printUsage( );
+			System.exit( 1 );
+		}
+		
+		if( (Boolean)parser.getOptionValue( help, Boolean.FALSE ) ) {
+			printUsage( );
+			System.exit( 0 );
+		}
+		
+		if( (Boolean)parser.getOptionValue( version, Boolean.FALSE ) ) {
+			printVersion( );
+			System.exit( 0 );
+		}
+
+		// verbosity as a level, increased with -vvv
+		int verbosity = 0;
+		while( true ) {
+			Boolean verboseValue = (Boolean)parser.getOptionValue( verbose );
+			if( verboseValue == null ) {
+				break;
+			} else {
+				verbosity++;
+			}
+		}
+		
+		// read command line arguments
+		String[] otherArgs = parser.getRemainingArgs( );
+		
+		if( otherArgs.length != 1 ) {
+			System.err.println( "Missing a lucene index directory as first argument" );
+			printUsage( );
+			System.exit( 1 );
+		}
+		
+		File indexDir = new File( otherArgs[0] );
+		if( !indexDir.exists( ) ) {
+			System.err.println( indexDir + " doesn't exist" );
+			System.exit( 1 );
+		}
+		if( !indexDir.isDirectory( ) ) {
+			System.err.println( indexDir + " is not a directory" );
+			System.exit( 1 );
+		}
+		
+		Directory luceneDirectory = FSDirectory.getDirectory( indexDir, false );
+		IndexReader indexReader = IndexReader.open( luceneDirectory );
+		
+		printGlobalInfo( indexReader );
+		printFieldInfo( indexReader );
+		printTerms( indexReader, verbosity == 1, verbosity == 2 );
+		
+		indexReader.close( );
+		
+		System.exit( 0 );
+	}
+}
author	Andreas Baumann <abaumann@yahoo.com>	2008-08-20 11:33:56 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2008-08-20 11:33:56 +0200
commit	271357c05b42f2a32250f20e55a6ac3d99c9529d (patch)
tree	3a8d20df61dda0c85bf66519aad9873bcc79c0cd /src
download	LuceneAnalyzer-271357c05b42f2a32250f20e55a6ac3d99c9529d.tar.gz LuceneAnalyzer-271357c05b42f2a32250f20e55a6ac3d99c9529d.tar.bz2