summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2008-08-20 11:33:56 +0200
committerAndreas Baumann <abaumann@yahoo.com>2008-08-20 11:33:56 +0200
commit271357c05b42f2a32250f20e55a6ac3d99c9529d (patch)
tree3a8d20df61dda0c85bf66519aad9873bcc79c0cd /src
downloadLuceneAnalyzer-271357c05b42f2a32250f20e55a6ac3d99c9529d.tar.gz
LuceneAnalyzer-271357c05b42f2a32250f20e55a6ac3d99c9529d.tar.bz2
initial load
Diffstat (limited to 'src')
-rw-r--r--src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java284
1 files changed, 284 insertions, 0 deletions
diff --git a/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java b/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java
new file mode 100644
index 0000000..aab2862
--- /dev/null
+++ b/src/main/java/org/dyndns/andreasbaumann/LuceneAnalyzer.java
@@ -0,0 +1,284 @@
+/*
+ * LuceneAnalyzer - Lucene Index Analyzer
+ *
+ * Copyright (C) 2006 Andreas Baumann
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+package org.dyndns.andreasbaumann;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.List;
+
+import jargs.gnu.CmdLineParser;
+import jargs.gnu.CmdLineParser.Option;
+import jargs.gnu.CmdLineParser.OptionException;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.FieldOption;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/**
+ * Lucene index analyzer. Works for file system indexes only (not
+ * for indexes fully in RAM or in different persistence systems as
+ * a JDBCDirectory.
+ *
+ * Note: requires lucene 1.9.1
+ *
+ * @author Andreas Baumann, <abaumann@yahoo.com>
+ * @version $Id$
+ */
+
+public class LuceneAnalyzer
+{
+ private static final String versionString = "0.0.2";
+
+ private static void printGlobalInfo( IndexReader indexReader ) throws IOException
+ {
+ System.out.println( "Global Information:" );
+ System.out.println( "===================" );
+
+ System.out.println( "\tnumber of documents: " + indexReader.numDocs( ) );
+
+ // we should get the number of features differently, this is inefficient, but Lucene
+ // has no notion of global statistics (because the default weighting schema doesn't
+ // make use of it!)
+ int nofFeatures = 0;
+ int nofTokens = 0;
+ TermEnum terms = indexReader.terms( );
+ while( terms.next( ) ) {
+ Term term = terms.term( );
+ int df = terms.docFreq( );
+ nofFeatures++;
+ nofTokens += df;
+ }
+ System.out.println( "\ttotal number of features: " + nofFeatures );
+ System.out.println( "\ttotal number of tokens: " + nofTokens );
+
+ System.out.println( "\tversion: " + indexReader.getVersion( ) );
+ System.out.println( "\tstill current: " + indexReader.isCurrent( ) );
+
+ //TODO: we don't get segment information!
+ //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) );
+ System.out.println( "\tmaximal document number: " + indexReader.maxDoc( ) );
+ System.out.println( "\thas deletions: " + indexReader.hasDeletions( ) );
+
+ System.out.println( "" );
+ }
+
+ private static void printFieldInfoPerFieldOption( IndexReader indexReader, IndexReader.FieldOption fieldOption )
+ {
+ System.out.println( "Fields of type '" + fieldOption + "':" );
+ Collection fields = indexReader.getFieldNames( fieldOption );
+ Iterator fieldIterator = fields.iterator( );
+ while( fieldIterator.hasNext( ) ) {
+ String field = (String)fieldIterator.next( );
+ if( field != null && !field.equals( "" ) ) {
+ // TODO: define data type here!
+ System.out.println( "\t" + field.toString( ) );
+ }
+ }
+ }
+
+ private static void printFieldInfo( IndexReader indexReader ) throws IOException
+ {
+ System.out.println( "Field Information:" );
+ System.out.println( "==================" );
+
+ // very bad design, this field types!
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.ALL );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_NO_TERMVECTOR );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET );
+ printFieldInfoPerFieldOption( indexReader, IndexReader.FieldOption.UNINDEXED );
+ System.out.println( "" );
+ }
+
+ private static void printTerms( IndexReader indexReader,
+ boolean printDocNumbers,
+ boolean printPositions ) throws IOException
+ {
+ System.out.println( "Terms:" );
+ System.out.println( "======" );
+ TermEnum terms = indexReader.terms( );
+ while( terms.next( ) ) {
+ Term term = terms.term( );
+ // the df is stored in the iterator and not in the term, weird...
+ int df = terms.docFreq( );
+ if( !printDocNumbers && !printPositions ) {
+ System.out.print( term.field( ) + "\t" + term.text( ) + "\t" + df );
+ } else {
+ System.out.print( term.field( ) + "\t" + term.text( ) );
+ }
+
+ if( printDocNumbers ) {
+ TermDocs termDocs = indexReader.termDocs( term );
+ boolean first = true;
+ while( termDocs.next( ) ) {
+ if( first ) {
+ System.out.print( "\t" + termDocs.doc( ) );
+ first = false;
+ } else {
+ System.out.print( "," + termDocs.doc( ) );
+ }
+ }
+ termDocs.close( );
+ } else if( printPositions ) {
+ TermPositions termPositions = indexReader.termPositions( term );
+ boolean first = true;
+ while( termPositions.next( ) ) {
+ if( first ) {
+ System.out.print( "\t" + termPositions.doc( ) );
+ first = false;
+ } else {
+ System.out.print( "," + termPositions.doc( ) );
+ }
+
+ for( int i = 0; i < termPositions.freq( ); i++ ) {
+ int position = termPositions.nextPosition( );
+ if( i == 0 ) {
+ System.out.print( "[" );
+ }
+ System.out.print( position );
+ if( i > 0 && i < termPositions.freq( ) - 1 ) {
+ System.out.print( "," );
+ }
+ if( i == termPositions.freq( ) - 1 ) {
+ System.out.print( "]" );
+ }
+ }
+ }
+ termPositions.close( );
+ }
+
+ System.out.println( "" );
+ }
+ System.out.println( "" );
+ }
+
+ private static List optionHelpStrings = new ArrayList();
+
+ private static Option addHelp( Option option, String helpString )
+ {
+ if( option.shortForm( ) != null ) {
+ optionHelpStrings.add( " -" + option.shortForm( ) + "/--" + option.longForm( ) + ": " + helpString );
+ } else {
+ optionHelpStrings.add( " --" + option.longForm( ) + ": " + helpString );
+ }
+ return option;
+ }
+
+ private static void printUsage()
+ {
+ System.err.println( "Usage: java " + LuceneAnalyzer.class.getName( ) + " <lucene index dir>" );
+ for( Iterator i = optionHelpStrings.iterator( ); i.hasNext( ); ) {
+ System.err.println( i.next( ) );
+ }
+ }
+
+ private static void printVersion()
+ {
+ System.out.println( "Version " + LuceneAnalyzer.class.getName( ) + " " + versionString );
+ }
+
+ public static void main( String[] args ) throws IOException
+ {
+ CmdLineParser parser = new CmdLineParser( );
+
+ // default options, well-known, should always be around
+ Option verbose = addHelp( parser.addBooleanOption( 'v', "verbose" ),
+ "print extra verbosity information" );
+ Option help = addHelp( parser.addBooleanOption( 'h', "help" ),
+ "print this help message" );
+ Option version = addHelp( parser.addBooleanOption( "version" ),
+ "print version information" );
+
+ // read the command line options
+ try {
+ parser.parse( args );
+ } catch( OptionException e ) {
+ System.err.println( e.getMessage( ) );
+ printUsage( );
+ System.exit( 1 );
+ }
+
+ if( (Boolean)parser.getOptionValue( help, Boolean.FALSE ) ) {
+ printUsage( );
+ System.exit( 0 );
+ }
+
+ if( (Boolean)parser.getOptionValue( version, Boolean.FALSE ) ) {
+ printVersion( );
+ System.exit( 0 );
+ }
+
+ // verbosity as a level, increased with -vvv
+ int verbosity = 0;
+ while( true ) {
+ Boolean verboseValue = (Boolean)parser.getOptionValue( verbose );
+ if( verboseValue == null ) {
+ break;
+ } else {
+ verbosity++;
+ }
+ }
+
+ // read command line arguments
+ String[] otherArgs = parser.getRemainingArgs( );
+
+ if( otherArgs.length != 1 ) {
+ System.err.println( "Missing a lucene index directory as first argument" );
+ printUsage( );
+ System.exit( 1 );
+ }
+
+ File indexDir = new File( otherArgs[0] );
+ if( !indexDir.exists( ) ) {
+ System.err.println( indexDir + " doesn't exist" );
+ System.exit( 1 );
+ }
+ if( !indexDir.isDirectory( ) ) {
+ System.err.println( indexDir + " is not a directory" );
+ System.exit( 1 );
+ }
+
+ Directory luceneDirectory = FSDirectory.getDirectory( indexDir, false );
+ IndexReader indexReader = IndexReader.open( luceneDirectory );
+
+ printGlobalInfo( indexReader );
+ printFieldInfo( indexReader );
+ printTerms( indexReader, verbosity == 1, verbosity == 2 );
+
+ indexReader.close( );
+
+ System.exit( 0 );
+ }
+}