private void cluster( LanguageCode language ) { PreprocessingContext context = preprocessingPipeline.preprocess( documents, query, language ); clusters = new List<Cluster>(); if ( context.hasLabels() ) { VectorSpaceModelContext vsmContext = new VectorSpaceModelContext( context ); ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext( vsmContext ); LingoProcessingContext lingoContext = new LingoProcessingContext( reducedVsmContext ); matrixBuilder.buildTermDocumentMatrix( vsmContext ); matrixBuilder.buildTermPhraseMatrix( vsmContext ); matrixReducer.reduce( reducedVsmContext, computeClusterCount( desiredClusterCountBase, documents.Count ) ); clusterBuilder.buildLabels( lingoContext, matrixBuilder.termWeighting ); clusterBuilder.assignDocuments( lingoContext ); clusterBuilder.merge( lingoContext ); int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; BitSet[] clusterDocuments = lingoContext.clusterDocuments; double[] clusterLabelScore = lingoContext.clusterLabelScore; for ( int i = 0; i < clusterLabelIndex.length; i++ ) { Cluster cluster = new Cluster(); int labelFeature = clusterLabelIndex[i]; if ( labelFeature < 0 ) { // Cluster removed during merging continue; } cluster.addPhrases( labelFormatter.format( context, labelFeature ) ); cluster.setAttribute( Cluster.SCORE, clusterLabelScore[i] ); BitSet bs = clusterDocuments[i]; for ( int bit = bs.nextSetBit( 0 ); bit >= 0; bit = bs.nextSetBit( bit + 1 ) ) { cluster.addDocuments( documents.ElementAt( bit ) ); } clusters.Add( cluster ); } Collections.Sort( clusters, Cluster.byReversedWeightedScoreAndSizeComparator( scoreWeight ) ); } Cluster.appendOtherTopics( documents, clusters ); }
public LingoProcessingContext( ReducedVectorSpaceModelContext reducedVsmContext ) { this.reducedVsmContext = reducedVsmContext; this.vsmContext = reducedVsmContext.vsmContext; this.preprocessingContext = vsmContext.preprocessingContext; }