private void cluster( LanguageCode language )
 {
     PreprocessingContext context = preprocessingPipeline.preprocess( documents, query, language );
     clusters = new List<Cluster>();
     if ( context.hasLabels() )
     {
         VectorSpaceModelContext vsmContext = new VectorSpaceModelContext( context );
         ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext( vsmContext );
         LingoProcessingContext lingoContext = new LingoProcessingContext( reducedVsmContext );
         matrixBuilder.buildTermDocumentMatrix( vsmContext );
         matrixBuilder.buildTermPhraseMatrix( vsmContext );
         matrixReducer.reduce( reducedVsmContext, computeClusterCount( desiredClusterCountBase, documents.Count ) );
         clusterBuilder.buildLabels( lingoContext, matrixBuilder.termWeighting );
         clusterBuilder.assignDocuments( lingoContext );
         clusterBuilder.merge( lingoContext );
         int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
         BitSet[] clusterDocuments = lingoContext.clusterDocuments;
         double[] clusterLabelScore = lingoContext.clusterLabelScore;
         for ( int i = 0; i < clusterLabelIndex.length; i++ )
         {
             Cluster cluster = new Cluster();
             int labelFeature = clusterLabelIndex[i];
             if ( labelFeature < 0 )
             {
                 // Cluster removed during merging
                 continue;
             }
             cluster.addPhrases( labelFormatter.format( context, labelFeature ) );
             cluster.setAttribute( Cluster.SCORE, clusterLabelScore[i] );
             BitSet bs = clusterDocuments[i];
             for ( int bit = bs.nextSetBit( 0 ); bit >= 0; bit = bs.nextSetBit( bit + 1 ) )
             {
                 cluster.addDocuments( documents.ElementAt( bit ) );
             }
             clusters.Add( cluster );
         }
         Collections.Sort( clusters, Cluster.byReversedWeightedScoreAndSizeComparator( scoreWeight ) );
     }
     Cluster.appendOtherTopics( documents, clusters );
 }
コード例 #2
0
 public LingoProcessingContext( ReducedVectorSpaceModelContext reducedVsmContext )
 {
     this.reducedVsmContext = reducedVsmContext;
     this.vsmContext = reducedVsmContext.vsmContext;
     this.preprocessingContext = vsmContext.preprocessingContext;
 }