LingoProcessingContext C# (CSharp)代码示例

示例#1

0

显示文件

文件： ClusterBuilder.cs 项目： theplaymate/Lingo-Clustering-Algorithm

        void assignDocuments( LingoProcessingContext context )
        {
            int[] clusterLabelFeatureIndex = context.clusterLabelFeatureIndex;
            BitSet[] clusterDocuments = new BitSet[clusterLabelFeatureIndex.Length];
            int[] labelsFeatureIndex = context.preprocessingContext.allLabels.featureIndex;
            BitSet[] documentIndices = context.preprocessingContext.allLabels.documentIndices;
            IntIntOpenHashMap featureValueToIndex = new IntIntOpenHashMap();
            for ( int i = 0; i < labelsFeatureIndex.Length; i++ )
            {
                featureValueToIndex.put( labelsFeatureIndex[i], i );
            }

            for ( int clusterIndex = 0; clusterIndex < clusterDocuments.Length; clusterIndex++ )
            {
                clusterDocuments[clusterIndex] = documentIndices[featureValueToIndex.get( clusterLabelFeatureIndex[clusterIndex] )];
            }

            context.clusterDocuments = clusterDocuments;
        }

示例#2

0

显示文件

文件： LingoClusteringAlgorithm.cs 项目： theplaymate/Lingo-Clustering-Algorithm

 private void cluster( LanguageCode language )
 {
     PreprocessingContext context = preprocessingPipeline.preprocess( documents, query, language );
     clusters = new List<Cluster>();
     if ( context.hasLabels() )
     {
         VectorSpaceModelContext vsmContext = new VectorSpaceModelContext( context );
         ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext( vsmContext );
         LingoProcessingContext lingoContext = new LingoProcessingContext( reducedVsmContext );
         matrixBuilder.buildTermDocumentMatrix( vsmContext );
         matrixBuilder.buildTermPhraseMatrix( vsmContext );
         matrixReducer.reduce( reducedVsmContext, computeClusterCount( desiredClusterCountBase, documents.Count ) );
         clusterBuilder.buildLabels( lingoContext, matrixBuilder.termWeighting );
         clusterBuilder.assignDocuments( lingoContext );
         clusterBuilder.merge( lingoContext );
         int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
         BitSet[] clusterDocuments = lingoContext.clusterDocuments;
         double[] clusterLabelScore = lingoContext.clusterLabelScore;
         for ( int i = 0; i < clusterLabelIndex.length; i++ )
         {
             Cluster cluster = new Cluster();
             int labelFeature = clusterLabelIndex[i];
             if ( labelFeature < 0 )
             {
                 // Cluster removed during merging
                 continue;
             }
             cluster.addPhrases( labelFormatter.format( context, labelFeature ) );
             cluster.setAttribute( Cluster.SCORE, clusterLabelScore[i] );
             BitSet bs = clusterDocuments[i];
             for ( int bit = bs.nextSetBit( 0 ); bit >= 0; bit = bs.nextSetBit( bit + 1 ) )
             {
                 cluster.addDocuments( documents.ElementAt( bit ) );
             }
             clusters.Add( cluster );
         }
         Collections.Sort( clusters, Cluster.byReversedWeightedScoreAndSizeComparator( scoreWeight ) );
     }
     Cluster.appendOtherTopics( documents, clusters );
 }

示例#3

0

显示文件

文件： ClusterBuilder.cs 项目： theplaymate/Lingo-Clustering-Algorithm

        void buildLabels( LingoProcessingContext context, ITermWeighting termWeighting )
        {
            PreprocessingContext preprocessingContext = context.preprocessingContext;
            VectorSpaceModelContext vsmContext = context.vsmContext;
            DoubleMatrix2D reducedTdMatrix = context.reducedVsmContext.baseMatrix;
            int[] wordsStemIndex = preprocessingContext.allWords.stemIndex;
            int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
            int[] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
            int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
            BitSet[] labelsDocumentIndices = preprocessingContext.allLabels.documentIndices;
            int wordCount = preprocessingContext.allWords.image.length;
            int documentCount = preprocessingContext.documents.Size();
            BitSet oneWordCandidateStemIndices = new BitSet();
            for ( int i = 0; i < labelsFeatureIndex.Length; i++ )
            {
                int featureIndex = labelsFeatureIndex[i];
                if ( featureIndex >= wordCount )
                {
                    break;
                }
                oneWordCandidateStemIndices.set( wordsStemIndex[featureIndex] );
            }
            IntIntOpenHashMap stemToRowIndex = vsmContext.stemToRowIndex;
            IntIntOpenHashMap filteredRowToStemIndex = new IntIntOpenHashMap();
            IntArrayList filteredRows = new IntArrayList();
            int filteredRowIndex = 0;
            foreach ( IntIntCursor it in stemToRowIndex )
            {
                if ( oneWordCandidateStemIndices.get( it.key ) )
                {
                    filteredRowToStemIndex.put( filteredRowIndex++, it.key );
                    filteredRows.add( it.value );
                }
            }
            double[] featureScores = featureScorer != null ? featureScorer.getFeatureScores( context ) : null;
            int[] wordLabelIndex = new int[wordCount];
            for ( int i = 0; i < wordCount; i++ )
            {
                wordLabelIndex[i] = -1;
            }
            for ( int i = 0; i < labelsFeatureIndex.Length; i++ )
            {
                int featureIndex = labelsFeatureIndex[i];
                if ( featureIndex < wordCount )
                {
                    wordLabelIndex[featureIndex] = i;
                }
            }
            DoubleMatrix2D stemCos = reducedTdMatrix.viewSelection(
            filteredRows.toArray(), null ).copy();
            for ( int r = 0; r < stemCos.rows(); r++ )
            {
                int labelIndex = wordLabelIndex[mostFrequentOriginalWordIndex[filteredRowToStemIndex.get( r )]];
                double penalty = getDocumentCountPenalty( labelIndex, documentCount, labelsDocumentIndices );
                if ( featureScores != null )
                {
                    penalty *= featureScores[labelIndex];
                }
                stemCos.viewRow( r ).assign( Functions.mult( penalty ) );
            }
            DoubleMatrix2D phraseMatrix = vsmContext.termPhraseMatrix;
            int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;
            DoubleMatrix2D phraseCos = null;
            if ( phraseMatrix != null )
            {
                phraseCos = phraseMatrix.zMult( reducedTdMatrix, null, 1, 0, false, false );
                if ( phraseLengthPenaltyStop < phraseLengthPenaltyStart )
                {
                    phraseLengthPenaltyStop = phraseLengthPenaltyStart;
                }
                double penaltyStep = 1.0 / ( phraseLengthPenaltyStop - phraseLengthPenaltyStart + 1 );
                for ( int row = 0; row < phraseCos.rows(); row++ )
                {
                    int phraseFeature = labelsFeatureIndex[row + firstPhraseIndex];
                    int[] phraseWordIndices = phrasesWordIndices[phraseFeature - wordCount];

                    double penalty;
                    if ( phraseWordIndices.Length >= phraseLengthPenaltyStop )
                    {
                        penalty = 0;
                    }
                    else
                    {
                        penalty = getDocumentCountPenalty( row + firstPhraseIndex,
                            documentCount, labelsDocumentIndices );

                        if ( phraseWordIndices.Length >= phraseLengthPenaltyStart )
                        {
                            penalty *= 1 - penaltyStep
                                * ( phraseWordIndices.Length - phraseLengthPenaltyStart + 1 );
                        }
                        if ( featureScores != null )
                        {
                            penalty *= featureScores[row + firstPhraseIndex];
                        }
                    }
                    phraseCos.viewRow( row ).assign( Functions.mult( penalty * phraseLabelBoost ) );
                }
            }
            labelAssigner.assignLabels( context, stemCos, filteredRowToStemIndex, phraseCos );
        }

示例#4

0

显示文件

文件： ClusterBuilder.cs 项目： theplaymate/Lingo-Clustering-Algorithm

        void merge( LingoProcessingContext context )
        {
            BitSet[] clusterDocuments = context.clusterDocuments;
            int[] clusterLabelFeatureIndex = context.clusterLabelFeatureIndex;
            double[] clusterLabelScore = context.clusterLabelScore;

            // Merge Code Issue. Check

            //List<IntArrayList> mergedClusters = GraphUtils.findCoherentSubgraphs(
            //clusterDocuments.length, new GraphUtils.IArcPredicate()
            //{
            //	private BitSet temp = new BitSet();

            //	public boolean isArcPresent(int clusterA, int clusterB)
            //	{
            //		temp.clear();
            //		int size;
            //		BitSet setA = clusterDocuments[clusterA];
            //		BitSet setB = clusterDocuments[clusterB];

            //		// Suitable for flat clustering
            //		// A small subgroup contained within a bigger group
            //		// will give small overlap ratio. Big ratios will
            //		// be produced only for balanced group sizes.
            //		if (setA.cardinality() < setB.cardinality())
            //		{
            //			// addAll == or
            //			// reiatinAll == and | intersect
            //			temp.or(setA);
            //			temp.intersect(setB);
            //			size = (int) setB.cardinality();
            //		}
            //		else
            //		{
            //			temp.or(setB);
            //			temp.intersect(setA);
            //			size = (int) setA.cardinality();
            //		}

            //		return temp.cardinality() / (double) size >= clusterMergingThreshold;
            //	}
            //}, true);

            foreach (IntArrayList clustersToMerge in mergedClusters)
            {
                int mergeBaseClusterIndex = -1;
                double maxScore = -1;

                int [] buf = clustersToMerge.buffer;
                int max = clustersToMerge.size();
                for (int i = 0; i < max; i++)
                {
                    int clusterIndex = buf[i];
                    if (clusterLabelScore[clusterIndex] > maxScore)
                    {
                        mergeBaseClusterIndex = clusterIndex;
                        maxScore = clusterLabelScore[clusterIndex];
                    }
                }

                for (int i = 0; i < max; i++)
                {
                    int clusterIndex = buf[i];
                    if (clusterIndex != mergeBaseClusterIndex)
                    {
                        clusterDocuments[mergeBaseClusterIndex].or(
                            clusterDocuments[clusterIndex]);
                        clusterLabelFeatureIndex[clusterIndex] = -1;
                        clusterDocuments[clusterIndex] = null;
                    }

                }
            }
        }

C# (CSharp) LingoProcessingContext示例