コード例 #1
0
        void assignDocuments( LingoProcessingContext context )
        {
            int[] clusterLabelFeatureIndex = context.clusterLabelFeatureIndex;
            BitSet[] clusterDocuments = new BitSet[clusterLabelFeatureIndex.Length];
            int[] labelsFeatureIndex = context.preprocessingContext.allLabels.featureIndex;
            BitSet[] documentIndices = context.preprocessingContext.allLabels.documentIndices;
            IntIntOpenHashMap featureValueToIndex = new IntIntOpenHashMap();
            for ( int i = 0; i < labelsFeatureIndex.Length; i++ )
            {
                featureValueToIndex.put( labelsFeatureIndex[i], i );
            }

            for ( int clusterIndex = 0; clusterIndex < clusterDocuments.Length; clusterIndex++ )
            {
                clusterDocuments[clusterIndex] = documentIndices[featureValueToIndex.get( clusterLabelFeatureIndex[clusterIndex] )];
            }

            context.clusterDocuments = clusterDocuments;
        }
コード例 #2
0
        void buildLabels( LingoProcessingContext context, ITermWeighting termWeighting )
        {
            PreprocessingContext preprocessingContext = context.preprocessingContext;
            VectorSpaceModelContext vsmContext = context.vsmContext;
            DoubleMatrix2D reducedTdMatrix = context.reducedVsmContext.baseMatrix;
            int[] wordsStemIndex = preprocessingContext.allWords.stemIndex;
            int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
            int[] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
            int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
            BitSet[] labelsDocumentIndices = preprocessingContext.allLabels.documentIndices;
            int wordCount = preprocessingContext.allWords.image.length;
            int documentCount = preprocessingContext.documents.Size();
            BitSet oneWordCandidateStemIndices = new BitSet();
            for ( int i = 0; i < labelsFeatureIndex.Length; i++ )
            {
                int featureIndex = labelsFeatureIndex[i];
                if ( featureIndex >= wordCount )
                {
                    break;
                }
                oneWordCandidateStemIndices.set( wordsStemIndex[featureIndex] );
            }
            IntIntOpenHashMap stemToRowIndex = vsmContext.stemToRowIndex;
            IntIntOpenHashMap filteredRowToStemIndex = new IntIntOpenHashMap();
            IntArrayList filteredRows = new IntArrayList();
            int filteredRowIndex = 0;
            foreach ( IntIntCursor it in stemToRowIndex )
            {
                if ( oneWordCandidateStemIndices.get( it.key ) )
                {
                    filteredRowToStemIndex.put( filteredRowIndex++, it.key );
                    filteredRows.add( it.value );
                }
            }
            double[] featureScores = featureScorer != null ? featureScorer.getFeatureScores( context ) : null;
            int[] wordLabelIndex = new int[wordCount];
            for ( int i = 0; i < wordCount; i++ )
            {
                wordLabelIndex[i] = -1;
            }
            for ( int i = 0; i < labelsFeatureIndex.Length; i++ )
            {
                int featureIndex = labelsFeatureIndex[i];
                if ( featureIndex < wordCount )
                {
                    wordLabelIndex[featureIndex] = i;
                }
            }
            DoubleMatrix2D stemCos = reducedTdMatrix.viewSelection(
            filteredRows.toArray(), null ).copy();
            for ( int r = 0; r < stemCos.rows(); r++ )
            {
                int labelIndex = wordLabelIndex[mostFrequentOriginalWordIndex[filteredRowToStemIndex.get( r )]];
                double penalty = getDocumentCountPenalty( labelIndex, documentCount, labelsDocumentIndices );
                if ( featureScores != null )
                {
                    penalty *= featureScores[labelIndex];
                }
                stemCos.viewRow( r ).assign( Functions.mult( penalty ) );
            }
            DoubleMatrix2D phraseMatrix = vsmContext.termPhraseMatrix;
            int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;
            DoubleMatrix2D phraseCos = null;
            if ( phraseMatrix != null )
            {
                phraseCos = phraseMatrix.zMult( reducedTdMatrix, null, 1, 0, false, false );
                if ( phraseLengthPenaltyStop < phraseLengthPenaltyStart )
                {
                    phraseLengthPenaltyStop = phraseLengthPenaltyStart;
                }
                double penaltyStep = 1.0 / ( phraseLengthPenaltyStop - phraseLengthPenaltyStart + 1 );
                for ( int row = 0; row < phraseCos.rows(); row++ )
                {
                    int phraseFeature = labelsFeatureIndex[row + firstPhraseIndex];
                    int[] phraseWordIndices = phrasesWordIndices[phraseFeature - wordCount];

                    double penalty;
                    if ( phraseWordIndices.Length >= phraseLengthPenaltyStop )
                    {
                        penalty = 0;
                    }
                    else
                    {
                        penalty = getDocumentCountPenalty( row + firstPhraseIndex,
                            documentCount, labelsDocumentIndices );

                        if ( phraseWordIndices.Length >= phraseLengthPenaltyStart )
                        {
                            penalty *= 1 - penaltyStep
                                * ( phraseWordIndices.Length - phraseLengthPenaltyStart + 1 );
                        }
                        if ( featureScores != null )
                        {
                            penalty *= featureScores[row + firstPhraseIndex];
                        }
                    }
                    phraseCos.viewRow( row ).assign( Functions.mult( penalty * phraseLabelBoost ) );
                }
            }
            labelAssigner.assignLabels( context, stemCos, filteredRowToStemIndex, phraseCos );
        }