void assignDocuments( LingoProcessingContext context ) { int[] clusterLabelFeatureIndex = context.clusterLabelFeatureIndex; BitSet[] clusterDocuments = new BitSet[clusterLabelFeatureIndex.Length]; int[] labelsFeatureIndex = context.preprocessingContext.allLabels.featureIndex; BitSet[] documentIndices = context.preprocessingContext.allLabels.documentIndices; IntIntOpenHashMap featureValueToIndex = new IntIntOpenHashMap(); for ( int i = 0; i < labelsFeatureIndex.Length; i++ ) { featureValueToIndex.put( labelsFeatureIndex[i], i ); } for ( int clusterIndex = 0; clusterIndex < clusterDocuments.Length; clusterIndex++ ) { clusterDocuments[clusterIndex] = documentIndices[featureValueToIndex.get( clusterLabelFeatureIndex[clusterIndex] )]; } context.clusterDocuments = clusterDocuments; }
void buildLabels( LingoProcessingContext context, ITermWeighting termWeighting ) { PreprocessingContext preprocessingContext = context.preprocessingContext; VectorSpaceModelContext vsmContext = context.vsmContext; DoubleMatrix2D reducedTdMatrix = context.reducedVsmContext.baseMatrix; int[] wordsStemIndex = preprocessingContext.allWords.stemIndex; int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; int[] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex; int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices; BitSet[] labelsDocumentIndices = preprocessingContext.allLabels.documentIndices; int wordCount = preprocessingContext.allWords.image.length; int documentCount = preprocessingContext.documents.Size(); BitSet oneWordCandidateStemIndices = new BitSet(); for ( int i = 0; i < labelsFeatureIndex.Length; i++ ) { int featureIndex = labelsFeatureIndex[i]; if ( featureIndex >= wordCount ) { break; } oneWordCandidateStemIndices.set( wordsStemIndex[featureIndex] ); } IntIntOpenHashMap stemToRowIndex = vsmContext.stemToRowIndex; IntIntOpenHashMap filteredRowToStemIndex = new IntIntOpenHashMap(); IntArrayList filteredRows = new IntArrayList(); int filteredRowIndex = 0; foreach ( IntIntCursor it in stemToRowIndex ) { if ( oneWordCandidateStemIndices.get( it.key ) ) { filteredRowToStemIndex.put( filteredRowIndex++, it.key ); filteredRows.add( it.value ); } } double[] featureScores = featureScorer != null ? featureScorer.getFeatureScores( context ) : null; int[] wordLabelIndex = new int[wordCount]; for ( int i = 0; i < wordCount; i++ ) { wordLabelIndex[i] = -1; } for ( int i = 0; i < labelsFeatureIndex.Length; i++ ) { int featureIndex = labelsFeatureIndex[i]; if ( featureIndex < wordCount ) { wordLabelIndex[featureIndex] = i; } } DoubleMatrix2D stemCos = reducedTdMatrix.viewSelection( filteredRows.toArray(), null ).copy(); for ( int r = 0; r < stemCos.rows(); r++ ) { int labelIndex = wordLabelIndex[mostFrequentOriginalWordIndex[filteredRowToStemIndex.get( r )]]; double penalty = getDocumentCountPenalty( labelIndex, documentCount, labelsDocumentIndices ); if ( featureScores != null ) { penalty *= featureScores[labelIndex]; } stemCos.viewRow( r ).assign( Functions.mult( penalty ) ); } DoubleMatrix2D phraseMatrix = vsmContext.termPhraseMatrix; int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; DoubleMatrix2D phraseCos = null; if ( phraseMatrix != null ) { phraseCos = phraseMatrix.zMult( reducedTdMatrix, null, 1, 0, false, false ); if ( phraseLengthPenaltyStop < phraseLengthPenaltyStart ) { phraseLengthPenaltyStop = phraseLengthPenaltyStart; } double penaltyStep = 1.0 / ( phraseLengthPenaltyStop - phraseLengthPenaltyStart + 1 ); for ( int row = 0; row < phraseCos.rows(); row++ ) { int phraseFeature = labelsFeatureIndex[row + firstPhraseIndex]; int[] phraseWordIndices = phrasesWordIndices[phraseFeature - wordCount]; double penalty; if ( phraseWordIndices.Length >= phraseLengthPenaltyStop ) { penalty = 0; } else { penalty = getDocumentCountPenalty( row + firstPhraseIndex, documentCount, labelsDocumentIndices ); if ( phraseWordIndices.Length >= phraseLengthPenaltyStart ) { penalty *= 1 - penaltyStep * ( phraseWordIndices.Length - phraseLengthPenaltyStart + 1 ); } if ( featureScores != null ) { penalty *= featureScores[row + firstPhraseIndex]; } } phraseCos.viewRow( row ).assign( Functions.mult( penalty * phraseLabelBoost ) ); } } labelAssigner.assignLabels( context, stemCos, filteredRowToStemIndex, phraseCos ); }