public void assignLabels( LingoProcessingContext context, DoubleMatrix2D stemCos, IntIntOpenHashMap filteredRowToStemIndex, DoubleMatrix2D phraseCos ) { PreprocessingContext preprocessingContext = context.preprocessingContext; int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; int [] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex; int desiredClusterCount = stemCos.columns(); IntArrayList clusterLabelFeatureIndex = new IntArrayList( desiredClusterCount); DoubleArrayList clusterLabelScore = new DoubleArrayList(desiredClusterCount); for (int label = 0; label < desiredClusterCount; label++) { Pair<int, int> stemMax = max(stemCos); Pair<int, int> phraseMax = max(phraseCos); if (stemMax == null && phraseMax == null) { break; } double stemScore = stemMax != null ? stemCos.getQuick(stemMax.objectA, stemMax.objectB) : -1; double phraseScore = phraseMax != null ? phraseCos.getQuick( phraseMax.objectA, phraseMax.objectB) : -1; if (phraseScore > stemScore) { phraseCos.viewRow(phraseMax.objectA).assign(0); phraseCos.viewColumn(phraseMax.objectB).assign(0); stemCos.viewColumn(phraseMax.objectB).assign(0); clusterLabelFeatureIndex.add(labelsFeatureIndex[phraseMax.objectA + firstPhraseIndex]); clusterLabelScore.add(phraseScore); } else { stemCos.viewRow(stemMax.objectA).assign(0); stemCos.viewColumn(stemMax.objectB).assign(0); if (phraseCos != null) { phraseCos.viewColumn(stemMax.objectB).assign(0); } clusterLabelFeatureIndex .add(mostFrequentOriginalWordIndex[filteredRowToStemIndex .get(stemMax.objectA)]); clusterLabelScore.add(stemScore); } } context.clusterLabelFeatureIndex = clusterLabelFeatureIndex.toArray(); context.clusterLabelScore = clusterLabelScore.toArray(); }
public void assignLabels( LingoProcessingContext context, DoubleMatrix2D stemCos, IntIntOpenHashMap filteredRowToStemIndex, DoubleMatrix2D phraseCos ) { PreprocessingContext preprocessingContext = context.preprocessingContext; int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; int[] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex; int desiredClusterCount = stemCos.columns(); int[] candidateStemIndices = new int[desiredClusterCount]; double[] candidateStemScores = new double[desiredClusterCount]; int[] candidatePhraseIndices = new int[desiredClusterCount]; for ( int i = 0; i < desiredClusterCount; i++ ) { candidatePhraseIndices[i] = -1; } double[] candidatePhraseScores = new double[desiredClusterCount]; MatrixUtils.maxInColumns( stemCos, candidateStemIndices, candidateStemScores, Functions.ABS ); if ( phraseCos != null ) { MatrixUtils.maxInColumns( phraseCos, candidatePhraseIndices, candidatePhraseScores, Functions.ABS ); } int[] clusterLabelFeatureIndex = new int[desiredClusterCount]; double [] clusterLabelScore = new double [desiredClusterCount]; for (int i = 0; i < desiredClusterCount; i++) { int phraseFeatureIndex = candidatePhraseIndices[i]; int stemIndex = filteredRowToStemIndex.get(candidateStemIndices[i]); double phraseScore = candidatePhraseScores[i]; if (phraseFeatureIndex >= 0 && phraseScore > candidateStemScores[i]) { clusterLabelFeatureIndex[i] = labelsFeatureIndex[phraseFeatureIndex + firstPhraseIndex]; clusterLabelScore[i] = phraseScore; } else { clusterLabelFeatureIndex[i] = mostFrequentOriginalWordIndex[stemIndex]; clusterLabelScore[i] = candidateStemScores[i]; } } context.clusterLabelFeatureIndex = clusterLabelFeatureIndex; context.clusterLabelScore = clusterLabelScore; }
void assignDocuments( LingoProcessingContext context ) { int[] clusterLabelFeatureIndex = context.clusterLabelFeatureIndex; BitSet[] clusterDocuments = new BitSet[clusterLabelFeatureIndex.Length]; int[] labelsFeatureIndex = context.preprocessingContext.allLabels.featureIndex; BitSet[] documentIndices = context.preprocessingContext.allLabels.documentIndices; IntIntOpenHashMap featureValueToIndex = new IntIntOpenHashMap(); for ( int i = 0; i < labelsFeatureIndex.Length; i++ ) { featureValueToIndex.put( labelsFeatureIndex[i], i ); } for ( int clusterIndex = 0; clusterIndex < clusterDocuments.Length; clusterIndex++ ) { clusterDocuments[clusterIndex] = documentIndices[featureValueToIndex.get( clusterLabelFeatureIndex[clusterIndex] )]; } context.clusterDocuments = clusterDocuments; }
void buildLabels( LingoProcessingContext context, ITermWeighting termWeighting ) { PreprocessingContext preprocessingContext = context.preprocessingContext; VectorSpaceModelContext vsmContext = context.vsmContext; DoubleMatrix2D reducedTdMatrix = context.reducedVsmContext.baseMatrix; int[] wordsStemIndex = preprocessingContext.allWords.stemIndex; int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; int[] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex; int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices; BitSet[] labelsDocumentIndices = preprocessingContext.allLabels.documentIndices; int wordCount = preprocessingContext.allWords.image.length; int documentCount = preprocessingContext.documents.Size(); BitSet oneWordCandidateStemIndices = new BitSet(); for ( int i = 0; i < labelsFeatureIndex.Length; i++ ) { int featureIndex = labelsFeatureIndex[i]; if ( featureIndex >= wordCount ) { break; } oneWordCandidateStemIndices.set( wordsStemIndex[featureIndex] ); } IntIntOpenHashMap stemToRowIndex = vsmContext.stemToRowIndex; IntIntOpenHashMap filteredRowToStemIndex = new IntIntOpenHashMap(); IntArrayList filteredRows = new IntArrayList(); int filteredRowIndex = 0; foreach ( IntIntCursor it in stemToRowIndex ) { if ( oneWordCandidateStemIndices.get( it.key ) ) { filteredRowToStemIndex.put( filteredRowIndex++, it.key ); filteredRows.add( it.value ); } } double[] featureScores = featureScorer != null ? featureScorer.getFeatureScores( context ) : null; int[] wordLabelIndex = new int[wordCount]; for ( int i = 0; i < wordCount; i++ ) { wordLabelIndex[i] = -1; } for ( int i = 0; i < labelsFeatureIndex.Length; i++ ) { int featureIndex = labelsFeatureIndex[i]; if ( featureIndex < wordCount ) { wordLabelIndex[featureIndex] = i; } } DoubleMatrix2D stemCos = reducedTdMatrix.viewSelection( filteredRows.toArray(), null ).copy(); for ( int r = 0; r < stemCos.rows(); r++ ) { int labelIndex = wordLabelIndex[mostFrequentOriginalWordIndex[filteredRowToStemIndex.get( r )]]; double penalty = getDocumentCountPenalty( labelIndex, documentCount, labelsDocumentIndices ); if ( featureScores != null ) { penalty *= featureScores[labelIndex]; } stemCos.viewRow( r ).assign( Functions.mult( penalty ) ); } DoubleMatrix2D phraseMatrix = vsmContext.termPhraseMatrix; int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; DoubleMatrix2D phraseCos = null; if ( phraseMatrix != null ) { phraseCos = phraseMatrix.zMult( reducedTdMatrix, null, 1, 0, false, false ); if ( phraseLengthPenaltyStop < phraseLengthPenaltyStart ) { phraseLengthPenaltyStop = phraseLengthPenaltyStart; } double penaltyStep = 1.0 / ( phraseLengthPenaltyStop - phraseLengthPenaltyStart + 1 ); for ( int row = 0; row < phraseCos.rows(); row++ ) { int phraseFeature = labelsFeatureIndex[row + firstPhraseIndex]; int[] phraseWordIndices = phrasesWordIndices[phraseFeature - wordCount]; double penalty; if ( phraseWordIndices.Length >= phraseLengthPenaltyStop ) { penalty = 0; } else { penalty = getDocumentCountPenalty( row + firstPhraseIndex, documentCount, labelsDocumentIndices ); if ( phraseWordIndices.Length >= phraseLengthPenaltyStart ) { penalty *= 1 - penaltyStep * ( phraseWordIndices.Length - phraseLengthPenaltyStart + 1 ); } if ( featureScores != null ) { penalty *= featureScores[row + firstPhraseIndex]; } } phraseCos.viewRow( row ).assign( Functions.mult( penalty * phraseLabelBoost ) ); } } labelAssigner.assignLabels( context, stemCos, filteredRowToStemIndex, phraseCos ); }