/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat) { // if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) { // Data.loadGoogleNGrams(); // } ICounter <E> patterns = new ClassicCounter <E>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>(); double externalWtsDefault = 0.5; ICounter <string> classifierScores = null; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { foreach (CandidatePhrase gc in allCandidatePhrases) { string g = gc.GetPhrase(); if (constVars.usePatternEvalEditDistOther) { editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePatternEvalEditDistSame) { editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc)); } if (constVars.usePatternEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g)); } } if (constVars.usePatternEvalWordClass) { int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, externalWtsDefault); } } } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); } if (constVars.usePatternEvalDomainNgram) { domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); } if (constVars.usePatternEvalWordClass) { externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { Properties props2 = new Properties(); props2.PutAll(props); props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt"); ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars); System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile")); ArgumentParser.FillOptions(typeof(Data), props2); classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true); } } ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet()) { foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet()) { CandidatePhrase word = en2.Key; ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); double score = 1; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { if (cachedScoresForThisIter.ContainsKey(word)) { score = cachedScoresForThisIter.GetCount(word); } else { if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word)) { score = 1; } else { if (constVars.usePatternEvalSemanticOdds) { double semanticClassOdds = 1; if (dictOddsWordWeights.ContainsKey(word)) { semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds); } if (constVars.usePatternEvalGoogleNgram) { double gscore = 0; if (googleNgramNormScores.ContainsKey(word)) { gscore = 1 - googleNgramNormScores.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore); } if (constVars.usePatternEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = 1 - domainNgramNormScores.GetCount(word); } else { domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePatternEvalWordClass) { double externalFeatureWt = externalWtsDefault; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePatternEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word)); } if (constVars.usePatternEvalEditDistSame) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word)); } // taking average score = Counters.Mean(scoreslist); phInPatScores.SetCounter(word, scoreslist); } cachedScoresForThisIter.SetCount(word, score); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { score = 1 - classifierScores.GetCount(word); } } // score = 1 - scorePhrases.scoreUsingClassifer(classifier, // e.getKey(), label, true, null, null, dictOddsWordWeights); // throw new RuntimeException("not implemented yet"); if (useFreqPhraseExtractedByPat) { score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word)); } if (constVars.sqrtPatScore) { patterns.IncrementCount(en.Key, Math.Sqrt(score)); } else { patterns.IncrementCount(en.Key, score); } } } return(patterns); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
internal override ICounter <CandidatePhrase> ScorePhrases(string label, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase > alreadyIdentifiedWords, bool forLearningPatterns) { IDictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > scores = new Dictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> >(); if (Data.domainNGramsFile != null) { Data.LoadDomainNGrams(); } Redwood.Log(ConstantsAndVariables.extremedebug, "Considering terms: " + terms.FirstKeySet()); // calculate TF-IDF like scores ICounter <CandidatePhrase> tfidfScores = new ClassicCounter <CandidatePhrase>(); if (constVars.usePhraseEvalPatWtByFreq) { foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { double score = GetPatTFIDFScore(en.Key, en.Value, allSelectedPatterns); tfidfScores.SetCount(en.Key, score); } Redwood.Log(ConstantsAndVariables.extremedebug, "BEFORE IDF " + Counters.ToSortedString(tfidfScores, 100, "%1$s:%2$f", "\t")); Counters.DivideInPlace(tfidfScores, Data.processedDataFreq); } ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceOtherBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceSameBinaryScores = new ClassicCounter <CandidatePhrase>(); foreach (CandidatePhrase gc in terms.FirstKeySet()) { string g = gc.GetPhrase(); if (constVars.usePhraseEvalEditDistOther) { editDistanceOtherBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePhraseEvalEditDistSame) { editDistanceSameBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePhraseEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, GetDomainNgramScore(g)); } else { log.Info("why is " + g + " not present in domainNgram"); } } if (constVars.usePhraseEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, GetGoogleNgramScore(gc)); } if (constVars.usePhraseEvalWordClass) { // calculate dist sim weights int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, OOVExternalFeatWt); } } } ICounter <CandidatePhrase> normTFIDFScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(tfidfScores, true, true, false); ICounter <CandidatePhrase> dictOdddsScores = null; if (constVars.usePhraseEvalSemanticOdds) { System.Diagnostics.Debug.Assert(constVars.dictOddsWeights != null, "usePhraseEvalSemanticOdds is true but dictOddsWeights is null for the label " + label); dictOdddsScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false); } domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); // Counters.max(googleNgramNormScores); // Counters.max(externalFeatWtsNormalized); foreach (CandidatePhrase word in terms.FirstKeySet()) { if (alreadyIdentifiedWords.Contains(word)) { continue; } ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); System.Diagnostics.Debug.Assert(normTFIDFScores.ContainsKey(word), "NormTFIDF score does not contain" + word); double tfscore = normTFIDFScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Patwtbyfreq, tfscore); if (constVars.usePhraseEvalSemanticOdds) { double dscore; if (dictOdddsScores.ContainsKey(word)) { dscore = dictOdddsScores.GetCount(word); } else { dscore = GetPhraseWeightFromWords(dictOdddsScores, word, OOVdictOdds); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, dscore); } if (constVars.usePhraseEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = domainNgramNormScores.GetCount(word); } else { domainscore = GetPhraseWeightFromWords(domainNgramNormScores, word, OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePhraseEvalGoogleNgram) { double googlescore; if (googleNgramNormScores.ContainsKey(word)) { googlescore = googleNgramNormScores.GetCount(word); } else { googlescore = GetPhraseWeightFromWords(googleNgramNormScores, word, OOVGoogleNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, googlescore); } if (constVars.usePhraseEvalWordClass) { double externalFeatureWt; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = externalFeatWtsNormalized.GetCount(word); } else { externalFeatureWt = GetPhraseWeightFromWords(externalFeatWtsNormalized, word, OOVExternalFeatWt); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePhraseEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceOtherBinaryScores.ContainsKey(word), "How come no edit distance info?"); double editD = editDistanceOtherBinaryScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editD); } if (constVars.usePhraseEvalEditDistSame) { double editDSame = editDistanceSameBinaryScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDSame); } if (constVars.usePhraseEvalWordShape) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Wordshape, this.GetWordShapeScore(word.GetPhrase(), label)); } scores[word] = scoreslist; phraseScoresNormalized.SetCounter(word, scoreslist); } ICounter <CandidatePhrase> phraseScores = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > wEn in scores) { double avgScore = Counters.Mean(wEn.Value); if (!avgScore.IsInfinite() && !double.IsNaN(avgScore)) { phraseScores.SetCount(wEn.Key, avgScore); } else { Redwood.Log(Redwood.Dbg, "Ignoring " + wEn.Key + " because score is " + avgScore); } } return(phraseScores); }