/// <summary>Check if a unit exists in the literal string.</summary>
        /// <remarks>
        /// Check if a unit exists in the literal string. If so, parse it by making use of
        /// the compositionality; otherwise return null.
        /// </remarks>
        /// <param name="s"/>
        /// <param name="unit"/>
        /// <returns/>
        private static double CompositeAtUnitIfExists(string s, string unit)
        {
            // invalid unit
            if (!quantityUnitToValues.ContainsKey(unit))
            {
                return(null);
            }
            int idx = s.IndexOf(unit);

            if (idx != -1)
            {
                double first = double.ValueOf(1.0);
                // Here we need special handling for 十 and 百 when they occur as the first char
                // As in Chinese 十二 is very common, 百二十 is sometimes valid as well.
                if (("十".Equals(unit) || "百".Equals(unit)) && idx == 0)
                {
                }
                else
                {
                    // do nothing
                    // otherwise we try to parse the value before the unit
                    first = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, 0, idx));
                }
                double second = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, idx + 1));
                if (first != null && second != null)
                {
                    return(double.ValueOf(first * quantityUnitToValues.GetCount(unit) + second));
                }
            }
            // return null if unit is not present or fails to parse
            return(null);
        }
        public virtual void RunCoref(Document document)
        {
            IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetUnlabeledMentionPairs(document);

            if (mentionPairs.Count == 0)
            {
                return;
            }
            Compressor <string>         compressor           = new Compressor <string>();
            DocumentExamples            examples             = extractor.Extract(0, document, mentionPairs, compressor);
            ICounter <Pair <int, int> > classificationScores = new ClassicCounter <Pair <int, int> >();
            ICounter <Pair <int, int> > rankingScores        = new ClassicCounter <Pair <int, int> >();
            ICounter <int> anaphoricityScores = new ClassicCounter <int>();

            foreach (Example example in examples.examples)
            {
                CorefUtils.CheckForInterrupt();
                Pair <int, int> mentionPair = new Pair <int, int>(example.mentionId1, example.mentionId2);
                classificationScores.IncrementCount(mentionPair, classificationModel.Predict(example, examples.mentionFeatures, compressor));
                rankingScores.IncrementCount(mentionPair, rankingModel.Predict(example, examples.mentionFeatures, compressor));
                if (!anaphoricityScores.ContainsKey(example.mentionId2))
                {
                    anaphoricityScores.IncrementCount(example.mentionId2, anaphoricityModel.Predict(new Example(example, false), examples.mentionFeatures, compressor));
                }
            }
            ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.Stream().Collect(Collectors.ToMap(null, null)));
            foreach (Pair <int, int> mentionPair_1 in clusterer.GetClusterMerges(doc))
            {
                CorefUtils.MergeCoreferenceClusters(mentionPair_1, document);
            }
        }
            public State(ClustererDataLoader.ClustererDoc doc)
            {
                currentDocId      = doc.id;
                this.doc          = doc;
                this.hashedScores = new Dictionary <Clusterer.MergeKey, bool>();
                this.hashedCosts  = new Dictionary <long, double>();
                this.clusters     = new List <Clusterer.Cluster>();
                this.hash         = 0;
                mentionToCluster  = new Dictionary <int, Clusterer.Cluster>();
                foreach (int m in doc.mentions)
                {
                    Clusterer.Cluster c = new Clusterer.Cluster(m);
                    clusters.Add(c);
                    mentionToCluster[m] = c;
                    hash ^= c.hash * 7;
                }
                IList <Pair <int, int> >    allPairs = new List <Pair <int, int> >(doc.classificationScores.KeySet());
                ICounter <Pair <int, int> > scores   = UseRanking ? doc.rankingScores : doc.classificationScores;

                allPairs.Sort(null);
                int i = 0;

                for (i = 0; i < allPairs.Count; i++)
                {
                    double score = scores.GetCount(allPairs[i]);
                    if (score < MinPairwiseScore && i > MinPairs)
                    {
                        break;
                    }
                    if (i >= EarlyStopThreshold && i / score > EarlyStopVal)
                    {
                        break;
                    }
                }
                mentionPairs = allPairs.SubList(0, i);
                ICounter <int> seenAnaphors    = new ClassicCounter <int>();
                ICounter <int> seenAntecedents = new ClassicCounter <int>();

                globalFeatures = new List <Clusterer.GlobalFeatures>();
                for (int j = 0; j < allPairs.Count; j++)
                {
                    Pair <int, int>          mentionPair = allPairs[j];
                    Clusterer.GlobalFeatures gf          = new Clusterer.GlobalFeatures();
                    gf.currentIndex = j;
                    gf.anaphorSeen  = seenAnaphors.ContainsKey(mentionPair.second);
                    gf.size         = mentionPairs.Count;
                    gf.docSize      = doc.mentions.Count / 300.0;
                    globalFeatures.Add(gf);
                    seenAnaphors.IncrementCount(mentionPair.second);
                    seenAntecedents.IncrementCount(mentionPair.first);
                }
                currentIndex = 0;
                SetClusters();
            }
        /// <summary>Recursively parse a integer String expressed in either Chinese or a mix of Chinese and arabic numbers.</summary>
        /// <param name="s"/>
        /// <returns/>
        private static double RecurNormalizeLiteralIntegerString(string s)
        {
            // If empty, return 0
            if (s.IsEmpty())
            {
                return(double.ValueOf(0));
            }
            // TODO: check if it is valid. It is possible that this is a vague number like "五六十" which cannot be parsed by current implementation.
            // In case of pure arabic numbers, return the straight value of it
            if (ArabicNumbersPattern.Matcher(s).Matches())
            {
                return(double.ValueOf(s));
            }
            //If s has more than 1 char and first char is 零 or 〇, it is likely
            // to be useless
            if (s.Length > 1 && (s.StartsWith("零") || s.StartsWith("〇")))
            {
                s = Sharpen.Runtime.Substring(s, 1);
            }
            //If there is only one char left and we can quantify it, we return the value of it
            if (s.Length == 1 && wordsToValues.ContainsKey(s))
            {
                return(double.ValueOf(wordsToValues.GetCount(s)));
            }
            // Now parse the integer, making use of the compositionality of Chinese literal numbers
            double value;

            value = CompositeAtUnitIfExists(s, "亿");
            if (value != null)
            {
                return(value);
            }
            else
            {
                value = CompositeAtUnitIfExists(s, "万");
            }
            if (value != null)
            {
                return(value);
            }
            else
            {
                value = CompositeAtUnitIfExists(s, "千");
            }
            if (value != null)
            {
                return(value);
            }
            else
            {
                value = CompositeAtUnitIfExists(s, "百");
            }
            if (value != null)
            {
                return(value);
            }
            else
            {
                value = CompositeAtUnitIfExists(s, "十");
            }
            if (value != null)
            {
                return(value);
            }
            // otherwise we fail to parse and just return null
            return(null);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool
                                                     sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat)
        {
            //    if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) {
            //      Data.loadGoogleNGrams();
            //    }
            ICounter <E> patterns = new ClassicCounter <E>();
            ICounter <CandidatePhrase> googleNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> domainNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores    = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>();
            double            externalWtsDefault = 0.5;
            ICounter <string> classifierScores   = null;

            if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
            {
                foreach (CandidatePhrase gc in allCandidatePhrases)
                {
                    string g = gc.GetPhrase();
                    if (constVars.usePatternEvalEditDistOther)
                    {
                        editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalEditDistSame)
                    {
                        editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalGoogleNgram)
                    {
                        googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc));
                    }
                    if (constVars.usePatternEvalDomainNgram)
                    {
                        // calculate domain-ngram wts
                        if (Data.domainNGramRawFreq.ContainsKey(g))
                        {
                            System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc)));
                            domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g));
                        }
                    }
                    if (constVars.usePatternEvalWordClass)
                    {
                        int num = constVars.GetWordClassClusters()[g];
                        if (num == null)
                        {
                            num = constVars.GetWordClassClusters()[g.ToLower()];
                        }
                        if (num != null && constVars.distSimWeights[label].ContainsKey(num))
                        {
                            externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num));
                        }
                        else
                        {
                            externalFeatWtsNormalized.SetCount(gc, externalWtsDefault);
                        }
                    }
                }
                if (constVars.usePatternEvalGoogleNgram)
                {
                    googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalDomainNgram)
                {
                    domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalWordClass)
                {
                    externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false);
                }
            }
            else
            {
                if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                {
                    Properties props2 = new Properties();
                    props2.PutAll(props);
                    props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt");
                    ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars);
                    System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile"));
                    ArgumentParser.FillOptions(typeof(Data), props2);
                    classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true);
                }
            }
            ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>();

            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet())
            {
                foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet())
                {
                    CandidatePhrase word = en2.Key;
                    ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>();
                    double score = 1;
                    if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
                    {
                        if (cachedScoresForThisIter.ContainsKey(word))
                        {
                            score = cachedScoresForThisIter.GetCount(word);
                        }
                        else
                        {
                            if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word))
                            {
                                score = 1;
                            }
                            else
                            {
                                if (constVars.usePatternEvalSemanticOdds)
                                {
                                    double semanticClassOdds = 1;
                                    if (dictOddsWordWeights.ContainsKey(word))
                                    {
                                        semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds);
                                }
                                if (constVars.usePatternEvalGoogleNgram)
                                {
                                    double gscore = 0;
                                    if (googleNgramNormScores.ContainsKey(word))
                                    {
                                        gscore = 1 - googleNgramNormScores.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore);
                                }
                                if (constVars.usePatternEvalDomainNgram)
                                {
                                    double domainscore;
                                    if (domainNgramNormScores.ContainsKey(word))
                                    {
                                        domainscore = 1 - domainNgramNormScores.GetCount(word);
                                    }
                                    else
                                    {
                                        domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore);
                                }
                                if (constVars.usePatternEvalWordClass)
                                {
                                    double externalFeatureWt = externalWtsDefault;
                                    if (externalFeatWtsNormalized.ContainsKey(word))
                                    {
                                        externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt);
                                }
                                if (constVars.usePatternEvalEditDistOther)
                                {
                                    System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty);
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word));
                                }
                                if (constVars.usePatternEvalEditDistSame)
                                {
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word));
                                }
                                // taking average
                                score = Counters.Mean(scoreslist);
                                phInPatScores.SetCounter(word, scoreslist);
                            }
                            cachedScoresForThisIter.SetCount(word, score);
                        }
                    }
                    else
                    {
                        if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                        {
                            score = 1 - classifierScores.GetCount(word);
                        }
                    }
                    // score = 1 - scorePhrases.scoreUsingClassifer(classifier,
                    // e.getKey(), label, true, null, null, dictOddsWordWeights);
                    // throw new RuntimeException("not implemented yet");
                    if (useFreqPhraseExtractedByPat)
                    {
                        score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word));
                    }
                    if (constVars.sqrtPatScore)
                    {
                        patterns.IncrementCount(en.Key, Math.Sqrt(score));
                    }
                    else
                    {
                        patterns.IncrementCount(en.Key, score);
                    }
                }
            }
            return(patterns);
        }
        internal override ICounter <CandidatePhrase> ScorePhrases(string label, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase
                                                                                                                                                                                                                                               > alreadyIdentifiedWords, bool forLearningPatterns)
        {
            IDictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > scores = new Dictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> >();

            if (Data.domainNGramsFile != null)
            {
                Data.LoadDomainNGrams();
            }
            Redwood.Log(ConstantsAndVariables.extremedebug, "Considering terms: " + terms.FirstKeySet());
            // calculate TF-IDF like scores
            ICounter <CandidatePhrase> tfidfScores = new ClassicCounter <CandidatePhrase>();

            if (constVars.usePhraseEvalPatWtByFreq)
            {
                foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                {
                    double score = GetPatTFIDFScore(en.Key, en.Value, allSelectedPatterns);
                    tfidfScores.SetCount(en.Key, score);
                }
                Redwood.Log(ConstantsAndVariables.extremedebug, "BEFORE IDF " + Counters.ToSortedString(tfidfScores, 100, "%1$s:%2$f", "\t"));
                Counters.DivideInPlace(tfidfScores, Data.processedDataFreq);
            }
            ICounter <CandidatePhrase> externalFeatWtsNormalized     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> domainNgramNormScores         = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> googleNgramNormScores         = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceOtherBinaryScores = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceSameBinaryScores  = new ClassicCounter <CandidatePhrase>();

            foreach (CandidatePhrase gc in terms.FirstKeySet())
            {
                string g = gc.GetPhrase();
                if (constVars.usePhraseEvalEditDistOther)
                {
                    editDistanceOtherBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresOtherClassThreshold(label, g));
                }
                if (constVars.usePhraseEvalEditDistSame)
                {
                    editDistanceSameBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresThisClassThreshold(label, g));
                }
                if (constVars.usePhraseEvalDomainNgram)
                {
                    // calculate domain-ngram wts
                    if (Data.domainNGramRawFreq.ContainsKey(g))
                    {
                        System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc)));
                        domainNgramNormScores.SetCount(gc, GetDomainNgramScore(g));
                    }
                    else
                    {
                        log.Info("why is " + g + " not present in domainNgram");
                    }
                }
                if (constVars.usePhraseEvalGoogleNgram)
                {
                    googleNgramNormScores.SetCount(gc, GetGoogleNgramScore(gc));
                }
                if (constVars.usePhraseEvalWordClass)
                {
                    // calculate dist sim weights
                    int num = constVars.GetWordClassClusters()[g];
                    if (num == null)
                    {
                        num = constVars.GetWordClassClusters()[g.ToLower()];
                    }
                    if (num != null && constVars.distSimWeights[label].ContainsKey(num))
                    {
                        externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num));
                    }
                    else
                    {
                        externalFeatWtsNormalized.SetCount(gc, OOVExternalFeatWt);
                    }
                }
            }
            ICounter <CandidatePhrase> normTFIDFScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(tfidfScores, true, true, false);
            ICounter <CandidatePhrase> dictOdddsScores = null;

            if (constVars.usePhraseEvalSemanticOdds)
            {
                System.Diagnostics.Debug.Assert(constVars.dictOddsWeights != null, "usePhraseEvalSemanticOdds is true but dictOddsWeights is null for the label " + label);
                dictOdddsScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false);
            }
            domainNgramNormScores     = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false);
            googleNgramNormScores     = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false);
            externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false);
            // Counters.max(googleNgramNormScores);
            // Counters.max(externalFeatWtsNormalized);
            foreach (CandidatePhrase word in terms.FirstKeySet())
            {
                if (alreadyIdentifiedWords.Contains(word))
                {
                    continue;
                }
                ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>();
                System.Diagnostics.Debug.Assert(normTFIDFScores.ContainsKey(word), "NormTFIDF score does not contain" + word);
                double tfscore = normTFIDFScores.GetCount(word);
                scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Patwtbyfreq, tfscore);
                if (constVars.usePhraseEvalSemanticOdds)
                {
                    double dscore;
                    if (dictOdddsScores.ContainsKey(word))
                    {
                        dscore = dictOdddsScores.GetCount(word);
                    }
                    else
                    {
                        dscore = GetPhraseWeightFromWords(dictOdddsScores, word, OOVdictOdds);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, dscore);
                }
                if (constVars.usePhraseEvalDomainNgram)
                {
                    double domainscore;
                    if (domainNgramNormScores.ContainsKey(word))
                    {
                        domainscore = domainNgramNormScores.GetCount(word);
                    }
                    else
                    {
                        domainscore = GetPhraseWeightFromWords(domainNgramNormScores, word, OOVDomainNgramScore);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore);
                }
                if (constVars.usePhraseEvalGoogleNgram)
                {
                    double googlescore;
                    if (googleNgramNormScores.ContainsKey(word))
                    {
                        googlescore = googleNgramNormScores.GetCount(word);
                    }
                    else
                    {
                        googlescore = GetPhraseWeightFromWords(googleNgramNormScores, word, OOVGoogleNgramScore);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, googlescore);
                }
                if (constVars.usePhraseEvalWordClass)
                {
                    double externalFeatureWt;
                    if (externalFeatWtsNormalized.ContainsKey(word))
                    {
                        externalFeatureWt = externalFeatWtsNormalized.GetCount(word);
                    }
                    else
                    {
                        externalFeatureWt = GetPhraseWeightFromWords(externalFeatWtsNormalized, word, OOVExternalFeatWt);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt);
                }
                if (constVars.usePhraseEvalEditDistOther)
                {
                    System.Diagnostics.Debug.Assert(editDistanceOtherBinaryScores.ContainsKey(word), "How come no edit distance info?");
                    double editD = editDistanceOtherBinaryScores.GetCount(word);
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editD);
                }
                if (constVars.usePhraseEvalEditDistSame)
                {
                    double editDSame = editDistanceSameBinaryScores.GetCount(word);
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDSame);
                }
                if (constVars.usePhraseEvalWordShape)
                {
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Wordshape, this.GetWordShapeScore(word.GetPhrase(), label));
                }
                scores[word] = scoreslist;
                phraseScoresNormalized.SetCounter(word, scoreslist);
            }
            ICounter <CandidatePhrase> phraseScores = new ClassicCounter <CandidatePhrase>();

            foreach (KeyValuePair <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > wEn in scores)
            {
                double avgScore = Counters.Mean(wEn.Value);
                if (!avgScore.IsInfinite() && !double.IsNaN(avgScore))
                {
                    phraseScores.SetCount(wEn.Key, avgScore);
                }
                else
                {
                    Redwood.Log(Redwood.Dbg, "Ignoring " + wEn.Key + " because score is " + avgScore);
                }
            }
            return(phraseScores);
        }
Example #7
0
        public override float Score(IntTaggedWord itw, string word)
        {
            // Label tagL = itw.tagLabel();
            // String tag = tagL.value();
            string tag  = itw.TagString(tagIndex);
            ILabel tagL = new Tag(tag);
            float  logProb;

            if (word.Matches(dateMatch))
            {
                //EncodingPrintWriter.out.println("Date match for " + word,encoding);
                if (tag.Equals("NT"))
                {
                    logProb = 0.0f;
                }
                else
                {
                    logProb = float.NegativeInfinity;
                }
            }
            else
            {
                if (word.Matches(numberMatch))
                {
                    //EncodingPrintWriter.out.println("Number match for " + word,encoding);
                    if (tag.Equals("CD") && (!word.Matches(ordinalMatch)))
                    {
                        logProb = 0.0f;
                    }
                    else
                    {
                        if (tag.Equals("OD") && word.Matches(ordinalMatch))
                        {
                            logProb = 0.0f;
                        }
                        else
                        {
                            logProb = float.NegativeInfinity;
                        }
                    }
                }
                else
                {
                    if (word.Matches(properNameMatch))
                    {
                        //EncodingPrintWriter.out.println("Proper name match for " + word,encoding);
                        if (tag.Equals("NR"))
                        {
                            logProb = 0.0f;
                        }
                        else
                        {
                            logProb = float.NegativeInfinity;
                        }
                    }
                    else
                    {
                        /* -------------
                        *  // this didn't seem to work -- too categorical
                        *  int type = Character.getType(word.charAt(0));
                        *  // the below may not normalize probs over options, but is probably okay
                        *  if (type == Character.START_PUNCTUATION) {
                        *  if (tag.equals("PU-LPAREN") || tag.equals("PU-PAREN") ||
                        *  tag.equals("PU-LQUOTE") || tag.equals("PU-QUOTE") ||
                        *  tag.equals("PU")) {
                        *  // if (VERBOSE) log.info("ChineseUWM: unknown L Punc");
                        *  logProb = 0.0f;
                        *  } else {
                        *  logProb = Float.NEGATIVE_INFINITY;
                        *  }
                        *  } else if (type == Character.END_PUNCTUATION) {
                        *  if (tag.equals("PU-RPAREN") || tag.equals("PU-PAREN") ||
                        *  tag.equals("PU-RQUOTE") || tag.equals("PU-QUOTE") ||
                        *  tag.equals("PU")) {
                        *  // if (VERBOSE) log.info("ChineseUWM: unknown R Punc");
                        *  logProb = 0.0f;
                        *  } else {
                        *  logProb = Float.NEGATIVE_INFINITY;
                        *  }
                        *  } else {
                        *  if (tag.equals("PU-OTHER") || tag.equals("PU-ENDSENT") ||
                        *  tag.equals("PU")) {
                        *  // if (VERBOSE) log.info("ChineseUWM: unknown O Punc");
                        *  logProb = 0.0f;
                        *  } else {
                        *  logProb = Float.NEGATIVE_INFINITY;
                        *  }
                        *  }
                        *  ------------- */
                        if (useFirst)
                        {
                            string first = Sharpen.Runtime.Substring(word, 0, 1);
                            if (useUnicodeType)
                            {
                                char ch   = word[0];
                                int  type = char.GetType(ch);
                                if (type != char.OtherLetter)
                                {
                                    // standard Chinese characters are of type "OTHER_LETTER"!!
                                    first = int.ToString(type);
                                }
                            }
                            if (!seenFirst.Contains(first))
                            {
                                if (useGT)
                                {
                                    logProb = ScoreGT(tag);
                                    goto first_break;
                                }
                                else
                                {
                                    first = unknown;
                                }
                            }
                            /* get the Counter of terminal rewrites for the relevant tag */
                            ClassicCounter <string> wordProbs = tagHash[tagL];

                            /* if the proposed tag has never been seen before, issue a
                             * warning and return probability 0. */
                            if (wordProbs == null)
                            {
                                logProb = float.NegativeInfinity;
                            }
                            else
                            {
                                if (wordProbs.ContainsKey(first))
                                {
                                    logProb = (float)wordProbs.GetCount(first);
                                }
                                else
                                {
                                    logProb = (float)wordProbs.GetCount(unknown);
                                }
                            }
                        }
                        else
                        {
                            if (useGT)
                            {
                                logProb = ScoreGT(tag);
                            }
                            else
                            {
                                logProb = float.NegativeInfinity;
                            }
                        }
                        first_break :;
                    }
                }
            }
            // should never get this!
            return(logProb);
        }
Example #8
0
        public static void Main(string[] args)
        {
            System.Console.Out.WriteLine("Testing unknown matching");
            string s = "\u5218\u00b7\u9769\u547d";

            if (s.Matches(properNameMatch))
            {
                System.Console.Out.WriteLine("hooray names!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh names!");
            }
            string s1 = "\uff13\uff10\uff10\uff10";

            if (s1.Matches(numberMatch))
            {
                System.Console.Out.WriteLine("hooray numbers!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh numbers!");
            }
            string s11 = "\u767e\u5206\u4e4b\u56db\u5341\u4e09\u70b9\u4e8c";

            if (s11.Matches(numberMatch))
            {
                System.Console.Out.WriteLine("hooray numbers!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh numbers!");
            }
            string s12 = "\u767e\u5206\u4e4b\u4e09\u5341\u516b\u70b9\u516d";

            if (s12.Matches(numberMatch))
            {
                System.Console.Out.WriteLine("hooray numbers!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh numbers!");
            }
            string s2 = "\u4e09\u6708";

            if (s2.Matches(dateMatch))
            {
                System.Console.Out.WriteLine("hooray dates!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh dates!");
            }
            System.Console.Out.WriteLine("Testing tagged word");
            ClassicCounter <TaggedWord> c = new ClassicCounter <TaggedWord>();
            TaggedWord tw1 = new TaggedWord("w", "t");

            c.IncrementCount(tw1);
            TaggedWord tw2 = new TaggedWord("w", "t2");

            System.Console.Out.WriteLine(c.ContainsKey(tw2));
            System.Console.Out.WriteLine(tw1.Equals(tw2));
            WordTag wt1 = ToWordTag(tw1);
            WordTag wt2 = ToWordTag(tw2);
            WordTag wt3 = new WordTag("w", "t2");

            System.Console.Out.WriteLine(wt1.Equals(wt2));
            System.Console.Out.WriteLine(wt2.Equals(wt3));
        }