/// <summary>
        /// Calculate sister annotation statistics suitable for doing
        /// selective sister splitting in the PCFGParser inside the
        /// FactoredParser.
        /// </summary>
        /// <param name="args">One argument: path to the Treebank</param>
        public static void Main(string[] args)
        {
            ClassicCounter <string> c = new ClassicCounter <string>();

            c.SetCount("A", 0);
            c.SetCount("B", 1);
            double d = Counters.KlDivergence(c, c);

            System.Console.Out.WriteLine("KL Divergence: " + d);
            string encoding = "UTF-8";

            if (args.Length > 1)
            {
                encoding = args[1];
            }
            if (args.Length < 1)
            {
                System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath");
            }
            else
            {
                SisterAnnotationStats pas = new SisterAnnotationStats();
                Treebank treebank         = new DiskTreebank(null, encoding);
                treebank.LoadPath(args[0]);
                treebank.Apply(pas);
                pas.PrintStats();
            }
        }
        public virtual void TestGetDistributionFromLogValues()
        {
            ICounter <string> c1 = new ClassicCounter <string>();

            c1.SetCount("p", 1.0);
            c1.SetCount("q", 2.0);
            c1.SetCount("r", 3.0);
            c1.SetCount("s", 4.0);
            // take log
            Counters.LogInPlace(c1);
            // now call distribution
            Distribution <string> distribution = Distribution.GetDistributionFromLogValues(c1);

            // test
            NUnit.Framework.Assert.AreEqual(distribution.KeySet().Count, 4);
            // size
            // keys
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("p"), true);
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("q"), true);
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("r"), true);
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("s"), true);
            // values
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("p"), 1.0E-1, 1E-10);
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("q"), 2.0E-1, 1E-10);
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("r"), 3.0E-1, 1E-10);
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("s"), 4.0E-1, 1E-10);
        }
        /// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary>
        /// <param name="datum">with a collection of features.</param>
        /// <param name="featureDocCounts">a counter of doc-count for each feature.</param>
        /// <returns>RVFDatum with l1-normalized tf-idf features.</returns>
        public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts)
        {
            ICounter <F> tfidfFeatures = new ClassicCounter <F>();

            foreach (F feature in datum.AsFeatures())
            {
                if (featureDocCounts.ContainsKey(feature))
                {
                    tfidfFeatures.IncrementCount(feature, 1.0);
                }
            }
            double l1norm = 0;

            foreach (F feature_1 in tfidfFeatures.KeySet())
            {
                double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5));
                double tf  = tfidfFeatures.GetCount(feature_1);
                tfidfFeatures.SetCount(feature_1, tf * idf);
                l1norm += tf * idf;
            }
            foreach (F feature_2 in tfidfFeatures.KeySet())
            {
                double tfidf = tfidfFeatures.GetCount(feature_2);
                tfidfFeatures.SetCount(feature_2, tfidf / l1norm);
            }
            RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label());

            return(rvfDatum);
        }
示例#4
0
        private ICounter <L> ScoresOfRVFDatum(RVFDatum <L, F> example)
        {
            ICounter <F> features = example.AsFeaturesCounter();
            double       sum      = ScoreOf(features);
            ICounter <L> c        = new ClassicCounter <L>();

            c.SetCount(classes[0], -sum);
            c.SetCount(classes[1], sum);
            return(c);
        }
示例#5
0
        /// <summary>returns the scores for both the classes</summary>
        public virtual ICounter <L> ScoresOf(IDatum <L, F> datum)
        {
            if (datum is RVFDatum <object, object> )
            {
                return(ScoresOfRVFDatum((RVFDatum <L, F>)datum));
            }
            ICollection <F> features = datum.AsFeatures();
            double          sum      = ScoreOf(features);
            ICounter <L>    c        = new ClassicCounter <L>();

            c.SetCount(classes[0], -sum);
            c.SetCount(classes[1], sum);
            return(c);
        }
        public virtual EntityMention MakeEntityMention(ICoreMap sentence, int start, int end, string label, string identifier)
        {
            Span   span    = new Span(start, end);
            string type    = null;
            string subtype = null;

            if (!label.StartsWith("B-") && !label.StartsWith("I-"))
            {
                type    = label;
                subtype = null;
            }
            else
            {
                // TODO: add support for subtypes! (needed at least in ACE)
                type    = Sharpen.Runtime.Substring(label, 2);
                subtype = null;
            }
            // TODO: add support for subtypes! (needed at least in ACE)
            EntityMention     entity = entityMentionFactory.ConstructEntityMention(identifier, sentence, span, span, type, subtype, null);
            ICounter <string> probs  = new ClassicCounter <string>();

            probs.SetCount(entity.GetType(), 1.0);
            entity.SetTypeProbabilities(probs);
            return(entity);
        }
示例#7
0
        public override ICounter <E> Score()
        {
            ICounter <E> specificity = new ClassicCounter <E>();
            ICounter <E> sensitivity = new ClassicCounter <E>();

            if (p0Set.KeySet().Count == 0)
            {
                throw new Exception("how come p0set size is empty for " + p0 + "?");
            }
            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet())
            {
                int common = CollectionUtils.Intersection(en.Value.KeySet(), p0Set.KeySet()).Count;
                if (common == 0)
                {
                    continue;
                }
                if (en.Value.KeySet().Count == 0)
                {
                    throw new Exception("how come counter for " + en.Key + " is empty?");
                }
                specificity.SetCount(en.Key, common / (double)en.Value.KeySet().Count);
                sensitivity.SetCount(en.Key, common / (double)p0Set.Size());
            }
            Counters.RetainNonZeros(specificity);
            Counters.RetainNonZeros(sensitivity);
            ICounter <E> add     = Counters.Add(sensitivity, specificity);
            ICounter <E> product = Counters.Product(sensitivity, specificity);

            Counters.RetainNonZeros(product);
            Counters.RetainKeys(product, add.KeySet());
            ICounter <E> finalPat = Counters.Scale(Counters.Division(product, add), 2);

            return(finalPat);
        }
 public override IUnknownWordModel FinishTraining()
 {
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c)
     {
         /* outer iteration is over tags */
         ILabel key = entry.Key;
         ClassicCounter <string> wc = entry.Value;
         // counts for words given a tag
         if (!tagHash.Contains(key))
         {
             tagHash[key] = new ClassicCounter <string>();
         }
         /* the UNKNOWN sequence is assumed to be seen once in each tag */
         // This is sort of broken, but you can regard it as a Dirichlet prior.
         tc.IncrementCount(key);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         /* inner iteration is over words */
         foreach (string end in wc.KeySet())
         {
             double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key)));
             // p(sig|tag)
             tagHash[key].SetCount(end, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding);
     return(model);
 }
 public override IUnknownWordModel FinishTraining()
 {
     // Map<String,Float> unknownGT = null;
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     // unknownGT = unknownGTTrainer.unknownGT;
     foreach (ILabel tagLab in c.Keys)
     {
         // outer iteration is over tags as Labels
         ClassicCounter <string> wc = c[tagLab];
         // counts for words given a tag
         if (!tagHash.Contains(tagLab))
         {
             tagHash[tagLab] = new ClassicCounter <string>();
         }
         // the UNKNOWN first character is assumed to be seen once in
         // each tag
         // this is really sort of broken!  (why??)
         tc.IncrementCount(tagLab);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         // inner iteration is over words  as strings
         foreach (string first in wc.KeySet())
         {
             double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab));
             tagHash[tagLab].SetCount(first, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding);
     return(model);
 }
        private static void LoadSignatures(string file, IDictionary <string, ICounter <string> > sigs)
        {
            BufferedReader reader = null;

            try
            {
                reader = IOUtils.ReaderFromString(file);
                while (reader.Ready())
                {
                    string[]          split = reader.ReadLine().Split("\t");
                    ICounter <string> cntr  = new ClassicCounter <string>();
                    sigs[split[0]] = cntr;
                    for (int i = 1; i < split.Length; i = i + 2)
                    {
                        cntr.SetCount(split[i], double.ParseDouble(split[i + 1]));
                    }
                }
            }
            catch (IOException e)
            {
                throw new Exception(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(reader);
            }
        }
        // Does L1 or L2 using FOBOS and lazy update, so L1 should not be handled in the
        // objective
        // Alternatively, you can handle other regularization in the objective,
        // but then, if the derivative is not sparse, this routine would not be very
        // efficient. However, might still be okay for CRFs
        public virtual ICounter <K> Minimize(F function, ICounter <K> x, int maxIterations)
        {
            Sayln("       Batch size of: " + batchSize);
            Sayln("       Data dimension of: " + function.DataSize());
            int numBatches = (function.DataSize() - 1) / this.batchSize + 1;

            Sayln("       Batches per pass through data:  " + numBatches);
            Sayln("       Number of passes is = " + numPasses);
            Sayln("       Max iterations is = " + maxIterations);
            ICounter <K> lastUpdated = new ClassicCounter <K>();
            int          timeStep    = 0;
            Timing       total       = new Timing();

            total.Start();
            for (int iter = 0; iter < numPasses; iter++)
            {
                double totalObjValue = 0;
                for (int j = 0; j < numBatches; j++)
                {
                    int[] selectedData = GetSample(function, this.batchSize);
                    // the core adagrad
                    ICounter <K> gradient = function.DerivativeAt(x, selectedData);
                    totalObjValue = totalObjValue + function.ValueAt(x, selectedData);
                    foreach (K feature in gradient.KeySet())
                    {
                        double gradf              = gradient.GetCount(feature);
                        double prevrate           = eta / (Math.Sqrt(sumGradSquare.GetCount(feature)) + soften);
                        double sgsValue           = sumGradSquare.IncrementCount(feature, gradf * gradf);
                        double currentrate        = eta / (Math.Sqrt(sgsValue) + soften);
                        double testupdate         = x.GetCount(feature) - (currentrate * gradient.GetCount(feature));
                        double lastUpdateTimeStep = lastUpdated.GetCount(feature);
                        double idleinterval       = timeStep - lastUpdateTimeStep - 1;
                        lastUpdated.SetCount(feature, (double)timeStep);
                        // does lazy update using idleinterval
                        double trunc      = Math.Max(0.0, (Math.Abs(testupdate) - (currentrate + prevrate * idleinterval) * this.lambdaL1));
                        double trunc2     = trunc * Math.Pow(1 - this.lambdaL2, currentrate + prevrate * idleinterval);
                        double realupdate = Math.Signum(testupdate) * trunc2;
                        if (realupdate < Eps)
                        {
                            x.Remove(feature);
                        }
                        else
                        {
                            x.SetCount(feature, realupdate);
                        }
                        // reporting
                        timeStep++;
                        if (timeStep > maxIterations)
                        {
                            Sayln("Stochastic Optimization complete.  Stopped after max iterations");
                            break;
                        }
                        Sayln(System.Console.Out.Format("Iter %d \t batch: %d \t time=%.2f \t obj=%.4f", iter, timeStep, total.Report() / 1000.0, totalObjValue).ToString());
                    }
                }
            }
            return(x);
        }
        // Quick little sanity check
        public static void Main(string[] args)
        {
            ICollection <RVFDatum <string, string> > trainingInstances = new List <RVFDatum <string, string> >();

            {
                ClassicCounter <string> f1 = new ClassicCounter <string>();
                f1.SetCount("humidity", 5.0);
                f1.SetCount("temperature", 35.0);
                trainingInstances.Add(new RVFDatum <string, string>(f1, "rain"));
            }
            {
                ClassicCounter <string> f1 = new ClassicCounter <string>();
                f1.SetCount("humidity", 4.0);
                f1.SetCount("temperature", 32.0);
                trainingInstances.Add(new RVFDatum <string, string>(f1, "rain"));
            }
            {
                ClassicCounter <string> f1 = new ClassicCounter <string>();
                f1.SetCount("humidity", 6.0);
                f1.SetCount("temperature", 30.0);
                trainingInstances.Add(new RVFDatum <string, string>(f1, "rain"));
            }
            {
                ClassicCounter <string> f1 = new ClassicCounter <string>();
                f1.SetCount("humidity", 2.0);
                f1.SetCount("temperature", 33.0);
                trainingInstances.Add(new RVFDatum <string, string>(f1, "dry"));
            }
            {
                ClassicCounter <string> f1 = new ClassicCounter <string>();
                f1.SetCount("humidity", 1.0);
                f1.SetCount("temperature", 34.0);
                trainingInstances.Add(new RVFDatum <string, string>(f1, "dry"));
            }
            Edu.Stanford.Nlp.Classify.KNNClassifier <string, string> classifier = new KNNClassifierFactory <string, string>(3, false, true).Train(trainingInstances);
            {
                ClassicCounter <string> f1 = new ClassicCounter <string>();
                f1.SetCount("humidity", 2.0);
                f1.SetCount("temperature", 33.0);
                RVFDatum <string, string> testVec = new RVFDatum <string, string>(f1);
                System.Console.Out.WriteLine(classifier.ScoresOf(testVec));
                System.Console.Out.WriteLine(classifier.ClassOf(testVec));
            }
        }
示例#13
0
        public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords
                                                                 , double thresholdWordExtract)
        {
            IEnumerator <CandidatePhrase> termIter   = Counters.ToPriorityQueue(newdt).GetEnumerator();
            ICounter <CandidatePhrase>    finalwords = new ClassicCounter <CandidatePhrase>();

            while (termIter.MoveNext())
            {
                if (finalwords.Size() >= constVars.numWordsToAdd)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                if (newdt.GetCount(w) < thresholdWordExtract)
                {
                    Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of  " + thresholdWordExtract);
                    break;
                }
                System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity));
                if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied)
                {
                    Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet());
                    continue;
                }
                CandidatePhrase matchedFuzzy = null;
                if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null)
                {
                    matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern);
                }
                if (matchedFuzzy == null)
                {
                    Redwood.Log("extremePatDebug", "adding word " + w);
                    finalwords.SetCount(w, newdt.GetCount(w));
                }
                else
                {
                    Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word");
                    ignoreWords.Add(w);
                }
            }
            string nextTen = string.Empty;
            int    n       = 0;

            while (termIter.MoveNext())
            {
                n++;
                if (n > 10)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                nextTen += ";\t" + w + ":" + newdt.GetCount(w);
            }
            Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen);
            return(finalwords);
        }
        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }
示例#15
0
        public virtual ICounter <F> WeightsAsCounter()
        {
            ICounter <F> c = new ClassicCounter <F>();

            foreach (F f in featureIndex)
            {
                double w = weights[featureIndex.IndexOf(f)];
                if (w != 0.0)
                {
                    c.SetCount(f, w);
                }
            }
            return(c);
        }
        public virtual ICounter <int> LengthAccuracies()
        {
            ICollection <int> keys = Generics.NewHashSet();

            Sharpen.Collections.AddAll(keys, lengthLabelsCorrect.KeySet());
            Sharpen.Collections.AddAll(keys, lengthLabelsIncorrect.KeySet());
            ICounter <int> results = new ClassicCounter <int>();

            foreach (int key in keys)
            {
                results.SetCount(key, lengthLabelsCorrect.GetCount(key) / (lengthLabelsCorrect.GetCount(key) + lengthLabelsIncorrect.GetCount(key)));
            }
            return(results);
        }
示例#17
0
        public virtual void TestSimplerTokens()
        {
            IDictionary <Type, string> prev   = new _Dictionary_44();
            IDictionary <Type, string> next   = new _Dictionary_49();
            PatternToken               token  = new PatternToken("V", false, true, 2, null, false, false, null);
            SurfacePattern             p      = new SurfacePattern(CreateContext(prev), token, CreateContext(next), SurfacePatternFactory.Genre.Prevnext);
            IDictionary <Type, string> prev2  = new _Dictionary_58();
            IDictionary <Type, string> next2  = new _Dictionary_63();
            PatternToken               token2 = new PatternToken("V", false, true, 2, null, false, false, null);
            SurfacePattern             p2     = new SurfacePattern(CreateContext(prev2), token2, CreateContext(next2), SurfacePatternFactory.Genre.Prevnext);

            System.Diagnostics.Debug.Assert(p.CompareTo(p2) == 0);
            ICounter <SurfacePattern> pats = new ClassicCounter <SurfacePattern>();

            pats.SetCount(p, 1);
            pats.SetCount(p2, 1);
            System.Diagnostics.Debug.Assert(pats.Size() == 1);
            System.Console.Out.WriteLine("pats size is " + pats.Size());
            ConcurrentHashIndex <SurfacePattern> index = new ConcurrentHashIndex <SurfacePattern>();

            index.Add(p);
            index.Add(p2);
            System.Diagnostics.Debug.Assert(index.Count == 1);
        }
示例#18
0
        public virtual ICounter <L> ScoresOf(IDatum <L, F> example)
        {
            ICounter <L> scores = new ClassicCounter <L>();

            foreach (L label in labelIndex)
            {
                IDictionary <L, string> posLabelMap = new ArrayMap <L, string>();
                posLabelMap[label] = PosLabel;
                IDatum <string, F>      binDatum         = GeneralDataset.MapDatum(example, posLabelMap, NegLabel);
                IClassifier <string, F> binaryClassifier = GetBinaryClassifier(label);
                ICounter <string>       binScores        = binaryClassifier.ScoresOf(binDatum);
                double score = binScores.GetCount(PosLabel);
                scores.SetCount(label, score);
            }
            return(scores);
        }
        public virtual IDictionary <L, ICounter <F> > WeightsAsGenericCounter()
        {
            IDictionary <L, ICounter <F> > allweights = new Dictionary <L, ICounter <F> >();

            for (int i = 0; i < weights.Length; i++)
            {
                ICounter <F> c     = new ClassicCounter <F>();
                L            label = labelIndex.Get(i);
                double[]     w     = weights[i];
                foreach (F f in featureIndex)
                {
                    int indexf = featureIndex.IndexOf(f);
                    if (w[indexf] != 0.0)
                    {
                        c.SetCount(f, w[indexf]);
                    }
                }
                allweights[label] = c;
            }
            return(allweights);
        }
示例#20
0
        // todo: Fix javadoc, have unit tested
        /// <summary>Print SVM Light Format file.</summary>
        /// <remarks>
        /// Print SVM Light Format file.
        /// The following comments are no longer applicable because I am
        /// now printing out the exact labelID for each example. -Ramesh ([email protected]) 12/17/2009.
        /// If the Dataset has more than 2 classes, then it
        /// prints using the label index (+1) (for svm_struct).  If it is 2 classes, then the labelIndex.get(0)
        /// is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light).
        /// </remarks>
        public virtual void PrintSVMLightFormat(PrintWriter pw)
        {
            //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter
            // old comment:
            // the following code commented out by Ramesh ([email protected]) 12/17/2009.
            // why not simply print the exact id of the label instead of mapping to some values??
            // new comment:
            // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels,
            //        e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010)
            string[] labelMap = MakeSvmLabelMap();
            for (int i = 0; i < size; i++)
            {
                RVFDatum <L, F>      d      = GetRVFDatum(i);
                ICounter <F>         c      = d.AsFeaturesCounter();
                ClassicCounter <int> printC = new ClassicCounter <int>();
                foreach (F f in c.KeySet())
                {
                    printC.SetCount(featureIndex.IndexOf(f), c.GetCount(f));
                }
                int[] features = Sharpen.Collections.ToArray(printC.KeySet(), new int[printC.KeySet().Count]);
                Arrays.Sort(features);
                StringBuilder sb = new StringBuilder();
                sb.Append(labelMap[labels[i]]).Append(' ');
                // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions!

                /* Old code: assumes that F is Integer....
                 *
                 * for (int f: features) {
                 * sb.append((f + 1)).append(":").append(c.getCount(f)).append(" ");
                 * }
                 */
                //I think this is what was meant (using printC rather than c), but not sure
                // ~Sarah Spikes ([email protected])
                foreach (int f_1 in features)
                {
                    sb.Append((f_1 + 1)).Append(':').Append(printC.GetCount(f_1)).Append(' ');
                }
                pw.Println(sb.ToString());
            }
        }
 /// <summary>
 /// Given an instance to classify, scores and returns
 /// score by class.
 /// </summary>
 /// <remarks>
 /// Given an instance to classify, scores and returns
 /// score by class.
 /// NOTE: supports only RVFDatums
 /// </remarks>
 public virtual ClassicCounter <K> ScoresOf(IDatum <K, V> datum)
 {
     if (datum is RVFDatum <object, object> )
     {
         RVFDatum <K, V> vec = (RVFDatum <K, V>)datum;
         if (l2Normalize)
         {
             ClassicCounter <V> featVec = new ClassicCounter <V>(vec.AsFeaturesCounter());
             Counters.Normalize(featVec);
             vec = new RVFDatum <K, V>(featVec);
         }
         ClassicCounter <ICounter <V> > scores = new ClassicCounter <ICounter <V> >();
         foreach (ICounter <V> instance in instances.AllValues())
         {
             scores.SetCount(instance, Counters.Cosine(vec.AsFeaturesCounter(), instance));
         }
         // set entry, for given instance and score
         IList <ICounter <V> > sorted      = Counters.ToSortedList(scores);
         ClassicCounter <K>    classScores = new ClassicCounter <K>();
         for (int i = 0; i < k && i < sorted.Count; i++)
         {
             K      label = classLookup[sorted[i]];
             double count = 1.0;
             if (weightedVotes)
             {
                 count = scores.GetCount(sorted[i]);
             }
             classScores.IncrementCount(label, count);
         }
         return(classScores);
     }
     else
     {
         return(null);
     }
 }
示例#22
0
        public virtual void PrintStats()
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(2);
            // System.out.println("Node rules");
            // System.out.println(nodeRules);
            // System.out.println("Parent rules");
            // System.out.println(pRules);
            // System.out.println("Grandparent rules");
            // System.out.println(gPRules);
            // Store java code for selSplit
            StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length];
            for (int i = 0; i < Cutoffs.Length; i++)
            {
                javaSB[i] = new StringBuilder("  private static String[] splitters" + (i + 1) + " = new String[] {");
            }
            ClassicCounter <IList <string> > allScores = new ClassicCounter <IList <string> >();

            // do value of parent
            foreach (string node in nodeRules.Keys)
            {
                List <Pair <IList <string>, double> > answers = Generics.NewArrayList();
                ClassicCounter <IList <string> >      cntr    = nodeRules[node];
                double support = (cntr.TotalCount());
                System.Console.Out.WriteLine("Node " + node + " support is " + support);
                foreach (IList <string> key in pRules.Keys)
                {
                    if (key[0].Equals(node))
                    {
                        // only do it if they match
                        ClassicCounter <IList <string> > cntr2 = pRules[key];
                        double support2 = (cntr2.TotalCount());
                        double kl       = Counters.KlDivergence(cntr2, cntr);
                        System.Console.Out.WriteLine("KL(" + key + "||" + node + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2);
                        double score = kl * support2;
                        answers.Add(new Pair <IList <string>, double>(key, score));
                        allScores.SetCount(key, score);
                    }
                }
                System.Console.Out.WriteLine("----");
                System.Console.Out.WriteLine("Sorted descending support * KL");
                answers.Sort(null);
                foreach (Pair <IList <string>, double> answer in answers)
                {
                    Pair   p   = (Pair)answer;
                    double psd = ((double)p.Second());
                    System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd));
                    if (psd >= Cutoffs[0])
                    {
                        IList  lst = (IList)p.First();
                        string nd  = (string)lst[0];
                        string par = (string)lst[1];
                        for (int j = 0; j < Cutoffs.Length; j++)
                        {
                            if (psd >= Cutoffs[j])
                            {
                                javaSB[j].Append("\"").Append(nd).Append("^");
                                javaSB[j].Append(par).Append("\", ");
                            }
                        }
                    }
                }
                System.Console.Out.WriteLine();
            }

            /*
             * // do value of parent with info gain -- yet to finish this
             * for (Iterator it = nodeRules.entrySet().iterator(); it.hasNext(); ) {
             * Map.Entry pair = (Map.Entry) it.next();
             * String node = (String) pair.getKey();
             * Counter cntr = (Counter) pair.getValue();
             * double support = (cntr.totalCount());
             * System.out.println("Node " + node + " support is " + support);
             * ArrayList dtrs = new ArrayList();
             * for (Iterator it2 = pRules.entrySet().iterator(); it2.hasNext();) {
             * HashMap annotated = new HashMap();
             * Map.Entry pair2 = (Map.Entry) it2.next();
             * List node2 = (List) pair2.getKey();
             * Counter cntr2 = (Counter) pair2.getValue();
             * if (node2.get(0).equals(node)) {   // only do it if they match
             * annotated.put(node2, cntr2);
             * }
             * }
             *
             * // upto
             *
             * List answers = new ArrayList();
             * System.out.println("----");
             * System.out.println("Sorted descending support * KL");
             * Collections.sort(answers,
             * new Comparator() {
             * public int compare(Object o1, Object o2) {
             * Pair p1 = (Pair) o1;
             * Pair p2 = (Pair) o2;
             * Double p12 = (Double) p1.second();
             * Double p22 = (Double) p2.second();
             * return p22.compareTo(p12);
             * }
             * });
             * for (int i = 0, size = answers.size(); i < size; i++) {
             * Pair p = (Pair) answers.get(i);
             * double psd = ((Double) p.second()).doubleValue();
             * System.out.println(p.first() + ": " + nf.format(psd));
             * if (psd >= CUTOFFS[0]) {
             * List lst = (List) p.first();
             * String nd = (String) lst.get(0);
             * String par = (String) lst.get(1);
             * for (int j=0; j < CUTOFFS.length; j++) {
             * if (psd >= CUTOFFS[j]) {
             * javaSB[j].append("\"").append(nd).append("^");
             * javaSB[j].append(par).append("\", ");
             * }
             * }
             * }
             * }
             * System.out.println();
             * }
             */
            // do value of grandparent
            foreach (IList <string> node_1 in pRules.Keys)
            {
                List <Pair <IList <string>, double> > answers = Generics.NewArrayList();
                ClassicCounter <IList <string> >      cntr    = pRules[node_1];
                double support = (cntr.TotalCount());
                if (support < Suppcutoff)
                {
                    continue;
                }
                System.Console.Out.WriteLine("Node " + node_1 + " support is " + support);
                foreach (IList <string> key in gPRules.Keys)
                {
                    if (key[0].Equals(node_1[0]) && key[1].Equals(node_1[1]))
                    {
                        // only do it if they match
                        ClassicCounter <IList <string> > cntr2 = gPRules[key];
                        double support2 = (cntr2.TotalCount());
                        double kl       = Counters.KlDivergence(cntr2, cntr);
                        System.Console.Out.WriteLine("KL(" + key + "||" + node_1 + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2);
                        double score = kl * support2;
                        answers.Add(Pair.MakePair(key, score));
                        allScores.SetCount(key, score);
                    }
                }
                System.Console.Out.WriteLine("----");
                System.Console.Out.WriteLine("Sorted descending support * KL");
                answers.Sort(null);
                foreach (Pair <IList <string>, double> answer in answers)
                {
                    Pair   p   = (Pair)answer;
                    double psd = ((double)p.Second());
                    System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd));
                    if (psd >= Cutoffs[0])
                    {
                        IList  lst  = (IList)p.First();
                        string nd   = (string)lst[0];
                        string par  = (string)lst[1];
                        string gpar = (string)lst[2];
                        for (int j = 0; j < Cutoffs.Length; j++)
                        {
                            if (psd >= Cutoffs[j])
                            {
                                javaSB[j].Append("\"").Append(nd).Append("^");
                                javaSB[j].Append(par).Append("~");
                                javaSB[j].Append(gpar).Append("\", ");
                            }
                        }
                    }
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("All scores:");
            IPriorityQueue <IList <string> > pq = Counters.ToPriorityQueue(allScores);

            while (!pq.IsEmpty())
            {
                IList <string> key   = pq.GetFirst();
                double         score = pq.GetPriority(key);
                pq.RemoveFirst();
                System.Console.Out.WriteLine(key + "\t" + score);
            }
            System.Console.Out.WriteLine("  // Automatically generated by ParentAnnotationStats -- preferably don't edit");
            for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++)
            {
                int len = javaSB[i_1].Length;
                javaSB[i_1].Replace(len - 2, len, "};");
                System.Console.Out.WriteLine(javaSB[i_1]);
            }
            System.Console.Out.Write("  public static HashSet splitters = new HashSet(Arrays.asList(");
            for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--)
            {
                if (i_2 == 1)
                {
                    System.Console.Out.Write("splitters1");
                }
                else
                {
                    System.Console.Out.Write("selectiveSplit" + i_2 + " ? splitters" + i_2 + " : (");
                }
            }
            // need to print extra one to close other things open
            for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--)
            {
                System.Console.Out.Write(")");
            }
            System.Console.Out.WriteLine(";");
        }
        /// <summary>
        /// Runs the Viterbi algorithm on the sequence model, and then proceeds to efficiently
        /// backwards decode the best k label sequence assignments.
        /// </summary>
        /// <remarks>
        /// Runs the Viterbi algorithm on the sequence model, and then proceeds to efficiently
        /// backwards decode the best k label sequence assignments.
        /// This sequence finder only works on SequenceModel's with rightWindow == 0.
        /// </remarks>
        /// <param name="ts">The SequenceModel to find the best k label sequence assignments of</param>
        /// <param name="k">The number of top-scoring assignments to find.</param>
        /// <returns>A Counter with k entries that map from a sequence assignment (int array) to a double score</returns>
        public virtual ICounter <int[]> KBestSequences(ISequenceModel ts, int k)
        {
            // Set up tag options
            int length      = ts.Length();
            int leftWindow  = ts.LeftWindow();
            int rightWindow = ts.RightWindow();

            if (rightWindow != 0)
            {
                throw new ArgumentException("KBestSequenceFinder only works with rightWindow == 0 not " + rightWindow);
            }
            int padLength = length + leftWindow + rightWindow;

            int[][] tags   = new int[padLength][];
            int[]   tagNum = new int[padLength];
            for (int pos = 0; pos < padLength; pos++)
            {
                tags[pos]   = ts.GetPossibleValues(pos);
                tagNum[pos] = tags[pos].Length;
            }
            int[] tempTags = new int[padLength];
            // Set up product space sizes
            int[] productSizes = new int[padLength];
            int   curProduct   = 1;

            for (int i = 0; i < leftWindow; i++)
            {
                curProduct *= tagNum[i];
            }
            for (int pos_1 = leftWindow; pos_1 < padLength; pos_1++)
            {
                if (pos_1 > leftWindow + rightWindow)
                {
                    curProduct /= tagNum[pos_1 - leftWindow - rightWindow - 1];
                }
                // shift off
                curProduct *= tagNum[pos_1];
                // shift on
                productSizes[pos_1 - rightWindow] = curProduct;
            }
            double[][] windowScore = new double[padLength][];
            // Score all of each window's options
            for (int pos_2 = leftWindow; pos_2 < leftWindow + length; pos_2++)
            {
                windowScore[pos_2] = new double[productSizes[pos_2]];
                Arrays.Fill(tempTags, tags[0][0]);
                for (int product = 0; product < productSizes[pos_2]; product++)
                {
                    int p     = product;
                    int shift = 1;
                    for (int curPos = pos_2; curPos >= pos_2 - leftWindow; curPos--)
                    {
                        tempTags[curPos] = tags[curPos][p % tagNum[curPos]];
                        p /= tagNum[curPos];
                        if (curPos > pos_2)
                        {
                            shift *= tagNum[curPos];
                        }
                    }
                    if (tempTags[pos_2] == tags[pos_2][0])
                    {
                        // get all tags at once
                        double[] scores = ts.ScoresOf(tempTags, pos_2);
                        // fill in the relevant windowScores
                        for (int t = 0; t < tagNum[pos_2]; t++)
                        {
                            windowScore[pos_2][product + t * shift] = scores[t];
                        }
                    }
                }
            }
            // Set up score and backtrace arrays
            double[][][] score         = new double[padLength][][];
            int[][][][]  trace         = new int[padLength][][][];
            int[][]      numWaysToMake = new int[padLength][];
            for (int pos_3 = 0; pos_3 < padLength; pos_3++)
            {
                score[pos_3] = new double[productSizes[pos_3]][];
                trace[pos_3] = new int[productSizes[pos_3]][][];
                // the 2 is for backtrace, and which of the k best for that backtrace
                numWaysToMake[pos_3] = new int[productSizes[pos_3]];
                Arrays.Fill(numWaysToMake[pos_3], 1);
                for (int product = 0; product < productSizes[pos_3]; product++)
                {
                    if (pos_3 > leftWindow)
                    {
                        // loop over possible predecessor types
                        int sharedProduct = product / tagNum[pos_3];
                        int factor        = productSizes[pos_3] / tagNum[pos_3];
                        numWaysToMake[pos_3][product] = 0;
                        for (int newTagNum = 0; newTagNum < tagNum[pos_3 - leftWindow - 1] && numWaysToMake[pos_3][product] < k; newTagNum++)
                        {
                            int predProduct = newTagNum * factor + sharedProduct;
                            numWaysToMake[pos_3][product] += numWaysToMake[pos_3 - 1][predProduct];
                        }
                        if (numWaysToMake[pos_3][product] > k)
                        {
                            numWaysToMake[pos_3][product] = k;
                        }
                    }
                    score[pos_3][product] = new double[numWaysToMake[pos_3][product]];
                    Arrays.Fill(score[pos_3][product], double.NegativeInfinity);
                    trace[pos_3][product] = new int[numWaysToMake[pos_3][product]][];
                    Arrays.Fill(trace[pos_3][product], new int[] { -1, -1 });
                }
            }
            // Do forward Viterbi algorithm
            // this is the hottest loop, so cache loop control variables hoping for a little speed....
            // loop over the classification spot
            for (int pos_4 = leftWindow; pos_4 < posMax; pos_4++)
            {
                // loop over window product types
                for (int product = 0; product < productMax; product++)
                {
                    // check for initial spot
                    double[] scorePos = score[pos_4][product];
                    int[][]  tracePos = trace[pos_4][product];
                    if (pos_4 == leftWindow)
                    {
                        // no predecessor type
                        scorePos[0] = windowScore[pos_4][product];
                    }
                    else
                    {
                        // loop over possible predecessor types/k-best
                        int sharedProduct = product / tagNum[pos_4 + rightWindow];
                        int factor        = productSizes[pos_4] / tagNum[pos_4 + rightWindow];
                        for (int newTagNum = 0; newTagNum < maxTagNum; newTagNum++)
                        {
                            int      predProduct  = newTagNum * factor + sharedProduct;
                            double[] scorePosPrev = score[pos_4 - 1][predProduct];
                            for (int k1 = 0; k1 < scorePosPrev.Length; k1++)
                            {
                                double predScore = scorePosPrev[k1] + windowScore[pos_4][product];
                                if (predScore > scorePos[0])
                                {
                                    // new value higher then lowest value we should keep
                                    int k2 = Arrays.BinarySearch(scorePos, predScore);
                                    k2 = k2 < 0 ? -k2 - 2 : k2 - 1;
                                    // open a spot at k2 by shifting off the lowest value
                                    System.Array.Copy(scorePos, 1, scorePos, 0, k2);
                                    System.Array.Copy(tracePos, 1, tracePos, 0, k2);
                                    scorePos[k2] = predScore;
                                    tracePos[k2] = new int[] { predProduct, k1 };
                                }
                            }
                        }
                    }
                }
            }
            // Project the actual tag sequence
            int[]    whichDerivation     = new int[k];
            int[]    bestCurrentProducts = new int[k];
            double[] bestFinalScores     = new double[k];
            Arrays.Fill(bestFinalScores, double.NegativeInfinity);
            // just the last guy
            for (int product_1 = 0; product_1 < productSizes[padLength - 1]; product_1++)
            {
                double[] scorePos = score[padLength - 1][product_1];
                for (int k1 = scorePos.Length - 1; k1 >= 0 && scorePos[k1] > bestFinalScores[0]; k1--)
                {
                    int k2 = Arrays.BinarySearch(bestFinalScores, scorePos[k1]);
                    k2 = k2 < 0 ? -k2 - 2 : k2 - 1;
                    // open a spot at k2 by shifting off the lowest value
                    System.Array.Copy(bestFinalScores, 1, bestFinalScores, 0, k2);
                    System.Array.Copy(whichDerivation, 1, whichDerivation, 0, k2);
                    System.Array.Copy(bestCurrentProducts, 1, bestCurrentProducts, 0, k2);
                    bestCurrentProducts[k2] = product_1;
                    whichDerivation[k2]     = k1;
                    bestFinalScores[k2]     = scorePos[k1];
                }
            }
            ClassicCounter <int[]> kBestWithScores = new ClassicCounter <int[]>();

            for (int k1_1 = k - 1; k1_1 >= 0 && bestFinalScores[k1_1] > double.NegativeInfinity; k1_1--)
            {
                int lastProduct = bestCurrentProducts[k1_1];
                for (int last = padLength - 1; last >= length - 1 && last >= 0; last--)
                {
                    tempTags[last] = tags[last][lastProduct % tagNum[last]];
                    lastProduct   /= tagNum[last];
                }
                for (int pos_5 = leftWindow + length - 2; pos_5 >= leftWindow; pos_5--)
                {
                    int bestNextProduct = bestCurrentProducts[k1_1];
                    bestCurrentProducts[k1_1]    = trace[pos_5 + 1][bestNextProduct][whichDerivation[k1_1]][0];
                    whichDerivation[k1_1]        = trace[pos_5 + 1][bestNextProduct][whichDerivation[k1_1]][1];
                    tempTags[pos_5 - leftWindow] = tags[pos_5 - leftWindow][bestCurrentProducts[k1_1] / (productSizes[pos_5] / tagNum[pos_5 - leftWindow])];
                }
                kBestWithScores.SetCount(Arrays.CopyOf(tempTags, tempTags.Length), bestFinalScores[k1_1]);
            }
            return(kBestWithScores);
        }
        //goldList null if not training
        public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining)
        {
            Annotation doc = sd.doc;

            sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc);
            GeneralDataset <string, string>      dataset           = new RVFDataset <string, string>();
            //necessary for 'ScoreBestMention'
            IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >();
            //maps quote to corresponding indices in the dataset
            IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>();

            if (isTraining && goldList.Count != quotes.Count)
            {
                throw new Exception("Gold Quote List size doesn't match quote list size!");
            }
            for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++)
            {
                int      initialSize = dataset.Size();
                ICoreMap quote       = quotes[quoteIdx];
                XMLToAnnotation.GoldQuoteInfo gold = null;
                if (isTraining)
                {
                    gold = goldList[quoteIdx];
                    if (gold.speaker == string.Empty)
                    {
                        continue;
                    }
                }
                ICoreMap        quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                Pair <int, int> quoteRun           = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                //      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
                int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation));
                //add mentions before quote up to the previous paragraph
                int rightValue = quoteRun.first - 1;
                int leftValue  = quoteRun.first - 1;
                //move left value to be the first token idx of the previous paragraph
                for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--)
                {
                    ICoreMap sentence = sentences[sentIdx];
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        continue;
                    }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                    {
                        //quoteParagraphIdx - 1 for this and prev
                        leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>();
                if (leftValue > -1 && rightValue > -1)
                {
                    mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue)));
                }
                //mentions in next paragraph
                leftValue  = quoteRun.second + 1;
                rightValue = quoteRun.second + 1;
                for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++)
                {
                    ICoreMap sentence = sentences[sentIdx_1];
                    //        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
                    //          continue;
                    //        }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        //quoteParagraphIdx + 1
                        rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1;
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>();
                if (leftValue < tokens.Count && rightValue < tokens.Count)
                {
                    mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue));
                }
                IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>();
                Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph);
                Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph);
                //      System.out.println(candidateMentions.size());
                int rankedDistance = 1;
                int numBackwards   = mentionsInPreviousParagraph.Count;
                foreach (Sieve.MentionData mention in candidateMentions)
                {
                    IList <CoreLabel> mentionCandidateTokens   = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1);
                    ICoreMap          mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()];
                    //        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
                    //          continue;
                    //        }
                    ICounter <string> features = new ClassicCounter <string>();
                    bool isLeft   = true;
                    int  distance = quoteRun.first - mention.end;
                    if (distance < 0)
                    {
                        isLeft   = false;
                        distance = mention.begin - quoteRun.second;
                    }
                    if (distance < 0)
                    {
                        continue;
                    }
                    //disregard mention-in-quote cases.
                    features.SetCount("wordDistance", distance);
                    IList <CoreLabel> betweenTokens;
                    if (isLeft)
                    {
                        betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first);
                    }
                    else
                    {
                        betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin);
                    }
                    //Punctuation in between
                    foreach (CoreLabel token in betweenTokens)
                    {
                        if (punctuation.Contains(token.Word()))
                        {
                            features.SetCount("punctuationPresence:" + token.Word(), 1);
                        }
                    }
                    // number of mentions away
                    features.SetCount("rankedDistance", rankedDistance);
                    rankedDistance++;
                    if (rankedDistance == numBackwards)
                    {
                        //reset for the forward
                        rankedDistance = 1;
                    }
                    //        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
                    //third distance: # of paragraphs away
                    int      mentionParagraphIdx        = -1;
                    ICoreMap sentenceInMentionParagraph = null;
                    int      quoteParagraphBeginToken   = GetParagraphBeginToken(quoteFirstSentence, sentences);
                    int      quoteParagraphEndToken     = GetParagraphEndToken(quoteFirstSentence, sentences);
                    if (isLeft)
                    {
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("leftParagraphDistance", 0);
                            mentionParagraphIdx        = quoteParagraphIdx;
                            sentenceInMentionParagraph = quoteFirstSentence;
                        }
                        else
                        {
                            int      paragraphDistance = 1;
                            int      currParagraphIdx  = quoteParagraphIdx - paragraphDistance;
                            ICoreMap currSentence      = quoteFirstSentence;
                            int      currSentenceIdx   = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currParagraphIdx >= 0)
                            {
                                //              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
                                //extract begin and end tokens of
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx)
                                {
                                    currSentenceIdx--;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd)
                                {
                                    mentionParagraphIdx        = currParagraphIdx;
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("leftParagraphDistance", paragraphDistance);
                                    if (paragraphDistance % 2 == 0)
                                    {
                                        features.SetCount("leftParagraphDistanceEven", 1);
                                    }
                                    break;
                                }
                                paragraphDistance++;
                                currParagraphIdx--;
                            }
                        }
                    }
                    else
                    {
                        //right
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("rightParagraphDistance", 0);
                            sentenceInMentionParagraph = quoteFirstSentence;
                            mentionParagraphIdx        = quoteParagraphIdx;
                        }
                        else
                        {
                            int      paragraphDistance  = 1;
                            int      nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
                            ICoreMap currSentence       = quoteFirstSentence;
                            int      currSentenceIdx    = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currSentenceIdx < sentences.Count)
                            {
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex)
                                {
                                    currSentenceIdx++;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int nextParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd)
                                {
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("rightParagraphDistance", paragraphDistance);
                                    break;
                                }
                                paragraphDistance++;
                                nextParagraphIndex++;
                            }
                        }
                    }
                    //2. mention features
                    if (sentenceInMentionParagraph != null)
                    {
                        int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences);
                        int mentionParagraphEnd   = GetParagraphEndToken(sentenceInMentionParagraph, sentences);
                        if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken))
                        {
                            IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>());
                            Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd));
                            features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count);
                            features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
                            features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count);
                            //mention ordering in paragraph it is in
                            for (int i = 0; i < namesInMentionParagraph.second.Count; i++)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i]))
                                {
                                    features.SetCount("orderInParagraph", i);
                                }
                            }
                            //if mention paragraph is all one quote
                            if (quotesInMentionParagraph.Count == 1)
                            {
                                ICoreMap qInMentionParagraph = quotesInMentionParagraph[0];
                                if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd)
                                {
                                    features.SetCount("mentionParagraphIsInConversation", 1);
                                }
                                else
                                {
                                    features.SetCount("mentionParagraphIsInConversation", -1);
                                }
                            }
                            foreach (ICoreMap quoteIMP in quotesInMentionParagraph)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end)))
                                {
                                    features.SetCount("mentionInQuote", 1);
                                }
                            }
                            if (features.GetCount("mentionInQuote") != 1)
                            {
                                features.SetCount("mentionNotInQuote", 1);
                            }
                        }
                    }
                    // nearby word syntax types...make sure to check if there are previous or next words
                    // or there will be an array index crash
                    if (mention.begin > 0)
                    {
                        CoreLabel prevWord = tokens[mention.begin - 1];
                        features.SetCount("prevWordType:" + prevWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(prevWord.Lemma()))
                        {
                            features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1);
                        }
                    }
                    if (mention.end + 1 < tokens.Count)
                    {
                        CoreLabel nextWord = tokens[mention.end + 1];
                        features.SetCount("nextWordType:" + nextWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(nextWord.Lemma()))
                        {
                            features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1);
                        }
                    }
                    //                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
                    //quote paragraph features
                    IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx];
                    features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count);
                    features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
                    features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count);
                    //quote features
                    features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1);
                    for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++)
                    {
                        if (quotesInQuoteParagraph[i_1].Equals(quote))
                        {
                            features.SetCount("quotePosition", i_1 + 1);
                        }
                    }
                    if (features.GetCount("quotePosition") == 0)
                    {
                        throw new Exception("Check this (equality not working)");
                    }
                    Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun);
                    foreach (string name in namesData.first)
                    {
                        features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1);
                    }
                    //if quote encompasses entire paragraph
                    if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken)
                    {
                        features.SetCount("isImplicitSpeaker", 1);
                    }
                    else
                    {
                        features.SetCount("isImplicitSpeaker", -1);
                    }
                    //Vocative detection
                    if (mention.type.Equals("name"))
                    {
                        IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))];
                        Person         p     = null;
                        if (pList != null)
                        {
                            p = pList[0];
                        }
                        else
                        {
                            Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end));
                            if (scanForNamesResultPair.first.Count != 0)
                            {
                                string scanForNamesResultString = scanForNamesResultPair.first[0];
                                if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString))
                                {
                                    p = sd.characterMap[scanForNamesResultString][0];
                                }
                            }
                        }
                        if (p != null)
                        {
                            foreach (string name_1 in namesData.first)
                            {
                                if (p.aliases.Contains(name_1))
                                {
                                    features.SetCount("nameInQuote", 1);
                                }
                            }
                            if (quoteParagraphIdx > 0)
                            {
                                //            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
                                IList <ICoreMap>         quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>());
                                IList <Pair <int, int> > exclusionList         = new List <Pair <int, int> >();
                                foreach (ICoreMap quoteIPP in quotesInPrevParagraph)
                                {
                                    Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                                    exclusionList.Add(quoteRange);
                                    foreach (string name_2 in sieve.ScanForNames(quoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphQuote", 1);
                                        }
                                    }
                                }
                                int      sentenceIdx             = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                                ICoreMap sentenceInPrevParagraph = null;
                                for (int i = sentenceIdx - 1; i_1 >= 0; i_1--)
                                {
                                    ICoreMap currSentence = sentences[i_1];
                                    if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                                    {
                                        sentenceInPrevParagraph = currSentence;
                                        break;
                                    }
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(sentenceInPrevParagraph, sentences);
                                IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList);
                                foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns)
                                {
                                    foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphNonQuote", 1);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (isTraining)
                    {
                        if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end)))
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            mapDatumToMention[dataset.Size()] = mention;
                            dataset.Add(datum);
                        }
                        else
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            dataset.Add(datum);
                            mapDatumToMention[dataset.Size()] = mention;
                        }
                    }
                    else
                    {
                        RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none");
                        datum.SetID(int.ToString(dataset.Size()));
                        mapDatumToMention[dataset.Size()] = mention;
                        dataset.Add(datum);
                    }
                }
                mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1);
            }
            return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset));
        }
示例#25
0
        /// <summary>Takes time linear in number of arcs.</summary>
        public static ClassicCounter ComputeLambda(TransducerGraph graph)
        {
            ArrayList      queue  = new ArrayList();
            ClassicCounter lambda = new ClassicCounter();
            ClassicCounter length = new ClassicCounter();
            IDictionary    first  = new Hashtable();
            ISet           nodes  = graph.GetNodes();

            foreach (object node in nodes)
            {
                lambda.SetCount(node, 0);
                length.SetCount(node, double.PositiveInfinity);
            }
            ISet endNodes = graph.GetEndNodes();

            foreach (object o in endNodes)
            {
                lambda.SetCount(o, 0);
                length.SetCount(o, 0);
                queue.AddLast(o);
            }
            // Breadth first search
            // get the first node from the queue
            object node_1 = null;

            try
            {
                node_1 = queue.RemoveFirst();
            }
            catch (NoSuchElementException)
            {
            }
            while (node_1 != null)
            {
                double oldLen = length.GetCount(node_1);
                ISet   arcs   = graph.GetArcsByTarget(node_1);
                if (arcs != null)
                {
                    foreach (object arc1 in arcs)
                    {
                        TransducerGraph.Arc arc = (TransducerGraph.Arc)arc1;
                        object      newNode     = arc.GetSourceNode();
                        IComparable a           = (IComparable)arc.GetInput();
                        double      k           = ((double)arc.GetOutput());
                        double      newLen      = length.GetCount(newNode);
                        if (newLen == double.PositiveInfinity)
                        {
                            // we are discovering this
                            queue.AddLast(newNode);
                        }
                        IComparable f = (IComparable)first[newNode];
                        if (newLen == double.PositiveInfinity || (newLen == oldLen + 1 && a.CompareTo(f) < 0))
                        {
                            // f can't be null, since we have a newLen
                            // we do this to this to newNode when we have new info, possibly many times
                            first[newNode] = a;
                            // ejecting old one if necessary
                            length.SetCount(newNode, oldLen + 1);
                            // this may already be the case
                            lambda.SetCount(newNode, k + lambda.GetCount(node_1));
                        }
                    }
                }
                // get a new node from the queue
                node_1 = null;
                try
                {
                    node_1 = queue.RemoveFirst();
                }
                catch (NoSuchElementException)
                {
                }
            }
            return(lambda);
        }
示例#26
0
        public override ICounter <E> Score()
        {
            ICounter <E> currentPatternWeights4Label = new ClassicCounter <E>();
            ICounter <E> pos_i   = new ClassicCounter <E>();
            ICounter <E> neg_i   = new ClassicCounter <E>();
            ICounter <E> unlab_i = new ClassicCounter <E>();

            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in negPatternsandWords4Label.EntrySet())
            {
                neg_i.SetCount(en.Key, en.Value.Size());
            }
            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en_1 in unLabeledPatternsandWords4Label.EntrySet())
            {
                unlab_i.SetCount(en_1.Key, en_1.Value.Size());
            }
            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en_2 in patternsandWords4Label.EntrySet())
            {
                pos_i.SetCount(en_2.Key, en_2.Value.Size());
            }
            ICounter <E> all_i = Counters.Add(pos_i, neg_i);

            all_i.AddAll(unlab_i);
            //    for (Entry<Integer, ClassicCounter<String>> en : allPatternsandWords4Label
            //        .entrySet()) {
            //      all_i.setCount(en.getKey(), en.getValue().size());
            //    }
            ICounter <E> posneg_i = Counters.Add(pos_i, neg_i);
            ICounter <E> logFi    = new ClassicCounter <E>(pos_i);

            Counters.LogInPlace(logFi);
            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogF))
            {
                currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, all_i), logFi);
            }
            else
            {
                if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogFPosNeg))
                {
                    Redwood.Log("extremePatDebug", "computing rlogfposneg");
                    currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, posneg_i), logFi);
                }
                else
                {
                    if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogFUnlabNeg))
                    {
                        Redwood.Log("extremePatDebug", "computing rlogfunlabeg");
                        currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, Counters.Add(neg_i, unlab_i)), logFi);
                    }
                    else
                    {
                        if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogFNeg))
                        {
                            Redwood.Log("extremePatDebug", "computing rlogfneg");
                            currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, neg_i), logFi);
                        }
                        else
                        {
                            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.YanGarber02))
                            {
                                ICounter <E> acc            = Counters.Division(pos_i, Counters.Add(pos_i, neg_i));
                                double       thetaPrecision = 0.8;
                                Counters.RetainAbove(acc, thetaPrecision);
                                ICounter <E> conf = Counters.Product(Counters.Division(pos_i, all_i), logFi);
                                foreach (E p in acc.KeySet())
                                {
                                    currentPatternWeights4Label.SetCount(p, conf.GetCount(p));
                                }
                            }
                            else
                            {
                                if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LinICML03))
                                {
                                    ICounter <E> acc            = Counters.Division(pos_i, Counters.Add(pos_i, neg_i));
                                    double       thetaPrecision = 0.8;
                                    Counters.RetainAbove(acc, thetaPrecision);
                                    ICounter <E> conf = Counters.Product(Counters.Division(Counters.Add(pos_i, Counters.Scale(neg_i, -1)), all_i), logFi);
                                    foreach (E p in acc.KeySet())
                                    {
                                        currentPatternWeights4Label.SetCount(p, conf.GetCount(p));
                                    }
                                }
                                else
                                {
                                    throw new Exception("not implemented " + patternScoring + " . check spelling!");
                                }
                            }
                        }
                    }
                }
            }
            return(currentPatternWeights4Label);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public override ICounter <E> Score()
        {
            ICounter <CandidatePhrase> externalWordWeightsNormalized = null;

            if (constVars.dictOddsWeights.Contains(label))
            {
                externalWordWeightsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false);
            }
            ICounter <E> currentPatternWeights4Label = new ClassicCounter <E>();
            bool         useFreqPhraseExtractedByPat = false;

            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio))
            {
                useFreqPhraseExtractedByPat = true;
            }
            IToDoubleFunction <Pair <E, CandidatePhrase> > numeratorScore = null;
            ICounter <E> numeratorPatWt   = this.Convert2OneDim(label, numeratorScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, null, useFreqPhraseExtractedByPat);
            ICounter <E> denominatorPatWt = null;
            IToDoubleFunction <Pair <E, CandidatePhrase> > denoScore;

            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegUnlabOdds))
            {
                denoScore        = null;
                denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
            }
            else
            {
                if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RatioAll))
                {
                    denoScore        = null;
                    denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                }
                else
                {
                    if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegOdds))
                    {
                        denoScore        = null;
                        denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                    }
                    else
                    {
                        if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.
                                                                                                                                                                                                                            Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP))
                        {
                            denoScore        = null;
                            denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, true, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                        }
                        else
                        {
                            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio))
                            {
                                denoScore        = null;
                                denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, true, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                            }
                            else
                            {
                                throw new Exception("Cannot understand patterns scoring");
                            }
                        }
                    }
                }
            }
            currentPatternWeights4Label = Counters.DivisionNonNaN(numeratorPatWt, denominatorPatWt);
            //Multiplying by logP
            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP))
            {
                ICounter <E> logpos_i = new ClassicCounter <E>();
                foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet())
                {
                    logpos_i.SetCount(en.Key, Math.Log(en.Value.Size()));
                }
                Counters.MultiplyInPlace(currentPatternWeights4Label, logpos_i);
            }
            Counters.RetainNonZeros(currentPatternWeights4Label);
            return(currentPatternWeights4Label);
        }
示例#28
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
 static ChineseQuantifiableEntityNormalizer()
 {
     //Entity types that are quantifiable
     // used by money
     // used by money
     // Patterns we need
     // TODO (yuhao): here we are not considering 1) negative numbers, 2) Chinese traditional characters
     // This is the all-literal-number-characters sequence, excluding unit characters like 十 or 万
     // The decimal part of a float number should be exactly literal number sequence without units
     // Used by quantity modifiers
     // All the tags we need
     // static initialization of useful properties
     quantifiable = Generics.NewHashSet();
     quantifiable.Add(NumberTag);
     quantifiable.Add(DateTag);
     quantifiable.Add(TimeTag);
     quantifiable.Add(MoneyTag);
     quantifiable.Add(PercentTag);
     quantifiable.Add(OrdinalTag);
     quantityUnitToValues = new ClassicCounter <string>();
     quantityUnitToValues.SetCount("十", 10.0);
     quantityUnitToValues.SetCount("百", 100.0);
     quantityUnitToValues.SetCount("千", 1000.0);
     quantityUnitToValues.SetCount("万", 10000.0);
     quantityUnitToValues.SetCount("亿", 100000000.0);
     wordsToValues = new ClassicCounter <string>();
     wordsToValues.SetCount("零", 0.0);
     wordsToValues.SetCount("〇", 0.0);
     wordsToValues.SetCount("一", 1.0);
     wordsToValues.SetCount("二", 2.0);
     wordsToValues.SetCount("两", 2.0);
     wordsToValues.SetCount("三", 3.0);
     wordsToValues.SetCount("四", 4.0);
     wordsToValues.SetCount("五", 5.0);
     wordsToValues.SetCount("六", 6.0);
     wordsToValues.SetCount("七", 7.0);
     wordsToValues.SetCount("八", 8.0);
     wordsToValues.SetCount("九", 9.0);
     wordsToValues.AddAll(quantityUnitToValues);
     // all units are also quantifiable individually
     multiCharCurrencyWords       = Generics.NewHashMap();
     multiCharCurrencyWords["美元"] = '$';
     multiCharCurrencyWords["美分"] = '$';
     multiCharCurrencyWords["英镑"] = '£';
     multiCharCurrencyWords["先令"] = '£';
     multiCharCurrencyWords["便士"] = '£';
     multiCharCurrencyWords["欧元"] = '€';
     multiCharCurrencyWords["日元"] = '¥';
     multiCharCurrencyWords["韩元"] = '₩';
     oneCharCurrencyWords         = Generics.NewHashMap();
     oneCharCurrencyWords["刀"]    = '$';
     oneCharCurrencyWords["镑"]    = '£';
     oneCharCurrencyWords["元"]    = '元';
     // We follow the tradition in English to use 元 instead of ¥ for RMB
     // For all other currency, we use default currency symbol $
     yearModifiers             = Generics.NewHashMap();
     yearModifiers["前"]        = -2;
     yearModifiers["去"]        = -1;
     yearModifiers["上"]        = -1;
     yearModifiers["今"]        = 0;
     yearModifiers["同"]        = 0;
     yearModifiers["此"]        = 0;
     yearModifiers["该"]        = 0;
     yearModifiers["本"]        = 0;
     yearModifiers["明"]        = 1;
     yearModifiers["来"]        = 1;
     yearModifiers["下"]        = 1;
     yearModifiers["后"]        = 2;
     monthDayModifiers         = Generics.NewHashMap();
     monthDayModifiers["昨"]    = -1;
     monthDayModifiers["上"]    = -1;
     monthDayModifiers["今"]    = 0;
     monthDayModifiers["同"]    = 0;
     monthDayModifiers["此"]    = 0;
     monthDayModifiers["该"]    = 0;
     monthDayModifiers["本"]    = 0;
     monthDayModifiers["来"]    = 1;
     monthDayModifiers["明"]    = 1;
     monthDayModifiers["下"]    = 1;
     fullDigitToHalfDigit      = Generics.NewHashMap();
     fullDigitToHalfDigit["1"] = "1";
     fullDigitToHalfDigit["2"] = "2";
     fullDigitToHalfDigit["3"] = "3";
     fullDigitToHalfDigit["4"] = "4";
     fullDigitToHalfDigit["5"] = "5";
     fullDigitToHalfDigit["6"] = "6";
     fullDigitToHalfDigit["7"] = "7";
     fullDigitToHalfDigit["8"] = "8";
     fullDigitToHalfDigit["9"] = "9";
     fullDigitToHalfDigit["0"] = "0";
 }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool
                                                     sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat)
        {
            //    if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) {
            //      Data.loadGoogleNGrams();
            //    }
            ICounter <E> patterns = new ClassicCounter <E>();
            ICounter <CandidatePhrase> googleNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> domainNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores    = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>();
            double            externalWtsDefault = 0.5;
            ICounter <string> classifierScores   = null;

            if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
            {
                foreach (CandidatePhrase gc in allCandidatePhrases)
                {
                    string g = gc.GetPhrase();
                    if (constVars.usePatternEvalEditDistOther)
                    {
                        editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalEditDistSame)
                    {
                        editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalGoogleNgram)
                    {
                        googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc));
                    }
                    if (constVars.usePatternEvalDomainNgram)
                    {
                        // calculate domain-ngram wts
                        if (Data.domainNGramRawFreq.ContainsKey(g))
                        {
                            System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc)));
                            domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g));
                        }
                    }
                    if (constVars.usePatternEvalWordClass)
                    {
                        int num = constVars.GetWordClassClusters()[g];
                        if (num == null)
                        {
                            num = constVars.GetWordClassClusters()[g.ToLower()];
                        }
                        if (num != null && constVars.distSimWeights[label].ContainsKey(num))
                        {
                            externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num));
                        }
                        else
                        {
                            externalFeatWtsNormalized.SetCount(gc, externalWtsDefault);
                        }
                    }
                }
                if (constVars.usePatternEvalGoogleNgram)
                {
                    googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalDomainNgram)
                {
                    domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalWordClass)
                {
                    externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false);
                }
            }
            else
            {
                if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                {
                    Properties props2 = new Properties();
                    props2.PutAll(props);
                    props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt");
                    ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars);
                    System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile"));
                    ArgumentParser.FillOptions(typeof(Data), props2);
                    classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true);
                }
            }
            ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>();

            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet())
            {
                foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet())
                {
                    CandidatePhrase word = en2.Key;
                    ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>();
                    double score = 1;
                    if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
                    {
                        if (cachedScoresForThisIter.ContainsKey(word))
                        {
                            score = cachedScoresForThisIter.GetCount(word);
                        }
                        else
                        {
                            if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word))
                            {
                                score = 1;
                            }
                            else
                            {
                                if (constVars.usePatternEvalSemanticOdds)
                                {
                                    double semanticClassOdds = 1;
                                    if (dictOddsWordWeights.ContainsKey(word))
                                    {
                                        semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds);
                                }
                                if (constVars.usePatternEvalGoogleNgram)
                                {
                                    double gscore = 0;
                                    if (googleNgramNormScores.ContainsKey(word))
                                    {
                                        gscore = 1 - googleNgramNormScores.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore);
                                }
                                if (constVars.usePatternEvalDomainNgram)
                                {
                                    double domainscore;
                                    if (domainNgramNormScores.ContainsKey(word))
                                    {
                                        domainscore = 1 - domainNgramNormScores.GetCount(word);
                                    }
                                    else
                                    {
                                        domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore);
                                }
                                if (constVars.usePatternEvalWordClass)
                                {
                                    double externalFeatureWt = externalWtsDefault;
                                    if (externalFeatWtsNormalized.ContainsKey(word))
                                    {
                                        externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt);
                                }
                                if (constVars.usePatternEvalEditDistOther)
                                {
                                    System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty);
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word));
                                }
                                if (constVars.usePatternEvalEditDistSame)
                                {
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word));
                                }
                                // taking average
                                score = Counters.Mean(scoreslist);
                                phInPatScores.SetCounter(word, scoreslist);
                            }
                            cachedScoresForThisIter.SetCount(word, score);
                        }
                    }
                    else
                    {
                        if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                        {
                            score = 1 - classifierScores.GetCount(word);
                        }
                    }
                    // score = 1 - scorePhrases.scoreUsingClassifer(classifier,
                    // e.getKey(), label, true, null, null, dictOddsWordWeights);
                    // throw new RuntimeException("not implemented yet");
                    if (useFreqPhraseExtractedByPat)
                    {
                        score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word));
                    }
                    if (constVars.sqrtPatScore)
                    {
                        patterns.IncrementCount(en.Key, Math.Sqrt(score));
                    }
                    else
                    {
                        patterns.IncrementCount(en.Key, score);
                    }
                }
            }
            return(patterns);
        }