/// <exception cref="System.IO.IOException"/>
        private void WriteObject(ObjectOutputStream stream)
        {
            //    log.info("\nBefore compression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            ClassicCounter <IntDependency> fullArgCounter = argCounter;

            argCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency dependency in fullArgCounter.KeySet())
            {
                if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1)
                {
                    argCounter.IncrementCount(dependency, fullArgCounter.GetCount(dependency));
                }
            }
            ClassicCounter <IntDependency> fullStopCounter = stopCounter;

            stopCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency dependency_1 in fullStopCounter.KeySet())
            {
                if (dependency_1.head.word != -1)
                {
                    stopCounter.IncrementCount(dependency_1, fullStopCounter.GetCount(dependency_1));
                }
            }
            //    log.info("After compression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            stream.DefaultWriteObject();
            argCounter  = fullArgCounter;
            stopCounter = fullStopCounter;
        }
Ejemplo n.º 2
0
        public virtual ClassicCounter <L> ScoresOf(RVFDatum <L, F> example)
        {
            ClassicCounter <L> scores = new ClassicCounter <L>();

            Counters.AddInPlace(scores, priors);
            if (addZeroValued)
            {
                Counters.AddInPlace(scores, priorZero);
            }
            foreach (L l in labels)
            {
                double       score    = 0.0;
                ICounter <F> features = example.AsFeaturesCounter();
                foreach (F f in features.KeySet())
                {
                    int value = (int)features.GetCount(f);
                    score += Weight(l, f, int.Parse(value));
                    if (addZeroValued)
                    {
                        score -= Weight(l, f, zero);
                    }
                }
                scores.IncrementCount(l, score);
            }
            return(scores);
        }
Ejemplo n.º 3
0
        public override ICounter <E> Score()
        {
            ICounter <E> specificity = new ClassicCounter <E>();
            ICounter <E> sensitivity = new ClassicCounter <E>();

            if (p0Set.KeySet().Count == 0)
            {
                throw new Exception("how come p0set size is empty for " + p0 + "?");
            }
            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet())
            {
                int common = CollectionUtils.Intersection(en.Value.KeySet(), p0Set.KeySet()).Count;
                if (common == 0)
                {
                    continue;
                }
                if (en.Value.KeySet().Count == 0)
                {
                    throw new Exception("how come counter for " + en.Key + " is empty?");
                }
                specificity.SetCount(en.Key, common / (double)en.Value.KeySet().Count);
                sensitivity.SetCount(en.Key, common / (double)p0Set.Size());
            }
            Counters.RetainNonZeros(specificity);
            Counters.RetainNonZeros(sensitivity);
            ICounter <E> add     = Counters.Add(sensitivity, specificity);
            ICounter <E> product = Counters.Product(sensitivity, specificity);

            Counters.RetainNonZeros(product);
            Counters.RetainKeys(product, add.KeySet());
            ICounter <E> finalPat = Counters.Scale(Counters.Division(product, add), 2);

            return(finalPat);
        }
            public virtual void ComputeFinalValues()
            {
                double denom = (double)numTrees;

                meanDepth           = depth2 / denom;
                meanLength          = length2 / denom;
                meanBreadth         = breadth2 / denom;
                meanConstituents    = phrasalBranchingNum2.TotalCount() / denom;
                meanBranchingFactor = phrasalBranching2.TotalCount() / phrasalBranchingNum2.TotalCount();
                //Compute *actual* stddev (we iterate over the whole population)
                foreach (int d in depths)
                {
                    stddevDepth += Math.Pow(d - meanDepth, 2);
                }
                stddevDepth = Math.Sqrt(stddevDepth / denom);
                foreach (int l in lengths)
                {
                    stddevLength += Math.Pow(l - meanLength, 2);
                }
                stddevLength = Math.Sqrt(stddevLength / denom);
                foreach (int b in breadths)
                {
                    stddevBreadth += Math.Pow(b - meanBreadth, 2);
                }
                stddevBreadth        = Math.Sqrt(stddevBreadth / denom);
                meanBranchingByLabel = new ClassicCounter <string>();
                foreach (string label in phrasalBranching2.KeySet())
                {
                    double mean = phrasalBranching2.GetCount(label) / phrasalBranchingNum2.GetCount(label);
                    meanBranchingByLabel.IncrementCount(label, mean);
                }
                oovWords = Generics.NewHashSet(words.KeySet());
                oovWords.RemoveAll(trainVocab);
                OOVRate = (double)oovWords.Count / (double)words.KeySet().Count;
            }
Ejemplo n.º 5
0
        /// <summary>Featurize a given sentence.</summary>
        /// <param name="sentence">The sentence to featurize.</param>
        /// <returns>A counter encoding the featurized sentence.</returns>
        private static ICounter <string> Featurize(ICoreMap sentence)
        {
            ClassicCounter <string> features = new ClassicCounter <string>();
            string lastLemma = "^";

            foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                string lemma = token.Lemma().ToLower();
                if (number.Matcher(lemma).Matches())
                {
                    features.IncrementCount("**num**");
                }
                else
                {
                    features.IncrementCount(lemma);
                }
                if (alpha.Matcher(lemma).Matches())
                {
                    features.IncrementCount(lastLemma + "__" + lemma);
                    lastLemma = lemma;
                }
            }
            features.IncrementCount(lastLemma + "__$");
            return(features);
        }
 // Records the number of times word/tag pair was seen in training data.
 // Counts of each tag (stored as a Label) on unknown words.
 // tag (Label) --> signature --> count
 public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
     seenCounter   = new ClassicCounter <IntTaggedWord>();
     unSeenCounter = new ClassicCounter <IntTaggedWord>();
     tagHash       = Generics.NewHashMap();
     tc            = new ClassicCounter <ILabel>();
     c             = Generics.NewHashMap();
     seenEnd       = Generics.NewHashSet();
     useEnd        = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0);
     useFirstCap   = op.lexOptions.useUnknownWordSignatures > 0;
     useGT         = (op.lexOptions.useUnknownWordSignatures == 0);
     useFirst      = false;
     if (useFirst)
     {
         log.Info("Including first letter for unknown words.");
     }
     if (useFirstCap)
     {
         log.Info("Including whether first letter is capitalized for unknown words");
     }
     if (useEnd)
     {
         log.Info("Classing unknown word as the average of their equivalents by identity of last " + op.lexOptions.unknownSuffixSize + " letters.");
     }
     if (useGT)
     {
         log.Info("Using Good-Turing smoothing for unknown words.");
     }
     this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
     this.unknownGTTrainer        = (useGT) ? new UnknownGTTrainer() : null;
     this.model = BuildUWM();
 }
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            // scan data
            string word      = tw.Word();
            string subString = model.GetSignature(word, loc);
            ILabel tag       = new Tag(tw.Tag());

            if (!c.Contains(tag))
            {
                c[tag] = new ClassicCounter <string>();
            }
            c[tag].IncrementCount(subString, weight);
            tc.IncrementCount(tag, weight);
            seenEnd.Add(subString);
            string        tagStr = tw.Tag();
            IntTaggedWord iW     = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight);
                }
            }
        }
        /// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary>
        /// <param name="datum">with a collection of features.</param>
        /// <param name="featureDocCounts">a counter of doc-count for each feature.</param>
        /// <returns>RVFDatum with l1-normalized tf-idf features.</returns>
        public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts)
        {
            ICounter <F> tfidfFeatures = new ClassicCounter <F>();

            foreach (F feature in datum.AsFeatures())
            {
                if (featureDocCounts.ContainsKey(feature))
                {
                    tfidfFeatures.IncrementCount(feature, 1.0);
                }
            }
            double l1norm = 0;

            foreach (F feature_1 in tfidfFeatures.KeySet())
            {
                double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5));
                double tf  = tfidfFeatures.GetCount(feature_1);
                tfidfFeatures.SetCount(feature_1, tf * idf);
                l1norm += tf * idf;
            }
            foreach (F feature_2 in tfidfFeatures.KeySet())
            {
                double tfidf = tfidfFeatures.GetCount(feature_2);
                tfidfFeatures.SetCount(feature_2, tfidf / l1norm);
            }
            RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label());

            return(rvfDatum);
        }
        public virtual ICounter <L> ProbabilityOf(IDatum <L, F> example)
        {
            // calculate the feature indices and feature values
            int[]    featureIndices = LogisticUtils.IndicesOf(example.AsFeatures(), featureIndex);
            double[] featureValues;
            if (example is RVFDatum <object, object> )
            {
                ICollection <double> featureValuesCollection = ((RVFDatum <object, object>)example).AsFeaturesCounter().Values();
                featureValues = LogisticUtils.ConvertToArray(featureValuesCollection);
            }
            else
            {
                featureValues = new double[example.AsFeatures().Count];
                Arrays.Fill(featureValues, 1.0);
            }
            // calculate probability of each class
            ICounter <L> result     = new ClassicCounter <L>();
            int          numClasses = labelIndex.Size();

            double[] sigmoids = LogisticUtils.CalculateSigmoids(weights, featureIndices, featureValues);
            for (int c = 0; c < numClasses; c++)
            {
                L label = labelIndex.Get(c);
                result.IncrementCount(label, sigmoids[c]);
            }
            return(result);
        }
Ejemplo n.º 10
0
        public virtual ICounter <string> GetTopSpeakers(IList <Sieve.MentionData> closestMentions, IList <Sieve.MentionData> closestMentionsBackward, Person.Gender gender, ICoreMap quote, bool overrideGender)
        {
            ICounter <string> topSpeakerInRange               = new ClassicCounter <string>();
            ICounter <string> topSpeakerInRangeIgnoreGender   = new ClassicCounter <string>();
            ICollection <Sieve.MentionData> backwardsMentions = new HashSet <Sieve.MentionData>(closestMentionsBackward);

            foreach (Sieve.MentionData mention in closestMentions)
            {
                double weight = backwardsMentions.Contains(mention) ? BackwardWeight : ForwardWeight;
                if (mention.type.Equals(Name))
                {
                    if (!characterMap.Keys.Contains(mention.text))
                    {
                        continue;
                    }
                    Person p = characterMap[mention.text][0];
                    if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk))
                    {
                        topSpeakerInRange.IncrementCount(p.name, weight);
                    }
                    topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight);
                    if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94)
                    {
                        System.Console.Out.WriteLine(p.name + " " + weight + " name");
                    }
                }
                else
                {
                    if (mention.type.Equals(Pronoun))
                    {
                        int    charBeginKey = doc.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.begin].BeginPosition();
                        Person p            = DoCoreference(charBeginKey, quote);
                        if (p != null)
                        {
                            if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk))
                            {
                                topSpeakerInRange.IncrementCount(p.name, weight);
                            }
                            topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight);
                            if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94)
                            {
                                System.Console.Out.WriteLine(p.name + " " + weight + " pronoun");
                            }
                        }
                    }
                }
            }
            if (topSpeakerInRange.Size() > 0)
            {
                return(topSpeakerInRange);
            }
            else
            {
                if (gender != Person.Gender.Unk && !overrideGender)
                {
                    return(topSpeakerInRange);
                }
            }
            return(topSpeakerInRangeIgnoreGender);
        }
Ejemplo n.º 11
0
        private NaiveBayesClassifier <L, F> TrainClassifier(int[][] data, int[] labels, int numFeatures, int numClasses, IIndex <L> labelIndex, IIndex <F> featureIndex)
        {
            ICollection <L> labelSet = Generics.NewHashSet();

            NaiveBayesClassifierFactory.NBWeights nbWeights = TrainWeights(data, labels, numFeatures, numClasses);
            ICounter <L> priors = new ClassicCounter <L>();

            double[] pr = nbWeights.priors;
            for (int i = 0; i < pr.Length; i++)
            {
                priors.IncrementCount(labelIndex.Get(i), pr[i]);
                labelSet.Add(labelIndex.Get(i));
            }
            ICounter <Pair <Pair <L, F>, Number> > weightsCounter = new ClassicCounter <Pair <Pair <L, F>, Number> >();

            double[][][] wts = nbWeights.weights;
            for (int c = 0; c < numClasses; c++)
            {
                L label = labelIndex.Get(c);
                for (int f = 0; f < numFeatures; f++)
                {
                    F           feature = featureIndex.Get(f);
                    Pair <L, F> p       = new Pair <L, F>(label, feature);
                    for (int val = 0; val < wts[c][f].Length; val++)
                    {
                        Pair <Pair <L, F>, Number> key = new Pair <Pair <L, F>, Number>(p, int.Parse(val));
                        weightsCounter.IncrementCount(key, wts[c][f][val]);
                    }
                }
            }
            return(new NaiveBayesClassifier <L, F>(weightsCounter, priors, labelSet));
        }
        public virtual RVFDatum <L, F> ScaleDatumGaussian(RVFDatum <L, F> datum)
        {
            // scale this dataset before scaling the datum
            if (means == null || stdevs == null)
            {
                ScaleFeaturesGaussian();
            }
            ICounter <F> scaledFeatures = new ClassicCounter <F>();

            foreach (F feature in datum.AsFeatures())
            {
                int fID = this.featureIndex.IndexOf(feature);
                if (fID >= 0)
                {
                    double oldVal = datum.AsFeaturesCounter().GetCount(feature);
                    double newVal;
                    if (stdevs[fID] != 0)
                    {
                        newVal = (oldVal - means[fID]) / stdevs[fID];
                    }
                    else
                    {
                        newVal = oldVal;
                    }
                    scaledFeatures.IncrementCount(feature, newVal);
                }
            }
            return(new RVFDatum <L, F>(scaledFeatures, datum.Label()));
        }
Ejemplo n.º 13
0
        public virtual void Train(TaggedWord tw, int loc, double weight)
        {
            uwModelTrainer.Train(tw, loc, weight);
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);

            seenCounter.IncrementCount(iTW, weight);
            IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);

            seenCounter.IncrementCount(iT, weight);
            IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = new IntTaggedWord(nullWord, nullTag);

            seenCounter.IncrementCount(i, weight);
            // rules.add(iTW);
            tags.Add(iT);
            words.Add(iW);
            string            tag     = tw.Tag();
            string            baseTag = op.Langpack().BasicCategory(tag);
            ICounter <string> counts  = baseTagCounts[baseTag];

            if (counts == null)
            {
                counts = new ClassicCounter <string>();
                baseTagCounts[baseTag] = counts;
            }
            counts.IncrementCount(tag, weight);
        }
Ejemplo n.º 14
0
        private static void ModifyUsingCoreNLPNER(Annotation doc)
        {
            Properties ann = new Properties();

            ann.SetProperty("annotators", "pos, lemma, ner");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);

            pipeline.Annotate(doc);
            foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <EntityMention> entities = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));
                if (entities != null)
                {
                    IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                    foreach (EntityMention en in entities)
                    {
                        //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType());
                        Span s = en.GetExtent();
                        ICounter <string> allNertagforSpan = new ClassicCounter <string>();
                        for (int i = s.Start(); i < s.End(); i++)
                        {
                            allNertagforSpan.IncrementCount(tokens[i].Ner());
                        }
                        string entityNertag = Counters.Argmax(allNertagforSpan);
                        en.SetType(entityNertag);
                    }
                }
            }
        }
        public static void Main(string[] args)
        {
            Edu.Stanford.Nlp.Classify.RVFDataset <string, string> data = new Edu.Stanford.Nlp.Classify.RVFDataset <string, string>();
            ClassicCounter <string> c1 = new ClassicCounter <string>();

            c1.IncrementCount("fever", 3.5);
            c1.IncrementCount("cough", 1.1);
            c1.IncrementCount("congestion", 4.2);
            ClassicCounter <string> c2 = new ClassicCounter <string>();

            c2.IncrementCount("fever", 1.5);
            c2.IncrementCount("cough", 2.1);
            c2.IncrementCount("nausea", 3.2);
            ClassicCounter <string> c3 = new ClassicCounter <string>();

            c3.IncrementCount("cough", 2.5);
            c3.IncrementCount("congestion", 3.2);
            data.Add(new RVFDatum <string, string>(c1, "cold"));
            data.Add(new RVFDatum <string, string>(c2, "flu"));
            data.Add(new RVFDatum <string, string>(c3, "cold"));
            data.SummaryStatistics();
            LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>();

            factory.UseQuasiNewton();
            LinearClassifier <string, string> c  = factory.TrainClassifier(data);
            ClassicCounter <string>           c4 = new ClassicCounter <string>();

            c4.IncrementCount("cough", 2.3);
            c4.IncrementCount("fever", 1.3);
            RVFDatum <string, string> datum = new RVFDatum <string, string>(c4);

            c.JustificationOf((IDatum <string, string>)datum);
        }
        /// <summary>
        /// Calculate sister annotation statistics suitable for doing
        /// selective sister splitting in the PCFGParser inside the
        /// FactoredParser.
        /// </summary>
        /// <param name="args">One argument: path to the Treebank</param>
        public static void Main(string[] args)
        {
            ClassicCounter <string> c = new ClassicCounter <string>();

            c.SetCount("A", 0);
            c.SetCount("B", 1);
            double d = Counters.KlDivergence(c, c);

            System.Console.Out.WriteLine("KL Divergence: " + d);
            string encoding = "UTF-8";

            if (args.Length > 1)
            {
                encoding = args[1];
            }
            if (args.Length < 1)
            {
                System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath");
            }
            else
            {
                SisterAnnotationStats pas = new SisterAnnotationStats();
                Treebank treebank         = new DiskTreebank(null, encoding);
                treebank.LoadPath(args[0]);
                treebank.Apply(pas);
                pas.PrintStats();
            }
        }
        public virtual void RunCoref(Document document)
        {
            IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetUnlabeledMentionPairs(document);

            if (mentionPairs.Count == 0)
            {
                return;
            }
            Compressor <string>         compressor           = new Compressor <string>();
            DocumentExamples            examples             = extractor.Extract(0, document, mentionPairs, compressor);
            ICounter <Pair <int, int> > classificationScores = new ClassicCounter <Pair <int, int> >();
            ICounter <Pair <int, int> > rankingScores        = new ClassicCounter <Pair <int, int> >();
            ICounter <int> anaphoricityScores = new ClassicCounter <int>();

            foreach (Example example in examples.examples)
            {
                CorefUtils.CheckForInterrupt();
                Pair <int, int> mentionPair = new Pair <int, int>(example.mentionId1, example.mentionId2);
                classificationScores.IncrementCount(mentionPair, classificationModel.Predict(example, examples.mentionFeatures, compressor));
                rankingScores.IncrementCount(mentionPair, rankingModel.Predict(example, examples.mentionFeatures, compressor));
                if (!anaphoricityScores.ContainsKey(example.mentionId2))
                {
                    anaphoricityScores.IncrementCount(example.mentionId2, anaphoricityModel.Predict(new Example(example, false), examples.mentionFeatures, compressor));
                }
            }
            ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.Stream().Collect(Collectors.ToMap(null, null)));
            foreach (Pair <int, int> mentionPair_1 in clusterer.GetClusterMerges(doc))
            {
                CorefUtils.MergeCoreferenceClusters(mentionPair_1, document);
            }
        }
        public virtual EntityMention MakeEntityMention(ICoreMap sentence, int start, int end, string label, string identifier)
        {
            Span   span    = new Span(start, end);
            string type    = null;
            string subtype = null;

            if (!label.StartsWith("B-") && !label.StartsWith("I-"))
            {
                type    = label;
                subtype = null;
            }
            else
            {
                // TODO: add support for subtypes! (needed at least in ACE)
                type    = Sharpen.Runtime.Substring(label, 2);
                subtype = null;
            }
            // TODO: add support for subtypes! (needed at least in ACE)
            EntityMention     entity = entityMentionFactory.ConstructEntityMention(identifier, sentence, span, span, type, subtype, null);
            ICounter <string> probs  = new ClassicCounter <string>();

            probs.SetCount(entity.GetType(), 1.0);
            entity.SetTypeProbabilities(probs);
            return(entity);
        }
        public virtual void TestGetDistributionFromLogValues()
        {
            ICounter <string> c1 = new ClassicCounter <string>();

            c1.SetCount("p", 1.0);
            c1.SetCount("q", 2.0);
            c1.SetCount("r", 3.0);
            c1.SetCount("s", 4.0);
            // take log
            Counters.LogInPlace(c1);
            // now call distribution
            Distribution <string> distribution = Distribution.GetDistributionFromLogValues(c1);

            // test
            NUnit.Framework.Assert.AreEqual(distribution.KeySet().Count, 4);
            // size
            // keys
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("p"), true);
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("q"), true);
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("r"), true);
            NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("s"), true);
            // values
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("p"), 1.0E-1, 1E-10);
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("q"), 2.0E-1, 1E-10);
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("r"), 3.0E-1, 1E-10);
            NUnit.Framework.Assert.AreEqual(distribution.GetCount("s"), 4.0E-1, 1E-10);
        }
            public override Pair <double, double> GetScore(IList <IList <int> > clusters, IDictionary <int, IList <int> > mentionToGold)
            {
                double num = 0;
                int    dem = 0;

                foreach (IList <int> c in clusters)
                {
                    if (c.Count == 1)
                    {
                        continue;
                    }
                    ICounter <IList <int> > goldCounts = new ClassicCounter <IList <int> >();
                    double correct = 0;
                    foreach (int m in c)
                    {
                        IList <int> goldCluster = mentionToGold[m];
                        if (goldCluster != null)
                        {
                            goldCounts.IncrementCount(goldCluster);
                        }
                    }
                    foreach (KeyValuePair <IList <int>, double> e in goldCounts.EntrySet())
                    {
                        if (e.Key.Count != 1)
                        {
                            correct += e.Value * e.Value;
                        }
                    }
                    num += correct / c.Count;
                    dem += c.Count;
                }
                return(new Pair <double, double>(num, (double)dem));
            }
 public override IUnknownWordModel FinishTraining()
 {
     // Map<String,Float> unknownGT = null;
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     // unknownGT = unknownGTTrainer.unknownGT;
     foreach (ILabel tagLab in c.Keys)
     {
         // outer iteration is over tags as Labels
         ClassicCounter <string> wc = c[tagLab];
         // counts for words given a tag
         if (!tagHash.Contains(tagLab))
         {
             tagHash[tagLab] = new ClassicCounter <string>();
         }
         // the UNKNOWN first character is assumed to be seen once in
         // each tag
         // this is really sort of broken!  (why??)
         tc.IncrementCount(tagLab);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         // inner iteration is over words  as strings
         foreach (string first in wc.KeySet())
         {
             double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab));
             tagHash[tagLab].SetCount(first, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding);
     return(model);
 }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private void ReadObject(ObjectInputStream stream)
        {
            stream.DefaultReadObject();
            //    log.info("Before decompression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            ClassicCounter <IntDependency> compressedArgC = argCounter;

            argCounter = new ClassicCounter <IntDependency>();
            ClassicCounter <IntDependency> compressedStopC = stopCounter;

            stopCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency d in compressedArgC.KeySet())
            {
                double count = compressedArgC.GetCount(d);
                ExpandArg(d, d.distance, count);
            }
            foreach (IntDependency d_1 in compressedStopC.KeySet())
            {
                double count = compressedStopC.GetCount(d_1);
                ExpandStop(d_1, d_1.distance, count, false);
            }
            //    log.info("After decompression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            expandDependencyMap = null;
        }
        private static void LoadSignatures(string file, IDictionary <string, ICounter <string> > sigs)
        {
            BufferedReader reader = null;

            try
            {
                reader = IOUtils.ReaderFromString(file);
                while (reader.Ready())
                {
                    string[]          split = reader.ReadLine().Split("\t");
                    ICounter <string> cntr  = new ClassicCounter <string>();
                    sigs[split[0]] = cntr;
                    for (int i = 1; i < split.Length; i = i + 2)
                    {
                        cntr.SetCount(split[i], double.ParseDouble(split[i + 1]));
                    }
                }
            }
            catch (IOException e)
            {
                throw new Exception(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(reader);
            }
        }
        public override void PrintResults(PrintWriter pw, IList <ICoreMap> goldStandard, IList <ICoreMap> extractorOutput)
        {
            ResultsPrinter.Align(goldStandard, extractorOutput);
            // the mention factory cannot be null here
            System.Diagnostics.Debug.Assert(relationMentionFactory != null, "ERROR: RelationExtractorResultsPrinter.relationMentionFactory cannot be null in printResults!");
            // Count predicted-actual relation type pairs
            ICounter <Pair <string, string> > results    = new ClassicCounter <Pair <string, string> >();
            ClassicCounter <string>           labelCount = new ClassicCounter <string>();

            // TODO: assumes binary relations
            for (int goldSentenceIndex = 0; goldSentenceIndex < goldStandard.Count; goldSentenceIndex++)
            {
                foreach (RelationMention goldRelation in AnnotationUtils.GetAllRelations(relationMentionFactory, goldStandard[goldSentenceIndex], createUnrelatedRelations))
                {
                    ICoreMap extractorSentence = extractorOutput[goldSentenceIndex];
                    IList <RelationMention> extractorRelations = AnnotationUtils.GetRelations(relationMentionFactory, extractorSentence, goldRelation.GetArg(0), goldRelation.GetArg(1));
                    labelCount.IncrementCount(goldRelation.GetType());
                    foreach (RelationMention extractorRelation in extractorRelations)
                    {
                        results.IncrementCount(new Pair <string, string>(extractorRelation.GetType(), goldRelation.GetType()));
                    }
                }
            }
            PrintResultsInternal(pw, results, labelCount);
        }
        /// <summary>Destructively modifies the input and returns it as a convenience.</summary>
        public virtual Pair <UnaryGrammar, BinaryGrammar> Apply(Pair <UnaryGrammar, BinaryGrammar> bgug)
        {
            Alpha = trainOptions.ruleSmoothingAlpha;
            ICounter <string> symWeights = new ClassicCounter <string>();
            ICounter <string> symCounts  = new ClassicCounter <string>();

            //Tally unary rules
            foreach (UnaryRule rule in bgug.First())
            {
                if (!tagIndex.Contains(rule.parent))
                {
                    UpdateCounters(rule, symWeights, symCounts);
                }
            }
            //Tally binary rules
            foreach (BinaryRule rule_1 in bgug.Second())
            {
                UpdateCounters(rule_1, symWeights, symCounts);
            }
            //Compute smoothed rule scores, unary
            foreach (UnaryRule rule_2 in bgug.First())
            {
                if (!tagIndex.Contains(rule_2.parent))
                {
                    rule_2.score = SmoothRuleWeight(rule_2, symWeights, symCounts);
                }
            }
            //Compute smoothed rule scores, binary
            foreach (BinaryRule rule_3 in bgug.Second())
            {
                rule_3.score = SmoothRuleWeight(rule_3, symWeights, symCounts);
            }
            return(bgug);
        }
 public override IUnknownWordModel FinishTraining()
 {
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c)
     {
         /* outer iteration is over tags */
         ILabel key = entry.Key;
         ClassicCounter <string> wc = entry.Value;
         // counts for words given a tag
         if (!tagHash.Contains(key))
         {
             tagHash[key] = new ClassicCounter <string>();
         }
         /* the UNKNOWN sequence is assumed to be seen once in each tag */
         // This is sort of broken, but you can regard it as a Dirichlet prior.
         tc.IncrementCount(key);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         /* inner iteration is over words */
         foreach (string end in wc.KeySet())
         {
             double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key)));
             // p(sig|tag)
             tagHash[key].SetCount(end, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding);
     return(model);
 }
        private void PrintResultsInternal(PrintWriter pw, ICounter <Pair <string, string> > results, ClassicCounter <string> labelCount)
        {
            ClassicCounter <string> correct         = new ClassicCounter <string>();
            ClassicCounter <string> predictionCount = new ClassicCounter <string>();
            bool countGoldLabels = false;

            if (labelCount == null)
            {
                labelCount      = new ClassicCounter <string>();
                countGoldLabels = true;
            }
            foreach (Pair <string, string> predictedActual in results.KeySet())
            {
                string predicted = predictedActual.first;
                string actual    = predictedActual.second;
                if (predicted.Equals(actual))
                {
                    correct.IncrementCount(actual, results.GetCount(predictedActual));
                }
                predictionCount.IncrementCount(predicted, results.GetCount(predictedActual));
                if (countGoldLabels)
                {
                    labelCount.IncrementCount(actual, results.GetCount(predictedActual));
                }
            }
            DecimalFormat formatter = new DecimalFormat();

            formatter.SetMaximumFractionDigits(1);
            formatter.SetMinimumFractionDigits(1);
            double totalCount     = 0;
            double totalCorrect   = 0;
            double totalPredicted = 0;

            pw.Println("Label\tCorrect\tPredict\tActual\tPrecn\tRecall\tF");
            IList <string> labels = new List <string>(labelCount.KeySet());

            labels.Sort();
            foreach (string label in labels)
            {
                double numcorrect = correct.GetCount(label);
                double predicted  = predictionCount.GetCount(label);
                double trueCount  = labelCount.GetCount(label);
                double precision  = (predicted > 0) ? (numcorrect / predicted) : 0;
                double recall     = numcorrect / trueCount;
                double f          = (precision + recall > 0) ? 2 * precision * recall / (precision + recall) : 0.0;
                pw.Println(StringUtils.PadOrTrim(label, MaxLabelLength) + "\t" + numcorrect + "\t" + predicted + "\t" + trueCount + "\t" + formatter.Format(precision * 100) + "\t" + formatter.Format(100 * recall) + "\t" + formatter.Format(100 * f));
                if (!RelationMention.IsUnrelatedLabel(label))
                {
                    totalCount     += trueCount;
                    totalCorrect   += numcorrect;
                    totalPredicted += predicted;
                }
            }
            double precision_1 = (totalPredicted > 0) ? (totalCorrect / totalPredicted) : 0;
            double recall_1    = totalCorrect / totalCount;
            double f_1         = (totalPredicted > 0 && totalCorrect > 0) ? 2 * precision_1 * recall_1 / (precision_1 + recall_1) : 0.0;

            pw.Println("Total\t" + totalCorrect + "\t" + totalPredicted + "\t" + totalCount + "\t" + formatter.Format(100 * precision_1) + "\t" + formatter.Format(100 * recall_1) + "\t" + formatter.Format(100 * f_1));
        }
Ejemplo n.º 28
0
        protected internal override void Calculate(double[] x)
        {
            classifier.SetWeights(To2D(x));
            if (derivative == null)
            {
                derivative = new double[x.Length];
            }
            else
            {
                Arrays.Fill(derivative, 0.0);
            }
            ICounter <Triple <int, int, int> > feature2classPairDerivatives = new ClassicCounter <Triple <int, int, int> >();

            value = 0.0;
            for (int n = 0; n < geFeatures.Count; n++)
            {
                //F feature = geFeatures.get(n);
                double[] modelDist = new double[numClasses];
                Arrays.Fill(modelDist, 0);
                //go over the unlabeled active data to compute expectations
                IList <int> activeData = geFeature2DatumList[n];
                foreach (int activeDatum in activeData)
                {
                    IDatum <L, F> datum = unlabeledDataList[activeDatum];
                    double[]      probs = GetModelProbs(datum);
                    for (int c = 0; c < numClasses; c++)
                    {
                        modelDist[c] += probs[c];
                    }
                    UpdateDerivative(datum, probs, feature2classPairDerivatives);
                }
                //computes p(y_d)*(1-p(y_d))*f_d for all active features.
                //now  compute the value (KL-divergence) and the final value of the derivative.
                if (activeData.Count > 0)
                {
                    for (int c = 0; c < numClasses; c++)
                    {
                        modelDist[c] /= activeData.Count;
                    }
                    SmoothDistribution(modelDist);
                    for (int c_1 = 0; c_1 < numClasses; c_1++)
                    {
                        value += -geFeature2EmpiricalDist[n][c_1] * Math.Log(modelDist[c_1]);
                    }
                    for (int f = 0; f < labeledDataset.FeatureIndex().Size(); f++)
                    {
                        for (int c_2 = 0; c_2 < numClasses; c_2++)
                        {
                            int wtIndex = IndexOf(f, c_2);
                            for (int cPrime = 0; cPrime < numClasses; cPrime++)
                            {
                                derivative[wtIndex] += feature2classPairDerivatives.GetCount(new Triple <int, int, int>(f, c_2, cPrime)) * geFeature2EmpiricalDist[n][cPrime] / modelDist[cPrime];
                            }
                            derivative[wtIndex] /= activeData.Count;
                        }
                    }
                }
            }
        }
Ejemplo n.º 29
0
 // true: DT JJ NN -> DT "JJ NN", false: DT "DT"
 // = false;
 /// <summary>
 /// If this is set to true, then the binarizer will choose selectively whether or not to
 /// split states based on how many counts the states had in a previous run.
 /// </summary>
 /// <remarks>
 /// If this is set to true, then the binarizer will choose selectively whether or not to
 /// split states based on how many counts the states had in a previous run. These counts are
 /// stored in an internal counter, which will be added to when doSelectiveSplit is false.
 /// If passed false, this will initialize (clear) the counts.
 /// </remarks>
 /// <param name="doSelectiveSplit">Record this value and reset internal counter if false</param>
 public virtual void SetDoSelectiveSplit(bool doSelectiveSplit)
 {
     this.doSelectiveSplit = doSelectiveSplit;
     if (!doSelectiveSplit)
     {
         stateCounter = new ClassicCounter <string>();
     }
 }
Ejemplo n.º 30
0
 // boundary tag -- assumed not a real tag
 public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
     indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
     seenCounter             = new ClassicCounter <IntTaggedWord>();
     unSeenCounter           = new ClassicCounter <IntTaggedWord>();
     model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter);
 }