Beispiel #1
0
        private string GetTag(string word)
        {
            int iW = wordIndex.AddToIndex(word);

            EnsureProbs(iW, false);
            return(Counters.Argmax(logProbs));
        }
Beispiel #2
0
        private static void ModifyUsingCoreNLPNER(Annotation doc)
        {
            Properties ann = new Properties();

            ann.SetProperty("annotators", "pos, lemma, ner");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);

            pipeline.Annotate(doc);
            foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <EntityMention> entities = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));
                if (entities != null)
                {
                    IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                    foreach (EntityMention en in entities)
                    {
                        //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType());
                        Span s = en.GetExtent();
                        ICounter <string> allNertagforSpan = new ClassicCounter <string>();
                        for (int i = s.Start(); i < s.End(); i++)
                        {
                            allNertagforSpan.IncrementCount(tokens[i].Ner());
                        }
                        string entityNertag = Counters.Argmax(allNertagforSpan);
                        en.SetType(entityNertag);
                    }
                }
            }
        }
Beispiel #3
0
        /// <summary>
        /// Attempt to infer the part of speech of the given preterminal node, which
        /// was created during the expansion of a multi-word token.
        /// </summary>
        private static string InferPOS(Tree t, Tree parent, TwoDimensionalCounter <string, string> unigramTagger)
        {
            string word = t.FirstChild().Value();
            string containingPhraseStr = GetContainingPhrase(t, parent);
            // Overrides: let the manual POS model handle a few special cases first
            string overrideTag = MultiWordPreprocessor.ManualUWModel.GetOverrideTag(word, containingPhraseStr);

            if (overrideTag != null)
            {
                return(overrideTag);
            }
            ICollection <string> unigramTaggerKeys = unigramTagger.FirstKeySet();

            // Try treating this word as a verb and stripping any clitic
            // pronouns. If the stripped version exists in the unigram
            // tagger, then stick with the verb hypothesis
            SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.SeparatePronouns(word);
            if (strippedVerb != null && unigramTaggerKeys.Contains(strippedVerb.GetStem()))
            {
                string pos = Counters.Argmax(unigramTagger.GetCounter(strippedVerb.GetStem()));
                if (pos.StartsWith("v"))
                {
                    return(pos);
                }
            }
            if (unigramTagger.FirstKeySet().Contains(word))
            {
                return(Counters.Argmax(unigramTagger.GetCounter(word), new MultiWordPreprocessor.POSTieBreaker()));
            }
            return(MultiWordPreprocessor.ManualUWModel.GetTag(word, containingPhraseStr));
        }
Beispiel #4
0
        public virtual void ClassifyMentions(IList <IList <Mention> > predictedMentions, Dictionaries dict, Properties props)
        {
            ICollection <string> neStrings = Generics.NewHashSet();

            foreach (IList <Mention> predictedMention in predictedMentions)
            {
                foreach (Mention m in predictedMention)
                {
                    string ne = m.headWord.Ner();
                    if (ne.Equals("O"))
                    {
                        continue;
                    }
                    foreach (CoreLabel cl in m.originalSpan)
                    {
                        if (!cl.Ner().Equals(ne))
                        {
                            continue;
                        }
                    }
                    neStrings.Add(m.LowercaseNormalizedSpanString());
                }
            }
            foreach (IList <Mention> predicts in predictedMentions)
            {
                IDictionary <int, ICollection <Mention> > headPositions = Generics.NewHashMap();
                foreach (Mention p in predicts)
                {
                    if (!headPositions.Contains(p.headIndex))
                    {
                        headPositions[p.headIndex] = Generics.NewHashSet();
                    }
                    headPositions[p.headIndex].Add(p);
                }
                ICollection <Mention> remove = Generics.NewHashSet();
                foreach (int hPos in headPositions.Keys)
                {
                    ICollection <Mention> shares = headPositions[hPos];
                    if (shares.Count > 1)
                    {
                        ICounter <Mention> probs = new ClassicCounter <Mention>();
                        foreach (Mention p_1 in shares)
                        {
                            double trueProb = ProbabilityOf(p_1, shares, neStrings, dict, props);
                            probs.IncrementCount(p_1, trueProb);
                        }
                        // add to remove
                        Mention keep = Counters.Argmax(probs, null);
                        probs.Remove(keep);
                        Sharpen.Collections.AddAll(remove, probs.KeySet());
                    }
                }
                foreach (Mention r in remove)
                {
                    predicts.Remove(r);
                }
            }
        }
Beispiel #5
0
        public virtual L ClassOf(IDatum <L, F> example)
        {
            ICounter <L> scores = ScoresOf(example);

            if (scores != null)
            {
                return(Counters.Argmax(scores));
            }
            else
            {
                return(defaultLabel);
            }
        }
Beispiel #6
0
        /// <summary>Select the most common element of the given type in the given span.</summary>
        /// <remarks>
        /// Select the most common element of the given type in the given span.
        /// This is useful for, e.g., finding the most likely NER span of a given span, or the most
        /// likely POS tag of a given span.
        /// Null entries are removed.
        /// </remarks>
        /// <param name="span">The span of the sentence to find the mode element in. This must be entirely contained in the sentence.</param>
        /// <param name="selector">The property of the sentence we are getting the mode of. For example, <code>Sentence::posTags</code></param>
        /// <?/>
        /// <returns>The most common element of the given property in the sentence.</returns>
        public virtual E ModeInSpan <E>(Span span, IFunction <Sentence, IList <E> > selector)
        {
            if (!Span.FromValues(0, sentence.Length()).Contains(span))
            {
                throw new ArgumentException("Span must be entirely contained in the sentence: " + span + " (sentence length=" + sentence.Length() + ")");
            }
            ICounter <E> candidates = new ClassicCounter <E>();

            foreach (int i in span)
            {
                candidates.IncrementCount(selector.Apply(sentence)[i]);
            }
            candidates.Remove(null);
            return(Counters.Argmax(candidates));
        }
Beispiel #7
0
        /// <summary>A utility to get useful information out of a CorefMention.</summary>
        /// <remarks>
        /// A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are
        /// associated with this mention, and it returns a score for how much we think this mention should be the canonical
        /// mention.
        /// </remarks>
        /// <param name="doc">The document this mention is referenced into.</param>
        /// <param name="mention">The mention itself.</param>
        /// <returns>A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention.</returns>
        private static Pair <IList <CoreLabel>, double> GrokCorefMention(Annotation doc, CorefChain.CorefMention mention)
        {
            IList <CoreLabel> tokens          = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1].Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <CoreLabel> mentionAsTokens = tokens.SubList(mention.startIndex - 1, mention.endIndex - 1);
            // Try to assess this mention's NER type
            ICounter <string> nerVotes = new ClassicCounter <string>();

            mentionAsTokens.Stream().Filter(null).ForEach(null);
            string ner      = Counters.Argmax(nerVotes, null);
            double nerCount = nerVotes.GetCount(ner);
            double nerScore = nerCount * nerCount / ((double)mentionAsTokens.Count);

            // Return
            return(Pair.MakePair(mentionAsTokens, nerScore));
        }
		/// <summary>
		/// Score the given input, returning both the classification decision and the
		/// probability of that decision.
		/// </summary>
		/// <remarks>
		/// Score the given input, returning both the classification decision and the
		/// probability of that decision.
		/// Note that this method will not return a relation which does not type check.
		/// </remarks>
		/// <param name="input">The input to classify.</param>
		/// <returns>A pair with the relation we classified into, along with its confidence.</returns>
		public virtual Pair<string, double> Classify(KBPRelationExtractor.KBPInput input)
		{
			RVFDatum<string, string> datum = new RVFDatum<string, string>(Features(input));
			ICounter<string> scores = classifier.ScoresOf(datum);
			Counters.ExpInPlace(scores);
			Counters.Normalize(scores);
			string best = Counters.Argmax(scores);
			// While it doesn't type check, continue going down the list.
			// NO_RELATION is always an option somewhere in there, so safe to keep going...
			while (!KBPRelationExtractorConstants.NoRelation.Equals(best) && scores.Size() > 1 && (!KBPRelationExtractor.RelationType.FromString(best).Get().validNamedEntityLabels.Contains(input.objectType) || KBPRelationExtractor.RelationType.FromString
				(best).Get().entityType != input.subjectType))
			{
				scores.Remove(best);
				Counters.Normalize(scores);
				best = Counters.Argmax(scores);
			}
			return Pair.MakePair(best, scores.GetCount(best));
		}
Beispiel #9
0
        /// <summary>TODO(gabor) JavaDoc</summary>
        /// <param name="tokens"/>
        /// <param name="span"/>
        /// <returns/>
        public static string GuessNER(IList <CoreLabel> tokens, Span span)
        {
            ICounter <string> nerGuesses = new ClassicCounter <string>();

            foreach (int i in span)
            {
                nerGuesses.IncrementCount(tokens[i].Ner());
            }
            nerGuesses.Remove("O");
            nerGuesses.Remove(null);
            if (nerGuesses.Size() > 0 && Counters.Max(nerGuesses) >= span.Size() / 2)
            {
                return(Counters.Argmax(nerGuesses));
            }
            else
            {
                return("O");
            }
        }
        public virtual void InitMC <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data)
        {
            //if (!(gData instanceof Dataset)) {
            //  throw new UnsupportedOperationException("Can only handle Datasets, not "+gData.getClass().getName());
            //}
            //
            //Dataset data = (Dataset)gData;
            IPriorityQueue <Pair <int, Pair <double, bool> > > q = new BinaryHeapPriorityQueue <Pair <int, Pair <double, bool> > >();

            total         = 0;
            correct       = 0;
            logLikelihood = 0.0;
            for (int i = 0; i < data.Size(); i++)
            {
                IDatum <L, F> d            = data.GetRVFDatum(i);
                ICounter <L>  scores       = classifier.LogProbabilityOf(d);
                L             guess        = Counters.Argmax(scores);
                L             correctLab   = d.Label();
                double        guessScore   = scores.GetCount(guess);
                double        correctScore = scores.GetCount(correctLab);
                int           guessInd     = data.LabelIndex().IndexOf(guess);
                int           correctInd   = data.LabelIndex().IndexOf(correctLab);
                total++;
                if (guessInd == correctInd)
                {
                    correct++;
                }
                logLikelihood += correctScore;
                q.Add(new Pair <int, Pair <double, bool> >(int.Parse(i), new Pair <double, bool>(guessScore, bool.ValueOf(guessInd == correctInd))), -guessScore);
            }
            accuracy = (double)correct / (double)total;
            IList <Pair <int, Pair <double, bool> > > sorted = q.ToSortedList();

            scores    = new double[sorted.Count];
            isCorrect = new bool[sorted.Count];
            for (int i_1 = 0; i_1 < sorted.Count; i_1++)
            {
                Pair <double, bool> next = sorted[i_1].Second();
                scores[i_1]    = next.First();
                isCorrect[i_1] = next.Second();
            }
        }
Beispiel #11
0
 public static void TraverseAndFix(Tree t, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger)
 {
     if (t.IsPreTerminal())
     {
         if (t.Value().Equals(FrenchXMLTreeReader.MissingPos))
         {
             nMissingPOS++;
             string word = t.FirstChild().Value();
             string tag  = (unigramTagger.FirstKeySet().Contains(word)) ? Counters.Argmax(unigramTagger.GetCounter(word)) : MWEPreprocessor.ManualUWModel.GetTag(word);
             t.SetValue(tag);
         }
         return;
     }
     foreach (Tree kid in t.Children())
     {
         TraverseAndFix(kid, pretermLabel, unigramTagger);
     }
     //Post-order visit
     if (t.Value().Equals(FrenchXMLTreeReader.MissingPhrasal))
     {
         nMissingPhrasal++;
         StringBuilder sb = new StringBuilder();
         foreach (Tree kid_1 in t.Children())
         {
             sb.Append(kid_1.Value()).Append(" ");
         }
         string posSequence = sb.ToString().Trim();
         if (pretermLabel.FirstKeySet().Contains(posSequence))
         {
             string phrasalCat = Counters.Argmax(pretermLabel.GetCounter(posSequence));
             t.SetValue(phrasalCat);
         }
         else
         {
             System.Console.Out.WriteLine("No phrasal cat for: " + posSequence);
         }
     }
 }
 /// <summary>
 /// Runs the Viterbi algorithm on the sequence model
 /// in order to find the best sequence.
 /// </summary>
 /// <remarks>
 /// Runs the Viterbi algorithm on the sequence model
 /// in order to find the best sequence.
 /// This sequence finder only works on SequenceModel's with rightWindow == 0.
 /// </remarks>
 /// <returns>An array containing the int tags of the best sequence</returns>
 public virtual int[] BestSequence(ISequenceModel ts)
 {
     return(Counters.Argmax(KBestSequences(ts, 1)));
 }
        public virtual void TestClassicCounterHistoricalMain()
        {
            c.SetCount("p", 0);
            c.SetCount("q", 2);
            ClassicCounter <string> small_c = new ClassicCounter <string>(c);
            ICounter <string>       c7      = c.GetFactory().Create();

            c7.AddAll(c);
            NUnit.Framework.Assert.AreEqual(c.TotalCount(), 2.0);
            c.IncrementCount("p");
            NUnit.Framework.Assert.AreEqual(c.TotalCount(), 3.0);
            c.IncrementCount("p", 2.0);
            NUnit.Framework.Assert.AreEqual(Counters.Min(c), 2.0);
            NUnit.Framework.Assert.AreEqual(Counters.Argmin(c), "q");
            // Now p is p=3.0, q=2.0
            c.SetCount("w", -5.0);
            c.SetCount("x", -4.5);
            IList <string> biggestKeys = new List <string>(c.KeySet());

            NUnit.Framework.Assert.AreEqual(biggestKeys.Count, 4);
            biggestKeys.Sort(Counters.ToComparator(c, false, true));
            NUnit.Framework.Assert.AreEqual("w", biggestKeys[0]);
            NUnit.Framework.Assert.AreEqual("x", biggestKeys[1]);
            NUnit.Framework.Assert.AreEqual("p", biggestKeys[2]);
            NUnit.Framework.Assert.AreEqual("q", biggestKeys[3]);
            NUnit.Framework.Assert.AreEqual(Counters.Min(c), -5.0, Tolerance);
            NUnit.Framework.Assert.AreEqual(Counters.Argmin(c), "w");
            NUnit.Framework.Assert.AreEqual(Counters.Max(c), 3.0, Tolerance);
            NUnit.Framework.Assert.AreEqual(Counters.Argmax(c), "p");
            if (integral)
            {
                NUnit.Framework.Assert.AreEqual(Counters.Mean(c), -1.0);
            }
            else
            {
                NUnit.Framework.Assert.AreEqual(Counters.Mean(c), -1.125, Tolerance);
            }
            if (!integral)
            {
                // only do this for floating point counters.  Too much bother to rewrite
                c.SetCount("x", -2.5);
                ClassicCounter <string> c2 = new ClassicCounter <string>(c);
                NUnit.Framework.Assert.AreEqual(3.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(2.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(-5.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(-2.5, c2.GetCount("x"));
                ICounter <string> c3 = c.GetFactory().Create();
                foreach (string str in c2.KeySet())
                {
                    c3.IncrementCount(str);
                }
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("x"));
                Counters.AddInPlace(c2, c3, 10.0);
                NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x"));
                c3.AddAll(c);
                NUnit.Framework.Assert.AreEqual(4.0, c3.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(3.0, c3.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(-4.0, c3.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(-1.5, c3.GetCount("x"));
                Counters.SubtractInPlace(c3, c);
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("x"));
                foreach (string str_1 in c.KeySet())
                {
                    c3.IncrementCount(str_1);
                }
                NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("x"));
                Counters.DivideInPlace(c2, c3);
                NUnit.Framework.Assert.AreEqual(6.5, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(6.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(2.5, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(3.75, c2.GetCount("x"));
                Counters.DivideInPlace(c2, 0.5);
                NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x"));
                Counters.MultiplyInPlace(c2, 2.0);
                NUnit.Framework.Assert.AreEqual(26.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(24.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(10.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("x"));
                Counters.DivideInPlace(c2, 2.0);
                NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x"));
                foreach (string str_2 in c2.KeySet())
                {
                    c2.IncrementCount(str_2);
                }
                NUnit.Framework.Assert.AreEqual(14.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(6.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(8.5, c2.GetCount("x"));
                foreach (string str_3 in c.KeySet())
                {
                    c2.IncrementCount(str_3);
                }
                NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(14.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x"));
                c2.AddAll(small_c);
                NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(16.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x"));
                NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("p", "q")), Counters.KeysAbove(c2, 14));
                NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("q")), Counters.KeysAt(c2, 16));
                NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("x", "w")), Counters.KeysBelow(c2, 9.5));
                Counters.AddInPlace(c2, small_c, -6);
                NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p"));
                NUnit.Framework.Assert.AreEqual(4.0, c2.GetCount("q"));
                NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x"));
                Counters.SubtractInPlace(c2, small_c);
                Counters.SubtractInPlace(c2, small_c);
                Counters.RetainNonZeros(c2);
                NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p"));
                NUnit.Framework.Assert.IsFalse(c2.ContainsKey("q"));
                NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w"));
                NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x"));
            }
            // serialize to Stream
            if (c is ISerializable)
            {
                try
                {
                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
                    ObjectOutputStream    @out = new ObjectOutputStream(new BufferedOutputStream(baos));
                    @out.WriteObject(c);
                    @out.Close();
                    // reconstitute
                    byte[]            bytes = baos.ToByteArray();
                    ObjectInputStream @in   = new ObjectInputStream(new BufferedInputStream(new ByteArrayInputStream(bytes)));
                    c = IOUtils.ReadObjectFromObjectStream(@in);
                    @in.Close();
                    if (!this.integral)
                    {
                        NUnit.Framework.Assert.AreEqual(-2.5, c.TotalCount());
                        NUnit.Framework.Assert.AreEqual(-5.0, Counters.Min(c));
                        NUnit.Framework.Assert.AreEqual("w", Counters.Argmin(c));
                    }
                    c.Clear();
                    if (!this.integral)
                    {
                        NUnit.Framework.Assert.AreEqual(0.0, c.TotalCount());
                    }
                }
                catch (IOException ioe)
                {
                    Fail("IOException: " + ioe);
                }
                catch (TypeLoadException cce)
                {
                    Fail("ClassNotFoundException: " + cce);
                }
            }
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }
        /// <exception cref="System.Exception"/>
        public static string PrintErrorLog(Mention m, Document document, ICounter <int> probs, int mIdx, Dictionaries dict, RFSieve sieve)
        {
            StringBuilder sb = new StringBuilder();

            sb.Append("\nERROR START-----------------------------------------------------------------------\n");
            sb.Append("RESOLVER TYPE: mType: " + sieve.mType + ", aType: " + sieve.aType).Append("\n");
            sb.Append("DOCUMENT: " + document.docInfo["DOC_ID"] + ", " + document.docInfo["DOC_PART"]).Append("\n");
            IList <Mention> orderedAnts = new List <Mention>();

            sb.Append("\nGOLD CLUSTER ID\n");
            for (int sentDist = m.sentNum; sentDist >= 0; sentDist--)
            {
                if (sentDist == sieve.maxSentDist)
                {
                    sb.Append("\tstart compare from here-------------\n");
                }
                int sentIdx = m.sentNum - sentDist;
                sb.Append("\tSENT " + sentIdx + "\t" + SentenceStringWithMention(sentIdx, document, true, true)).Append("\n");
            }
            sb.Append("\nMENTION ID\n");
            for (int sentDist_1 = m.sentNum; sentDist_1 >= 0; sentDist_1--)
            {
                if (sentDist_1 == sieve.maxSentDist)
                {
                    sb.Append("\tstart compare from here-------------\n");
                }
                int sentIdx = m.sentNum - sentDist_1;
                sb.Append("\tSENT " + sentIdx + "\t" + SentenceStringWithMention(sentIdx, document, false, false)).Append("\n");
            }
            // get dcoref antecedents ordering
            for (int sentDist_2 = 0; sentDist_2 <= Math.Min(sieve.maxSentDist, m.sentNum); sentDist_2++)
            {
                int sentIdx = m.sentNum - sentDist_2;
                Sharpen.Collections.AddAll(orderedAnts, Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.GetOrderedAntecedents(m, sentIdx, mIdx, document.predictedMentions, dict));
            }
            IDictionary <int, int> orders = Generics.NewHashMap();

            for (int i = 0; i < orderedAnts.Count; i++)
            {
                Mention ant = orderedAnts[i];
                orders[ant.mentionID] = i;
            }
            CorefCluster mC              = document.corefClusters[m.corefClusterID];
            bool         isFirstMention  = IsFirstMention(m, document);
            bool         foundCorefAnt   = (probs.Size() > 0 && Counters.Max(probs) > sieve.thresMerge);
            bool         correctDecision = ((isFirstMention && !foundCorefAnt) || (foundCorefAnt && Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.IsReallyCoref(document, m.mentionID, Counters.Argmax(probs))));
            bool         barePlural      = (m.originalSpan.Count == 1 && m.headWord.Tag().Equals("NNS"));

            if (correctDecision)
            {
                return(string.Empty);
            }
            sb.Append("\nMENTION: " + m.SpanToString() + " (" + m.mentionID + ")\tperson: " + m.person + "\tsingleton? " + (!m.hasTwin) + "\t\tisFirstMention? " + isFirstMention + "\t\tfoundAnt? " + foundCorefAnt + "\t\tcorrectDecision? " + correctDecision
                      + "\tbarePlural? " + barePlural);
            sb.Append("\n\ttype: " + m.mentionType + "\tHeadword: " + m.headWord.Word() + "\tNEtype: " + m.nerString + "\tnumber: " + m.number + "\tgender: " + m.gender + "\tanimacy: " + m.animacy).Append("\n");
            if (m.contextParseTree != null)
            {
                sb.Append(m.contextParseTree.PennString());
            }
            sb.Append("\n\n\t\tOracle\t\tDcoref\t\t\tRF\t\tAntecedent\n");
            foreach (int antID in Counters.ToSortedList(probs))
            {
                Mention      ant       = document.predictedMentionsByID[antID];
                CorefCluster aC        = document.corefClusters[ant.corefClusterID];
                bool         oracle    = Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.IsReallyCoref(document, m.mentionID, antID);
                double       prob      = probs.GetCount(antID);
                int          order     = orders[antID];
                string       oracleStr = (oracle) ? "coref   " : "notcoref";
                //      String dcorefStr = (dcoref)? "coref   " : "notcoref";
                string dcorefStr = "notcoref";
                if (dcorefDiscourse.Coreferent(document, mC, aC, m, ant, dict, null))
                {
                    dcorefStr = "coref-discourse";
                }
                else
                {
                    //      else if(dcorefChineseHeadMatch.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-chineseHeadMatch";
                    if (dcorefExactString.Coreferent(document, mC, aC, m, ant, dict, null))
                    {
                        dcorefStr = "coref-exactString";
                    }
                    else
                    {
                        if (dcorefRelaxedExactString.Coreferent(document, mC, aC, m, ant, dict, null))
                        {
                            dcorefStr = "coref-relaxedExact";
                        }
                        else
                        {
                            if (dcorefPreciseConstructs.Coreferent(document, mC, aC, m, ant, dict, null))
                            {
                                dcorefStr = "coref-preciseConstruct";
                            }
                            else
                            {
                                if (dcorefHead1.Coreferent(document, mC, aC, m, ant, dict, null))
                                {
                                    dcorefStr = "coref-head1";
                                }
                                else
                                {
                                    if (dcorefHead2.Coreferent(document, mC, aC, m, ant, dict, null))
                                    {
                                        dcorefStr = "coref-head2";
                                    }
                                    else
                                    {
                                        if (dcorefHead3.Coreferent(document, mC, aC, m, ant, dict, null))
                                        {
                                            dcorefStr = "coref-head3";
                                        }
                                        else
                                        {
                                            if (dcorefHead4.Coreferent(document, mC, aC, m, ant, dict, null))
                                            {
                                                dcorefStr = "coref-head4";
                                            }
                                            else
                                            {
                                                if (dcorefRelaxedHead.Coreferent(document, mC, aC, m, ant, dict, null))
                                                {
                                                    dcorefStr = "coref-relaxedHead";
                                                }
                                                else
                                                {
                                                    if (dcorefPronounSieve.Coreferent(document, mC, aC, m, ant, dict, null))
                                                    {
                                                        dcorefStr = "coref-pronounSieve";
                                                    }
                                                    else
                                                    {
                                                        if (dcorefSpeaker.Coreferent(document, mC, aC, m, ant, dict, null))
                                                        {
                                                            dcorefStr = "coref-speaker";
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                dcorefStr += "\t" + order.ToString();
                string probStr = df.Format(prob);
                sb.Append("\t\t" + oracleStr + "\t" + dcorefStr + "\t" + probStr + "\t\t" + ant.SpanToString() + " (" + ant.mentionID + ")\n");
            }
            sb.Append("ERROR END -----------------------------------------------------------------------\n");
            return(sb.ToString());
        }
Beispiel #16
0
 public virtual E Argmax()
 {
     return(Counters.Argmax(counter));
 }
Beispiel #17
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
 public virtual L ClassOf(IDatum <L, F> example)
 {
     return(Counters.Argmax(ScoresOf(example)));
 }
Beispiel #19
0
        public virtual L ClassOf(RVFDatum <L, F> example)
        {
            ICounter <L> scores = ScoresOf(example);

            return(Counters.Argmax(scores));
        }
Beispiel #20
0
 public override E Argmax()
 {
     return(Counters.Argmax(Counters.LinearCombination(this.counter, 1.0, prior.counter, priorMultiplier)));
 }