public virtual void TestTraditionalMain()
        {
            TwoDimensionalIntCounter <string, string> cc = new TwoDimensionalIntCounter <string, string>();

            cc.SetCount("a", "c", 1.0);
            cc.SetCount("b", "c", 1.0);
            cc.SetCount("a", "d", 1.0);
            cc.SetCount("a", "d", -1.0);
            cc.SetCount("b", "d", 1.0);
            NUnit.Framework.Assert.AreEqual("Error in counter setup", 1.0, cc.GetCount("a", "c"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter setup", 1.0, cc.GetCount("b", "c"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter setup", -1.0, cc.GetCount("a", "d"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter setup", 1.0, cc.GetCount("b", "d"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter setup", 0.0, cc.GetCount("a", "a"), 1e-8);
            cc.IncrementCount("b", "d", 1.0);
            NUnit.Framework.Assert.AreEqual("Error in counter increment", -1.0, cc.GetCount("a", "d"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter increment", 2.0, cc.GetCount("b", "d"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter increment", 0.0, cc.GetCount("a", "a"), 1e-8);
            TwoDimensionalIntCounter <string, string> cc2 = TwoDimensionalIntCounter.ReverseIndexOrder(cc);

            NUnit.Framework.Assert.AreEqual("Error in counter reverseIndexOrder", 1.0, cc2.GetCount("c", "a"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter reverseIndexOrder", 1.0, cc2.GetCount("c", "b"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter reverseIndexOrder", -1.0, cc2.GetCount("d", "a"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter reverseIndexOrder", 2.0, cc2.GetCount("d", "b"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter reverseIndexOrder", 0.0, cc2.GetCount("a", "a"), 1e-8);
            NUnit.Framework.Assert.AreEqual("Error in counter reverseIndexOrder", 0.0, cc2.GetCount("a", "c"), 1e-8);
        }
        /// <summary>This method should never return 0!</summary>
        private double ProbMorphTag(int tagId, int morphId)
        {
            double cM  = morphTag.TotalCount(morphId);
            double cMT = morphTag.GetCount(morphId, tagId);
            // p_M
            double p_M = cM / morphTag.TotalCount();
            // p_T
            double cTseen = tagCounter.GetCount(tagId);
            double p_T    = cTseen / tagCounter.TotalCount();
            double p_M_T  = 0.0;

            if (cM > 100.0 && cMT > 0.0)
            {
                double p_T_M = cMT / cM;
                //      else {
                //        double cTunseen = morphTagUnseen.getCount(tagId);
                //        double p_T_U = cTunseen / morphTagUnseen.totalCount();
                //        p_T_M = (cMT + smooth[1]*p_T_U) / (cM + smooth[1]);
                //      }
                p_M_T = p_T_M * p_M / p_T;
            }
            else
            {
                // Unseen morphological analysis
                // Hack....unseen morph tags are extremely rare
                // Add+1 smoothing
                p_M_T = 1.0 / (morphTag.TotalCount() + tagIndex.Size() + 1.0);
            }
            return(p_M_T);
        }
        /// <summary>This method should never return 0!!</summary>
        private double ProbLemmaTag(string word, int loc, int tagId, int lemmaId)
        {
            double cL  = lemmaTag.TotalCount(lemmaId);
            double cLT = lemmaTag.GetCount(lemmaId, tagId);
            // p_L
            double p_L = cL / lemmaTag.TotalCount();
            // p_T
            double cTseen = tagCounter.GetCount(tagId);
            double p_T    = cTseen / tagCounter.TotalCount();
            // p_T_L
            double p_L_T = 0.0;

            if (cL > 0.0)
            {
                // Seen lemma
                double p_T_L = 0.0;
                if (cL > 100.0 && cLT > 0.0)
                {
                    p_T_L = cLT / cL;
                }
                else
                {
                    double cTunseen = lemmaTagUnseen.GetCount(tagId);
                    // TODO(spenceg): p_T_U is 0??
                    double p_T_U = cTunseen / lemmaTagUnseen.TotalCount();
                    p_T_L = (cLT + smooth[1] * p_T_U) / (cL + smooth[1]);
                }
                p_L_T = p_T_L * p_L / p_T;
            }
            else
            {
                // Unseen lemma. Score based on the word signature (of the surface form)
                // Hack
                double cTunseen = lemmaTagUnseen.GetCount(tagId);
                p_L_T = cTunseen / tagCounter.TotalCount();
            }
            //      int wordId = wordIndex.indexOf(word);
            //      IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
            //      double c_T = tagCounter.getCount(tagId);
            //      p_L_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
            return(p_L_T);
        }
        private double ProbWordTag(string word, int loc, int wordId, int tagId)
        {
            double cW  = wordTag.TotalCount(wordId);
            double cWT = wordTag.GetCount(wordId, tagId);
            // p_L
            double p_W = cW / wordTag.TotalCount();
            // p_T
            double cTseen = tagCounter.GetCount(tagId);
            double p_T    = cTseen / tagCounter.TotalCount();
            // p_T_L
            double p_W_T = 0.0;

            if (cW > 0.0)
            {
                // Seen lemma
                double p_T_W = 0.0;
                if (cW > 100.0 && cWT > 0.0)
                {
                    p_T_W = cWT / cW;
                }
                else
                {
                    double cTunseen = wordTagUnseen.GetCount(tagId);
                    // TODO p_T_U is 0?
                    double p_T_U = cTunseen / wordTagUnseen.TotalCount();
                    p_T_W = (cWT + smooth[1] * p_T_U) / (cW + smooth[1]);
                }
                p_W_T = p_T_W * p_W / p_T;
            }
            else
            {
                // Unseen word. Score based on the word signature (of the surface form)
                IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
                double        c_T = tagCounter.GetCount(tagId);
                p_W_T = Math.Exp(GetUnknownWordModel().Score(iTW, loc, c_T, tagCounter.TotalCount(), smooth[0], word));
            }
            return(p_W_T);
        }
Exemple #5
0
        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }