private static IList <FactoredLexiconEvent> GetTuningSet(Treebank devTreebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon, ITreebankLangParserParams tlpp)
        {
            IList <Tree> devTrees = new List <Tree>(3000);

            foreach (Tree tree in devTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                devTrees.Add(tree);
            }
            IList <FactoredLexiconEvent> tuningSet = TreebankToLexiconEvents(devTrees, lexicon);

            return(tuningSet);
        }
Example #2
0
        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }
Example #4
0
        /// <summary>Do the category splitting of the tree passed in.</summary>
        /// <remarks>
        /// Do the category splitting of the tree passed in.
        /// This is initially called on the root node of a tree, and it recursively
        /// calls itself on children.  A depth first left-to-right traversal is
        /// done whereby a tree node's children are first transformed and then
        /// the parent is transformed.  At the time of calling, the original root
        /// always sits above the current node.  This routine can be assumed to,
        /// and does, change the tree passed in: it destructively modifies tree nodes,
        /// and makes new tree structure when it needs to.
        /// </remarks>
        /// <param name="t">The tree node to subcategorize.</param>
        /// <param name="root">
        /// The root of the tree.  It must contain
        /// <paramref name="t"/>
        /// or
        /// this code will throw a NullPointerException.
        /// </param>
        /// <returns>The annotated tree.</returns>
        private Tree TransformTreeHelper(Tree t, Tree root)
        {
            if (t == null)
            {
                // handle null
                return(null);
            }
            if (t.IsLeaf())
            {
                //No need to change the label
                return(t);
            }
            string cat = t.Label().Value();
            Tree   parent;
            string parentStr;
            string grandParentStr;

            if (root == null || t.Equals(root))
            {
                parent    = null;
                parentStr = string.Empty;
            }
            else
            {
                parent    = t.Parent(root);
                parentStr = parent.Label().Value();
            }
            if (parent == null || parent.Equals(root))
            {
                grandParentStr = string.Empty;
            }
            else
            {
                grandParentStr = parent.Parent(root).Label().Value();
            }
            string baseParentStr      = tlpParams.TreebankLanguagePack().BasicCategory(parentStr);
            string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr);

            //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr);
            if (t.IsPreTerminal())
            {
                // handle tags
                Tree childResult = TransformTreeHelper(t.Children()[0], null);
                // recurse
                string word = childResult.Value();
                // would be nicer if Word/CWT ??
                if (!trainOptions.noTagSplit)
                {
                    if (trainOptions.tagPA)
                    {
                        string test = cat + "^" + baseParentStr;
                        if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.Contains(test))
                        {
                            cat = test;
                        }
                    }
                    if (trainOptions.markUnaryTags && parent.NumChildren() == 1)
                    {
                        cat = cat + "^U";
                    }
                }
                // otherwise, leave the tags alone!
                // Label label = new CategoryWordTag(cat, word, cat);
                ILabel label = t.Label().LabelFactory().NewLabel(t.Label());
                label.SetValue(cat);
                if (label is IHasCategory)
                {
                    ((IHasCategory)label).SetCategory(cat);
                }
                if (label is IHasWord)
                {
                    ((IHasWord)label).SetWord(word);
                }
                if (label is IHasTag)
                {
                    ((IHasTag)label).SetTag(cat);
                }
                t.SetLabel(label);
                t.SetChild(0, childResult);
                // just in case word is changed
                if (trainOptions.noTagSplit)
                {
                    return(t);
                }
                else
                {
                    // language-specific transforms
                    return(tlpParams.TransformTree(t, root));
                }
            }
            // end isPreTerminal()
            // handle phrasal categories
            Tree[] kids = t.Children();
            for (int childNum = 0; childNum < kids.Length; childNum++)
            {
                Tree child       = kids[childNum];
                Tree childResult = TransformTreeHelper(child, root);
                // recursive call
                t.SetChild(childNum, childResult);
            }
            Tree headChild = hf.DetermineHead(t);

            if (headChild == null || headChild.Label() == null)
            {
                throw new Exception("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
            }
            ILabel headLabel = headChild.Label();

            if (!(headLabel is IHasWord))
            {
                throw new Exception("TreeAnnotator: Head label lacks a Word annotation!");
            }
            if (!(headLabel is IHasTag))
            {
                throw new Exception("TreeAnnotator: Head label lacks a Tag annotation!");
            }
            string word_1 = ((IHasWord)headLabel).Word();
            string tag    = ((IHasTag)headLabel).Tag();
            // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
            string baseCat = tlpParams.TreebankLanguagePack().BasicCategory(cat);

            /* Sister annotation. Potential problem: if multiple sisters are
             * strong indicators for a single category's expansions.  This
             * happens concretely in the Chinese Treebank when NP (object)
             * has left sisters VV and AS.  Could lead to too much
             * sparseness.  The ideal solution would be to give the
             * splitting list an ordering, and take only the highest (~most
             * informative/reliable) sister annotation.
             */
            if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.Length > 0)
            {
                IList <string> leftSis  = ListBasicCategories(SisterAnnotationStats.LeftSisterLabels(t, parent));
                IList <string> rightSis = ListBasicCategories(SisterAnnotationStats.RightSisterLabels(t, parent));
                IList <string> leftAnn  = new List <string>();
                IList <string> rightAnn = new List <string>();
                foreach (string s in leftSis)
                {
                    //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
                    leftAnn.Add(baseCat + "=l=" + tlpParams.TreebankLanguagePack().BasicCategory(s));
                }
                //System.out.println("left-annotated test string " + s);
                foreach (string s_1 in rightSis)
                {
                    //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
                    rightAnn.Add(baseCat + "=r=" + tlpParams.TreebankLanguagePack().BasicCategory(s_1));
                }
                for (IEnumerator <string> j = rightAnn.GetEnumerator(); j.MoveNext();)
                {
                }
                //System.out.println("new rightsis " + (String)j.next()); //debugging
                foreach (string annCat in trainOptions.sisterSplitters)
                {
                    //System.out.println("annotated test string " + annCat);
                    if (leftAnn.Contains(annCat) || rightAnn.Contains(annCat))
                    {
                        cat = cat + annCat.ReplaceAll("^" + baseCat, string.Empty);
                        break;
                    }
                }
            }
            if (trainOptions.Pa && !trainOptions.smoothing && baseParentStr.Length > 0)
            {
                string cat2 = baseCat + "^" + baseParentStr;
                if (!trainOptions.selectiveSplit || trainOptions.splitters.Contains(cat2))
                {
                    cat = cat + "^" + baseParentStr;
                }
            }
            if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.Length > 0)
            {
                if (trainOptions.selectiveSplit)
                {
                    string cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
                    if (cat.Contains("^") && trainOptions.splitters.Contains(cat2))
                    {
                        cat = cat + "~" + baseGrandParentStr;
                    }
                }
                else
                {
                    cat = cat + "~" + baseGrandParentStr;
                }
            }
            if (trainOptions.markUnary > 0)
            {
                if (trainOptions.markUnary == 1 && kids.Length == 1 && kids[0].Depth() >= 2)
                {
                    cat = cat + "-U";
                }
                else
                {
                    if (trainOptions.markUnary == 2 && parent != null && parent.NumChildren() == 1 && t.Depth() >= 2)
                    {
                        cat = cat + "-u";
                    }
                }
            }
            if (trainOptions.rightRec && RightRec(t, baseCat))
            {
                cat = cat + "-R";
            }
            if (trainOptions.leftRec && LeftRec(t, baseCat))
            {
                cat = cat + "-L";
            }
            if (trainOptions.splitPrePreT && t.IsPrePreTerminal())
            {
                cat = cat + "-PPT";
            }
            //    Label label = new CategoryWordTag(cat, word, tag);
            ILabel label_1 = t.Label().LabelFactory().NewLabel(t.Label());

            label_1.SetValue(cat);
            if (label_1 is IHasCategory)
            {
                ((IHasCategory)label_1).SetCategory(cat);
            }
            if (label_1 is IHasWord)
            {
                ((IHasWord)label_1).SetWord(word_1);
            }
            if (label_1 is IHasTag)
            {
                ((IHasTag)label_1).SetTag(tag);
            }
            t.SetLabel(label_1);
            return(tlpParams.TransformTree(t, root));
        }