public override Tree TransformTree(Tree t, Tree root)
        {
            // Perform tregex-powered annotations
            t = base.TransformTree(t, root);
            string cat = t.Value();

            //Add morphosyntactic features if this is a POS tag
            if (t.IsPreTerminal() && tagSpec != null)
            {
                if (!(t.FirstChild().Label() is CoreLabel) || ((CoreLabel)t.FirstChild().Label()).OriginalText() == null)
                {
                    throw new Exception(string.Format("%s: Term lacks morpho analysis: %s", this.GetType().FullName, t.ToString()));
                }
                string morphoStr = ((CoreLabel)t.FirstChild().Label()).OriginalText();
                Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(string.Empty, morphoStr);
                MorphoFeatures        feats      = tagSpec.StrToFeatures(lemmaMorph.Second());
                cat = feats.GetTag(cat);
            }
            //Update the label(s)
            t.SetValue(cat);
            if (t.IsPreTerminal() && t.Label() is IHasTag)
            {
                ((IHasTag)t.Label()).SetTag(cat);
            }
            return(t);
        }
        public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            int wordId = iTW.Word();
            int tagId  = iTW.Tag();
            // Force 1-best path to go through the boundary symbol
            // (deterministic tagging)
            int boundaryId    = wordIndex.IndexOf(LexiconConstants.Boundary);
            int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag);

            if (wordId == boundaryId && tagId == boundaryTagId)
            {
                return(0.0f);
            }
            // Morphological features
            string tag = tagIndex.Get(iTW.Tag());
            Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec);
            string lemma           = lemmaMorph.First();
            int    lemmaId         = wordIndex.IndexOf(lemma);
            string richMorphTag    = lemmaMorph.Second();
            string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim();

            reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag;
            int morphId = morphIndex.AddToIndex(reducedMorphTag);
            // Score the factors and create the rule score p_W_T
            double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId));
            //    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
            double p_L_T = 0.0;
            double p_M_T = Math.Log(ProbMorphTag(tagId, morphId));
            double p_W_T = p_W_Tf + p_L_T + p_M_T;

            //      String tag = tagIndex.get(tagId);
            // Filter low probability taggings
            return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity);
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile = args[0];
            ITreeReaderFactory trf      = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                for (Tree tree1; (tree1 = tr.ReadTree()) != null;)
                {
                    IList <ILabel> pretermYield = tree1.PreTerminalYield();
                    IList <ILabel> yield        = tree1.Yield();
                    int            yieldLen     = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel             rawToken   = (CoreLabel)yield[i];
                        string                word       = rawToken.Value();
                        string                morphStr   = rawToken.OriginalText();
                        Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr);
                        string                lemma      = lemmaMorph.First();
                        string                morph      = lemmaMorph.Second();
                        if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX"))
                        {
                            morph = ((CoreLabel)pretermYield[i]).Value();
                        }
                        System.Console.Out.Printf("%s %s %s%n", word, lemma, morph);
                    }
                    System.Console.Out.WriteLine();
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <summary>
        /// Convert a treebank to factored lexicon events for fast iteration in the
        /// optimizer.
        /// </summary>
        private static IList <FactoredLexiconEvent> TreebankToLexiconEvents(IList <Tree> treebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon)
        {
            IList <FactoredLexiconEvent> events = new List <FactoredLexiconEvent>(70000);

            foreach (Tree tree in treebank)
            {
                IList <ILabel> yield   = tree.Yield();
                IList <ILabel> preterm = tree.PreTerminalYield();
                System.Diagnostics.Debug.Assert(yield.Count == preterm.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag    = preterm[i].Value();
                    int    tagId  = lexicon.tagIndex.IndexOf(tag);
                    string word   = yield[i].Value();
                    int    wordId = lexicon.wordIndex.IndexOf(word);
                    // Two checks to see if we keep this example
                    if (tagId < 0)
                    {
                        log.Info("Discarding training example: " + word + " " + tag);
                        continue;
                    }
                    //        if (counts.probWordTag(wordId, tagId) == 0.0) {
                    //          log.info("Discarding low counts <w,t> pair: " + word + " " + tag);
                    //          continue;
                    //        }
                    string featureStr = ((CoreLabel)yield[i]).OriginalText();
                    Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureStr);
                    string lemma      = lemmaMorph.First();
                    string richTag    = lemmaMorph.Second();
                    string reducedTag = lexicon.morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTag = reducedTag.Length == 0 ? NoMorphAnalysis : reducedTag;
                    int lemmaId = lexicon.wordIndex.IndexOf(lemma);
                    int morphId = lexicon.morphIndex.IndexOf(reducedTag);
                    FactoredLexiconEvent @event = new FactoredLexiconEvent(wordId, tagId, lemmaId, morphId, i, word, featureStr);
                    events.Add(@event);
                }
            }
            return(events);
        }
Esempio n. 5
0
        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }
        /// <summary>This method should populate wordIndex, tagIndex, and morphIndex.</summary>
        public override void Train(ICollection <Tree> trees, ICollection <Tree> rawTrees)
        {
            double weight = 1.0;

            // Train uw model on words
            uwModelTrainer.Train(trees, weight);
            double             numTrees    = trees.Count;
            IEnumerator <Tree> rawTreesItr = rawTrees == null ? null : rawTrees.GetEnumerator();
            IEnumerator <Tree> treeItr     = trees.GetEnumerator();
            // Train factored lexicon on lemmas and morph tags
            int treeId = 0;

            while (treeItr.MoveNext())
            {
                Tree tree = treeItr.Current;
                // CoreLabels, with morph analysis in the originalText annotation
                IList <ILabel> yield = rawTrees == null?tree.Yield() : rawTreesItr.Current.Yield();

                // Annotated, binarized tree for the tags (labels are usually CategoryWordTag)
                IList <ILabel> pretermYield = tree.PreTerminalYield();
                int            yieldLen     = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string word   = yield[i].Value();
                    int    wordId = wordIndex.AddToIndex(word);
                    // Don't do anything with words
                    string tag   = pretermYield[i].Value();
                    int    tagId = tagIndex.AddToIndex(tag);
                    // Use the word as backup if there is no lemma
                    string featureStr = ((CoreLabel)yield[i]).OriginalText();
                    Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureStr);
                    string lemma           = lemmaMorph.First();
                    int    lemmaId         = wordIndex.AddToIndex(lemma);
                    string richMorphTag    = lemmaMorph.Second();
                    string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim();
                    reducedMorphTag = reducedMorphTag.IsEmpty() ? NoMorphAnalysis : reducedMorphTag;
                    int morphId = morphIndex.AddToIndex(reducedMorphTag);
                    // Seen event counts
                    wordTag.IncrementCount(wordId, tagId);
                    lemmaTag.IncrementCount(lemmaId, tagId);
                    morphTag.IncrementCount(morphId, tagId);
                    tagCounter.IncrementCount(tagId);
                    // Unseen event counts
                    if (treeId > op.trainOptions.fractionBeforeUnseenCounting * numTrees)
                    {
                        if (!wordTag.FirstKeySet().Contains(wordId) || wordTag.GetCounter(wordId).TotalCount() < 2)
                        {
                            wordTagUnseen.IncrementCount(tagId);
                        }
                        if (!lemmaTag.FirstKeySet().Contains(lemmaId) || lemmaTag.GetCounter(lemmaId).TotalCount() < 2)
                        {
                            lemmaTagUnseen.IncrementCount(tagId);
                        }
                        if (!morphTag.FirstKeySet().Contains(morphId) || morphTag.GetCounter(morphId).TotalCount() < 2)
                        {
                            morphTagUnseen.IncrementCount(tagId);
                        }
                    }
                }
                ++treeId;
                if (Debug && (treeId % 100) == 0)
                {
                    System.Console.Error.Printf("[%d]", treeId);
                }
                if (Debug && (treeId % 10000) == 0)
                {
                    log.Info();
                }
            }
        }
 public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf)
 {
     tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf);
     foreach (Tree t in tree)
     {
         if (t.IsLeaf())
         {
             //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is
             //specified by HasContext.
             if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark))
             {
                 string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark);
                 if (toks.Length != 2)
                 {
                     log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value()));
                 }
                 else
                 {
                     if (t.Label() is CoreLabel)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.SetValue(string.Intern(toks[0].Trim()));
                         cl.SetWord(string.Intern(toks[0].Trim()));
                         Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]);
                         string lemma         = lemmaMorph.First();
                         string morphAnalysis = lemmaMorph.Second();
                         if (lemma.Equals(toks[0]))
                         {
                             cl.SetOriginalText(string.Intern(toks[1].Trim()));
                         }
                         else
                         {
                             // TODO(spenceg): Does this help?
                             string newLemma = lexMapper.Map(null, lemma);
                             if (newLemma == null || newLemma.Trim().IsEmpty())
                             {
                                 newLemma = lemma;
                             }
                             string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis;
                             cl.SetOriginalText(string.Intern(newMorphAnalysis));
                         }
                     }
                     else
                     {
                         log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName));
                     }
                 }
             }
         }
         else
         {
             if (t.IsPreTerminal())
             {
                 if (t.Value() == null || t.Value().IsEmpty())
                 {
                     log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString()));
                 }
                 else
                 {
                     if (t.Label() is IHasTag)
                     {
                         ((IHasTag)t.Label()).SetTag(t.Value());
                     }
                 }
             }
             else
             {
                 //Phrasal nodes
                 // there are some nodes "/" missing preterminals.  We'll splice in a tag for these.
                 int          nk      = t.NumChildren();
                 IList <Tree> newKids = new List <Tree>(nk);
                 for (int j = 0; j < nk; j++)
                 {
                     Tree child = t.GetChild(j);
                     if (child.IsLeaf())
                     {
                         log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString()));
                         newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child)));
                     }
                     else
                     {
                         newKids.Add(child);
                     }
                 }
                 t.SetChildren(newKids);
             }
         }
     }
     //Every node in the tree has now been processed
     //
     // Additional processing for specific phrasal annotations
     //
     // special global coding for moving PRD annotation from constituent to verb tag.
     if (markPRDverb)
     {
         TregexMatcher m     = prdVerbPattern.Matcher(tree);
         Tree          match = null;
         while (m.Find())
         {
             if (m.GetMatch() != match)
             {
                 match = m.GetMatch();
                 match.Label().SetValue(match.Label().Value() + "-PRDverb");
                 Tree prd = m.GetNode("prd");
                 prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value()));
             }
         }
     }
     //Mark *only* subjects in verb-initial clauses
     if (retainNPSbj)
     {
         TregexMatcher m = npSbjPattern.Matcher(tree);
         while (m.Find())
         {
             Tree match = m.GetMatch();
             match.Label().SetValue("NP");
         }
     }
     if (tree.IsPreTerminal())
     {
         // The whole tree is a bare tag: bad!
         string val = tree.Label().Value();
         if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ"))
         {
             log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString()));
             tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree));
         }
         else
         {
             log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString()));
         }
     }
     //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets.
     //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method
     //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree.
     while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1)
     {
         tree = tree.FirstChild();
     }
     if (tree != null && !tree.Value().Equals(rootLabel))
     {
         tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree));
     }
     return(tree);
 }