Exemple #1
0
        public virtual void Train(TaggedWord tw, int loc, double weight)
        {
            uwModelTrainer.Train(tw, loc, weight);
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);

            seenCounter.IncrementCount(iTW, weight);
            IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);

            seenCounter.IncrementCount(iT, weight);
            IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = new IntTaggedWord(nullWord, nullTag);

            seenCounter.IncrementCount(i, weight);
            // rules.add(iTW);
            tags.Add(iT);
            words.Add(iW);
            string            tag     = tw.Tag();
            string            baseTag = op.Langpack().BasicCategory(tag);
            ICounter <string> counts  = baseTagCounts[baseTag];

            if (counts == null)
            {
                counts = new ClassicCounter <string>();
                baseTagCounts[baseTag] = counts;
            }
            counts.IncrementCount(tag, weight);
        }
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            // scan data
            string word      = tw.Word();
            string subString = model.GetSignature(word, loc);
            ILabel tag       = new Tag(tw.Tag());

            if (!c.Contains(tag))
            {
                c[tag] = new ClassicCounter <string>();
            }
            c[tag].IncrementCount(subString, weight);
            tc.IncrementCount(tag, weight);
            seenEnd.Add(subString);
            string        tagStr = tw.Tag();
            IntTaggedWord iW     = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight);
                }
            }
        }
Exemple #3
0
        /// <summary>Trains this lexicon on the Collection of trees.</summary>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);
            IntTaggedWord iT  = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag);
            IntTaggedWord iW  = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw;

            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    // it's an entirely unknown word
                    int           s   = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word));
                    IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
                    IntTaggedWord iS  = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag);
                    unSeenCounter.IncrementCount(iTS, weight);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iS, weight);
                    unSeenCounter.IncrementCount(i, weight);
                }
            }
        }
Exemple #4
0
 internal virtual Tree BinarizeLocalTree(Tree t, int headNum, TaggedWord head)
 {
     //System.out.println("Working on: "+headNum+" -- "+t.label());
     if (markovFactor)
     {
         string topCat   = t.Label().Value();
         ILabel newLabel = new CategoryWordTag(topCat, head.Word(), head.Tag());
         t.SetLabel(newLabel);
         Tree t2;
         if (insideFactor)
         {
             t2 = MarkovInsideBinarizeLocalTreeNew(t, headNum, 0, t.NumChildren() - 1, true);
         }
         else
         {
             //          t2 = markovInsideBinarizeLocalTree(t, head, headNum, topCat, false);
             t2 = MarkovOutsideBinarizeLocalTree(t, head, headNum, topCat, new LinkedList <Tree>(), false);
         }
         return(t2);
     }
     if (insideFactor)
     {
         return(InsideBinarizeLocalTree(t, headNum, head, 0, 0));
     }
     return(OutsideBinarizeLocalTree(t, t.Label().Value(), t.Label().Value(), headNum, head, 0, string.Empty, 0, string.Empty));
 }
        /// <summary>Trains the first-character based unknown word model.</summary>
        /// <param name="tw">The word we are currently training on</param>
        /// <param name="loc">The position of that word</param>
        /// <param name="weight">The weight to give this word in terms of training</param>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            string word  = tw.Word();
            ILabel tagL  = new Tag(tw.Tag());
            string first = Sharpen.Runtime.Substring(word, 0, 1);

            if (useUnicodeType)
            {
                char ch   = word[0];
                int  type = char.GetType(ch);
                if (type != char.OtherLetter)
                {
                    // standard Chinese characters are of type "OTHER_LETTER"!!
                    first = int.ToString(type);
                }
            }
            string tag = tw.Tag();

            if (!c.Contains(tagL))
            {
                c[tagL] = new ClassicCounter <string>();
            }
            c[tagL].IncrementCount(first, weight);
            tc.IncrementCount(tagL, weight);
            seenFirst.Add(first);
            IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iTotal, weight);
                }
            }
        }
Exemple #6
0
        private Tree InsideBinarizeLocalTree(Tree t, int headNum, TaggedWord head, int leftProcessed, int rightProcessed)
        {
            string       word        = head.Word();
            string       tag         = head.Tag();
            IList <Tree> newChildren = new List <Tree>(2);

            // check done
            if (t.NumChildren() <= leftProcessed + rightProcessed + 2)
            {
                Tree leftChild = t.GetChild(leftProcessed);
                newChildren.Add(leftChild);
                if (t.NumChildren() == leftProcessed + rightProcessed + 1)
                {
                    // unary ... so top level
                    string finalCat = t.Label().Value();
                    return(tf.NewTreeNode(new CategoryWordTag(finalCat, word, tag), newChildren));
                }
                // binary
                Tree rightChild = t.GetChild(leftProcessed + 1);
                newChildren.Add(rightChild);
                string labelStr = t.Label().Value();
                if (leftProcessed != 0 || rightProcessed != 0)
                {
                    labelStr = ("@ " + leftChild.Label().Value() + ' ' + rightChild.Label().Value());
                }
                return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren));
            }
            if (headNum > leftProcessed)
            {
                // eat left word
                Tree leftChild  = t.GetChild(leftProcessed);
                Tree rightChild = InsideBinarizeLocalTree(t, headNum, head, leftProcessed + 1, rightProcessed);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                string labelStr = ("@ " + leftChild.Label().Value() + ' ' + Sharpen.Runtime.Substring(rightChild.Label().Value(), 2));
                if (leftProcessed == 0 && rightProcessed == 0)
                {
                    labelStr = t.Label().Value();
                }
                return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren));
            }
            else
            {
                // eat right word
                Tree leftChild  = InsideBinarizeLocalTree(t, headNum, head, leftProcessed, rightProcessed + 1);
                Tree rightChild = t.GetChild(t.NumChildren() - rightProcessed - 1);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                string labelStr = ("@ " + Sharpen.Runtime.Substring(leftChild.Label().Value(), 2) + ' ' + rightChild.Label().Value());
                if (leftProcessed == 0 && rightProcessed == 0)
                {
                    labelStr = t.Label().Value();
                }
                return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren));
            }
        }
Exemple #7
0
        private Tree OutsideBinarizeLocalTree(Tree t, string labelStr, string finalCat, int headNum, TaggedWord head, int leftProcessed, string leftStr, int rightProcessed, string rightStr)
        {
            IList <Tree> newChildren = new List <Tree>(2);
            ILabel       label       = new CategoryWordTag(labelStr, head.Word(), head.Tag());

            // check if there are <=2 children already
            if (t.NumChildren() - leftProcessed - rightProcessed <= 2)
            {
                // done, return
                newChildren.Add(t.GetChild(leftProcessed));
                if (t.NumChildren() - leftProcessed - rightProcessed == 2)
                {
                    newChildren.Add(t.GetChild(leftProcessed + 1));
                }
                return(tf.NewTreeNode(label, newChildren));
            }
            if (headNum > leftProcessed)
            {
                // eat a left word
                Tree   leftChild    = t.GetChild(leftProcessed);
                string childLeftStr = leftStr + ' ' + leftChild.Label().Value();
                string childLabelStr;
                if (simpleLabels)
                {
                    childLabelStr = '@' + finalCat;
                }
                else
                {
                    childLabelStr = '@' + finalCat + " :" + childLeftStr + " ..." + rightStr;
                }
                Tree rightChild = OutsideBinarizeLocalTree(t, childLabelStr, finalCat, headNum, head, leftProcessed + 1, childLeftStr, rightProcessed, rightStr);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                return(tf.NewTreeNode(label, newChildren));
            }
            else
            {
                // eat a right word
                Tree   rightChild    = t.GetChild(t.NumChildren() - rightProcessed - 1);
                string childRightStr = ' ' + rightChild.Label().Value() + rightStr;
                string childLabelStr;
                if (simpleLabels)
                {
                    childLabelStr = '@' + finalCat;
                }
                else
                {
                    childLabelStr = '@' + finalCat + " :" + leftStr + " ..." + childRightStr;
                }
                Tree leftChild = OutsideBinarizeLocalTree(t, childLabelStr, finalCat, headNum, head, leftProcessed, leftStr, rightProcessed + 1, childRightStr);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                return(tf.NewTreeNode(label, newChildren));
            }
        }
        public virtual void Train(TaggedWord tw, double weight)
        {
            tokens = tokens + weight;
            string word = tw.Word();
            string tag  = tw.Tag();
            // TaggedWord has crummy equality conditions
            Pair <string, string> wt = new Pair <string, string>(word, tag);

            wtCount.IncrementCount(wt, weight);
            tagCount.IncrementCount(tag, weight);
            seenWords.Add(word);
        }
Exemple #9
0
        /// <summary>Turns a sentence into a flat phrasal tree.</summary>
        /// <remarks>
        /// Turns a sentence into a flat phrasal tree.
        /// The structure is S -&gt; tag*.  And then each tag goes to a word.
        /// The tag is either found from the label or made "WD".
        /// The tag and phrasal node have a StringLabel.
        /// </remarks>
        /// <param name="s">The Sentence to make the Tree from</param>
        /// <param name="lf">The LabelFactory with which to create the new Tree labels</param>
        /// <returns>The one phrasal level Tree</returns>
        public static Tree ToFlatTree <_T0>(IList <_T0> s, ILabelFactory lf)
            where _T0 : IHasWord
        {
            IList <Tree> daughters = new List <Tree>(s.Count);

            foreach (IHasWord word in s)
            {
                Tree wordNode = new LabeledScoredTreeNode(lf.NewLabel(word.Word()));
                if (word is TaggedWord)
                {
                    TaggedWord taggedWord = (TaggedWord)word;
                    wordNode = new LabeledScoredTreeNode(new StringLabel(taggedWord.Tag()), Java.Util.Collections.SingletonList(wordNode));
                }
                else
                {
                    wordNode = new LabeledScoredTreeNode(lf.NewLabel("WD"), Java.Util.Collections.SingletonList(wordNode));
                }
                daughters.Add(wordNode);
            }
            return(new LabeledScoredTreeNode(new StringLabel("S"), daughters));
        }
Exemple #10
0
        private Tree MarkovOutsideBinarizeLocalTree(Tree t, TaggedWord head, int headLoc, string topCat, LinkedList <Tree> ll, bool doneLeft)
        {
            string       word        = head.Word();
            string       tag         = head.Tag();
            IList <Tree> newChildren = new List <Tree>(2);

            // call with t, headNum, head, topCat, false
            if (headLoc == 0)
            {
                if (!doneLeft)
                {
                    // insert a unary to separate the sides
                    if (tlp.IsStartSymbol(topCat))
                    {
                        return(MarkovOutsideBinarizeLocalTree(t, head, headLoc, topCat, new LinkedList <Tree>(), true));
                    }
                    string subLabelStr;
                    if (simpleLabels)
                    {
                        subLabelStr = '@' + topCat;
                    }
                    else
                    {
                        string headStr = t.GetChild(headLoc).Label().Value();
                        subLabelStr = '@' + topCat + ": " + headStr + " ]";
                    }
                    ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag);
                    Tree   subTree  = tf.NewTreeNode(subLabel, t.GetChildrenAsList());
                    newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc, topCat, new LinkedList <Tree>(), true));
                    return(tf.NewTreeNode(t.Label(), newChildren));
                }
                int len = t.NumChildren();
                // len = 1
                if (len == 1)
                {
                    return(tf.NewTreeNode(t.Label(), Java.Util.Collections.SingletonList(t.GetChild(0))));
                }
                ll.AddFirst(t.GetChild(len - 1));
                if (ll.Count > markovOrder)
                {
                    ll.RemoveLast();
                }
                // generate a right
                string subLabelStr_1;
                if (simpleLabels)
                {
                    subLabelStr_1 = '@' + topCat;
                }
                else
                {
                    string headStr  = t.GetChild(headLoc).Label().Value();
                    string rightStr = (len > markovOrder - 1 ? "... " : string.Empty) + Join(ll);
                    subLabelStr_1 = '@' + topCat + ": " + headStr + ' ' + rightStr;
                }
                ILabel subLabel_1 = new CategoryWordTag(subLabelStr_1, word, tag);
                Tree   subTree_1  = tf.NewTreeNode(subLabel_1, t.GetChildrenAsList().SubList(0, len - 1));
                newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree_1, head, headLoc, topCat, ll, true));
                newChildren.Add(t.GetChild(len - 1));
                return(tf.NewTreeNode(t.Label(), newChildren));
            }
            if (headLoc > 0)
            {
                ll.AddLast(t.GetChild(0));
                if (ll.Count > markovOrder)
                {
                    ll.RemoveFirst();
                }
                // generate a left
                string subLabelStr;
                if (simpleLabels)
                {
                    subLabelStr = '@' + topCat;
                }
                else
                {
                    string headStr = t.GetChild(headLoc).Label().Value();
                    string leftStr = Join(ll) + (headLoc > markovOrder - 1 ? " ..." : string.Empty);
                    subLabelStr = '@' + topCat + ": " + leftStr + ' ' + headStr + " ]";
                }
                ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag);
                Tree   subTree  = tf.NewTreeNode(subLabel, t.GetChildrenAsList().SubList(1, t.NumChildren()));
                newChildren.Add(t.GetChild(0));
                newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc - 1, topCat, ll, false));
                return(tf.NewTreeNode(t.Label(), newChildren));
            }
            return(t);
        }
Exemple #11
0
 private static WordTag ToWordTag(TaggedWord tw)
 {
     return(new WordTag(tw.Word(), tw.Tag()));
 }