public virtual void Train(TaggedWord tw, int loc, double weight) { uwModelTrainer.Train(tw, loc, weight); IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); seenCounter.IncrementCount(iTW, weight); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); seenCounter.IncrementCount(iT, weight); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.IncrementCount(iW, weight); IntTaggedWord i = new IntTaggedWord(nullWord, nullTag); seenCounter.IncrementCount(i, weight); // rules.add(iTW); tags.Add(iT); words.Add(iW); string tag = tw.Tag(); string baseTag = op.Langpack().BasicCategory(tag); ICounter <string> counts = baseTagCounts[baseTag]; if (counts == null) { counts = new ClassicCounter <string>(); baseTagCounts[baseTag] = counts; } counts.IncrementCount(tag, weight); }
public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } // scan data string word = tw.Word(); string subString = model.GetSignature(word, loc); ILabel tag = new Tag(tw.Tag()); if (!c.Contains(tag)) { c[tag] = new ClassicCounter <string>(); } c[tag].IncrementCount(subString, weight); tc.IncrementCount(tag, weight); seenEnd.Add(subString); string tagStr = tw.Tag(); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight); } } }
/// <summary>Trains this lexicon on the Collection of trees.</summary> public override void Train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag); seenCounter.IncrementCount(iW, weight); IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { // it's an entirely unknown word int s = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag); unSeenCounter.IncrementCount(iTS, weight); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iS, weight); unSeenCounter.IncrementCount(i, weight); } } }
internal virtual Tree BinarizeLocalTree(Tree t, int headNum, TaggedWord head) { //System.out.println("Working on: "+headNum+" -- "+t.label()); if (markovFactor) { string topCat = t.Label().Value(); ILabel newLabel = new CategoryWordTag(topCat, head.Word(), head.Tag()); t.SetLabel(newLabel); Tree t2; if (insideFactor) { t2 = MarkovInsideBinarizeLocalTreeNew(t, headNum, 0, t.NumChildren() - 1, true); } else { // t2 = markovInsideBinarizeLocalTree(t, head, headNum, topCat, false); t2 = MarkovOutsideBinarizeLocalTree(t, head, headNum, topCat, new LinkedList <Tree>(), false); } return(t2); } if (insideFactor) { return(InsideBinarizeLocalTree(t, headNum, head, 0, 0)); } return(OutsideBinarizeLocalTree(t, t.Label().Value(), t.Label().Value(), headNum, head, 0, string.Empty, 0, string.Empty)); }
/// <summary>Trains the first-character based unknown word model.</summary> /// <param name="tw">The word we are currently training on</param> /// <param name="loc">The position of that word</param> /// <param name="weight">The weight to give this word in terms of training</param> public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } string word = tw.Word(); ILabel tagL = new Tag(tw.Tag()); string first = Sharpen.Runtime.Substring(word, 0, 1); if (useUnicodeType) { char ch = word[0]; int type = char.GetType(ch); if (type != char.OtherLetter) { // standard Chinese characters are of type "OTHER_LETTER"!! first = int.ToString(type); } } string tag = tw.Tag(); if (!c.Contains(tagL)) { c[tagL] = new ClassicCounter <string>(); } c[tagL].IncrementCount(first, weight); tc.IncrementCount(tagL, weight); seenFirst.Add(first); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iTotal, weight); } } }
private Tree InsideBinarizeLocalTree(Tree t, int headNum, TaggedWord head, int leftProcessed, int rightProcessed) { string word = head.Word(); string tag = head.Tag(); IList <Tree> newChildren = new List <Tree>(2); // check done if (t.NumChildren() <= leftProcessed + rightProcessed + 2) { Tree leftChild = t.GetChild(leftProcessed); newChildren.Add(leftChild); if (t.NumChildren() == leftProcessed + rightProcessed + 1) { // unary ... so top level string finalCat = t.Label().Value(); return(tf.NewTreeNode(new CategoryWordTag(finalCat, word, tag), newChildren)); } // binary Tree rightChild = t.GetChild(leftProcessed + 1); newChildren.Add(rightChild); string labelStr = t.Label().Value(); if (leftProcessed != 0 || rightProcessed != 0) { labelStr = ("@ " + leftChild.Label().Value() + ' ' + rightChild.Label().Value()); } return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren)); } if (headNum > leftProcessed) { // eat left word Tree leftChild = t.GetChild(leftProcessed); Tree rightChild = InsideBinarizeLocalTree(t, headNum, head, leftProcessed + 1, rightProcessed); newChildren.Add(leftChild); newChildren.Add(rightChild); string labelStr = ("@ " + leftChild.Label().Value() + ' ' + Sharpen.Runtime.Substring(rightChild.Label().Value(), 2)); if (leftProcessed == 0 && rightProcessed == 0) { labelStr = t.Label().Value(); } return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren)); } else { // eat right word Tree leftChild = InsideBinarizeLocalTree(t, headNum, head, leftProcessed, rightProcessed + 1); Tree rightChild = t.GetChild(t.NumChildren() - rightProcessed - 1); newChildren.Add(leftChild); newChildren.Add(rightChild); string labelStr = ("@ " + Sharpen.Runtime.Substring(leftChild.Label().Value(), 2) + ' ' + rightChild.Label().Value()); if (leftProcessed == 0 && rightProcessed == 0) { labelStr = t.Label().Value(); } return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren)); } }
private Tree OutsideBinarizeLocalTree(Tree t, string labelStr, string finalCat, int headNum, TaggedWord head, int leftProcessed, string leftStr, int rightProcessed, string rightStr) { IList <Tree> newChildren = new List <Tree>(2); ILabel label = new CategoryWordTag(labelStr, head.Word(), head.Tag()); // check if there are <=2 children already if (t.NumChildren() - leftProcessed - rightProcessed <= 2) { // done, return newChildren.Add(t.GetChild(leftProcessed)); if (t.NumChildren() - leftProcessed - rightProcessed == 2) { newChildren.Add(t.GetChild(leftProcessed + 1)); } return(tf.NewTreeNode(label, newChildren)); } if (headNum > leftProcessed) { // eat a left word Tree leftChild = t.GetChild(leftProcessed); string childLeftStr = leftStr + ' ' + leftChild.Label().Value(); string childLabelStr; if (simpleLabels) { childLabelStr = '@' + finalCat; } else { childLabelStr = '@' + finalCat + " :" + childLeftStr + " ..." + rightStr; } Tree rightChild = OutsideBinarizeLocalTree(t, childLabelStr, finalCat, headNum, head, leftProcessed + 1, childLeftStr, rightProcessed, rightStr); newChildren.Add(leftChild); newChildren.Add(rightChild); return(tf.NewTreeNode(label, newChildren)); } else { // eat a right word Tree rightChild = t.GetChild(t.NumChildren() - rightProcessed - 1); string childRightStr = ' ' + rightChild.Label().Value() + rightStr; string childLabelStr; if (simpleLabels) { childLabelStr = '@' + finalCat; } else { childLabelStr = '@' + finalCat + " :" + leftStr + " ..." + childRightStr; } Tree leftChild = OutsideBinarizeLocalTree(t, childLabelStr, finalCat, headNum, head, leftProcessed, leftStr, rightProcessed + 1, childRightStr); newChildren.Add(leftChild); newChildren.Add(rightChild); return(tf.NewTreeNode(label, newChildren)); } }
public virtual void Train(TaggedWord tw, double weight) { tokens = tokens + weight; string word = tw.Word(); string tag = tw.Tag(); // TaggedWord has crummy equality conditions Pair <string, string> wt = new Pair <string, string>(word, tag); wtCount.IncrementCount(wt, weight); tagCount.IncrementCount(tag, weight); seenWords.Add(word); }
/// <summary>Turns a sentence into a flat phrasal tree.</summary> /// <remarks> /// Turns a sentence into a flat phrasal tree. /// The structure is S -> tag*. And then each tag goes to a word. /// The tag is either found from the label or made "WD". /// The tag and phrasal node have a StringLabel. /// </remarks> /// <param name="s">The Sentence to make the Tree from</param> /// <param name="lf">The LabelFactory with which to create the new Tree labels</param> /// <returns>The one phrasal level Tree</returns> public static Tree ToFlatTree <_T0>(IList <_T0> s, ILabelFactory lf) where _T0 : IHasWord { IList <Tree> daughters = new List <Tree>(s.Count); foreach (IHasWord word in s) { Tree wordNode = new LabeledScoredTreeNode(lf.NewLabel(word.Word())); if (word is TaggedWord) { TaggedWord taggedWord = (TaggedWord)word; wordNode = new LabeledScoredTreeNode(new StringLabel(taggedWord.Tag()), Java.Util.Collections.SingletonList(wordNode)); } else { wordNode = new LabeledScoredTreeNode(lf.NewLabel("WD"), Java.Util.Collections.SingletonList(wordNode)); } daughters.Add(wordNode); } return(new LabeledScoredTreeNode(new StringLabel("S"), daughters)); }
private Tree MarkovOutsideBinarizeLocalTree(Tree t, TaggedWord head, int headLoc, string topCat, LinkedList <Tree> ll, bool doneLeft) { string word = head.Word(); string tag = head.Tag(); IList <Tree> newChildren = new List <Tree>(2); // call with t, headNum, head, topCat, false if (headLoc == 0) { if (!doneLeft) { // insert a unary to separate the sides if (tlp.IsStartSymbol(topCat)) { return(MarkovOutsideBinarizeLocalTree(t, head, headLoc, topCat, new LinkedList <Tree>(), true)); } string subLabelStr; if (simpleLabels) { subLabelStr = '@' + topCat; } else { string headStr = t.GetChild(headLoc).Label().Value(); subLabelStr = '@' + topCat + ": " + headStr + " ]"; } ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag); Tree subTree = tf.NewTreeNode(subLabel, t.GetChildrenAsList()); newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc, topCat, new LinkedList <Tree>(), true)); return(tf.NewTreeNode(t.Label(), newChildren)); } int len = t.NumChildren(); // len = 1 if (len == 1) { return(tf.NewTreeNode(t.Label(), Java.Util.Collections.SingletonList(t.GetChild(0)))); } ll.AddFirst(t.GetChild(len - 1)); if (ll.Count > markovOrder) { ll.RemoveLast(); } // generate a right string subLabelStr_1; if (simpleLabels) { subLabelStr_1 = '@' + topCat; } else { string headStr = t.GetChild(headLoc).Label().Value(); string rightStr = (len > markovOrder - 1 ? "... " : string.Empty) + Join(ll); subLabelStr_1 = '@' + topCat + ": " + headStr + ' ' + rightStr; } ILabel subLabel_1 = new CategoryWordTag(subLabelStr_1, word, tag); Tree subTree_1 = tf.NewTreeNode(subLabel_1, t.GetChildrenAsList().SubList(0, len - 1)); newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree_1, head, headLoc, topCat, ll, true)); newChildren.Add(t.GetChild(len - 1)); return(tf.NewTreeNode(t.Label(), newChildren)); } if (headLoc > 0) { ll.AddLast(t.GetChild(0)); if (ll.Count > markovOrder) { ll.RemoveFirst(); } // generate a left string subLabelStr; if (simpleLabels) { subLabelStr = '@' + topCat; } else { string headStr = t.GetChild(headLoc).Label().Value(); string leftStr = Join(ll) + (headLoc > markovOrder - 1 ? " ..." : string.Empty); subLabelStr = '@' + topCat + ": " + leftStr + ' ' + headStr + " ]"; } ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag); Tree subTree = tf.NewTreeNode(subLabel, t.GetChildrenAsList().SubList(1, t.NumChildren())); newChildren.Add(t.GetChild(0)); newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc - 1, topCat, ll, false)); return(tf.NewTreeNode(t.Label(), newChildren)); } return(t); }
private static WordTag ToWordTag(TaggedWord tw) { return(new WordTag(tw.Word(), tw.Tag())); }