public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } // scan data string word = tw.Word(); string subString = model.GetSignature(word, loc); ILabel tag = new Tag(tw.Tag()); if (!c.Contains(tag)) { c[tag] = new ClassicCounter <string>(); } c[tag].IncrementCount(subString, weight); tc.IncrementCount(tag, weight); seenEnd.Add(subString); string tagStr = tw.Tag(); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight); } } }
/// <summary>Trains the first-character based unknown word model.</summary> /// <param name="tw">The word we are currently training on</param> /// <param name="loc">The position of that word</param> /// <param name="weight">The weight to give this word in terms of training</param> public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } string word = tw.Word(); ILabel tagL = new Tag(tw.Tag()); string first = Sharpen.Runtime.Substring(word, 0, 1); if (useUnicodeType) { char ch = word[0]; int type = char.GetType(ch); if (type != char.OtherLetter) { // standard Chinese characters are of type "OTHER_LETTER"!! first = int.ToString(type); } } string tag = tw.Tag(); if (!c.Contains(tagL)) { c[tagL] = new ClassicCounter <string>(); } c[tagL].IncrementCount(first, weight); tc.IncrementCount(tagL, weight); seenFirst.Add(first); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iTotal, weight); } } }