public override bool Equals(object o) { if (this == o) { return(true); } if (!(o is ChineseCharacterBasedLexicon.Symbol)) { return(false); } ChineseCharacterBasedLexicon.Symbol symbol = (ChineseCharacterBasedLexicon.Symbol)o; if (ch != symbol.ch) { return(false); } if (type != symbol.type) { return(false); } if (unkClass != null ? !unkClass.Equals(symbol.unkClass) : symbol.unkClass != null) { return(false); } return(true); }
public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { string tag = tagIndex.Get(iTW.tag); System.Diagnostics.Debug.Assert(!word.Equals(LexiconConstants.Boundary)); char[] chars = word.ToCharArray(); IList <ISerializable> charList = new List <ISerializable>(chars.Length + ContextLength + 1); // this starts of storing Symbol's and then starts storing String's. Clean this up someday! // charList is constructed backward // END_WORD char[length-1] char[length-2] ... char[0] BEGIN_WORD BEGIN_WORD charList.Add(ChineseCharacterBasedLexicon.Symbol.EndWord); for (int i = chars.Length - 1; i >= 0; i--) { ChineseCharacterBasedLexicon.Symbol ch = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(chars[i]); if (knownChars.Contains(ch)) { charList.Add(ch); } else { charList.Add(UnknownCharClass(ch)); } } for (int i_1 = 0; i_1 < ContextLength; i_1++) { charList.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord); } double score = 0.0; for (int i_2 = 0; i_2 < size - ContextLength; i_2++) { ChineseCharacterBasedLexicon.Symbol nextChar = (ChineseCharacterBasedLexicon.Symbol)charList[i_2]; charList.Set(i_2, tag); double charScore = GetBackedOffDist(charList.SubList(i_2, i_2 + ContextLength + 1)).ProbabilityOf(nextChar); score += Math.Log(charScore); } switch (penaltyType) { case 0: { break; } case 1: { score -= (chars.Length * (chars.Length + 1)) * (lengthPenalty / 2); break; } case 2: { score -= (chars.Length - 1) * lengthPenalty; break; } } return((float)score); }
private ChineseCharacterBasedLexicon.Symbol UnknownCharClass(ChineseCharacterBasedLexicon.Symbol ch) { if (useUnknownCharacterModel) { return(new ChineseCharacterBasedLexicon.Symbol(char.ToString(RadicalMap.GetRadical(ch.GetCh()))).Intern()); } else { return(ChineseCharacterBasedLexicon.Symbol.Unknown); } }
/// <summary>Samples from the distribution over words with this POS according to the lexicon.</summary> /// <param name="tag">the POS of the word to sample</param> /// <returns>a sampled word</returns> public virtual string SampleFrom(string tag) { StringBuilder buf = new StringBuilder(); IList <ISerializable> context = new List <ISerializable>(ContextLength + 1); // context must contain [tag prevChar prevPrevChar] context.Add(tag); for (int i = 0; i < ContextLength; i++) { context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord); } Distribution <ChineseCharacterBasedLexicon.Symbol> d = GetBackedOffDist(context); ChineseCharacterBasedLexicon.Symbol gen = d.SampleFrom(); while (gen != ChineseCharacterBasedLexicon.Symbol.EndWord) { buf.Append(gen.GetCh()); switch (penaltyType) { case 1: { if (Math.Random() > Math.Pow(lengthPenalty, buf.Length)) { goto genLoop_break; } break; } case 2: { if (Math.Random() > lengthPenalty) { goto genLoop_break; } break; } } for (int i_1 = 1; i_1 < ContextLength; i_1++) { context.Set(i_1 + 1, context[i_1]); } context.Set(1, gen); d = GetBackedOffDist(context); gen = d.SampleFrom(); genLoop_continue :; } genLoop_break :; return(buf.ToString()); }
public static void PrintStats(ICollection <Tree> trees, PrintWriter pw) { ClassicCounter <int> wordLengthCounter = new ClassicCounter <int>(); ClassicCounter <TaggedWord> wordCounter = new ClassicCounter <TaggedWord>(); ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>(); int counter = 0; foreach (Tree tree in trees) { counter++; IList <TaggedWord> taggedWords = tree.TaggedYield(); foreach (TaggedWord taggedWord in taggedWords) { string word = taggedWord.Word(); if (word.Equals(LexiconConstants.Boundary)) { continue; } wordCounter.IncrementCount(taggedWord); wordLengthCounter.IncrementCount(int.Parse(word.Length)); for (int j = 0; j < length; j++) { ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]); charCounter.IncrementCount(sym); } charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord); } } ICollection <ChineseCharacterBasedLexicon.Symbol> singletonChars = Counters.KeysBelow(charCounter, 1.5); ICollection <TaggedWord> singletonWords = Counters.KeysBelow(wordCounter, 1.5); ClassicCounter <string> singletonWordPOSes = new ClassicCounter <string>(); foreach (TaggedWord taggedWord_1 in singletonWords) { singletonWordPOSes.IncrementCount(taggedWord_1.Tag()); } Distribution <string> singletonWordPOSDist = Distribution.GetDistribution(singletonWordPOSes); ClassicCounter <char> singletonCharRads = new ClassicCounter <char>(); foreach (ChineseCharacterBasedLexicon.Symbol s in singletonChars) { singletonCharRads.IncrementCount(char.ValueOf(RadicalMap.GetRadical(s.GetCh()))); } Distribution <char> singletonCharRadDist = Distribution.GetDistribution(singletonCharRads); Distribution <int> wordLengthDist = Distribution.GetDistribution(wordLengthCounter); NumberFormat percent = new DecimalFormat("##.##%"); pw.Println("There are " + singletonChars.Count + " singleton chars out of " + (int)charCounter.TotalCount() + " tokens and " + charCounter.Size() + " types found in " + counter + " trees."); pw.Println("Thus singletonChars comprise " + percent.Format(singletonChars.Count / charCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonChars.Count / charCounter.Size()) + " of types."); pw.Println(); pw.Println("There are " + singletonWords.Count + " singleton words out of " + (int)wordCounter.TotalCount() + " tokens and " + wordCounter.Size() + " types."); pw.Println("Thus singletonWords comprise " + percent.Format(singletonWords.Count / wordCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonWords.Count / wordCounter.Size()) + " of types."); pw.Println(); pw.Println("Distribution over singleton word POS:"); pw.Println(singletonWordPOSDist.ToString()); pw.Println(); pw.Println("Distribution over singleton char radicals:"); pw.Println(singletonCharRadDist.ToString()); pw.Println(); pw.Println("Distribution over word length:"); pw.Println(wordLengthDist); }
public virtual void FinishTraining() { Timing.Tick("Counting characters..."); ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>(); // first find all chars that occur only once foreach (IList <TaggedWord> labels in trainingSentences) { foreach (TaggedWord label in labels) { string word = label.Word(); if (word.Equals(LexiconConstants.Boundary)) { continue; } for (int j = 0; j < length; j++) { ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]); charCounter.IncrementCount(sym); } charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord); } } ICollection <ChineseCharacterBasedLexicon.Symbol> singletons = Counters.KeysBelow(charCounter, 1.5); knownChars = Generics.NewHashSet(charCounter.KeySet()); Timing.Tick("Counting nGrams..."); GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[ContextLength + 1]; for (int i = 0; i <= ContextLength; i++) { POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2); } ClassicCounter <string> POSCounter = new ClassicCounter <string>(); IList <ISerializable> context = new List <ISerializable>(ContextLength + 1); foreach (IList <TaggedWord> words in trainingSentences) { foreach (TaggedWord taggedWord in words) { string word = taggedWord.Word(); string tag = taggedWord.Tag(); tagIndex.Add(tag); if (word.Equals(LexiconConstants.Boundary)) { continue; } POSCounter.IncrementCount(tag); for (int i_1 = 0; i_1 <= size; i_1++) { ChineseCharacterBasedLexicon.Symbol sym; ChineseCharacterBasedLexicon.Symbol unknownCharClass = null; context.Clear(); context.Add(tag); if (i_1 < size) { char thisCh = word[i_1]; sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(thisCh); if (singletons.Contains(sym)) { unknownCharClass = UnknownCharClass(sym); charCounter.IncrementCount(unknownCharClass); } } else { sym = ChineseCharacterBasedLexicon.Symbol.EndWord; } POSspecificCharNGrams[0].IncrementCount(context, sym); // POS-specific 1-gram if (unknownCharClass != null) { POSspecificCharNGrams[0].IncrementCount(context, unknownCharClass); } // for unknown ch model // context is constructed incrementally: // tag prevChar prevPrevChar // this could be made faster using .sublist like in score for (int j = 1; j <= ContextLength; j++) { // poly grams if (i_1 - j < 0) { context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord); POSspecificCharNGrams[j].IncrementCount(context, sym); if (unknownCharClass != null) { POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass); } // for unknown ch model break; } else { ChineseCharacterBasedLexicon.Symbol prev = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[i_1 - j]); if (singletons.Contains(prev)) { context.Add(UnknownCharClass(prev)); } else { context.Add(prev); } POSspecificCharNGrams[j].IncrementCount(context, sym); if (unknownCharClass != null) { POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass); } } } } } } // for unknown ch model POSDistribution = Distribution.GetDistribution(POSCounter); Timing.Tick("Creating character prior distribution..."); charDistributions = Generics.NewHashMap(); // charDistributions = Generics.newHashMap(); // 1.5 // charCounter.incrementCount(Symbol.UNKNOWN, singletons.size()); int numberOfKeys = charCounter.Size() + singletons.Count; Distribution <ChineseCharacterBasedLexicon.Symbol> prior = Distribution.GoodTuringSmoothedCounter(charCounter, numberOfKeys); charDistributions[Java.Util.Collections.EmptyList] = prior; for (int i_2 = 0; i_2 <= ContextLength; i_2++) { ICollection <KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > > counterEntries = POSspecificCharNGrams[i_2].LowestLevelCounterEntrySet(); Timing.Tick("Creating " + counterEntries.Count + " character " + (i_2 + 1) + "-gram distributions..."); foreach (KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > entry in counterEntries) { context = entry.Key; ClassicCounter <ChineseCharacterBasedLexicon.Symbol> c = entry.Value; Distribution <ChineseCharacterBasedLexicon.Symbol> thisPrior = charDistributions[context.SubList(0, context.Count - 1)]; double priorWeight = thisPrior.GetNumberOfKeys() / 200.0; Distribution <ChineseCharacterBasedLexicon.Symbol> newDist = Distribution.DynamicCounterWithDirichletPrior(c, thisPrior, priorWeight); charDistributions[context] = newDist; } } }