public virtual GeneralizedCounter <E> Probs(int position, int window) { GeneralizedCounter <E> gc = new GeneralizedCounter <E>(window); int[] labels = new int[window]; // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.) // 4.12.5 // Arrays.fill(labels, 0); while (true) { IList <E> labelsList = IntArrayToListE(labels); gc.IncrementCount(labelsList, Prob(position, labels)); for (int i = 0; i < labels.Length; i++) { labels[i]++; if (labels[i] < numClasses) { break; } if (i == labels.Length - 1) { goto OUTER_break; } labels[i] = 0; } OUTER_continue :; } OUTER_break :; return(gc); }
// Only used at training time public virtual void InitializeTraining(double numTrees) { lex.InitializeTraining(numTrees); this.initial = new ClassicCounter <string>(); this.ruleCounter = new GeneralizedCounter(2); }
public virtual void FinishTraining() { Timing.Tick("Counting characters..."); ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>(); // first find all chars that occur only once foreach (IList <TaggedWord> labels in trainingSentences) { foreach (TaggedWord label in labels) { string word = label.Word(); if (word.Equals(LexiconConstants.Boundary)) { continue; } for (int j = 0; j < length; j++) { ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]); charCounter.IncrementCount(sym); } charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord); } } ICollection <ChineseCharacterBasedLexicon.Symbol> singletons = Counters.KeysBelow(charCounter, 1.5); knownChars = Generics.NewHashSet(charCounter.KeySet()); Timing.Tick("Counting nGrams..."); GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[ContextLength + 1]; for (int i = 0; i <= ContextLength; i++) { POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2); } ClassicCounter <string> POSCounter = new ClassicCounter <string>(); IList <ISerializable> context = new List <ISerializable>(ContextLength + 1); foreach (IList <TaggedWord> words in trainingSentences) { foreach (TaggedWord taggedWord in words) { string word = taggedWord.Word(); string tag = taggedWord.Tag(); tagIndex.Add(tag); if (word.Equals(LexiconConstants.Boundary)) { continue; } POSCounter.IncrementCount(tag); for (int i_1 = 0; i_1 <= size; i_1++) { ChineseCharacterBasedLexicon.Symbol sym; ChineseCharacterBasedLexicon.Symbol unknownCharClass = null; context.Clear(); context.Add(tag); if (i_1 < size) { char thisCh = word[i_1]; sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(thisCh); if (singletons.Contains(sym)) { unknownCharClass = UnknownCharClass(sym); charCounter.IncrementCount(unknownCharClass); } } else { sym = ChineseCharacterBasedLexicon.Symbol.EndWord; } POSspecificCharNGrams[0].IncrementCount(context, sym); // POS-specific 1-gram if (unknownCharClass != null) { POSspecificCharNGrams[0].IncrementCount(context, unknownCharClass); } // for unknown ch model // context is constructed incrementally: // tag prevChar prevPrevChar // this could be made faster using .sublist like in score for (int j = 1; j <= ContextLength; j++) { // poly grams if (i_1 - j < 0) { context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord); POSspecificCharNGrams[j].IncrementCount(context, sym); if (unknownCharClass != null) { POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass); } // for unknown ch model break; } else { ChineseCharacterBasedLexicon.Symbol prev = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[i_1 - j]); if (singletons.Contains(prev)) { context.Add(UnknownCharClass(prev)); } else { context.Add(prev); } POSspecificCharNGrams[j].IncrementCount(context, sym); if (unknownCharClass != null) { POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass); } } } } } } // for unknown ch model POSDistribution = Distribution.GetDistribution(POSCounter); Timing.Tick("Creating character prior distribution..."); charDistributions = Generics.NewHashMap(); // charDistributions = Generics.newHashMap(); // 1.5 // charCounter.incrementCount(Symbol.UNKNOWN, singletons.size()); int numberOfKeys = charCounter.Size() + singletons.Count; Distribution <ChineseCharacterBasedLexicon.Symbol> prior = Distribution.GoodTuringSmoothedCounter(charCounter, numberOfKeys); charDistributions[Java.Util.Collections.EmptyList] = prior; for (int i_2 = 0; i_2 <= ContextLength; i_2++) { ICollection <KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > > counterEntries = POSspecificCharNGrams[i_2].LowestLevelCounterEntrySet(); Timing.Tick("Creating " + counterEntries.Count + " character " + (i_2 + 1) + "-gram distributions..."); foreach (KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > entry in counterEntries) { context = entry.Key; ClassicCounter <ChineseCharacterBasedLexicon.Symbol> c = entry.Value; Distribution <ChineseCharacterBasedLexicon.Symbol> thisPrior = charDistributions[context.SubList(0, context.Count - 1)]; double priorWeight = thisPrior.GetNumberOfKeys() / 200.0; Distribution <ChineseCharacterBasedLexicon.Symbol> newDist = Distribution.DynamicCounterWithDirichletPrior(c, thisPrior, priorWeight); charDistributions[context] = newDist; } } }