Exemple #1
0
        public virtual GeneralizedCounter <E> Probs(int position, int window)
        {
            GeneralizedCounter <E> gc = new GeneralizedCounter <E>(window);

            int[] labels = new int[window];
            // cdm july 2005: below array initialization isn't necessary: JLS (3rd ed.)
            // 4.12.5
            // Arrays.fill(labels, 0);
            while (true)
            {
                IList <E> labelsList = IntArrayToListE(labels);
                gc.IncrementCount(labelsList, Prob(position, labels));
                for (int i = 0; i < labels.Length; i++)
                {
                    labels[i]++;
                    if (labels[i] < numClasses)
                    {
                        break;
                    }
                    if (i == labels.Length - 1)
                    {
                        goto OUTER_break;
                    }
                    labels[i] = 0;
                }
                OUTER_continue :;
            }
            OUTER_break :;
            return(gc);
        }
 // Only used at training time
 public virtual void InitializeTraining(double numTrees)
 {
     lex.InitializeTraining(numTrees);
     this.initial     = new ClassicCounter <string>();
     this.ruleCounter = new GeneralizedCounter(2);
 }
        public virtual void FinishTraining()
        {
            Timing.Tick("Counting characters...");
            ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>();

            // first find all chars that occur only once
            foreach (IList <TaggedWord> labels in trainingSentences)
            {
                foreach (TaggedWord label in labels)
                {
                    string word = label.Word();
                    if (word.Equals(LexiconConstants.Boundary))
                    {
                        continue;
                    }
                    for (int j = 0; j < length; j++)
                    {
                        ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]);
                        charCounter.IncrementCount(sym);
                    }
                    charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord);
                }
            }
            ICollection <ChineseCharacterBasedLexicon.Symbol> singletons = Counters.KeysBelow(charCounter, 1.5);

            knownChars = Generics.NewHashSet(charCounter.KeySet());
            Timing.Tick("Counting nGrams...");
            GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[ContextLength + 1];
            for (int i = 0; i <= ContextLength; i++)
            {
                POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
            }
            ClassicCounter <string> POSCounter = new ClassicCounter <string>();
            IList <ISerializable>   context    = new List <ISerializable>(ContextLength + 1);

            foreach (IList <TaggedWord> words in trainingSentences)
            {
                foreach (TaggedWord taggedWord in words)
                {
                    string word = taggedWord.Word();
                    string tag  = taggedWord.Tag();
                    tagIndex.Add(tag);
                    if (word.Equals(LexiconConstants.Boundary))
                    {
                        continue;
                    }
                    POSCounter.IncrementCount(tag);
                    for (int i_1 = 0; i_1 <= size; i_1++)
                    {
                        ChineseCharacterBasedLexicon.Symbol sym;
                        ChineseCharacterBasedLexicon.Symbol unknownCharClass = null;
                        context.Clear();
                        context.Add(tag);
                        if (i_1 < size)
                        {
                            char thisCh = word[i_1];
                            sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(thisCh);
                            if (singletons.Contains(sym))
                            {
                                unknownCharClass = UnknownCharClass(sym);
                                charCounter.IncrementCount(unknownCharClass);
                            }
                        }
                        else
                        {
                            sym = ChineseCharacterBasedLexicon.Symbol.EndWord;
                        }
                        POSspecificCharNGrams[0].IncrementCount(context, sym);
                        // POS-specific 1-gram
                        if (unknownCharClass != null)
                        {
                            POSspecificCharNGrams[0].IncrementCount(context, unknownCharClass);
                        }
                        // for unknown ch model
                        // context is constructed incrementally:
                        // tag prevChar prevPrevChar
                        // this could be made faster using .sublist like in score
                        for (int j = 1; j <= ContextLength; j++)
                        {
                            // poly grams
                            if (i_1 - j < 0)
                            {
                                context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord);
                                POSspecificCharNGrams[j].IncrementCount(context, sym);
                                if (unknownCharClass != null)
                                {
                                    POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass);
                                }
                                // for unknown ch model
                                break;
                            }
                            else
                            {
                                ChineseCharacterBasedLexicon.Symbol prev = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[i_1 - j]);
                                if (singletons.Contains(prev))
                                {
                                    context.Add(UnknownCharClass(prev));
                                }
                                else
                                {
                                    context.Add(prev);
                                }
                                POSspecificCharNGrams[j].IncrementCount(context, sym);
                                if (unknownCharClass != null)
                                {
                                    POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass);
                                }
                            }
                        }
                    }
                }
            }
            // for unknown ch model
            POSDistribution = Distribution.GetDistribution(POSCounter);
            Timing.Tick("Creating character prior distribution...");
            charDistributions = Generics.NewHashMap();
            //    charDistributions = Generics.newHashMap();  // 1.5
            //    charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
            int numberOfKeys = charCounter.Size() + singletons.Count;
            Distribution <ChineseCharacterBasedLexicon.Symbol> prior = Distribution.GoodTuringSmoothedCounter(charCounter, numberOfKeys);

            charDistributions[Java.Util.Collections.EmptyList] = prior;
            for (int i_2 = 0; i_2 <= ContextLength; i_2++)
            {
                ICollection <KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > > counterEntries = POSspecificCharNGrams[i_2].LowestLevelCounterEntrySet();
                Timing.Tick("Creating " + counterEntries.Count + " character " + (i_2 + 1) + "-gram distributions...");
                foreach (KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > entry in counterEntries)
                {
                    context = entry.Key;
                    ClassicCounter <ChineseCharacterBasedLexicon.Symbol> c         = entry.Value;
                    Distribution <ChineseCharacterBasedLexicon.Symbol>   thisPrior = charDistributions[context.SubList(0, context.Count - 1)];
                    double priorWeight = thisPrior.GetNumberOfKeys() / 200.0;
                    Distribution <ChineseCharacterBasedLexicon.Symbol> newDist = Distribution.DynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
                    charDistributions[context] = newDist;
                }
            }
        }