public override bool Equals(object o)
 {
     if (this == o)
     {
         return(true);
     }
     if (!(o is ChineseCharacterBasedLexicon.Symbol))
     {
         return(false);
     }
     ChineseCharacterBasedLexicon.Symbol symbol = (ChineseCharacterBasedLexicon.Symbol)o;
     if (ch != symbol.ch)
     {
         return(false);
     }
     if (type != symbol.type)
     {
         return(false);
     }
     if (unkClass != null ? !unkClass.Equals(symbol.unkClass) : symbol.unkClass != null)
     {
         return(false);
     }
     return(true);
 }
        public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            string tag = tagIndex.Get(iTW.tag);

            System.Diagnostics.Debug.Assert(!word.Equals(LexiconConstants.Boundary));
            char[] chars = word.ToCharArray();
            IList <ISerializable> charList = new List <ISerializable>(chars.Length + ContextLength + 1);

            // this starts of storing Symbol's and then starts storing String's. Clean this up someday!
            // charList is constructed backward
            // END_WORD char[length-1] char[length-2] ... char[0] BEGIN_WORD BEGIN_WORD
            charList.Add(ChineseCharacterBasedLexicon.Symbol.EndWord);
            for (int i = chars.Length - 1; i >= 0; i--)
            {
                ChineseCharacterBasedLexicon.Symbol ch = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(chars[i]);
                if (knownChars.Contains(ch))
                {
                    charList.Add(ch);
                }
                else
                {
                    charList.Add(UnknownCharClass(ch));
                }
            }
            for (int i_1 = 0; i_1 < ContextLength; i_1++)
            {
                charList.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord);
            }
            double score = 0.0;

            for (int i_2 = 0; i_2 < size - ContextLength; i_2++)
            {
                ChineseCharacterBasedLexicon.Symbol nextChar = (ChineseCharacterBasedLexicon.Symbol)charList[i_2];
                charList.Set(i_2, tag);
                double charScore = GetBackedOffDist(charList.SubList(i_2, i_2 + ContextLength + 1)).ProbabilityOf(nextChar);
                score += Math.Log(charScore);
            }
            switch (penaltyType)
            {
            case 0:
            {
                break;
            }

            case 1:
            {
                score -= (chars.Length * (chars.Length + 1)) * (lengthPenalty / 2);
                break;
            }

            case 2:
            {
                score -= (chars.Length - 1) * lengthPenalty;
                break;
            }
            }
            return((float)score);
        }
 private ChineseCharacterBasedLexicon.Symbol UnknownCharClass(ChineseCharacterBasedLexicon.Symbol ch)
 {
     if (useUnknownCharacterModel)
     {
         return(new ChineseCharacterBasedLexicon.Symbol(char.ToString(RadicalMap.GetRadical(ch.GetCh()))).Intern());
     }
     else
     {
         return(ChineseCharacterBasedLexicon.Symbol.Unknown);
     }
 }
        /// <summary>Samples from the distribution over words with this POS according to the lexicon.</summary>
        /// <param name="tag">the POS of the word to sample</param>
        /// <returns>a sampled word</returns>
        public virtual string SampleFrom(string tag)
        {
            StringBuilder         buf     = new StringBuilder();
            IList <ISerializable> context = new List <ISerializable>(ContextLength + 1);

            // context must contain [tag prevChar prevPrevChar]
            context.Add(tag);
            for (int i = 0; i < ContextLength; i++)
            {
                context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord);
            }
            Distribution <ChineseCharacterBasedLexicon.Symbol> d = GetBackedOffDist(context);

            ChineseCharacterBasedLexicon.Symbol gen = d.SampleFrom();
            while (gen != ChineseCharacterBasedLexicon.Symbol.EndWord)
            {
                buf.Append(gen.GetCh());
                switch (penaltyType)
                {
                case 1:
                {
                    if (Math.Random() > Math.Pow(lengthPenalty, buf.Length))
                    {
                        goto genLoop_break;
                    }
                    break;
                }

                case 2:
                {
                    if (Math.Random() > lengthPenalty)
                    {
                        goto genLoop_break;
                    }
                    break;
                }
                }
                for (int i_1 = 1; i_1 < ContextLength; i_1++)
                {
                    context.Set(i_1 + 1, context[i_1]);
                }
                context.Set(1, gen);
                d   = GetBackedOffDist(context);
                gen = d.SampleFrom();
                genLoop_continue :;
            }
            genLoop_break :;
            return(buf.ToString());
        }
        public static void PrintStats(ICollection <Tree> trees, PrintWriter pw)
        {
            ClassicCounter <int>        wordLengthCounter = new ClassicCounter <int>();
            ClassicCounter <TaggedWord> wordCounter       = new ClassicCounter <TaggedWord>();
            ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>();
            int counter = 0;

            foreach (Tree tree in trees)
            {
                counter++;
                IList <TaggedWord> taggedWords = tree.TaggedYield();
                foreach (TaggedWord taggedWord in taggedWords)
                {
                    string word = taggedWord.Word();
                    if (word.Equals(LexiconConstants.Boundary))
                    {
                        continue;
                    }
                    wordCounter.IncrementCount(taggedWord);
                    wordLengthCounter.IncrementCount(int.Parse(word.Length));
                    for (int j = 0; j < length; j++)
                    {
                        ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]);
                        charCounter.IncrementCount(sym);
                    }
                    charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord);
                }
            }
            ICollection <ChineseCharacterBasedLexicon.Symbol> singletonChars = Counters.KeysBelow(charCounter, 1.5);
            ICollection <TaggedWord> singletonWords     = Counters.KeysBelow(wordCounter, 1.5);
            ClassicCounter <string>  singletonWordPOSes = new ClassicCounter <string>();

            foreach (TaggedWord taggedWord_1 in singletonWords)
            {
                singletonWordPOSes.IncrementCount(taggedWord_1.Tag());
            }
            Distribution <string> singletonWordPOSDist = Distribution.GetDistribution(singletonWordPOSes);
            ClassicCounter <char> singletonCharRads    = new ClassicCounter <char>();

            foreach (ChineseCharacterBasedLexicon.Symbol s in singletonChars)
            {
                singletonCharRads.IncrementCount(char.ValueOf(RadicalMap.GetRadical(s.GetCh())));
            }
            Distribution <char> singletonCharRadDist = Distribution.GetDistribution(singletonCharRads);
            Distribution <int>  wordLengthDist       = Distribution.GetDistribution(wordLengthCounter);
            NumberFormat        percent = new DecimalFormat("##.##%");

            pw.Println("There are " + singletonChars.Count + " singleton chars out of " + (int)charCounter.TotalCount() + " tokens and " + charCounter.Size() + " types found in " + counter + " trees.");
            pw.Println("Thus singletonChars comprise " + percent.Format(singletonChars.Count / charCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonChars.Count / charCounter.Size()) + " of types.");
            pw.Println();
            pw.Println("There are " + singletonWords.Count + " singleton words out of " + (int)wordCounter.TotalCount() + " tokens and " + wordCounter.Size() + " types.");
            pw.Println("Thus singletonWords comprise " + percent.Format(singletonWords.Count / wordCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonWords.Count / wordCounter.Size()) + " of types.");
            pw.Println();
            pw.Println("Distribution over singleton word POS:");
            pw.Println(singletonWordPOSDist.ToString());
            pw.Println();
            pw.Println("Distribution over singleton char radicals:");
            pw.Println(singletonCharRadDist.ToString());
            pw.Println();
            pw.Println("Distribution over word length:");
            pw.Println(wordLengthDist);
        }
        public virtual void FinishTraining()
        {
            Timing.Tick("Counting characters...");
            ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>();

            // first find all chars that occur only once
            foreach (IList <TaggedWord> labels in trainingSentences)
            {
                foreach (TaggedWord label in labels)
                {
                    string word = label.Word();
                    if (word.Equals(LexiconConstants.Boundary))
                    {
                        continue;
                    }
                    for (int j = 0; j < length; j++)
                    {
                        ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]);
                        charCounter.IncrementCount(sym);
                    }
                    charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord);
                }
            }
            ICollection <ChineseCharacterBasedLexicon.Symbol> singletons = Counters.KeysBelow(charCounter, 1.5);

            knownChars = Generics.NewHashSet(charCounter.KeySet());
            Timing.Tick("Counting nGrams...");
            GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[ContextLength + 1];
            for (int i = 0; i <= ContextLength; i++)
            {
                POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2);
            }
            ClassicCounter <string> POSCounter = new ClassicCounter <string>();
            IList <ISerializable>   context    = new List <ISerializable>(ContextLength + 1);

            foreach (IList <TaggedWord> words in trainingSentences)
            {
                foreach (TaggedWord taggedWord in words)
                {
                    string word = taggedWord.Word();
                    string tag  = taggedWord.Tag();
                    tagIndex.Add(tag);
                    if (word.Equals(LexiconConstants.Boundary))
                    {
                        continue;
                    }
                    POSCounter.IncrementCount(tag);
                    for (int i_1 = 0; i_1 <= size; i_1++)
                    {
                        ChineseCharacterBasedLexicon.Symbol sym;
                        ChineseCharacterBasedLexicon.Symbol unknownCharClass = null;
                        context.Clear();
                        context.Add(tag);
                        if (i_1 < size)
                        {
                            char thisCh = word[i_1];
                            sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(thisCh);
                            if (singletons.Contains(sym))
                            {
                                unknownCharClass = UnknownCharClass(sym);
                                charCounter.IncrementCount(unknownCharClass);
                            }
                        }
                        else
                        {
                            sym = ChineseCharacterBasedLexicon.Symbol.EndWord;
                        }
                        POSspecificCharNGrams[0].IncrementCount(context, sym);
                        // POS-specific 1-gram
                        if (unknownCharClass != null)
                        {
                            POSspecificCharNGrams[0].IncrementCount(context, unknownCharClass);
                        }
                        // for unknown ch model
                        // context is constructed incrementally:
                        // tag prevChar prevPrevChar
                        // this could be made faster using .sublist like in score
                        for (int j = 1; j <= ContextLength; j++)
                        {
                            // poly grams
                            if (i_1 - j < 0)
                            {
                                context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord);
                                POSspecificCharNGrams[j].IncrementCount(context, sym);
                                if (unknownCharClass != null)
                                {
                                    POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass);
                                }
                                // for unknown ch model
                                break;
                            }
                            else
                            {
                                ChineseCharacterBasedLexicon.Symbol prev = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[i_1 - j]);
                                if (singletons.Contains(prev))
                                {
                                    context.Add(UnknownCharClass(prev));
                                }
                                else
                                {
                                    context.Add(prev);
                                }
                                POSspecificCharNGrams[j].IncrementCount(context, sym);
                                if (unknownCharClass != null)
                                {
                                    POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass);
                                }
                            }
                        }
                    }
                }
            }
            // for unknown ch model
            POSDistribution = Distribution.GetDistribution(POSCounter);
            Timing.Tick("Creating character prior distribution...");
            charDistributions = Generics.NewHashMap();
            //    charDistributions = Generics.newHashMap();  // 1.5
            //    charCounter.incrementCount(Symbol.UNKNOWN, singletons.size());
            int numberOfKeys = charCounter.Size() + singletons.Count;
            Distribution <ChineseCharacterBasedLexicon.Symbol> prior = Distribution.GoodTuringSmoothedCounter(charCounter, numberOfKeys);

            charDistributions[Java.Util.Collections.EmptyList] = prior;
            for (int i_2 = 0; i_2 <= ContextLength; i_2++)
            {
                ICollection <KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > > counterEntries = POSspecificCharNGrams[i_2].LowestLevelCounterEntrySet();
                Timing.Tick("Creating " + counterEntries.Count + " character " + (i_2 + 1) + "-gram distributions...");
                foreach (KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > entry in counterEntries)
                {
                    context = entry.Key;
                    ClassicCounter <ChineseCharacterBasedLexicon.Symbol> c         = entry.Value;
                    Distribution <ChineseCharacterBasedLexicon.Symbol>   thisPrior = charDistributions[context.SubList(0, context.Count - 1)];
                    double priorWeight = thisPrior.GetNumberOfKeys() / 200.0;
                    Distribution <ChineseCharacterBasedLexicon.Symbol> newDist = Distribution.DynamicCounterWithDirichletPrior(c, thisPrior, priorWeight);
                    charDistributions[context] = newDist;
                }
            }
        }