public void NoSamples()
 {
     var ml = new MaxLikelihoodProbabilityDistribution<string>(new FrequencyDistribution<string>());
     Assert.That(ml["a"], Is.EqualTo(0));
     Assert.That(ml["b"], Is.EqualTo(0));
     Assert.That(ml["c"], Is.EqualTo(0));
 }
        public void NoSamples()
        {
            var ml = new MaxLikelihoodProbabilityDistribution <string>(new FrequencyDistribution <string>());

            Assert.That(ml["a"], Is.EqualTo(0));
            Assert.That(ml["b"], Is.EqualTo(0));
            Assert.That(ml["c"], Is.EqualTo(0));
        }
        public void Probability()
        {
            var ml = new MaxLikelihoodProbabilityDistribution<string>(_fd);
            Assert.That(ml["a"], Is.EqualTo(0.01136).Within(0.00001));
            Assert.That(ml["c"], Is.EqualTo(0.02272).Within(0.00001));
            Assert.That(ml["d"], Is.EqualTo(0.03409).Within(0.00001));
            Assert.That(ml["o"], Is.EqualTo(0.09090).Within(0.00001));
            Assert.That(ml["q"], Is.EqualTo(0.11363).Within(0.00001));

            Assert.That(ml["t"], Is.EqualTo(0));
            Assert.That(ml["z"], Is.EqualTo(0));
        }
        public void Probability()
        {
            var ml = new MaxLikelihoodProbabilityDistribution <string>(_fd);

            Assert.That(ml["a"], Is.EqualTo(0.01136).Within(0.00001));
            Assert.That(ml["c"], Is.EqualTo(0.02272).Within(0.00001));
            Assert.That(ml["d"], Is.EqualTo(0.03409).Within(0.00001));
            Assert.That(ml["o"], Is.EqualTo(0.09090).Within(0.00001));
            Assert.That(ml["q"], Is.EqualTo(0.11363).Within(0.00001));

            Assert.That(ml["t"], Is.EqualTo(0));
            Assert.That(ml["z"], Is.EqualTo(0));
        }
예제 #5
0
        protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type)
        {
            var dir = Direction.LeftToRight;

            switch (type)
            {
            case AffixType.Prefix:
                dir = Direction.LeftToRight;
                break;

            case AffixType.Suffix:
                dir = Direction.RightToLeft;
                break;
            }

            var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var itemFreqDist  = new FrequencyDistribution <TItem>();

            var affixes   = new Dictionary <Ngram <TItem>, AffixInfo>();
            var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>());

            foreach (TSeq seq in sequences)
            {
                var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s));
                nullAffix.Stems.Add(wordNgram);
                foreach (TItem item in wordNgram)
                {
                    itemFreqDist.Increment(item);
                }
                if (wordNgram.Length <= 1)
                {
                    continue;
                }

                var items         = new List <TItem>();
                var syllableStart = new HashSet <int>();
                foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir))
                {
                    items.AddRange(syllable.Items(dir));
                    syllableStart.Add(items.Count - 1);
                }
                var affix = new Ngram <TItem>();
                var stem  = new Ngram <TItem>(items, dir);
                for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++)
                {
                    affix = affix.Concat(items[i], dir);
                    affixFreqDist[affix.Length].Increment(affix);
                    if (i < items.Count - 1 && affix.Length <= MaxAffixLength)
                    {
                        AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix));
                        stem = stem.SkipFirst(dir);
                        ai.Stems.Add(stem);
                        if (syllableStart.Contains(i))
                        {
                            ai.SyllableBreakCount++;
                        }
                    }
                }

                for (int i = 0; i < items.Count; i++)
                {
                    var ngram = new Ngram <TItem>();
                    for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++)
                    {
                        ngram = ngram.Concat(items[j], dir);
                        ngramFreqDist[ngram.Length].Increment(ngram);
                    }
                }
            }

            var itemProbDist  = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist);
            var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });
            var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });

            foreach (AffixInfo affix in affixes.Values)
            {
                int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram];

                var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq })
                                   .MaxBy(item => item.Curve);
                double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]);

                double pw        = affixProbDist[affix.Ngram.Length][affix.Ngram];
                double npw       = ngramProbDist[affix.Ngram.Length][affix.Ngram];
                double randomAdj = npw == 0 ? 1.0 : pw / npw;

                double normalizedFreq = affix.Ngram.Length * Math.Log(freq);

                double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0;

                affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore;
                yield return(affix);
            }

            yield return(nullAffix);
        }