private IEnumerable <Tuple <AffixInfo, Ngram <TItem> > > GetAffixes(Ngram <TItem> word, AffixType type)
        {
            Direction dir;
            Dictionary <Ngram <TItem>, AffixInfo> affixes;

            if (type == AffixType.Prefix)
            {
                dir     = Direction.LeftToRight;
                affixes = _prefixes;
            }
            else
            {
                dir     = Direction.RightToLeft;
                affixes = _suffixes;
            }

            var affix = new Ngram <TItem>();

            yield return(Tuple.Create(affixes[affix], word));

            foreach (TItem item in word.GetItems(dir).Take(Math.Min(MaxAffixLength, word.Length - 1)))
            {
                affix = affix.Concat(item, dir);
                word  = word.SkipFirst(dir);
                AffixInfo ai;
                if (affixes.TryGetValue(affix, out ai))
                {
                    yield return(Tuple.Create(ai, word));
                }
            }
        }
Example #2
0
        protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type)
        {
            var dir = Direction.LeftToRight;

            switch (type)
            {
            case AffixType.Prefix:
                dir = Direction.LeftToRight;
                break;

            case AffixType.Suffix:
                dir = Direction.RightToLeft;
                break;
            }

            var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var itemFreqDist  = new FrequencyDistribution <TItem>();

            var affixes   = new Dictionary <Ngram <TItem>, AffixInfo>();
            var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>());

            foreach (TSeq seq in sequences)
            {
                var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s));
                nullAffix.Stems.Add(wordNgram);
                foreach (TItem item in wordNgram)
                {
                    itemFreqDist.Increment(item);
                }
                if (wordNgram.Length <= 1)
                {
                    continue;
                }

                var items         = new List <TItem>();
                var syllableStart = new HashSet <int>();
                foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir))
                {
                    items.AddRange(syllable.Items(dir));
                    syllableStart.Add(items.Count - 1);
                }
                var affix = new Ngram <TItem>();
                var stem  = new Ngram <TItem>(items, dir);
                for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++)
                {
                    affix = affix.Concat(items[i], dir);
                    affixFreqDist[affix.Length].Increment(affix);
                    if (i < items.Count - 1 && affix.Length <= MaxAffixLength)
                    {
                        AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix));
                        stem = stem.SkipFirst(dir);
                        ai.Stems.Add(stem);
                        if (syllableStart.Contains(i))
                        {
                            ai.SyllableBreakCount++;
                        }
                    }
                }

                for (int i = 0; i < items.Count; i++)
                {
                    var ngram = new Ngram <TItem>();
                    for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++)
                    {
                        ngram = ngram.Concat(items[j], dir);
                        ngramFreqDist[ngram.Length].Increment(ngram);
                    }
                }
            }

            var itemProbDist  = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist);
            var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });
            var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });

            foreach (AffixInfo affix in affixes.Values)
            {
                int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram];

                var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq })
                                   .MaxBy(item => item.Curve);
                double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]);

                double pw        = affixProbDist[affix.Ngram.Length][affix.Ngram];
                double npw       = ngramProbDist[affix.Ngram.Length][affix.Ngram];
                double randomAdj = npw == 0 ? 1.0 : pw / npw;

                double normalizedFreq = affix.Ngram.Length * Math.Log(freq);

                double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0;

                affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore;
                yield return(affix);
            }

            yield return(nullAffix);
        }