Ejemplo n.º 1
0
        private bool M(VarietyPair pair)
        {
            IWordAligner aligner       = _project.WordAligners[AlignerId];
            int          segmentCount  = pair.Variety2.SegmentFrequencyDistribution.ObservedSamples.Count;
            int          possCorrCount = aligner.ExpansionCompressionEnabled ? (segmentCount * segmentCount) + segmentCount + 1 : segmentCount + 1;
            var          cpd           = new ConditionalProbabilityDistribution <SoundContext, Ngram <Segment> >(
                pair.CognateSoundCorrespondenceFrequencyDistribution,
                (sc, fd) => new WittenBellProbabilityDistribution <Ngram <Segment> >(fd, possCorrCount));

            bool converged = true;

            if (pair.CognateSoundCorrespondenceProbabilityDistribution == null ||
                pair.CognateSoundCorrespondenceProbabilityDistribution.Conditions.Count != cpd.Conditions.Count)
            {
                converged = false;
            }
            else
            {
                foreach (SoundContext lhs in cpd.Conditions)
                {
                    IProbabilityDistribution <Ngram <Segment> > probDist = cpd[lhs];
                    IProbabilityDistribution <Ngram <Segment> > oldProbDist;
                    if (!pair.CognateSoundCorrespondenceProbabilityDistribution.TryGetProbabilityDistribution(lhs, out oldProbDist) ||
                        probDist.Samples.Count != oldProbDist.Samples.Count)
                    {
                        converged = false;
                        break;
                    }

                    foreach (Ngram <Segment> correspondence in probDist.Samples)
                    {
                        if (Math.Abs(probDist[correspondence] - oldProbDist[correspondence]) > 0.0001)
                        {
                            converged = false;
                            break;
                        }
                    }

                    if (!converged)
                    {
                        break;
                    }
                }
            }

            if (!converged)
            {
                pair.CognateSoundCorrespondenceProbabilityDistribution = cpd;
                pair.DefaultSoundCorrespondenceProbability             = 1.0 / possCorrCount;
            }

            return(converged);
        }
Ejemplo n.º 2
0
        protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type)
        {
            var dir = Direction.LeftToRight;

            switch (type)
            {
            case AffixType.Prefix:
                dir = Direction.LeftToRight;
                break;

            case AffixType.Suffix:
                dir = Direction.RightToLeft;
                break;
            }

            var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var itemFreqDist  = new FrequencyDistribution <TItem>();

            var affixes   = new Dictionary <Ngram <TItem>, AffixInfo>();
            var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>());

            foreach (TSeq seq in sequences)
            {
                var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s));
                nullAffix.Stems.Add(wordNgram);
                foreach (TItem item in wordNgram)
                {
                    itemFreqDist.Increment(item);
                }
                if (wordNgram.Length <= 1)
                {
                    continue;
                }

                var items         = new List <TItem>();
                var syllableStart = new HashSet <int>();
                foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir))
                {
                    items.AddRange(syllable.Items(dir));
                    syllableStart.Add(items.Count - 1);
                }
                var affix = new Ngram <TItem>();
                var stem  = new Ngram <TItem>(items, dir);
                for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++)
                {
                    affix = affix.Concat(items[i], dir);
                    affixFreqDist[affix.Length].Increment(affix);
                    if (i < items.Count - 1 && affix.Length <= MaxAffixLength)
                    {
                        AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix));
                        stem = stem.SkipFirst(dir);
                        ai.Stems.Add(stem);
                        if (syllableStart.Contains(i))
                        {
                            ai.SyllableBreakCount++;
                        }
                    }
                }

                for (int i = 0; i < items.Count; i++)
                {
                    var ngram = new Ngram <TItem>();
                    for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++)
                    {
                        ngram = ngram.Concat(items[j], dir);
                        ngramFreqDist[ngram.Length].Increment(ngram);
                    }
                }
            }

            var itemProbDist  = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist);
            var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });
            var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });

            foreach (AffixInfo affix in affixes.Values)
            {
                int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram];

                var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq })
                                   .MaxBy(item => item.Curve);
                double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]);

                double pw        = affixProbDist[affix.Ngram.Length][affix.Ngram];
                double npw       = ngramProbDist[affix.Ngram.Length][affix.Ngram];
                double randomAdj = npw == 0 ? 1.0 : pw / npw;

                double normalizedFreq = affix.Ngram.Length * Math.Log(freq);

                double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0;

                affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore;
                yield return(affix);
            }

            yield return(nullAffix);
        }