Example #1
0
        public VarietyPairSurrogate(VarietyPair vp)
        {
            Variety1 = vp.Variety1.Name;
            Variety2 = vp.Variety2.Name;
            var wordPairSurrogates = new Dictionary <WordPair, WordPairSurrogate>();

            _wordPairs = vp.WordPairs.Select(wp => wordPairSurrogates.GetValue(wp, () => new WordPairSurrogate(wp))).ToList();
            PhoneticSimilarityScore = vp.PhoneticSimilarityScore;
            LexicalSimilarityScore  = vp.LexicalSimilarityScore;
            DefaultSoundCorrespondenceProbability            = vp.DefaultSoundCorrespondenceProbability;
            _cognateSoundCorrespondenceFrequencyDistribution = new Dictionary <SoundContextSurrogate, Tuple <string[], int>[]>();
            foreach (SoundContext lhs in vp.CognateSoundCorrespondenceFrequencyDistribution.Conditions)
            {
                FrequencyDistribution <Ngram <Segment> > freqDist = vp.CognateSoundCorrespondenceFrequencyDistribution[lhs];
                _cognateSoundCorrespondenceFrequencyDistribution[new SoundContextSurrogate(lhs)] = freqDist.ObservedSamples.Select(ngram => Tuple.Create(ngram.Select(seg => seg.StrRep).ToArray(), freqDist[ngram])).ToArray();
            }
            _cognateSoundCorrespondenceByPosition = new Dictionary <string, List <SoundCorrespondenceSurrogate> >();
            foreach (KeyValuePair <FeatureSymbol, SoundCorrespondenceCollection> kvp in vp.CognateSoundCorrespondencesByPosition)
            {
                string pos;
                if (kvp.Key == CogFeatureSystem.Onset)
                {
                    pos = "onset";
                }
                else if (kvp.Key == CogFeatureSystem.Nucleus)
                {
                    pos = "nucleus";
                }
                else
                {
                    pos = "coda";
                }
                _cognateSoundCorrespondenceByPosition[pos] = kvp.Value.Select(corr => new SoundCorrespondenceSurrogate(wordPairSurrogates, corr)).ToList();
            }
        }
Example #2
0
        private bool IsRegular(WordPair wordPair, IWordAlignerResult alignerResult, Alignment <Word, ShapeNode> alignment, int column,
                               Ngram <Segment> v)
        {
            VarietyPair  vp      = wordPair.VarietyPair;
            SoundContext context = alignment.ToSoundContext(_segmentPool, 0, column, alignerResult.WordAligner.ContextualSoundClasses);
            FrequencyDistribution <Ngram <Segment> > freqDist = vp.CognateSoundCorrespondenceFrequencyDistribution[context];
            int threshold;

            if (AutomaticRegularCorrespondenceThreshold)
            {
                int seg2Count = vp.CognateSoundCorrespondenceFrequencyDistribution.Conditions
                                .Where(sc => sc.LeftEnvironment == context.LeftEnvironment && sc.RightEnvironment == context.RightEnvironment)
                                .Sum(sc => vp.CognateSoundCorrespondenceFrequencyDistribution[sc][v]);
                if (!_regularCorrespondenceThresholdTable.TryGetThreshold(vp.CognateCount, freqDist.SampleOutcomeCount, seg2Count,
                                                                          out threshold))
                {
                    threshold = DefaultRegularCorrespondenceThreshold;
                }
            }
            else
            {
                threshold = DefaultRegularCorrespondenceThreshold;
            }
            return(freqDist[v] >= threshold);
        }
Example #3
0
        public void Process(Variety data)
        {
            var posFreqDists = new Dictionary <FeatureSymbol, FrequencyDistribution <Segment> >
            {
                { CogFeatureSystem.Onset, new FrequencyDistribution <Segment>() },
                { CogFeatureSystem.Nucleus, new FrequencyDistribution <Segment>() },
                { CogFeatureSystem.Coda, new FrequencyDistribution <Segment>() }
            };

            var freqDist = new FrequencyDistribution <Segment>();

            foreach (Word word in data.Words)
            {
                foreach (ShapeNode node in word.Shape.Where(n => n.Type().IsOneOf(CogFeatureSystem.VowelType, CogFeatureSystem.ConsonantType)))
                {
                    Segment seg = _segmentPool.Get(node);
                    SymbolicFeatureValue pos;
                    if (node.Annotation.FeatureStruct.TryGetValue(CogFeatureSystem.SyllablePosition, out pos))
                    {
                        posFreqDists[(FeatureSymbol)pos].Increment(seg);
                    }
                    freqDist.Increment(seg);
                }
            }

            foreach (KeyValuePair <FeatureSymbol, FrequencyDistribution <Segment> > kvp in posFreqDists)
            {
                data.SyllablePositionSegmentFrequencyDistributions[kvp.Key] = kvp.Value;
            }

            data.SegmentFrequencyDistribution = freqDist;
        }
Example #4
0
        public void Calculate_Any_ShouldNotCreateEmptyGroups()
        {
            var numbers       = new[] { 10d };
            var unitUnderTest = new FrequencyDistribution(10);

            var distribution = unitUnderTest.Calculate(numbers);

            Assert.AreEqual(1, distribution.Keys.Count);
        }
Example #5
0
        public void Calculate_MultipleOccurences_ShouldApportionCorrectly()
        {
            var numbers       = new[] { 9.9, 0, 1 };
            var unitUnderTest = new FrequencyDistribution(10);

            var distribution = unitUnderTest.Calculate(numbers);

            Assert.AreEqual(3, distribution[new Tuple <double, double>(0d, 10d)]);
        }
Example #6
0
        public void Calculate_HighDecimal_ShouldNotRoundUp()
        {
            var numbers       = new[] { 9.9d };
            var unitUnderTest = new FrequencyDistribution(10);

            var distribution = unitUnderTest.Calculate(numbers);

            Assert.AreEqual(1, distribution[new Tuple <double, double>(0d, 10d)]);
        }
Example #7
0
        public void Calculate_MinusNumbers_ShouldApportionCorrectly()
        {
            var numbers       = new[] { -10d };
            var unitUnderTest = new FrequencyDistribution(10);

            var distribution = unitUnderTest.Calculate(numbers);

            Assert.AreEqual(1, distribution[new Tuple <double, double>(-10d, 0d)]);
        }
        public double GetProbability(TItem item, Ngram <TItem> context)
        {
            FrequencyDistribution <TItem> fd = _cfd[context];

            if (fd.SampleOutcomeCount == 0)
            {
                return(0);
            }
            return((double)fd[item] / fd.SampleOutcomeCount);
        }
Example #9
0
        public VarietyPair ToVarietyPair(SegmentPool segmentPool, CogProject project)
        {
            var vp = new VarietyPair(project.Varieties[Variety1], project.Varieties[Variety2])
            {
                PhoneticSimilarityScore          = PhoneticSimilarityScore,
                LexicalSimilarityScore           = LexicalSimilarityScore,
                DefaultCorrespondenceProbability = DefaultCorrespondenceProbability
            };
            var wordPairs = new Dictionary <WordPairSurrogate, WordPair>();

            vp.WordPairs.AddRange(_wordPairs.Select(surrogate => wordPairs.GetValue(surrogate, () => surrogate.ToWordPair(project, vp))));
            var soundChanges = new ConditionalFrequencyDistribution <SoundContext, Ngram <Segment> >();

            foreach (KeyValuePair <SoundContextSurrogate, Tuple <string[], int>[]> fd in _soundChanges)
            {
                SoundContext ctxt = fd.Key.ToSoundContext(project, segmentPool);
                FrequencyDistribution <Ngram <Segment> > freqDist = soundChanges[ctxt];
                foreach (Tuple <string[], int> sample in fd.Value)
                {
                    Ngram <Segment> corr = sample.Item1 == null ? new Ngram <Segment>() : new Ngram <Segment>(sample.Item1.Select(segmentPool.GetExisting));
                    freqDist.Increment(corr, sample.Item2);
                }
            }
            vp.SoundChangeFrequencyDistribution = soundChanges;
            IWordAligner aligner       = project.WordAligners[ComponentIdentifiers.PrimaryWordAligner];
            int          segmentCount  = vp.Variety2.SegmentFrequencyDistribution.ObservedSamples.Count;
            int          possCorrCount = aligner.ExpansionCompressionEnabled ? (segmentCount * segmentCount) + segmentCount + 1 : segmentCount + 1;

            vp.SoundChangeProbabilityDistribution = new ConditionalProbabilityDistribution <SoundContext, Ngram <Segment> >(soundChanges,
                                                                                                                            (sc, freqDist) => new WittenBellProbabilityDistribution <Ngram <Segment> >(freqDist, possCorrCount));

            foreach (KeyValuePair <string, List <SoundCorrespondenceSurrogate> > kvp in _soundCorrespondenceCollections)
            {
                if (kvp.Value != null)
                {
                    FeatureSymbol pos = null;
                    switch (kvp.Key)
                    {
                    case "onset":
                        pos = CogFeatureSystem.Onset;
                        break;

                    case "nucleus":
                        pos = CogFeatureSystem.Nucleus;
                        break;

                    case "coda":
                        pos = CogFeatureSystem.Coda;
                        break;
                    }
                    vp.SoundCorrespondenceCollections[pos].AddRange(kvp.Value.Select(surrogate => surrogate.ToSoundCorrespondence(segmentPool, wordPairs)));
                }
            }
            return(vp);
        }
Example #10
0
        public void Calculate_BandingOfTwo_ShouldApportionCorrectly()
        {
            var numbers       = new[] { -1d, 0d, 1d, 2d, 3d };
            var unitUnderTest = new FrequencyDistribution(2);

            var distribution = unitUnderTest.Calculate(numbers);

            Assert.AreEqual(1, distribution[new Tuple <double, double>(-2d, 0d)]);
            Assert.AreEqual(2, distribution[new Tuple <double, double>(0d, 2d)]);
            Assert.AreEqual(2, distribution[new Tuple <double, double>(2d, 4d)]);
        }
Example #11
0
        public static string ToDisplayName(this FrequencyDistribution distribution)
        {
            switch (distribution)
            {
            case FrequencyDistribution.Exponential: return("Exponential (-3 dB/Octave)");

            case FrequencyDistribution.Linear: return("Linear (White)");

            default:
                UnityEngine.Debug.LogError($"Unexpected FrequencyDistribution: {distribution}");
                return("");
            }
        }
Example #12
0
        public double GetProbability(TItem item, Ngram <TItem> context)
        {
            FrequencyDistribution <TItem> freqDist = _cfd[context];

            if (freqDist.ObservedSamples.Count == 0)
            {
                return(0);
            }

            double numer = freqDist[item] + (freqDist.ObservedSamples.Count * (_lowerOrderModel == null ? 1.0 / freqDist.ObservedSamples.Count
                                : _lowerOrderModel.GetProbability(item, context.SkipFirst(_dir))));
            double denom = freqDist.SampleOutcomeCount + freqDist.ObservedSamples.Count;

            return(numer / denom);
        }
 public void FixtureSetUp()
 {
     _fd = new FrequencyDistribution<string>();
     _fd.Increment("a", 1);
     _fd.Increment("b", 1);
     _fd.Increment("c", 2);
     _fd.Increment("d", 3);
     _fd.Increment("e", 4);
     _fd.Increment("f", 4);
     _fd.Increment("g", 4);
     _fd.Increment("h", 5);
     _fd.Increment("i", 5);
     _fd.Increment("j", 6);
     _fd.Increment("k", 6);
     _fd.Increment("l", 6);
     _fd.Increment("m", 7);
     _fd.Increment("n", 7);
     _fd.Increment("o", 8);
     _fd.Increment("p", 9);
     _fd.Increment("q", 10);
 }
 public void FixtureSetUp()
 {
     _fd = new FrequencyDistribution <string>();
     _fd.Increment("a", 1);
     _fd.Increment("b", 1);
     _fd.Increment("c", 2);
     _fd.Increment("d", 3);
     _fd.Increment("e", 4);
     _fd.Increment("f", 4);
     _fd.Increment("g", 4);
     _fd.Increment("h", 5);
     _fd.Increment("i", 5);
     _fd.Increment("j", 6);
     _fd.Increment("k", 6);
     _fd.Increment("l", 6);
     _fd.Increment("m", 7);
     _fd.Increment("n", 7);
     _fd.Increment("o", 8);
     _fd.Increment("p", 9);
     _fd.Increment("q", 10);
 }
Example #15
0
        /// <summary>
        /// Gets the Modified N-Gram precision score.
        /// </summary>
        /// <param name="references">The collection of reference sentences.</param>
        /// <param name="candidate">The MT candidate.</param>
        /// <param name="grams">The number of grams (default is 2).</param>
        /// <returns>The precision as a double.</returns>
        public double ModifiedNGramPrecision(ICollection <string> references, string candidate, int grams = 2)
        {
            var count     = new FrequencyDistribution <string>(new NGramCollector(candidate, grams).Collect());
            var countClip = new Dictionary <string, int>();

            foreach (var word in count.Keys)
            {
                countClip[word] = 0;
                foreach (var reference in references)
                {
                    var dist = new FrequencyDistribution <string>(new NGramCollector(reference, grams).Collect());
                    if (dist.ContainsKey(word))
                    {
                        countClip[word] = dist.FrequencyOf(word);
                        break;
                    }
                }
            }

            return(countClip.Values.Sum() / (double)count.Values.Sum());
        }
Example #16
0
        public void Export(Stream stream, IWordAligner aligner, VarietyPair varietyPair)
        {
            using (var writer = new StreamWriter(new NonClosingStreamWrapper(stream)))
            {
                writer.WriteLine("Similarity");
                writer.WriteLine("----------");
                writer.WriteLine("Lexical: {0:p}", varietyPair.LexicalSimilarityScore);
                writer.WriteLine("Phonetic: {0:p}", varietyPair.PhoneticSimilarityScore);
                writer.WriteLine();

                writer.WriteLine("Likely cognates");
                writer.WriteLine("--------------");
                WriteWordPairs(writer, aligner, varietyPair.WordPairs.Where(wp => wp.Cognacy));
                writer.WriteLine();

                writer.WriteLine("Likely non-cognates");
                writer.WriteLine("-------------------");
                WriteWordPairs(writer, aligner, varietyPair.WordPairs.Where(wp => !wp.Cognacy));
                writer.WriteLine();

                writer.WriteLine("Sound correspondences");
                writer.WriteLine("---------------------");
                bool first = true;
                foreach (SoundContext lhs in varietyPair.CognateSoundCorrespondenceProbabilityDistribution.Conditions)
                {
                    if (!first)
                    {
                        writer.WriteLine();
                    }
                    IProbabilityDistribution <Ngram <Segment> > probDist = varietyPair.CognateSoundCorrespondenceProbabilityDistribution[lhs];
                    FrequencyDistribution <Ngram <Segment> >    freqDist = varietyPair.CognateSoundCorrespondenceFrequencyDistribution[lhs];
                    writer.WriteLine(lhs.ToString());
                    foreach (var correspondence in freqDist.ObservedSamples.Select(corr => new { Segment = corr, Probability = probDist[corr], Frequency = freqDist[corr] }).OrderByDescending(corr => corr.Probability))
                    {
                        writer.WriteLine("{0}: {1:p}, {2}", correspondence.Segment, correspondence.Probability, correspondence.Frequency);
                    }
                    first = false;
                }
            }
        }
Example #17
0
        public double GetProbability(TItem item, Ngram <TItem> context)
        {
            FrequencyDistribution <TItem> freqDist = _cfd[context];

            if (freqDist.ObservedSamples.Count == 0)
            {
                return(0);
            }

            if (context.Length == 0)
            {
                return((double)freqDist[item] / freqDist.SampleOutcomeCount);
            }

            int count = freqDist[item];
            Tuple <int, int, int> bigN = _bigNs[context];
            double gamma = ((_discount1 * bigN.Item1) + (_discount2 * bigN.Item2) + (_discount3 * bigN.Item3)) / freqDist.SampleOutcomeCount;
            double d     = 0;

            if (count == 1)
            {
                d = _discount1;
            }
            else if (count == 2)
            {
                d = _discount2;
            }
            else if (count > 2)
            {
                d = _discount3;
            }

            double prob = (count - d) / freqDist.SampleOutcomeCount;

            return(prob + (gamma * _lowerOrderModel.GetProbability(item, context.SkipFirst(_dir))));
        }
Example #18
0
        protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type)
        {
            var dir = Direction.LeftToRight;

            switch (type)
            {
            case AffixType.Prefix:
                dir = Direction.LeftToRight;
                break;

            case AffixType.Suffix:
                dir = Direction.RightToLeft;
                break;
            }

            var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >();
            var itemFreqDist  = new FrequencyDistribution <TItem>();

            var affixes   = new Dictionary <Ngram <TItem>, AffixInfo>();
            var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>());

            foreach (TSeq seq in sequences)
            {
                var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s));
                nullAffix.Stems.Add(wordNgram);
                foreach (TItem item in wordNgram)
                {
                    itemFreqDist.Increment(item);
                }
                if (wordNgram.Length <= 1)
                {
                    continue;
                }

                var items         = new List <TItem>();
                var syllableStart = new HashSet <int>();
                foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir))
                {
                    items.AddRange(syllable.Items(dir));
                    syllableStart.Add(items.Count - 1);
                }
                var affix = new Ngram <TItem>();
                var stem  = new Ngram <TItem>(items, dir);
                for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++)
                {
                    affix = affix.Concat(items[i], dir);
                    affixFreqDist[affix.Length].Increment(affix);
                    if (i < items.Count - 1 && affix.Length <= MaxAffixLength)
                    {
                        AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix));
                        stem = stem.SkipFirst(dir);
                        ai.Stems.Add(stem);
                        if (syllableStart.Contains(i))
                        {
                            ai.SyllableBreakCount++;
                        }
                    }
                }

                for (int i = 0; i < items.Count; i++)
                {
                    var ngram = new Ngram <TItem>();
                    for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++)
                    {
                        ngram = ngram.Concat(items[j], dir);
                        ngramFreqDist[ngram.Length].Increment(ngram);
                    }
                }
            }

            var itemProbDist  = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist);
            var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });
            var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) =>
            {
                if (c == 1)
                {
                    return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd));
                }
                int binCount;
                try
                {
                    binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c));
                }
                catch (OverflowException)
                {
                    binCount = int.MaxValue;
                }
                return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount));
            });

            foreach (AffixInfo affix in affixes.Values)
            {
                int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram];

                var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq })
                                   .MaxBy(item => item.Curve);
                double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]);

                double pw        = affixProbDist[affix.Ngram.Length][affix.Ngram];
                double npw       = ngramProbDist[affix.Ngram.Length][affix.Ngram];
                double randomAdj = npw == 0 ? 1.0 : pw / npw;

                double normalizedFreq = affix.Ngram.Length * Math.Log(freq);

                double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0;

                affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore;
                yield return(affix);
            }

            yield return(nullAffix);
        }
Example #19
0
        /// <summary>
        /// Calculates the modified unigram precision score as described in section 2.1 of the paper.
        /// </summary>
        /// <param name="reference">The reference as a collector.</param>
        /// <param name="candidate">The candidate as a collector.</param>
        /// <returns>The Modified n-gram precision score.</returns>
        public double ModifiedUnigramPrecision(ICollector <string> reference, ICollector <string> candidate)
        {
            var referenceDist = new FrequencyDistribution <string>(reference.Collect());

            return(referenceDist.MostFrequentValue() / (double)candidate.Size());
        }
        public void TestMostFrequentValue()
        {
            var dist = new FrequencyDistribution <string>("a a a a b c d e e ffff".Split(" "));

            Assert.AreEqual(4, dist.MostFrequentValue());
        }
Example #21
0
        public void Smooth(int ngramSize, TSeq[] sequences, Func <TSeq, IEnumerable <TItem> > itemsSelector, Direction dir, ConditionalFrequencyDistribution <Ngram <TItem>, TItem> cfd)
        {
            _cfd = cfd;
            _dir = dir;

            int totalN1 = 0, totalN2 = 0, totalN3 = 0, totalN4 = 0;

            _bigNs.Clear();
            foreach (Ngram <TItem> cond in cfd.Conditions)
            {
                int n1 = 0, n2 = 0, n3 = 0, n4 = 0;
                int nGreater = 0;
                FrequencyDistribution <TItem> freqDist = cfd[cond];
                foreach (TItem item in freqDist.ObservedSamples)
                {
                    if (freqDist[item] == 1)
                    {
                        n1++;
                    }
                    else if (freqDist[item] == 2)
                    {
                        n2++;
                    }
                    else if (freqDist[item] > 2)
                    {
                        if (freqDist[item] == 3)
                        {
                            n3++;
                        }
                        else if (freqDist[item] == 4)
                        {
                            n4++;
                        }
                        nGreater++;
                    }
                }

                totalN1 += n1;
                totalN2 += n2;
                totalN3 += n3;
                totalN4 += n4;

                _bigNs[cond] = Tuple.Create(n1, n2, nGreater);
            }

            _discount1 = 0;
            _discount2 = 0;
            _discount3 = 0;
            double y = 0;

            if (totalN1 > 0)
            {
                y          = (double)totalN1 / (totalN1 + (2 * totalN2));
                _discount1 = 1 - (2 * y * ((double)totalN2 / totalN1));
            }
            if (totalN2 > 0)
            {
                _discount2 = 2 - (3 * y * ((double)totalN3 / totalN2));
            }
            if (totalN3 > 0)
            {
                _discount3 = 3 - (4 * y * ((double)totalN4 / totalN3));
            }

            if (ngramSize > 1)
            {
                _lowerOrderModel = new NgramModel <TSeq, TItem>(ngramSize - 1, sequences, itemsSelector, dir, new ModifiedKneserNeySmoother <TSeq, TItem>());
            }
        }