public VarietyPairSurrogate(VarietyPair vp) { Variety1 = vp.Variety1.Name; Variety2 = vp.Variety2.Name; var wordPairSurrogates = new Dictionary <WordPair, WordPairSurrogate>(); _wordPairs = vp.WordPairs.Select(wp => wordPairSurrogates.GetValue(wp, () => new WordPairSurrogate(wp))).ToList(); PhoneticSimilarityScore = vp.PhoneticSimilarityScore; LexicalSimilarityScore = vp.LexicalSimilarityScore; DefaultSoundCorrespondenceProbability = vp.DefaultSoundCorrespondenceProbability; _cognateSoundCorrespondenceFrequencyDistribution = new Dictionary <SoundContextSurrogate, Tuple <string[], int>[]>(); foreach (SoundContext lhs in vp.CognateSoundCorrespondenceFrequencyDistribution.Conditions) { FrequencyDistribution <Ngram <Segment> > freqDist = vp.CognateSoundCorrespondenceFrequencyDistribution[lhs]; _cognateSoundCorrespondenceFrequencyDistribution[new SoundContextSurrogate(lhs)] = freqDist.ObservedSamples.Select(ngram => Tuple.Create(ngram.Select(seg => seg.StrRep).ToArray(), freqDist[ngram])).ToArray(); } _cognateSoundCorrespondenceByPosition = new Dictionary <string, List <SoundCorrespondenceSurrogate> >(); foreach (KeyValuePair <FeatureSymbol, SoundCorrespondenceCollection> kvp in vp.CognateSoundCorrespondencesByPosition) { string pos; if (kvp.Key == CogFeatureSystem.Onset) { pos = "onset"; } else if (kvp.Key == CogFeatureSystem.Nucleus) { pos = "nucleus"; } else { pos = "coda"; } _cognateSoundCorrespondenceByPosition[pos] = kvp.Value.Select(corr => new SoundCorrespondenceSurrogate(wordPairSurrogates, corr)).ToList(); } }
private bool IsRegular(WordPair wordPair, IWordAlignerResult alignerResult, Alignment <Word, ShapeNode> alignment, int column, Ngram <Segment> v) { VarietyPair vp = wordPair.VarietyPair; SoundContext context = alignment.ToSoundContext(_segmentPool, 0, column, alignerResult.WordAligner.ContextualSoundClasses); FrequencyDistribution <Ngram <Segment> > freqDist = vp.CognateSoundCorrespondenceFrequencyDistribution[context]; int threshold; if (AutomaticRegularCorrespondenceThreshold) { int seg2Count = vp.CognateSoundCorrespondenceFrequencyDistribution.Conditions .Where(sc => sc.LeftEnvironment == context.LeftEnvironment && sc.RightEnvironment == context.RightEnvironment) .Sum(sc => vp.CognateSoundCorrespondenceFrequencyDistribution[sc][v]); if (!_regularCorrespondenceThresholdTable.TryGetThreshold(vp.CognateCount, freqDist.SampleOutcomeCount, seg2Count, out threshold)) { threshold = DefaultRegularCorrespondenceThreshold; } } else { threshold = DefaultRegularCorrespondenceThreshold; } return(freqDist[v] >= threshold); }
public void Process(Variety data) { var posFreqDists = new Dictionary <FeatureSymbol, FrequencyDistribution <Segment> > { { CogFeatureSystem.Onset, new FrequencyDistribution <Segment>() }, { CogFeatureSystem.Nucleus, new FrequencyDistribution <Segment>() }, { CogFeatureSystem.Coda, new FrequencyDistribution <Segment>() } }; var freqDist = new FrequencyDistribution <Segment>(); foreach (Word word in data.Words) { foreach (ShapeNode node in word.Shape.Where(n => n.Type().IsOneOf(CogFeatureSystem.VowelType, CogFeatureSystem.ConsonantType))) { Segment seg = _segmentPool.Get(node); SymbolicFeatureValue pos; if (node.Annotation.FeatureStruct.TryGetValue(CogFeatureSystem.SyllablePosition, out pos)) { posFreqDists[(FeatureSymbol)pos].Increment(seg); } freqDist.Increment(seg); } } foreach (KeyValuePair <FeatureSymbol, FrequencyDistribution <Segment> > kvp in posFreqDists) { data.SyllablePositionSegmentFrequencyDistributions[kvp.Key] = kvp.Value; } data.SegmentFrequencyDistribution = freqDist; }
public void Calculate_Any_ShouldNotCreateEmptyGroups() { var numbers = new[] { 10d }; var unitUnderTest = new FrequencyDistribution(10); var distribution = unitUnderTest.Calculate(numbers); Assert.AreEqual(1, distribution.Keys.Count); }
public void Calculate_MultipleOccurences_ShouldApportionCorrectly() { var numbers = new[] { 9.9, 0, 1 }; var unitUnderTest = new FrequencyDistribution(10); var distribution = unitUnderTest.Calculate(numbers); Assert.AreEqual(3, distribution[new Tuple <double, double>(0d, 10d)]); }
public void Calculate_HighDecimal_ShouldNotRoundUp() { var numbers = new[] { 9.9d }; var unitUnderTest = new FrequencyDistribution(10); var distribution = unitUnderTest.Calculate(numbers); Assert.AreEqual(1, distribution[new Tuple <double, double>(0d, 10d)]); }
public void Calculate_MinusNumbers_ShouldApportionCorrectly() { var numbers = new[] { -10d }; var unitUnderTest = new FrequencyDistribution(10); var distribution = unitUnderTest.Calculate(numbers); Assert.AreEqual(1, distribution[new Tuple <double, double>(-10d, 0d)]); }
public double GetProbability(TItem item, Ngram <TItem> context) { FrequencyDistribution <TItem> fd = _cfd[context]; if (fd.SampleOutcomeCount == 0) { return(0); } return((double)fd[item] / fd.SampleOutcomeCount); }
public VarietyPair ToVarietyPair(SegmentPool segmentPool, CogProject project) { var vp = new VarietyPair(project.Varieties[Variety1], project.Varieties[Variety2]) { PhoneticSimilarityScore = PhoneticSimilarityScore, LexicalSimilarityScore = LexicalSimilarityScore, DefaultCorrespondenceProbability = DefaultCorrespondenceProbability }; var wordPairs = new Dictionary <WordPairSurrogate, WordPair>(); vp.WordPairs.AddRange(_wordPairs.Select(surrogate => wordPairs.GetValue(surrogate, () => surrogate.ToWordPair(project, vp)))); var soundChanges = new ConditionalFrequencyDistribution <SoundContext, Ngram <Segment> >(); foreach (KeyValuePair <SoundContextSurrogate, Tuple <string[], int>[]> fd in _soundChanges) { SoundContext ctxt = fd.Key.ToSoundContext(project, segmentPool); FrequencyDistribution <Ngram <Segment> > freqDist = soundChanges[ctxt]; foreach (Tuple <string[], int> sample in fd.Value) { Ngram <Segment> corr = sample.Item1 == null ? new Ngram <Segment>() : new Ngram <Segment>(sample.Item1.Select(segmentPool.GetExisting)); freqDist.Increment(corr, sample.Item2); } } vp.SoundChangeFrequencyDistribution = soundChanges; IWordAligner aligner = project.WordAligners[ComponentIdentifiers.PrimaryWordAligner]; int segmentCount = vp.Variety2.SegmentFrequencyDistribution.ObservedSamples.Count; int possCorrCount = aligner.ExpansionCompressionEnabled ? (segmentCount * segmentCount) + segmentCount + 1 : segmentCount + 1; vp.SoundChangeProbabilityDistribution = new ConditionalProbabilityDistribution <SoundContext, Ngram <Segment> >(soundChanges, (sc, freqDist) => new WittenBellProbabilityDistribution <Ngram <Segment> >(freqDist, possCorrCount)); foreach (KeyValuePair <string, List <SoundCorrespondenceSurrogate> > kvp in _soundCorrespondenceCollections) { if (kvp.Value != null) { FeatureSymbol pos = null; switch (kvp.Key) { case "onset": pos = CogFeatureSystem.Onset; break; case "nucleus": pos = CogFeatureSystem.Nucleus; break; case "coda": pos = CogFeatureSystem.Coda; break; } vp.SoundCorrespondenceCollections[pos].AddRange(kvp.Value.Select(surrogate => surrogate.ToSoundCorrespondence(segmentPool, wordPairs))); } } return(vp); }
public void Calculate_BandingOfTwo_ShouldApportionCorrectly() { var numbers = new[] { -1d, 0d, 1d, 2d, 3d }; var unitUnderTest = new FrequencyDistribution(2); var distribution = unitUnderTest.Calculate(numbers); Assert.AreEqual(1, distribution[new Tuple <double, double>(-2d, 0d)]); Assert.AreEqual(2, distribution[new Tuple <double, double>(0d, 2d)]); Assert.AreEqual(2, distribution[new Tuple <double, double>(2d, 4d)]); }
public static string ToDisplayName(this FrequencyDistribution distribution) { switch (distribution) { case FrequencyDistribution.Exponential: return("Exponential (-3 dB/Octave)"); case FrequencyDistribution.Linear: return("Linear (White)"); default: UnityEngine.Debug.LogError($"Unexpected FrequencyDistribution: {distribution}"); return(""); } }
public double GetProbability(TItem item, Ngram <TItem> context) { FrequencyDistribution <TItem> freqDist = _cfd[context]; if (freqDist.ObservedSamples.Count == 0) { return(0); } double numer = freqDist[item] + (freqDist.ObservedSamples.Count * (_lowerOrderModel == null ? 1.0 / freqDist.ObservedSamples.Count : _lowerOrderModel.GetProbability(item, context.SkipFirst(_dir)))); double denom = freqDist.SampleOutcomeCount + freqDist.ObservedSamples.Count; return(numer / denom); }
public void FixtureSetUp() { _fd = new FrequencyDistribution<string>(); _fd.Increment("a", 1); _fd.Increment("b", 1); _fd.Increment("c", 2); _fd.Increment("d", 3); _fd.Increment("e", 4); _fd.Increment("f", 4); _fd.Increment("g", 4); _fd.Increment("h", 5); _fd.Increment("i", 5); _fd.Increment("j", 6); _fd.Increment("k", 6); _fd.Increment("l", 6); _fd.Increment("m", 7); _fd.Increment("n", 7); _fd.Increment("o", 8); _fd.Increment("p", 9); _fd.Increment("q", 10); }
public void FixtureSetUp() { _fd = new FrequencyDistribution <string>(); _fd.Increment("a", 1); _fd.Increment("b", 1); _fd.Increment("c", 2); _fd.Increment("d", 3); _fd.Increment("e", 4); _fd.Increment("f", 4); _fd.Increment("g", 4); _fd.Increment("h", 5); _fd.Increment("i", 5); _fd.Increment("j", 6); _fd.Increment("k", 6); _fd.Increment("l", 6); _fd.Increment("m", 7); _fd.Increment("n", 7); _fd.Increment("o", 8); _fd.Increment("p", 9); _fd.Increment("q", 10); }
/// <summary> /// Gets the Modified N-Gram precision score. /// </summary> /// <param name="references">The collection of reference sentences.</param> /// <param name="candidate">The MT candidate.</param> /// <param name="grams">The number of grams (default is 2).</param> /// <returns>The precision as a double.</returns> public double ModifiedNGramPrecision(ICollection <string> references, string candidate, int grams = 2) { var count = new FrequencyDistribution <string>(new NGramCollector(candidate, grams).Collect()); var countClip = new Dictionary <string, int>(); foreach (var word in count.Keys) { countClip[word] = 0; foreach (var reference in references) { var dist = new FrequencyDistribution <string>(new NGramCollector(reference, grams).Collect()); if (dist.ContainsKey(word)) { countClip[word] = dist.FrequencyOf(word); break; } } } return(countClip.Values.Sum() / (double)count.Values.Sum()); }
public void Export(Stream stream, IWordAligner aligner, VarietyPair varietyPair) { using (var writer = new StreamWriter(new NonClosingStreamWrapper(stream))) { writer.WriteLine("Similarity"); writer.WriteLine("----------"); writer.WriteLine("Lexical: {0:p}", varietyPair.LexicalSimilarityScore); writer.WriteLine("Phonetic: {0:p}", varietyPair.PhoneticSimilarityScore); writer.WriteLine(); writer.WriteLine("Likely cognates"); writer.WriteLine("--------------"); WriteWordPairs(writer, aligner, varietyPair.WordPairs.Where(wp => wp.Cognacy)); writer.WriteLine(); writer.WriteLine("Likely non-cognates"); writer.WriteLine("-------------------"); WriteWordPairs(writer, aligner, varietyPair.WordPairs.Where(wp => !wp.Cognacy)); writer.WriteLine(); writer.WriteLine("Sound correspondences"); writer.WriteLine("---------------------"); bool first = true; foreach (SoundContext lhs in varietyPair.CognateSoundCorrespondenceProbabilityDistribution.Conditions) { if (!first) { writer.WriteLine(); } IProbabilityDistribution <Ngram <Segment> > probDist = varietyPair.CognateSoundCorrespondenceProbabilityDistribution[lhs]; FrequencyDistribution <Ngram <Segment> > freqDist = varietyPair.CognateSoundCorrespondenceFrequencyDistribution[lhs]; writer.WriteLine(lhs.ToString()); foreach (var correspondence in freqDist.ObservedSamples.Select(corr => new { Segment = corr, Probability = probDist[corr], Frequency = freqDist[corr] }).OrderByDescending(corr => corr.Probability)) { writer.WriteLine("{0}: {1:p}, {2}", correspondence.Segment, correspondence.Probability, correspondence.Frequency); } first = false; } } }
public double GetProbability(TItem item, Ngram <TItem> context) { FrequencyDistribution <TItem> freqDist = _cfd[context]; if (freqDist.ObservedSamples.Count == 0) { return(0); } if (context.Length == 0) { return((double)freqDist[item] / freqDist.SampleOutcomeCount); } int count = freqDist[item]; Tuple <int, int, int> bigN = _bigNs[context]; double gamma = ((_discount1 * bigN.Item1) + (_discount2 * bigN.Item2) + (_discount3 * bigN.Item3)) / freqDist.SampleOutcomeCount; double d = 0; if (count == 1) { d = _discount1; } else if (count == 2) { d = _discount2; } else if (count > 2) { d = _discount3; } double prob = (count - d) / freqDist.SampleOutcomeCount; return(prob + (gamma * _lowerOrderModel.GetProbability(item, context.SkipFirst(_dir)))); }
protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type) { var dir = Direction.LeftToRight; switch (type) { case AffixType.Prefix: dir = Direction.LeftToRight; break; case AffixType.Suffix: dir = Direction.RightToLeft; break; } var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >(); var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >(); var itemFreqDist = new FrequencyDistribution <TItem>(); var affixes = new Dictionary <Ngram <TItem>, AffixInfo>(); var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>()); foreach (TSeq seq in sequences) { var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s)); nullAffix.Stems.Add(wordNgram); foreach (TItem item in wordNgram) { itemFreqDist.Increment(item); } if (wordNgram.Length <= 1) { continue; } var items = new List <TItem>(); var syllableStart = new HashSet <int>(); foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir)) { items.AddRange(syllable.Items(dir)); syllableStart.Add(items.Count - 1); } var affix = new Ngram <TItem>(); var stem = new Ngram <TItem>(items, dir); for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++) { affix = affix.Concat(items[i], dir); affixFreqDist[affix.Length].Increment(affix); if (i < items.Count - 1 && affix.Length <= MaxAffixLength) { AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix)); stem = stem.SkipFirst(dir); ai.Stems.Add(stem); if (syllableStart.Contains(i)) { ai.SyllableBreakCount++; } } } for (int i = 0; i < items.Count; i++) { var ngram = new Ngram <TItem>(); for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++) { ngram = ngram.Concat(items[j], dir); ngramFreqDist[ngram.Length].Increment(ngram); } } } var itemProbDist = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist); var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) => { if (c == 1) { return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd)); } int binCount; try { binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c)); } catch (OverflowException) { binCount = int.MaxValue; } return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount)); }); var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) => { if (c == 1) { return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd)); } int binCount; try { binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c)); } catch (OverflowException) { binCount = int.MaxValue; } return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount)); }); foreach (AffixInfo affix in affixes.Values) { int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram]; var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq }) .MaxBy(item => item.Curve); double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]); double pw = affixProbDist[affix.Ngram.Length][affix.Ngram]; double npw = ngramProbDist[affix.Ngram.Length][affix.Ngram]; double randomAdj = npw == 0 ? 1.0 : pw / npw; double normalizedFreq = affix.Ngram.Length * Math.Log(freq); double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0; affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore; yield return(affix); } yield return(nullAffix); }
/// <summary> /// Calculates the modified unigram precision score as described in section 2.1 of the paper. /// </summary> /// <param name="reference">The reference as a collector.</param> /// <param name="candidate">The candidate as a collector.</param> /// <returns>The Modified n-gram precision score.</returns> public double ModifiedUnigramPrecision(ICollector <string> reference, ICollector <string> candidate) { var referenceDist = new FrequencyDistribution <string>(reference.Collect()); return(referenceDist.MostFrequentValue() / (double)candidate.Size()); }
public void TestMostFrequentValue() { var dist = new FrequencyDistribution <string>("a a a a b c d e e ffff".Split(" ")); Assert.AreEqual(4, dist.MostFrequentValue()); }
public void Smooth(int ngramSize, TSeq[] sequences, Func <TSeq, IEnumerable <TItem> > itemsSelector, Direction dir, ConditionalFrequencyDistribution <Ngram <TItem>, TItem> cfd) { _cfd = cfd; _dir = dir; int totalN1 = 0, totalN2 = 0, totalN3 = 0, totalN4 = 0; _bigNs.Clear(); foreach (Ngram <TItem> cond in cfd.Conditions) { int n1 = 0, n2 = 0, n3 = 0, n4 = 0; int nGreater = 0; FrequencyDistribution <TItem> freqDist = cfd[cond]; foreach (TItem item in freqDist.ObservedSamples) { if (freqDist[item] == 1) { n1++; } else if (freqDist[item] == 2) { n2++; } else if (freqDist[item] > 2) { if (freqDist[item] == 3) { n3++; } else if (freqDist[item] == 4) { n4++; } nGreater++; } } totalN1 += n1; totalN2 += n2; totalN3 += n3; totalN4 += n4; _bigNs[cond] = Tuple.Create(n1, n2, nGreater); } _discount1 = 0; _discount2 = 0; _discount3 = 0; double y = 0; if (totalN1 > 0) { y = (double)totalN1 / (totalN1 + (2 * totalN2)); _discount1 = 1 - (2 * y * ((double)totalN2 / totalN1)); } if (totalN2 > 0) { _discount2 = 2 - (3 * y * ((double)totalN3 / totalN2)); } if (totalN3 > 0) { _discount3 = 3 - (4 * y * ((double)totalN4 / totalN3)); } if (ngramSize > 1) { _lowerOrderModel = new NgramModel <TSeq, TItem>(ngramSize - 1, sequences, itemsSelector, dir, new ModifiedKneserNeySmoother <TSeq, TItem>()); } }