public void NoSamples() { var ml = new MaxLikelihoodProbabilityDistribution<string>(new FrequencyDistribution<string>()); Assert.That(ml["a"], Is.EqualTo(0)); Assert.That(ml["b"], Is.EqualTo(0)); Assert.That(ml["c"], Is.EqualTo(0)); }
public void NoSamples() { var ml = new MaxLikelihoodProbabilityDistribution <string>(new FrequencyDistribution <string>()); Assert.That(ml["a"], Is.EqualTo(0)); Assert.That(ml["b"], Is.EqualTo(0)); Assert.That(ml["c"], Is.EqualTo(0)); }
public void Probability() { var ml = new MaxLikelihoodProbabilityDistribution<string>(_fd); Assert.That(ml["a"], Is.EqualTo(0.01136).Within(0.00001)); Assert.That(ml["c"], Is.EqualTo(0.02272).Within(0.00001)); Assert.That(ml["d"], Is.EqualTo(0.03409).Within(0.00001)); Assert.That(ml["o"], Is.EqualTo(0.09090).Within(0.00001)); Assert.That(ml["q"], Is.EqualTo(0.11363).Within(0.00001)); Assert.That(ml["t"], Is.EqualTo(0)); Assert.That(ml["z"], Is.EqualTo(0)); }
public void Probability() { var ml = new MaxLikelihoodProbabilityDistribution <string>(_fd); Assert.That(ml["a"], Is.EqualTo(0.01136).Within(0.00001)); Assert.That(ml["c"], Is.EqualTo(0.02272).Within(0.00001)); Assert.That(ml["d"], Is.EqualTo(0.03409).Within(0.00001)); Assert.That(ml["o"], Is.EqualTo(0.09090).Within(0.00001)); Assert.That(ml["q"], Is.EqualTo(0.11363).Within(0.00001)); Assert.That(ml["t"], Is.EqualTo(0)); Assert.That(ml["z"], Is.EqualTo(0)); }
protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type) { var dir = Direction.LeftToRight; switch (type) { case AffixType.Prefix: dir = Direction.LeftToRight; break; case AffixType.Suffix: dir = Direction.RightToLeft; break; } var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >(); var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >(); var itemFreqDist = new FrequencyDistribution <TItem>(); var affixes = new Dictionary <Ngram <TItem>, AffixInfo>(); var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>()); foreach (TSeq seq in sequences) { var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s)); nullAffix.Stems.Add(wordNgram); foreach (TItem item in wordNgram) { itemFreqDist.Increment(item); } if (wordNgram.Length <= 1) { continue; } var items = new List <TItem>(); var syllableStart = new HashSet <int>(); foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir)) { items.AddRange(syllable.Items(dir)); syllableStart.Add(items.Count - 1); } var affix = new Ngram <TItem>(); var stem = new Ngram <TItem>(items, dir); for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++) { affix = affix.Concat(items[i], dir); affixFreqDist[affix.Length].Increment(affix); if (i < items.Count - 1 && affix.Length <= MaxAffixLength) { AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix)); stem = stem.SkipFirst(dir); ai.Stems.Add(stem); if (syllableStart.Contains(i)) { ai.SyllableBreakCount++; } } } for (int i = 0; i < items.Count; i++) { var ngram = new Ngram <TItem>(); for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++) { ngram = ngram.Concat(items[j], dir); ngramFreqDist[ngram.Length].Increment(ngram); } } } var itemProbDist = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist); var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) => { if (c == 1) { return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd)); } int binCount; try { binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c)); } catch (OverflowException) { binCount = int.MaxValue; } return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount)); }); var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) => { if (c == 1) { return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd)); } int binCount; try { binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c)); } catch (OverflowException) { binCount = int.MaxValue; } return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount)); }); foreach (AffixInfo affix in affixes.Values) { int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram]; var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq }) .MaxBy(item => item.Curve); double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]); double pw = affixProbDist[affix.Ngram.Length][affix.Ngram]; double npw = ngramProbDist[affix.Ngram.Length][affix.Ngram]; double randomAdj = npw == 0 ? 1.0 : pw / npw; double normalizedFreq = affix.Ngram.Length * Math.Log(freq); double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0; affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore; yield return(affix); } yield return(nullAffix); }