private IEnumerable <Tuple <AffixInfo, Ngram <TItem> > > GetAffixes(Ngram <TItem> word, AffixType type) { Direction dir; Dictionary <Ngram <TItem>, AffixInfo> affixes; if (type == AffixType.Prefix) { dir = Direction.LeftToRight; affixes = _prefixes; } else { dir = Direction.RightToLeft; affixes = _suffixes; } var affix = new Ngram <TItem>(); yield return(Tuple.Create(affixes[affix], word)); foreach (TItem item in word.GetItems(dir).Take(Math.Min(MaxAffixLength, word.Length - 1))) { affix = affix.Concat(item, dir); word = word.SkipFirst(dir); AffixInfo ai; if (affixes.TryGetValue(affix, out ai)) { yield return(Tuple.Create(ai, word)); } } }
protected IEnumerable <AffixInfo> ComputeAffixes(ICollection <TSeq> sequences, AffixType type) { var dir = Direction.LeftToRight; switch (type) { case AffixType.Prefix: dir = Direction.LeftToRight; break; case AffixType.Suffix: dir = Direction.RightToLeft; break; } var affixFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >(); var ngramFreqDist = new ConditionalFrequencyDistribution <int, Ngram <TItem> >(); var itemFreqDist = new FrequencyDistribution <TItem>(); var affixes = new Dictionary <Ngram <TItem>, AffixInfo>(); var nullAffix = new AffixInfo(sequences.Count, new Ngram <TItem>()); foreach (TSeq seq in sequences) { var wordNgram = new Ngram <TItem>(_syllablesSelector(seq).SelectMany(s => s)); nullAffix.Stems.Add(wordNgram); foreach (TItem item in wordNgram) { itemFreqDist.Increment(item); } if (wordNgram.Length <= 1) { continue; } var items = new List <TItem>(); var syllableStart = new HashSet <int>(); foreach (IEnumerable <TItem> syllable in _syllablesSelector(seq).Items(dir)) { items.AddRange(syllable.Items(dir)); syllableStart.Add(items.Count - 1); } var affix = new Ngram <TItem>(); var stem = new Ngram <TItem>(items, dir); for (int i = 0; i < Math.Min(MaxAffixLength + 1, items.Count); i++) { affix = affix.Concat(items[i], dir); affixFreqDist[affix.Length].Increment(affix); if (i < items.Count - 1 && affix.Length <= MaxAffixLength) { AffixInfo ai = affixes.GetOrCreate(affix, () => new AffixInfo(sequences.Count, affix)); stem = stem.SkipFirst(dir); ai.Stems.Add(stem); if (syllableStart.Contains(i)) { ai.SyllableBreakCount++; } } } for (int i = 0; i < items.Count; i++) { var ngram = new Ngram <TItem>(); for (int j = i; j < Math.Min(MaxAffixLength + i, items.Count); j++) { ngram = ngram.Concat(items[j], dir); ngramFreqDist[ngram.Length].Increment(ngram); } } } var itemProbDist = new MaxLikelihoodProbabilityDistribution <TItem>(itemFreqDist); var affixProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(affixFreqDist, (c, fd) => { if (c == 1) { return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd)); } int binCount; try { binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c)); } catch (OverflowException) { binCount = int.MaxValue; } return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount)); }); var ngramProbDist = new ConditionalProbabilityDistribution <int, Ngram <TItem> >(ngramFreqDist, (c, fd) => { if (c == 1) { return(new MaxLikelihoodProbabilityDistribution <Ngram <TItem> >(fd)); } int binCount; try { binCount = checked ((int)Math.Pow(itemFreqDist.ObservedSamples.Count, c)); } catch (OverflowException) { binCount = int.MaxValue; } return(new WittenBellProbabilityDistribution <Ngram <TItem> >(fd, binCount)); }); foreach (AffixInfo affix in affixes.Values) { int freq = affixFreqDist[affix.Ngram.Length][affix.Ngram]; var maxCurveItem = itemFreqDist.ObservedSamples.Select(item => new { Item = item, Curve = (double)affixFreqDist[affix.Ngram.Length + 1][affix.Ngram.Concat(item, dir)] / freq }) .MaxBy(item => item.Curve); double curveDrop = (1 - maxCurveItem.Curve) / (1 - itemProbDist[maxCurveItem.Item]); double pw = affixProbDist[affix.Ngram.Length][affix.Ngram]; double npw = ngramProbDist[affix.Ngram.Length][affix.Ngram]; double randomAdj = npw == 0 ? 1.0 : pw / npw; double normalizedFreq = affix.Ngram.Length * Math.Log(freq); double syllableScore = AffixesOccurOnSyllableBoundaries ? (0.5 * ((double)affix.SyllableBreakCount / freq)) + 0.5 : 1.0; affix.ZScore = curveDrop * randomAdj * normalizedFreq * syllableScore; yield return(affix); } yield return(nullAffix); }