public void Learn(TPhrase phrase) { Logger.Info($"Learning phrase: '{phrase}'"); if (phrase == null || phrase.Equals(default(TPhrase))) { return; } // Ignore particularly short phrases if (SplitTokens(phrase).Count() < Level) { Logger.Info($"Phrase {phrase} too short - skipped"); return; } // Add it to the source lines so we can ignore it // when learning in future if (!SourcePhrases.Contains(phrase)) { Logger.Debug($"Adding phrase {phrase} to source lines"); SourcePhrases.Add(phrase); } // Split the sentence to an array of words var tokens = SplitTokens(phrase).ToArray(); LearnTokens(tokens); var lastCol = new List <TUnigram>(); for (var j = Level; j > 0; j--) { TUnigram previous; try { previous = tokens[tokens.Length - j]; Logger.Debug($"Adding TGram ({typeof(TUnigram)}) {previous} to lastCol"); lastCol.Add(previous); } catch (IndexOutOfRangeException e) { Logger.Warn($"Caught an exception: {e}"); previous = GetPrepadUnigram(); lastCol.Add(previous); } } Logger.Debug($"Reached final key for phrase {phrase}"); var finalKey = new NgramContainer <TUnigram>(lastCol.ToArray()); Chain.AddOrCreate(finalKey, GetTerminatorUnigram()); }
public void Learn(TPhrase phrase) { if (phrase == null || phrase.Equals(default(TPhrase))) { return; } // Ignore particularly short phrases if (SplitTokens(phrase).Count() < Level) { return; } // Add it to the source lines so we can ignore it // when learning in future if (!SourcePhrases.Contains(phrase)) { SourcePhrases.Add(phrase); } // Split the sentence to an array of words var tokens = SplitTokens(phrase).ToArray(); LearnTokens(tokens); var lastCol = new List <TUnigram>(); for (var j = Level; j > 0; j--) { TUnigram previous; try { previous = tokens[tokens.Length - j]; lastCol.Add(previous); } catch (IndexOutOfRangeException e) { previous = GetPrepadUnigram(); lastCol.Add(previous); } } var finalKey = new NgramContainer <TUnigram>(lastCol.ToArray()); Chain.AddOrCreate(finalKey, GetTerminatorUnigram()); }
/// <summary> /// Iterate over a list of TGrams and store each of them in the model at a composite key genreated from its prior [Level] number of TGrams /// </summary> /// <param name="tokens"></param> private void LearnTokens(IReadOnlyList <TUnigram> tokens) { for (var i = 0; i < tokens.Count; i++) { var current = tokens[i]; var previousCol = new List <TUnigram>(); // From the current token's index, get hold of the previous [Level] number of tokens that came before it for (var j = Level; j > 0; j--) { TUnigram previous; try { // this case addresses when we are at a token index less then the value of [Level], // and we effectively would be looking at tokens before the beginning phrase if (i - j < 0) { previousCol.Add(GetPrepadUnigram()); } else { previous = tokens[i - j]; previousCol.Add(previous); } } catch (IndexOutOfRangeException) { previous = GetPrepadUnigram(); previousCol.Add(previous); } } // create the composite key based on previous tokens var key = new NgramContainer <TUnigram>(previousCol.ToArray()); // add the current token to the markov model at the composite key Chain.AddOrCreate(key, current); } }