// Returns any viable options for the next word based on // what was provided as input, based on the trained model. public List <TUnigram> GetMatches(TPhrase input) { var inputArray = SplitTokens(input).ToArray(); if (inputArray.Count() > Level) { inputArray = inputArray.Skip(inputArray.Length - Level).ToArray(); } else if (inputArray.Count() < Level) { inputArray = PadArrayLow(inputArray); } var key = new NgramContainer <TUnigram>(inputArray); var chosen = new List <TUnigram>(); try { chosen = Chain.GetValuesForKey(key); } catch (KeyNotFoundException) { } return(chosen); }
public void Learn(TPhrase phrase) { Logger.Info($"Learning phrase: '{phrase}'"); if (phrase == null || phrase.Equals(default(TPhrase))) { return; } // Ignore particularly short phrases if (SplitTokens(phrase).Count() < Level) { Logger.Info($"Phrase {phrase} too short - skipped"); return; } // Add it to the source lines so we can ignore it // when learning in future if (!SourcePhrases.Contains(phrase)) { Logger.Debug($"Adding phrase {phrase} to source lines"); SourcePhrases.Add(phrase); } // Split the sentence to an array of words var tokens = SplitTokens(phrase).ToArray(); LearnTokens(tokens); var lastCol = new List <TUnigram>(); for (var j = Level; j > 0; j--) { TUnigram previous; try { previous = tokens[tokens.Length - j]; Logger.Debug($"Adding TGram ({typeof(TUnigram)}) {previous} to lastCol"); lastCol.Add(previous); } catch (IndexOutOfRangeException e) { Logger.Warn($"Caught an exception: {e}"); previous = GetPrepadUnigram(); lastCol.Add(previous); } } Logger.Debug($"Reached final key for phrase {phrase}"); var finalKey = new NgramContainer <TUnigram>(lastCol.ToArray()); Chain.AddOrCreate(finalKey, GetTerminatorUnigram()); }
public void Learn(TPhrase phrase) { if (phrase == null || phrase.Equals(default(TPhrase))) { return; } // Ignore particularly short phrases if (SplitTokens(phrase).Count() < Level) { return; } // Add it to the source lines so we can ignore it // when learning in future if (!SourcePhrases.Contains(phrase)) { SourcePhrases.Add(phrase); } // Split the sentence to an array of words var tokens = SplitTokens(phrase).ToArray(); LearnTokens(tokens); var lastCol = new List <TUnigram>(); for (var j = Level; j > 0; j--) { TUnigram previous; try { previous = tokens[tokens.Length - j]; lastCol.Add(previous); } catch (IndexOutOfRangeException e) { previous = GetPrepadUnigram(); lastCol.Add(previous); } } var finalKey = new NgramContainer <TUnigram>(lastCol.ToArray()); Chain.AddOrCreate(finalKey, GetTerminatorUnigram()); }
/// <summary> /// Generate a single phrase of output data based on the current model /// </summary> /// <param name="seed">Optionally provide the start of the phrase to generate from</param> /// <returns></returns> private TPhrase WalkLine(TPhrase seed) { IEnumerable <TUnigram> tokensSeed = SplitTokens(seed); var paddedSeed = PadArrayLow(tokensSeed != null ? tokensSeed.ToArray() : null); var built = new List <TUnigram> (); // Allocate a queue to act as the memory, which is n // levels deep of previous words that were used var q = new Queue(paddedSeed); // If the start of the generated text has been seeded, // append that before generating the rest if (!seed.Equals(GetPrepadUnigram())) { built.AddRange(SplitTokens(seed)); } while (built.Count < 1500) { // Choose a new token to add from the model var key = new NgramContainer <TUnigram> (q.Cast <TUnigram> ().ToArray()); if (Chain.Contains(key)) { TUnigram chosen; if (built.Count == 0) { chosen = new UnweightedRandomUnigramSelector <TUnigram> ().SelectUnigram(Chain.GetValuesForKey(key)); } else { chosen = UnigramSelector.SelectUnigram(Chain.GetValuesForKey(key)); } q.Dequeue(); q.Enqueue(chosen); built.Add(chosen); } else { break; } } return(RebuildPhrase(built)); }
/// <summary> /// Iterate over a list of TGrams and store each of them in the model at a composite key genreated from its prior [Level] number of TGrams /// </summary> /// <param name="tokens"></param> private void LearnTokens(IReadOnlyList <TUnigram> tokens) { for (var i = 0; i < tokens.Count; i++) { var current = tokens[i]; var previousCol = new List <TUnigram>(); // From the current token's index, get hold of the previous [Level] number of tokens that came before it for (var j = Level; j > 0; j--) { TUnigram previous; try { // this case addresses when we are at a token index less then the value of [Level], // and we effectively would be looking at tokens before the beginning phrase if (i - j < 0) { previousCol.Add(GetPrepadUnigram()); } else { previous = tokens[i - j]; previousCol.Add(previous); } } catch (IndexOutOfRangeException) { previous = GetPrepadUnigram(); previousCol.Add(previous); } } // create the composite key based on previous tokens var key = new NgramContainer <TUnigram>(previousCol.ToArray()); // add the current token to the markov model at the composite key Chain.AddOrCreate(key, current); } }