/// <summary> /// Add a new word (for a particular subtopic) to the hash. /// /// This should be used while parsing raw text. /// </summary> /// <param name="word"></param> /// <param name="subtopic"></param> /// <param name="previousWords"></param> public void Add(string word, string subtopic, List<string> previousWords) { // Once the probability hash is finalized, adding new words // will mess up the weighting of state transition edges. if (finalized) throw new InvalidOperationException("Probability hash is already finalized"); // Only n words can be used as a state in an n-order chain if (previousWords.Count != Order) throw new ArgumentException("Number of previous words does not match order"); // The subtopic is likely coming from a raw text file, whose // filenames are mangled to remove spaces. subtopic = subtopic.Replace("-", " "); if (!Subtopics.Contains(subtopic)) Subtopics.Add(subtopic); var key = new ProbabilityHashKey(previousWords, subtopic); if (!wordDict.ContainsKey(key)) wordDict[key] = new ProbabilityHashValue(); wordDict[key].Add(word); }
/// <summary> /// Adds a new state (with transition edges) to the hash. /// /// This should be used while parsing serialized data, with a probability /// hash that is already finalized. /// </summary> /// <param name="key"></param> /// <param name="value"></param> public void Add(ProbabilityHashKey key, ProbabilityHashValue value) { // The weighting data is already present in the serialized data, // so the probability hash should already be finalized. if (!finalized) throw new InvalidOperationException("Probability hash is not yet finalized"); if (!Subtopics.Contains(key.Topic)) Subtopics.Add(key.Topic); wordDict.Add(key, value); }
/// <summary> /// Read an individual line from a serialized probability hash /// into a single Markov node. /// </summary> /// <param name="data">Parsing state data</param> private static void ParseLine(CorpusParseData data) { // Line is split into two parts (key and value) var components = data.Line.Split(CorpusWriter.ListDelimiter); if (components.Length != 2) throw new InvalidDataException("Two list delimiters on a line"); // The key is split in two parts (list of key words and a sub-topic) var keyComponents = components[0].Split(CorpusWriter.TopicDelimiter); if (keyComponents.Length != 2) throw new InvalidDataException("Two topic delimiters in a key"); // There are n keywords for an n-order Markov chain var keyWords = keyComponents[0].Split(CorpusWriter.ItemDelimiter); if (keyWords.Length != data.ProbabilityHash.Order) throw new InvalidDataException("Invalid number of keywords"); var topic = keyComponents[1]; // Get the list of words which follow the key in the corpus, including // their probabilities. var valueItems = components[1].Split(CorpusWriter.ItemDelimiter); var key = new ProbabilityHashKey(keyWords.ToList(), topic); // Finalize the value because the probabilities are already // present in the value and don't need to be calculated later var value = new ProbabilityHashValue(); value.Finalize(); foreach (var valueItem in valueItems) { // Separate each word from its probability var valueComponents = valueItem.Split(CorpusWriter.ProbabilityDelimiter); if (valueComponents.Length != 2) throw new InvalidDataException("Multiple probabilities for a word"); // Ensure the probability is formatted double probability; if (!Double.TryParse(valueComponents[1], out probability)) throw new InvalidDataException("Probability is not a real number"); value.Add(valueComponents[0], probability); } data.ProbabilityHash.Add(key, value); }
/// <summary> /// Find the closest key in the corpus to a provided key using longest commen /// subsequence /// </summary> /// <param name="idealKey"></param> /// <returns></returns> private async Task<ProbabilityHashKey> FindNextBestKeyAsync(ProbabilityHashKey idealKey) { // Only consider keys from the current subtopic. var keys = (from key in wordDict.Keys where key.Topic.ToLower() == idealKey.Topic.ToLower() select key).ToList(); // Start with a random key as a default if no good match is found. This prevents the // same key from being chosen over and over again in cases where matching keys repeatedly // cannot be found. var random = new Random(); ProbabilityHashKey bestKey = keys[random.Next(keys.Count)]; double bestMatchWeight = 0; foreach (var key in keys) { double matchWeight = 0; for (int i = 0; i < key.Length; i++) { // Start a new asynchronous task for the longest common subsequence var lcs = await Task.Factory.StartNew<int>(() => key[i].LongestCommonSubsequenceWith(idealKey[i])); matchWeight += lcs / (double)idealKey[i].Length; } if (matchWeight > bestMatchWeight) { bestMatchWeight = matchWeight; bestKey = key; } } return bestKey; }
/// <summary> /// Generate a random word from the text corpus based on the /// previous n-words used. /// /// If the last n-words aren't an existing key, the best matching /// key is chosen instead. /// </summary> /// <param name="previousWords"></param> /// <param name="topic"></param> /// <returns></returns> public async Task<string> GetRandomWordAsync(List<string> previousWords, string topic) { // Find a key that matches the previously used words, or find the // next best matching key if it doesn't exist. var key = new ProbabilityHashKey(previousWords, topic); if (!wordDict.ContainsKey(key)) key = await FindNextBestKeyAsync(key); return wordDict[key].GetRandomWord(); }
public List<KeyValuePair<string, double>> this[ProbabilityHashKey key] { get { return wordDict[key].Words; } }