/// <summary> /// Add a new word (for a particular subtopic) to the hash. /// /// This should be used while parsing raw text. /// </summary> /// <param name="word"></param> /// <param name="subtopic"></param> /// <param name="previousWords"></param> public void Add(string word, string subtopic, List<string> previousWords) { // Once the probability hash is finalized, adding new words // will mess up the weighting of state transition edges. if (finalized) throw new InvalidOperationException("Probability hash is already finalized"); // Only n words can be used as a state in an n-order chain if (previousWords.Count != Order) throw new ArgumentException("Number of previous words does not match order"); // The subtopic is likely coming from a raw text file, whose // filenames are mangled to remove spaces. subtopic = subtopic.Replace("-", " "); if (!Subtopics.Contains(subtopic)) Subtopics.Add(subtopic); var key = new ProbabilityHashKey(previousWords, subtopic); if (!wordDict.ContainsKey(key)) wordDict[key] = new ProbabilityHashValue(); wordDict[key].Add(word); }
/// <summary> /// Adds a new state (with transition edges) to the hash. /// /// This should be used while parsing serialized data, with a probability /// hash that is already finalized. /// </summary> /// <param name="key"></param> /// <param name="value"></param> public void Add(ProbabilityHashKey key, ProbabilityHashValue value) { // The weighting data is already present in the serialized data, // so the probability hash should already be finalized. if (!finalized) throw new InvalidOperationException("Probability hash is not yet finalized"); if (!Subtopics.Contains(key.Topic)) Subtopics.Add(key.Topic); wordDict.Add(key, value); }
/// <summary> /// Read an individual line from a serialized probability hash /// into a single Markov node. /// </summary> /// <param name="data">Parsing state data</param> private static void ParseLine(CorpusParseData data) { // Line is split into two parts (key and value) var components = data.Line.Split(CorpusWriter.ListDelimiter); if (components.Length != 2) throw new InvalidDataException("Two list delimiters on a line"); // The key is split in two parts (list of key words and a sub-topic) var keyComponents = components[0].Split(CorpusWriter.TopicDelimiter); if (keyComponents.Length != 2) throw new InvalidDataException("Two topic delimiters in a key"); // There are n keywords for an n-order Markov chain var keyWords = keyComponents[0].Split(CorpusWriter.ItemDelimiter); if (keyWords.Length != data.ProbabilityHash.Order) throw new InvalidDataException("Invalid number of keywords"); var topic = keyComponents[1]; // Get the list of words which follow the key in the corpus, including // their probabilities. var valueItems = components[1].Split(CorpusWriter.ItemDelimiter); var key = new ProbabilityHashKey(keyWords.ToList(), topic); // Finalize the value because the probabilities are already // present in the value and don't need to be calculated later var value = new ProbabilityHashValue(); value.Finalize(); foreach (var valueItem in valueItems) { // Separate each word from its probability var valueComponents = valueItem.Split(CorpusWriter.ProbabilityDelimiter); if (valueComponents.Length != 2) throw new InvalidDataException("Multiple probabilities for a word"); // Ensure the probability is formatted double probability; if (!Double.TryParse(valueComponents[1], out probability)) throw new InvalidDataException("Probability is not a real number"); value.Add(valueComponents[0], probability); } data.ProbabilityHash.Add(key, value); }