/// <summary> /// Read an individual line from a serialized probability hash /// into a single Markov node. /// </summary> /// <param name="data">Parsing state data</param> private static void ParseLine(CorpusParseData data) { // Line is split into two parts (key and value) var components = data.Line.Split(CorpusWriter.ListDelimiter); if (components.Length != 2) throw new InvalidDataException("Two list delimiters on a line"); // The key is split in two parts (list of key words and a sub-topic) var keyComponents = components[0].Split(CorpusWriter.TopicDelimiter); if (keyComponents.Length != 2) throw new InvalidDataException("Two topic delimiters in a key"); // There are n keywords for an n-order Markov chain var keyWords = keyComponents[0].Split(CorpusWriter.ItemDelimiter); if (keyWords.Length != data.ProbabilityHash.Order) throw new InvalidDataException("Invalid number of keywords"); var topic = keyComponents[1]; // Get the list of words which follow the key in the corpus, including // their probabilities. var valueItems = components[1].Split(CorpusWriter.ItemDelimiter); var key = new ProbabilityHashKey(keyWords.ToList(), topic); // Finalize the value because the probabilities are already // present in the value and don't need to be calculated later var value = new ProbabilityHashValue(); value.Finalize(); foreach (var valueItem in valueItems) { // Separate each word from its probability var valueComponents = valueItem.Split(CorpusWriter.ProbabilityDelimiter); if (valueComponents.Length != 2) throw new InvalidDataException("Multiple probabilities for a word"); // Ensure the probability is formatted double probability; if (!Double.TryParse(valueComponents[1], out probability)) throw new InvalidDataException("Probability is not a real number"); value.Add(valueComponents[0], probability); } data.ProbabilityHash.Add(key, value); }
/// <summary> /// Parses a serialized probability hash from disk. /// /// Throws a FileNotFound exception if the specified data doesn't yet exist. /// Throws an InvalidDataException if the data format is invalid. /// </summary> /// <param name="name">Major topic name</param> /// <param name="order">Markov order</param> /// <returns></returns> public static ProbabilityHash Parse(string name, int order) { // Mangle the filenames to not contain spaces name = name.Replace(" ", "-"); // Probability hash is finalized because word count data is already // present in the file, and doesn't need to be recalculated. var probabilityHash = new ProbabilityHash(order); probabilityHash.Finalize(); var data = new CorpusParseData() { ProbabilityHash = probabilityHash, }; // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { DataDirectories.EnterDirectory(CorpusWriter.DirectoryName); string filename = String.Format(CorpusWriter.FilenamePattern, name, order); if (!File.Exists(filename)) throw new FileNotFoundException(); // Read each line of the data file into an individual // Markov node. using (var file = File.OpenText(filename)) { while (!file.EndOfStream) { data.Line = file.ReadLine(); ParseLine(data); } } DataDirectories.LeaveDirectory(); } return probabilityHash; }