/// <summary> /// Generate a probability hash from raw text data, and serialize /// the hash for later use. /// </summary> /// <param name="topic"></param> /// <param name="order"></param> /// <returns></returns> public static ProbabilityHash Parse(string topic, int order) { var probabilityHash = new ProbabilityHash(order); // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { DataDirectories.EnterDirectory(RawTextWriter.DirectoryName); // Find all subtopic files for the given topic var filenames = FindFilenames(topic); // If there are no subtopic files, then the topic must not // exist on disk. if (filenames.Count == 0) { DataDirectories.LeaveDirectory(); throw new FileNotFoundException(); } // Read each subtopic into the probability hash foreach (var subtopic in filenames.Keys) { var data = new RawTextParseData() { ProbabilityHash = probabilityHash, PreviousWords = new List<string>(), Subtopic = subtopic }; using (var file = File.OpenText(filenames[subtopic])) { while (!file.EndOfStream) { data.Line = file.ReadLine(); ParseLine(data); } } } // Finalize the probability hash so it can be used // in the Markov chain. probabilityHash.Finalize(); DataDirectories.LeaveDirectory(); } // Produce a serialized version of the probability hash for later use. CorpusWriter.Write(topic, probabilityHash); return probabilityHash; }
/// <summary> /// Parses a serialized probability hash from disk. /// /// Throws a FileNotFound exception if the specified data doesn't yet exist. /// Throws an InvalidDataException if the data format is invalid. /// </summary> /// <param name="name">Major topic name</param> /// <param name="order">Markov order</param> /// <returns></returns> public static ProbabilityHash Parse(string name, int order) { // Mangle the filenames to not contain spaces name = name.Replace(" ", "-"); // Probability hash is finalized because word count data is already // present in the file, and doesn't need to be recalculated. var probabilityHash = new ProbabilityHash(order); probabilityHash.Finalize(); var data = new CorpusParseData() { ProbabilityHash = probabilityHash, }; // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { DataDirectories.EnterDirectory(CorpusWriter.DirectoryName); string filename = String.Format(CorpusWriter.FilenamePattern, name, order); if (!File.Exists(filename)) throw new FileNotFoundException(); // Read each line of the data file into an individual // Markov node. using (var file = File.OpenText(filename)) { while (!file.EndOfStream) { data.Line = file.ReadLine(); ParseLine(data); } } DataDirectories.LeaveDirectory(); } return probabilityHash; }