/// <summary> /// Create a new Markov chain based on a probability hash /// and a topic. /// /// Initialization is already done by the probability hash /// </summary> /// <param name="probabilityHash"></param> /// <param name="topic"></param> private MarkovChain(ProbabilityHash probabilityHash, string topic) { this.probabilityHash = probabilityHash; initialTopic = topic; CurrentSubtopic = topic; MaxSentenceLength = 30; CurrentSeed = new List<string>(); }
/// <summary> /// Generate a probability hash from raw text data, and serialize /// the hash for later use. /// </summary> /// <param name="topic"></param> /// <param name="order"></param> /// <returns></returns> public static ProbabilityHash Parse(string topic, int order) { var probabilityHash = new ProbabilityHash(order); // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { DataDirectories.EnterDirectory(RawTextWriter.DirectoryName); // Find all subtopic files for the given topic var filenames = FindFilenames(topic); // If there are no subtopic files, then the topic must not // exist on disk. if (filenames.Count == 0) { DataDirectories.LeaveDirectory(); throw new FileNotFoundException(); } // Read each subtopic into the probability hash foreach (var subtopic in filenames.Keys) { var data = new RawTextParseData() { ProbabilityHash = probabilityHash, PreviousWords = new List<string>(), Subtopic = subtopic }; using (var file = File.OpenText(filenames[subtopic])) { while (!file.EndOfStream) { data.Line = file.ReadLine(); ParseLine(data); } } } // Finalize the probability hash so it can be used // in the Markov chain. probabilityHash.Finalize(); DataDirectories.LeaveDirectory(); } // Produce a serialized version of the probability hash for later use. CorpusWriter.Write(topic, probabilityHash); return probabilityHash; }
/// <summary> /// Serializes a probability hash and writes it to disk. /// </summary> /// <param name="topic"></param> /// <param name="probabilityHash"></param> public static void Write(string topic, ProbabilityHash probabilityHash) { // Mangle the topic name to save the file with no spaces // in the filename. topic = topic.ToLower().Replace(" ", "-"); // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { // Create the corpus directory if it doesn't exist already. DataDirectories.ForceEnterDirectory(DirectoryName); // Create the filename with string substitution string filename = String.Format(FilenamePattern, topic, probabilityHash.Order); // I can't think of a reason why the filename would exist // already, but just in case, delete it. if (File.Exists(filename)) File.Delete(filename); using (var writer = new StreamWriter(filename, false)) { // Use a struct to simplify argument passing var data = new CorpusWriteData() { Writer = writer, ProbabilityHash = probabilityHash }; // Write each probability hash entry as a // line in the data file foreach (var key in probabilityHash.Keys) { data.Key = key; WriteLine(data); } } DataDirectories.LeaveDirectory(); } }
/// <summary> /// Parses a serialized probability hash from disk. /// /// Throws a FileNotFound exception if the specified data doesn't yet exist. /// Throws an InvalidDataException if the data format is invalid. /// </summary> /// <param name="name">Major topic name</param> /// <param name="order">Markov order</param> /// <returns></returns> public static ProbabilityHash Parse(string name, int order) { // Mangle the filenames to not contain spaces name = name.Replace(" ", "-"); // Probability hash is finalized because word count data is already // present in the file, and doesn't need to be recalculated. var probabilityHash = new ProbabilityHash(order); probabilityHash.Finalize(); var data = new CorpusParseData() { ProbabilityHash = probabilityHash, }; // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { DataDirectories.EnterDirectory(CorpusWriter.DirectoryName); string filename = String.Format(CorpusWriter.FilenamePattern, name, order); if (!File.Exists(filename)) throw new FileNotFoundException(); // Read each line of the data file into an individual // Markov node. using (var file = File.OpenText(filename)) { while (!file.EndOfStream) { data.Line = file.ReadLine(); ParseLine(data); } } DataDirectories.LeaveDirectory(); } return probabilityHash; }