/// <summary> /// Generate a probability hash from raw text data, and serialize /// the hash for later use. /// </summary> /// <param name="topic"></param> /// <param name="order"></param> /// <returns></returns> public static ProbabilityHash Parse(string topic, int order) { var probabilityHash = new ProbabilityHash(order); // This was a rushed design decision. I actually change the working directory to // get to the corpus data. A lock needs to be held because the client can be multithreaded, // and a race condition could cause the directory to be entered twice. lock (DataDirectories.DirectoryControl) { DataDirectories.EnterDirectory(RawTextWriter.DirectoryName); // Find all subtopic files for the given topic var filenames = FindFilenames(topic); // If there are no subtopic files, then the topic must not // exist on disk. if (filenames.Count == 0) { DataDirectories.LeaveDirectory(); throw new FileNotFoundException(); } // Read each subtopic into the probability hash foreach (var subtopic in filenames.Keys) { var data = new RawTextParseData() { ProbabilityHash = probabilityHash, PreviousWords = new List<string>(), Subtopic = subtopic }; using (var file = File.OpenText(filenames[subtopic])) { while (!file.EndOfStream) { data.Line = file.ReadLine(); ParseLine(data); } } } // Finalize the probability hash so it can be used // in the Markov chain. probabilityHash.Finalize(); DataDirectories.LeaveDirectory(); } // Produce a serialized version of the probability hash for later use. CorpusWriter.Write(topic, probabilityHash); return probabilityHash; }
/// <summary> /// Generate Markov states and state transitions from a line of raw text data. /// </summary> /// <param name="data"></param> private static void ParseLine(RawTextParseData data) { // Remove citation references from the line string line = wikiReferencePattern.Replace(data.Line, ""); // Clean the words of other unwanted characters, such as double quotes. var words = TextCorpus.GetCleanWords(line); foreach (string word in words) { if (data.PreviousWords.Count < data.ProbabilityHash.Order) { // We first have to find the first state by populating the // "PreviousWords" list data.PreviousWords.Add(word); } else { // Once we have an initial state, we can start generating transition edges // and new states based on each successive word data.ProbabilityHash.Add(word, data.Subtopic, data.PreviousWords); data.PreviousWords.Add(word); data.PreviousWords.RemoveAt(0); } } }