コード例 #1
0
 /// <summary>
 /// Create a new Markov chain based on a probability hash
 /// and a topic.
 /// 
 /// Initialization is already done by the probability hash
 /// </summary>
 /// <param name="probabilityHash"></param>
 /// <param name="topic"></param>
 private MarkovChain(ProbabilityHash probabilityHash, string topic)
 {
     this.probabilityHash = probabilityHash;
     initialTopic = topic;
     CurrentSubtopic = topic;
     MaxSentenceLength = 30;
     CurrentSeed = new List<string>();
 }
コード例 #2
0
        /// <summary>
        /// Generate a probability hash from raw text data, and serialize
        /// the hash for later use.
        /// </summary>
        /// <param name="topic"></param>
        /// <param name="order"></param>
        /// <returns></returns>
        public static ProbabilityHash Parse(string topic, int order)
        {
            var probabilityHash = new ProbabilityHash(order);

            // This was a rushed design decision. I actually change the working directory to
            // get to the corpus data. A lock needs to be held because the client can be multithreaded,
            // and a race condition could cause the directory to be entered twice.
            lock (DataDirectories.DirectoryControl)
            {
                DataDirectories.EnterDirectory(RawTextWriter.DirectoryName);

                // Find all subtopic files for the given topic
                var filenames = FindFilenames(topic);

                // If there are no subtopic files, then the topic must not
                // exist on disk.
                if (filenames.Count == 0)
                {
                    DataDirectories.LeaveDirectory();
                    throw new FileNotFoundException();
                }

                // Read each subtopic into the probability hash
                foreach (var subtopic in filenames.Keys)
                {
                    var data = new RawTextParseData()
                    {
                        ProbabilityHash = probabilityHash,
                        PreviousWords = new List<string>(),
                        Subtopic = subtopic
                    };

                    using (var file = File.OpenText(filenames[subtopic]))
                    {
                        while (!file.EndOfStream)
                        {
                            data.Line = file.ReadLine();
                            ParseLine(data);
                        }
                    }

                }

                // Finalize the probability hash so it can be used
                // in the Markov chain.
                probabilityHash.Finalize();
                DataDirectories.LeaveDirectory();
            }

            // Produce a serialized version of the probability hash for later use.
            CorpusWriter.Write(topic, probabilityHash);
            return probabilityHash;
        }
コード例 #3
0
        /// <summary>
        /// Serializes a probability hash and writes it to disk.
        /// </summary>
        /// <param name="topic"></param>
        /// <param name="probabilityHash"></param>
        public static void Write(string topic, ProbabilityHash probabilityHash)
        {
            // Mangle the topic name to save the file with no spaces
            // in the filename.
            topic = topic.ToLower().Replace(" ", "-");

            // This was a rushed design decision. I actually change the working directory to
            // get to the corpus data. A lock needs to be held because the client can be multithreaded,
            // and a race condition could cause the directory to be entered twice.
            lock (DataDirectories.DirectoryControl)
            {
                // Create the corpus directory if it doesn't exist already.
                DataDirectories.ForceEnterDirectory(DirectoryName);

                // Create the filename with string substitution
                string filename = String.Format(FilenamePattern, topic, probabilityHash.Order);

                // I can't think of a reason why the filename would exist
                // already, but just in case, delete it.
                if (File.Exists(filename))
                    File.Delete(filename);

                using (var writer = new StreamWriter(filename, false))
                {
                    // Use a struct to simplify argument passing
                    var data = new CorpusWriteData()
                    {
                        Writer = writer,
                        ProbabilityHash = probabilityHash
                    };

                    // Write each probability hash entry as a
                    // line in the data file
                    foreach (var key in probabilityHash.Keys)
                    {
                        data.Key = key;
                        WriteLine(data);
                    }
                }

                DataDirectories.LeaveDirectory();
            }
        }
コード例 #4
0
        /// <summary>
        /// Parses a serialized probability hash from disk.
        /// 
        /// Throws a FileNotFound exception if the specified data doesn't yet exist.
        /// Throws an InvalidDataException if the data format is invalid.
        /// </summary>
        /// <param name="name">Major topic name</param>
        /// <param name="order">Markov order</param>
        /// <returns></returns>
        public static ProbabilityHash Parse(string name, int order)
        {
            // Mangle the filenames to not contain spaces
            name = name.Replace(" ", "-");

            // Probability hash is finalized because word count data is already
            // present in the file, and doesn't need to be recalculated.
            var probabilityHash = new ProbabilityHash(order);
            probabilityHash.Finalize();

            var data = new CorpusParseData()
            {
                ProbabilityHash = probabilityHash,
            };

            // This was a rushed design decision. I actually change the working directory to
            // get to the corpus data. A lock needs to be held because the client can be multithreaded,
            // and a race condition could cause the directory to be entered twice.
            lock (DataDirectories.DirectoryControl)
            {
                DataDirectories.EnterDirectory(CorpusWriter.DirectoryName);

                string filename = String.Format(CorpusWriter.FilenamePattern, name, order);
                if (!File.Exists(filename))
                    throw new FileNotFoundException();

                // Read each line of the data file into an individual
                // Markov node.
                using (var file = File.OpenText(filename))
                {
                    while (!file.EndOfStream)
                    {
                        data.Line = file.ReadLine();
                        ParseLine(data);
                    }
                }

                DataDirectories.LeaveDirectory();
            }

            return probabilityHash;
        }