コード例 #1
0
        /// <summary>
        /// Add a new word (for a particular subtopic) to the hash.
        /// 
        /// This should be used while parsing raw text.
        /// </summary>
        /// <param name="word"></param>
        /// <param name="subtopic"></param>
        /// <param name="previousWords"></param>
        public void Add(string word, string subtopic, List<string> previousWords)
        {
            // Once the probability hash is finalized, adding new words
            // will mess up the weighting of state transition edges.
            if (finalized)
                throw new InvalidOperationException("Probability hash is already finalized");

            // Only n words can be used as a state in an n-order chain
            if (previousWords.Count != Order)
                throw new ArgumentException("Number of previous words does not match order");

            // The subtopic is likely coming from a raw text file, whose
            // filenames are mangled to remove spaces.
            subtopic = subtopic.Replace("-", " ");

            if (!Subtopics.Contains(subtopic))
                Subtopics.Add(subtopic);

            var key = new ProbabilityHashKey(previousWords, subtopic);

            if (!wordDict.ContainsKey(key))
                wordDict[key] = new ProbabilityHashValue();

            wordDict[key].Add(word);
        }
コード例 #2
0
        /// <summary>
        /// Adds a new state (with transition edges) to the hash.
        /// 
        /// This should be used while parsing serialized data, with a probability
        /// hash that is already finalized.
        /// </summary>
        /// <param name="key"></param>
        /// <param name="value"></param>
        public void Add(ProbabilityHashKey key, ProbabilityHashValue value)
        {
            // The weighting data is already present in the serialized data,
            // so the probability hash should already be finalized.
            if (!finalized)
                throw new InvalidOperationException("Probability hash is not yet finalized");

            if (!Subtopics.Contains(key.Topic))
                Subtopics.Add(key.Topic);

            wordDict.Add(key, value);
        }
コード例 #3
0
        /// <summary>
        /// Read an individual line from a serialized probability hash
        /// into a single Markov node.
        /// </summary>
        /// <param name="data">Parsing state data</param>
        private static void ParseLine(CorpusParseData data)
        {
            // Line is split into two parts (key and value)
            var components = data.Line.Split(CorpusWriter.ListDelimiter);
            if (components.Length != 2)
                throw new InvalidDataException("Two list delimiters on a line");

            // The key is split in two parts (list of key words and a sub-topic)
            var keyComponents = components[0].Split(CorpusWriter.TopicDelimiter);
            if (keyComponents.Length != 2)
                throw new InvalidDataException("Two topic delimiters in a key");

            // There are n keywords for an n-order Markov chain
            var keyWords = keyComponents[0].Split(CorpusWriter.ItemDelimiter);
            if (keyWords.Length != data.ProbabilityHash.Order)
                throw new InvalidDataException("Invalid number of keywords");

            var topic = keyComponents[1];

            // Get the list of words which follow the key in the corpus, including
            // their probabilities.
            var valueItems = components[1].Split(CorpusWriter.ItemDelimiter);

            var key = new ProbabilityHashKey(keyWords.ToList(), topic);

            // Finalize the value because the probabilities are already
            // present in the value and don't need to be calculated later
            var value = new ProbabilityHashValue();
            value.Finalize();

            foreach (var valueItem in valueItems)
            {
                // Separate each word from its probability
                var valueComponents = valueItem.Split(CorpusWriter.ProbabilityDelimiter);
                if (valueComponents.Length != 2)
                    throw new InvalidDataException("Multiple probabilities for a word");

                // Ensure the probability is formatted
                double probability;
                if (!Double.TryParse(valueComponents[1], out probability))
                    throw new InvalidDataException("Probability is not a real number");

                value.Add(valueComponents[0], probability);
            }

            data.ProbabilityHash.Add(key, value);
        }
コード例 #4
0
        /// <summary>
        /// Find the closest key in the corpus to a provided key using longest commen
        /// subsequence
        /// </summary>
        /// <param name="idealKey"></param>
        /// <returns></returns>
        private async Task<ProbabilityHashKey> FindNextBestKeyAsync(ProbabilityHashKey idealKey)
        {
            // Only consider keys from the current subtopic.
            var keys = (from key in wordDict.Keys
                        where key.Topic.ToLower() == idealKey.Topic.ToLower()
                        select key).ToList();

            // Start with a random key as a default if no good match is found. This prevents the
            // same key from being chosen over and over again in cases where matching keys repeatedly
            // cannot be found.
            var random = new Random();
            ProbabilityHashKey bestKey = keys[random.Next(keys.Count)];
            double bestMatchWeight = 0;

            foreach (var key in keys)
            {
                double matchWeight = 0;
                for (int i = 0; i < key.Length; i++)
                {
                    // Start a new asynchronous task for the longest common subsequence
                    var lcs = await Task.Factory.StartNew<int>(() => key[i].LongestCommonSubsequenceWith(idealKey[i]));
                    matchWeight += lcs / (double)idealKey[i].Length;
                }

                if (matchWeight > bestMatchWeight)
                {
                    bestMatchWeight = matchWeight;
                    bestKey = key;
                }
            }

            return bestKey;
        }
コード例 #5
0
        /// <summary>
        /// Generate a random word from the text corpus based on the
        /// previous n-words used.
        /// 
        /// If the last n-words aren't an existing key, the best matching
        /// key is chosen instead.
        /// </summary>
        /// <param name="previousWords"></param>
        /// <param name="topic"></param>
        /// <returns></returns>
        public async Task<string> GetRandomWordAsync(List<string> previousWords, string topic)
        {
            // Find a key that matches the previously used words, or find the
            // next best matching key if it doesn't exist.
            var key = new ProbabilityHashKey(previousWords, topic);
            if (!wordDict.ContainsKey(key))
                key = await FindNextBestKeyAsync(key);

            return wordDict[key].GetRandomWord();
        }
コード例 #6
0
 public List<KeyValuePair<string, double>> this[ProbabilityHashKey key] { get { return wordDict[key].Words; } }