コード例 #1
0
        /// <summary>
        /// Add a new word (for a particular subtopic) to the hash.
        /// 
        /// This should be used while parsing raw text.
        /// </summary>
        /// <param name="word"></param>
        /// <param name="subtopic"></param>
        /// <param name="previousWords"></param>
        public void Add(string word, string subtopic, List<string> previousWords)
        {
            // Once the probability hash is finalized, adding new words
            // will mess up the weighting of state transition edges.
            if (finalized)
                throw new InvalidOperationException("Probability hash is already finalized");

            // Only n words can be used as a state in an n-order chain
            if (previousWords.Count != Order)
                throw new ArgumentException("Number of previous words does not match order");

            // The subtopic is likely coming from a raw text file, whose
            // filenames are mangled to remove spaces.
            subtopic = subtopic.Replace("-", " ");

            if (!Subtopics.Contains(subtopic))
                Subtopics.Add(subtopic);

            var key = new ProbabilityHashKey(previousWords, subtopic);

            if (!wordDict.ContainsKey(key))
                wordDict[key] = new ProbabilityHashValue();

            wordDict[key].Add(word);
        }
コード例 #2
0
        /// <summary>
        /// Adds a new state (with transition edges) to the hash.
        /// 
        /// This should be used while parsing serialized data, with a probability
        /// hash that is already finalized.
        /// </summary>
        /// <param name="key"></param>
        /// <param name="value"></param>
        public void Add(ProbabilityHashKey key, ProbabilityHashValue value)
        {
            // The weighting data is already present in the serialized data,
            // so the probability hash should already be finalized.
            if (!finalized)
                throw new InvalidOperationException("Probability hash is not yet finalized");

            if (!Subtopics.Contains(key.Topic))
                Subtopics.Add(key.Topic);

            wordDict.Add(key, value);
        }
コード例 #3
0
        /// <summary>
        /// Read an individual line from a serialized probability hash
        /// into a single Markov node.
        /// </summary>
        /// <param name="data">Parsing state data</param>
        private static void ParseLine(CorpusParseData data)
        {
            // Line is split into two parts (key and value)
            var components = data.Line.Split(CorpusWriter.ListDelimiter);
            if (components.Length != 2)
                throw new InvalidDataException("Two list delimiters on a line");

            // The key is split in two parts (list of key words and a sub-topic)
            var keyComponents = components[0].Split(CorpusWriter.TopicDelimiter);
            if (keyComponents.Length != 2)
                throw new InvalidDataException("Two topic delimiters in a key");

            // There are n keywords for an n-order Markov chain
            var keyWords = keyComponents[0].Split(CorpusWriter.ItemDelimiter);
            if (keyWords.Length != data.ProbabilityHash.Order)
                throw new InvalidDataException("Invalid number of keywords");

            var topic = keyComponents[1];

            // Get the list of words which follow the key in the corpus, including
            // their probabilities.
            var valueItems = components[1].Split(CorpusWriter.ItemDelimiter);

            var key = new ProbabilityHashKey(keyWords.ToList(), topic);

            // Finalize the value because the probabilities are already
            // present in the value and don't need to be calculated later
            var value = new ProbabilityHashValue();
            value.Finalize();

            foreach (var valueItem in valueItems)
            {
                // Separate each word from its probability
                var valueComponents = valueItem.Split(CorpusWriter.ProbabilityDelimiter);
                if (valueComponents.Length != 2)
                    throw new InvalidDataException("Multiple probabilities for a word");

                // Ensure the probability is formatted
                double probability;
                if (!Double.TryParse(valueComponents[1], out probability))
                    throw new InvalidDataException("Probability is not a real number");

                value.Add(valueComponents[0], probability);
            }

            data.ProbabilityHash.Add(key, value);
        }