private void TestPerInputNodeAttributes() { _markovChainString.Children.Clear(); string inputString = "This string contains per node attributes."; MarkovChainInputString markovChainInputString = Preprocess(inputString, int.MaxValue); foreach (MarkovChainInputNodeString markovChainInputNodeString in markovChainInputString.Nodes) { if (markovChainInputNodeString.String.Contains("i")) { markovChainInputNodeString.Attributes.Add("perNodeAttribute", 1); } } Dictionary <string, double> globalAttributes = new Dictionary <string, double>(); globalAttributes.Add("globalAttribute1", 1); globalAttributes.Add("globalAttribute2", 1); AddMarkovChainString(markovChainInputString, false, globalAttributes); MarkovChainString markovChainString = GetMarkovChainString(inputString, false, false); }
public MarkovChainInputString Preprocess(string input, int maximumNumberOfNodesToProcess) { MarkovChainInputString markovChainInputString = new MarkovChainInputString(); markovChainInputString.Nodes = new List <MarkovChainInputNodeString>(); foreach (string word in UserDefinedFunctions.ExtractWords(input, _extractText, _extractDistinctWords)) { MarkovChainInputNodeString markovChainInputNodeString = new MarkovChainInputNodeString(); markovChainInputNodeString.String = word; markovChainInputNodeString.Attributes = new Dictionary <string, double>(); markovChainInputString.Nodes.Add(markovChainInputNodeString); } markovChainInputString.Nodes = markovChainInputString.Nodes.Take(maximumNumberOfNodesToProcess).ToList(); return(markovChainInputString); }
public MarkovChainNodeString AddMarkovChainString(MarkovChainInputString markovChainInputString, bool addMarkovChainBreaks, Dictionary <string, double> attributes) { if (markovChainInputString == null || markovChainInputString.Nodes == null || markovChainInputString.Nodes.Count == 0 || string.IsNullOrEmpty(markovChainInputString.ToString())) { return(null); } MarkovChainNodeString parent = null; MarkovChainNodeString currentMarkovChainNodeString = null; bool isFirstWord = true; string lastWord = null; if (!_isCaseSensitive) { for (int i = 0; i < markovChainInputString.Nodes.Count; i++) { markovChainInputString.Nodes[i].String = markovChainInputString.Nodes[i].String.ToLowerInvariant(); } } int index = 1; string path = null; foreach (MarkovChainInputNodeString markovChainInputNodeString in markovChainInputString.Nodes) { string word2 = string.Intern(markovChainInputNodeString.String.Trim()); lastWord = word2; path += word2 + "\\"; if (isFirstWord) { isFirstWord = false; if (!MarkovChainString.Children.ContainsKey(word2)) { MarkovChainNodeString markovChainNodeString = new MarkovChainNodeString(typeof(MCStorageString), false, _onDiskDirectoryBasePath, path); MarkovChainString.Children.Add(word2, markovChainNodeString); } MarkovChainString.ChainCount++; ManageAttributes(MarkovChainString, attributes); MarkovChainString.Update(); /**/ currentMarkovChainNodeString = MarkovChainString.Children[word2]; currentMarkovChainNodeString.ChainCount++; currentMarkovChainNodeString.Children.Path = path; currentMarkovChainNodeString.IsInputBoundary = false; currentMarkovChainNodeString.IsWordBoundary = true; //commented to allow the model to be serialiazed to JSON... should be re-added once disk-backed storage is added... //currentMarkovChainNodeString.Parent = MarkovChainString currentMarkovChainNodeString.Index = index++; ManageAttributes(currentMarkovChainNodeString, attributes); MarkovChainString.Children[word2] = currentMarkovChainNodeString; //currentMarkovChainNodeString.Update(); continue; } if (!currentMarkovChainNodeString.Children.ContainsKey(word2)) { MarkovChainNodeString markovChainNodeString = new MarkovChainNodeString(typeof(MCStorageString), false, _onDiskDirectoryBasePath, path); currentMarkovChainNodeString.Children.Add(word2, markovChainNodeString); } parent = currentMarkovChainNodeString; currentMarkovChainNodeString = currentMarkovChainNodeString.Children[word2]; currentMarkovChainNodeString.Path = path; currentMarkovChainNodeString.ChainCount++; currentMarkovChainNodeString.Children.Path = path; currentMarkovChainNodeString.IsInputBoundary = false; currentMarkovChainNodeString.IsWordBoundary = true; //commented to allow the model to be serialiazed to JSON... should be re-added once disk-backed storage is added... //currentMarkovChainNodeString.Parent = parent; currentMarkovChainNodeString.Index = index++; ManageAttributes(currentMarkovChainNodeString, attributes); if (addMarkovChainBreaks) { //experimental... if (!MarkovChainString.Children.ContainsKey(word2)) { MarkovChainString.Children.Add(word2, currentMarkovChainNodeString); } else { ManageAttributes(currentMarkovChainNodeString, currentMarkovChainNodeString.Attributes); } } parent.Children[word2] = currentMarkovChainNodeString; //currentMarkovChainNodeString.Update(); } if (currentMarkovChainNodeString != null && parent != null && !string.IsNullOrEmpty(lastWord)) { currentMarkovChainNodeString.IsInputBoundary = true; parent.Children[lastWord] = currentMarkovChainNodeString; //currentMarkovChainNodeString.Update(); } return(MarkovChainString); } private void ManageAttributes(MarkovChainNodeString currentMarkovChainNodeString, Dictionary <string, double> attributes) { if (attributes != null) { if (currentMarkovChainNodeString.Attributes == null) { currentMarkovChainNodeString.Attributes = new Dictionary <string, double>(); } List <string> keys = attributes.Keys.ToList(); foreach (string key in keys) { if (!currentMarkovChainNodeString.Attributes.ContainsKey(key)) { currentMarkovChainNodeString.Attributes.Add(key, attributes[key]); } else { currentMarkovChainNodeString.Attributes[key] += attributes[key]; } } } } public void PrepareMarkovChainString() { PrepareMarkovChainString(MarkovChainString); } private void PrepareMarkovChainString(MarkovChainNodeString markovChainNodeString) { markovChainNodeString.Children = new MarkovChainNodeStorageInMemoryDictionaryString(markovChainNodeString.Children.OrderByDescendingChainCount(), _onDiskDirectoryBasePath); foreach (KeyValuePair <string, MarkovChainNodeString> keyValuePair in markovChainNodeString.Children) { PrepareMarkovChainString(keyValuePair.Value); } } public MarkovChainString GetMarkovChainString(string input, bool decrementChainCount, bool alwaysContinueToEndOfChain) { if (string.IsNullOrEmpty(input)) { return(null); } MarkovChainString markovChainString = new MarkovChainString(); markovChainString.IsBrokenChain = false; markovChainString.Nodes = new List <MarkovChainNodeString>(); markovChainString.Input = input; bool continueProcessing = true; if (MarkovChainString.Children.Count != 0) { MarkovChainNodeString startingNodeString = MarkovChainString; decimal chainCount = 0; decimal chainCountTotal = 0; string path = null; StringBuilder chainString = new StringBuilder(); if (!string.IsNullOrEmpty(input)) { if (!_isCaseSensitive) { input = input.ToLowerInvariant(); } Func <string, bool> processWord = new Func <string, bool>(delegate(string word) { try { markovChainString.Nodes.Add(startingNodeString); chainCount += startingNodeString.Children[word].ChainCount; chainCountTotal += startingNodeString.Children.Values.Sum(_ => _.ChainCount); chainString.Append(word + " "); startingNodeString = startingNodeString.Children[word]; if (decrementChainCount) { startingNodeString.ChainCount--; } startingNodeString.Path = path; startingNodeString.Children.Path = path; } catch (Exception) { return(false); } return(true); }); List <string> words = UserDefinedFunctions.ExtractWords(input, _extractText, _extractDistinctWords).Cast <string>().ToList(); foreach (string word in words) { path += word + "\\"; if (startingNodeString.Children.ContainsKey(word)) { processWord(word); } else if (MarkovChainString.Children.ContainsKey(word)) { startingNodeString = MarkovChainString; markovChainString.IsBrokenChain = true; processWord(word); } else { if (startingNodeString == MarkovChainString) { return(markovChainString); } break; } } if (markovChainString.Nodes.Count == words.Count()) { continueProcessing = false; } } /**/ markovChainString.ChainCountKnown = chainCount; markovChainString.ChainCountTotalKnown = chainCountTotal; if (continueProcessing || alwaysContinueToEndOfChain) { KeyValuePair <string, MarkovChainNodeString> keyValuePair = startingNodeString.Children.OrderByDescendingChainCount().FirstOrDefault(); while (keyValuePair.Key != null && startingNodeString.Children.Values.Count != 0) { markovChainString.Nodes.Add(startingNodeString); chainCount += startingNodeString.Children[keyValuePair.Key].ChainCount; chainCountTotal += startingNodeString.Children.Values.Sum(_ => _.ChainCount); chainString.Append(keyValuePair.Key + " "); startingNodeString = startingNodeString.Children[keyValuePair.Key]; if (decrementChainCount) { keyValuePair.Value.ChainCount--; } path += keyValuePair.Key + "\\"; startingNodeString.Path = path; startingNodeString.Children.Path = path; keyValuePair = startingNodeString.Children.OrderByDescendingChainCount().FirstOrDefault(); } } markovChainString.String = chainString.ToString().Trim(); markovChainString.ChainCount = chainCount; markovChainString.ChainCountTotal = chainCountTotal; } return(markovChainString); }