public MarkovChainInputString Preprocess(string input, int maximumNumberOfNodesToProcess) { MarkovChainInputString markovChainInputString = new MarkovChainInputString(); markovChainInputString.Nodes = new List <MarkovChainInputNodeString>(); foreach (string word in UserDefinedFunctions.ExtractWords(input, _extractText, _extractDistinctWords)) { MarkovChainInputNodeString markovChainInputNodeString = new MarkovChainInputNodeString(); markovChainInputNodeString.String = word; markovChainInputNodeString.Attributes = new Dictionary <string, double>(); markovChainInputString.Nodes.Add(markovChainInputNodeString); } markovChainInputString.Nodes = markovChainInputString.Nodes.Take(maximumNumberOfNodesToProcess).ToList(); return(markovChainInputString); }
public void TestFullTextDistributedCacheStorageStress() { string directoryRoot1 = "c:\\DeleteMe\\WordsPerAbsoluteUri"; string directoryRoot2 = "c:\\DeleteMe\\InvertedIndexPerWord"; try { //Directories.DeleteFilesInDirectory(directoryRoot1 + "\\arachnode.cache"); //Directories.DeleteFilesInDirectory(directoryRoot2 + "\\arachnode.cache"); } catch (Exception) { throw; } DistributedCache_Accessor wordsPerAbsoluteUriDistributedCache = new DistributedCache_Accessor(directoryRoot1); wordsPerAbsoluteUriDistributedCache.UseSlidingWindowCache = false; DistributedCache_Accessor invertedIndexPerWordDistributedCache = new DistributedCache_Accessor(directoryRoot2); invertedIndexPerWordDistributedCache.UseSlidingWindowCache = true; invertedIndexPerWordDistributedCache.CacheItemPriority = CacheItemPriority.Normal; invertedIndexPerWordDistributedCache.CacheItemSlidingExpiration = TimeSpan.FromSeconds(10); Stopwatch perDocumentStopwatch = new Stopwatch(); Stopwatch corpusStopwatch = new Stopwatch(); SHA1 sha1 = SHA1.Create(); int iterations = 10; int numberOfWebPages = 10000; double totalNumberOfWebPages = 0; WebClient webClient = new WebClient(); corpusStopwatch.Reset(); corpusStopwatch.Start(); for (int j = 1; j <= iterations; j++) { for (int i = 1; i <= numberOfWebPages; i++) { totalNumberOfWebPages++; string absoluteUri = "http://localhost:56830/Test/" + i + ".htm"; string downloadedString = webClient.DownloadString(absoluteUri); IEnumerator enumerator = UserDefinedFunctions.ExtractWords(downloadedString, true, true).GetEnumerator(); perDocumentStopwatch.Reset(); perDocumentStopwatch.Start(); HashSet <string> wordsPerAbsoluteUri = new HashSet <string>(); bool writeWordsPerAbsoluteUri = false; object o2 = wordsPerAbsoluteUriDistributedCache.Read("WORDS_" + absoluteUri + "_" + j, sha1); if (true || o2 == null) { while (enumerator.MoveNext()) { wordsPerAbsoluteUri.Add(enumerator.Current.ToString()); /**/ string directory = invertedIndexPerWordDistributedCache.GenerateFullTextUniqueDirectory(enumerator.Current.ToString(), false, sha1); string directory2 = directory + "\\" + invertedIndexPerWordDistributedCache.GetFileNameWithoutDirectory(absoluteUri + "_" + j, sha1); if (!Directory.Exists(directory2)) { Directory.CreateDirectory(directory2); } //HashSet<string> invertedIndexPerWord; //object o = invertedIndexPerWordDistributedCache.Read(enumerator.Current.ToString(), sha1); //if (o == null) //{ // invertedIndexPerWord = new HashSet<string>(); // invertedIndexPerWordDistributedCache.Write(enumerator.Current.ToString(), invertedIndexPerWord, sha1, false); //} //else //{ // invertedIndexPerWord = (HashSet<string>) o; //} //if (!invertedIndexPerWord.Contains(absoluteUri + "_" + j)) //{ // invertedIndexPerWord.Add(absoluteUri + "_" + j); // writeWordsPerAbsoluteUri = true; //} //Debug.Print(invertedIndexPerWord.Count.ToString()); } if (true || writeWordsPerAbsoluteUri) { wordsPerAbsoluteUriDistributedCache.Write("WORDS_" + absoluteUri + "_" + j, wordsPerAbsoluteUri, sha1, true); invertedIndexPerWordDistributedCache.CacheItemSlidingExpiration = TimeSpan.FromSeconds(perDocumentStopwatch.Elapsed.TotalSeconds + 2); } } perDocumentStopwatch.Stop(); Debug.Print((((j - 1) * numberOfWebPages) + i).ToString()); Debug.Print(perDocumentStopwatch.Elapsed.ToString()); Debug.Print((corpusStopwatch.Elapsed.TotalSeconds / totalNumberOfWebPages).ToString()); } Debug.Print(j.ToString()); Debug.Print(corpusStopwatch.Elapsed.ToString()); Debug.Print((corpusStopwatch.Elapsed.TotalSeconds / totalNumberOfWebPages).ToString()); } corpusStopwatch.Stop(); Debug.Print(corpusStopwatch.Elapsed.ToString()); Debug.Print((corpusStopwatch.Elapsed.TotalSeconds / totalNumberOfWebPages).ToString()); }
private void TestLongFormInput() { _markovChainString.Children.Clear(); MarkovChainString mostMarkovChainUsingStrings = null; int numberOfExemplars = 0; Stopwatch stopwatch = new Stopwatch(); int numberOfLinesToTake = 1000; foreach (string line in File.ReadAllLines("TextDatabases\\Dracula.txt").Take(numberOfLinesToTake)) { string line2 = string.Join(" ", UserDefinedFunctions.ExtractWords(line, true, false).Cast <string>().ToArray()); if (string.IsNullOrEmpty(line2)) { continue; } numberOfExemplars++; stopwatch.Start(); AddMarkovChainString(Preprocess(line2, int.MaxValue), false, null); mostMarkovChainUsingStrings = GetMarkovChainString(line2, false, false); stopwatch.Stop(); #if DEBUG if (numberOfExemplars % 100 == 0) { Console.WriteLine("TestLongFormInput: R/W :: " + (numberOfExemplars / stopwatch.Elapsed.TotalSeconds) + "/sec. :: " + numberOfExemplars); } #endif if (!_isCaseSensitive) { Debug.Assert(mostMarkovChainUsingStrings.String == line2.ToLowerInvariant().TrimEnd()); } else { Debug.Assert(mostMarkovChainUsingStrings.String == line2.TrimEnd()); } } _markovChainString.Children.Clear(); numberOfExemplars = 0; stopwatch.Reset(); foreach (string line in File.ReadAllLines("TextDatabases\\Dracula.txt").Take(numberOfLinesToTake)) { string line2 = string.Join(" ", UserDefinedFunctions.ExtractWords(line, true, false).Cast <string>().ToArray()); if (string.IsNullOrEmpty(line2)) { continue; } numberOfExemplars++; stopwatch.Start(); AddMarkovChainString(Preprocess(line2, int.MaxValue), false, null); stopwatch.Stop(); #if DEBUG if (numberOfExemplars % 100 == 0) { Console.WriteLine("TestLongFormInput: W :: " + (numberOfExemplars / stopwatch.Elapsed.TotalSeconds) + "/sec. :: " + numberOfExemplars); } #endif } numberOfExemplars = 0; stopwatch.Reset(); foreach (string line in File.ReadAllLines("TextDatabases\\Dracula.txt").Take(numberOfLinesToTake)) { string line2 = string.Join(" ", UserDefinedFunctions.ExtractWords(line, true, false).Cast <string>().ToArray()); if (string.IsNullOrEmpty(line2)) { continue; } numberOfExemplars++; stopwatch.Start(); mostMarkovChainUsingStrings = GetMarkovChainString(line2, false, false); stopwatch.Stop(); #if DEBUG if (numberOfExemplars % 100 == 0) { Console.WriteLine("TestLongFormInput: R :: " + (numberOfExemplars / stopwatch.Elapsed.TotalSeconds) + "/sec. :: " + numberOfExemplars); } #endif if (!_isCaseSensitive) { Debug.Assert(mostMarkovChainUsingStrings.String == line2.ToLowerInvariant().TrimEnd()); } else { Debug.Assert(mostMarkovChainUsingStrings.String == line2.TrimEnd()); } } }