Exemplo n.º 1
0
        public MarkovChainInputString Preprocess(string input, int maximumNumberOfNodesToProcess)
        {
            MarkovChainInputString markovChainInputString = new MarkovChainInputString();

            markovChainInputString.Nodes = new List <MarkovChainInputNodeString>();

            foreach (string word in UserDefinedFunctions.ExtractWords(input, _extractText, _extractDistinctWords))
            {
                MarkovChainInputNodeString markovChainInputNodeString = new MarkovChainInputNodeString();

                markovChainInputNodeString.String     = word;
                markovChainInputNodeString.Attributes = new Dictionary <string, double>();

                markovChainInputString.Nodes.Add(markovChainInputNodeString);
            }

            markovChainInputString.Nodes = markovChainInputString.Nodes.Take(maximumNumberOfNodesToProcess).ToList();

            return(markovChainInputString);
        }
Exemplo n.º 2
0
        public void TestFullTextDistributedCacheStorageStress()
        {
            string directoryRoot1 = "c:\\DeleteMe\\WordsPerAbsoluteUri";
            string directoryRoot2 = "c:\\DeleteMe\\InvertedIndexPerWord";

            try
            {
                //Directories.DeleteFilesInDirectory(directoryRoot1 + "\\arachnode.cache");
                //Directories.DeleteFilesInDirectory(directoryRoot2 + "\\arachnode.cache");
            }
            catch (Exception)
            {
                throw;
            }

            DistributedCache_Accessor wordsPerAbsoluteUriDistributedCache = new DistributedCache_Accessor(directoryRoot1);

            wordsPerAbsoluteUriDistributedCache.UseSlidingWindowCache = false;

            DistributedCache_Accessor invertedIndexPerWordDistributedCache = new DistributedCache_Accessor(directoryRoot2);

            invertedIndexPerWordDistributedCache.UseSlidingWindowCache      = true;
            invertedIndexPerWordDistributedCache.CacheItemPriority          = CacheItemPriority.Normal;
            invertedIndexPerWordDistributedCache.CacheItemSlidingExpiration = TimeSpan.FromSeconds(10);

            Stopwatch perDocumentStopwatch = new Stopwatch();
            Stopwatch corpusStopwatch      = new Stopwatch();
            SHA1      sha1 = SHA1.Create();

            int iterations       = 10;
            int numberOfWebPages = 10000;

            double totalNumberOfWebPages = 0;

            WebClient webClient = new WebClient();

            corpusStopwatch.Reset();
            corpusStopwatch.Start();

            for (int j = 1; j <= iterations; j++)
            {
                for (int i = 1; i <= numberOfWebPages; i++)
                {
                    totalNumberOfWebPages++;

                    string absoluteUri = "http://localhost:56830/Test/" + i + ".htm";

                    string downloadedString = webClient.DownloadString(absoluteUri);

                    IEnumerator enumerator = UserDefinedFunctions.ExtractWords(downloadedString, true, true).GetEnumerator();

                    perDocumentStopwatch.Reset();
                    perDocumentStopwatch.Start();

                    HashSet <string> wordsPerAbsoluteUri = new HashSet <string>();

                    bool writeWordsPerAbsoluteUri = false;

                    object o2 = wordsPerAbsoluteUriDistributedCache.Read("WORDS_" + absoluteUri + "_" + j, sha1);

                    if (true || o2 == null)
                    {
                        while (enumerator.MoveNext())
                        {
                            wordsPerAbsoluteUri.Add(enumerator.Current.ToString());

                            /**/

                            string directory = invertedIndexPerWordDistributedCache.GenerateFullTextUniqueDirectory(enumerator.Current.ToString(), false, sha1);

                            string directory2 = directory + "\\" + invertedIndexPerWordDistributedCache.GetFileNameWithoutDirectory(absoluteUri + "_" + j, sha1);

                            if (!Directory.Exists(directory2))
                            {
                                Directory.CreateDirectory(directory2);
                            }

                            //HashSet<string> invertedIndexPerWord;

                            //object o = invertedIndexPerWordDistributedCache.Read(enumerator.Current.ToString(), sha1);

                            //if (o == null)
                            //{
                            //    invertedIndexPerWord = new HashSet<string>();

                            //    invertedIndexPerWordDistributedCache.Write(enumerator.Current.ToString(), invertedIndexPerWord, sha1, false);
                            //}
                            //else
                            //{
                            //    invertedIndexPerWord = (HashSet<string>) o;
                            //}

                            //if (!invertedIndexPerWord.Contains(absoluteUri + "_" + j))
                            //{
                            //    invertedIndexPerWord.Add(absoluteUri + "_" + j);

                            //    writeWordsPerAbsoluteUri = true;
                            //}
                            //Debug.Print(invertedIndexPerWord.Count.ToString());
                        }

                        if (true || writeWordsPerAbsoluteUri)
                        {
                            wordsPerAbsoluteUriDistributedCache.Write("WORDS_" + absoluteUri + "_" + j, wordsPerAbsoluteUri, sha1, true);

                            invertedIndexPerWordDistributedCache.CacheItemSlidingExpiration = TimeSpan.FromSeconds(perDocumentStopwatch.Elapsed.TotalSeconds + 2);
                        }
                    }

                    perDocumentStopwatch.Stop();

                    Debug.Print((((j - 1) * numberOfWebPages) + i).ToString());
                    Debug.Print(perDocumentStopwatch.Elapsed.ToString());
                    Debug.Print((corpusStopwatch.Elapsed.TotalSeconds / totalNumberOfWebPages).ToString());
                }

                Debug.Print(j.ToString());
                Debug.Print(corpusStopwatch.Elapsed.ToString());
                Debug.Print((corpusStopwatch.Elapsed.TotalSeconds / totalNumberOfWebPages).ToString());
            }

            corpusStopwatch.Stop();
            Debug.Print(corpusStopwatch.Elapsed.ToString());
            Debug.Print((corpusStopwatch.Elapsed.TotalSeconds / totalNumberOfWebPages).ToString());
        }
Exemplo n.º 3
0
        private void TestLongFormInput()
        {
            _markovChainString.Children.Clear();
            MarkovChainString mostMarkovChainUsingStrings = null;

            int numberOfExemplars = 0;

            Stopwatch stopwatch = new Stopwatch();

            int numberOfLinesToTake = 1000;

            foreach (string line in File.ReadAllLines("TextDatabases\\Dracula.txt").Take(numberOfLinesToTake))
            {
                string line2 = string.Join(" ", UserDefinedFunctions.ExtractWords(line, true, false).Cast <string>().ToArray());

                if (string.IsNullOrEmpty(line2))
                {
                    continue;
                }

                numberOfExemplars++;

                stopwatch.Start();
                AddMarkovChainString(Preprocess(line2, int.MaxValue), false, null);

                mostMarkovChainUsingStrings = GetMarkovChainString(line2, false, false);
                stopwatch.Stop();
#if DEBUG
                if (numberOfExemplars % 100 == 0)
                {
                    Console.WriteLine("TestLongFormInput: R/W :: " + (numberOfExemplars / stopwatch.Elapsed.TotalSeconds) + "/sec. :: " + numberOfExemplars);
                }
#endif
                if (!_isCaseSensitive)
                {
                    Debug.Assert(mostMarkovChainUsingStrings.String == line2.ToLowerInvariant().TrimEnd());
                }
                else
                {
                    Debug.Assert(mostMarkovChainUsingStrings.String == line2.TrimEnd());
                }
            }

            _markovChainString.Children.Clear();

            numberOfExemplars = 0;
            stopwatch.Reset();

            foreach (string line in File.ReadAllLines("TextDatabases\\Dracula.txt").Take(numberOfLinesToTake))
            {
                string line2 = string.Join(" ", UserDefinedFunctions.ExtractWords(line, true, false).Cast <string>().ToArray());

                if (string.IsNullOrEmpty(line2))
                {
                    continue;
                }

                numberOfExemplars++;

                stopwatch.Start();
                AddMarkovChainString(Preprocess(line2, int.MaxValue), false, null);
                stopwatch.Stop();
#if DEBUG
                if (numberOfExemplars % 100 == 0)
                {
                    Console.WriteLine("TestLongFormInput: W :: " + (numberOfExemplars / stopwatch.Elapsed.TotalSeconds) + "/sec. :: " + numberOfExemplars);
                }
#endif
            }

            numberOfExemplars = 0;
            stopwatch.Reset();

            foreach (string line in File.ReadAllLines("TextDatabases\\Dracula.txt").Take(numberOfLinesToTake))
            {
                string line2 = string.Join(" ", UserDefinedFunctions.ExtractWords(line, true, false).Cast <string>().ToArray());

                if (string.IsNullOrEmpty(line2))
                {
                    continue;
                }

                numberOfExemplars++;

                stopwatch.Start();
                mostMarkovChainUsingStrings = GetMarkovChainString(line2, false, false);
                stopwatch.Stop();
#if DEBUG
                if (numberOfExemplars % 100 == 0)
                {
                    Console.WriteLine("TestLongFormInput: R :: " + (numberOfExemplars / stopwatch.Elapsed.TotalSeconds) + "/sec. :: " + numberOfExemplars);
                }
#endif
                if (!_isCaseSensitive)
                {
                    Debug.Assert(mostMarkovChainUsingStrings.String == line2.ToLowerInvariant().TrimEnd());
                }
                else
                {
                    Debug.Assert(mostMarkovChainUsingStrings.String == line2.TrimEnd());
                }
            }
        }