Example #1
0
        /// <summary>
        ///  Threaded worker for scanning webpages. Automatically creates other copies
        ///  of itself and divides work for quicker scanning.
        /// </summary>
        /// <param name="id"></param>
        /// <param name="url"></param>
        /// <param name="startDepth"></param>
        /// <param name="sidedness"></param>
        /// <param name="data"></param>
        public CrawlWorker(int id, string url, int startDepth, bool sidedness, CrawlStruct data)
        {
            this.id           = id;
            this.startURL     = url;
            this.workingURL   = url;
            this.sidedness    = sidedness;
            this.startDepth   = startDepth;
            this.workingDepth = startDepth;
            this.data         = data;

            if (data.iterative)
            {
                this.workingURL += data.iteratorStart;
            }

            downloadManager   = new DownloadManager(data);
            this.pagesCrawled = 0;

            webStringUtils = new WebStringUtils(data.outputFolder);
            //Constructs string name from id and sidedness
            StringBuilder nameBuilder = new StringBuilder("Worker ").Append(id);

            if (!sidedness)
            {
                nameBuilder.Append(" (Left)");
            }
            else
            {
                nameBuilder.Append(" (Right)");
            }
            this.threadName = nameBuilder.ToString();
            Console.WriteLine("Creating " + threadName);
        }
        private static void Crawl(string urlString, CrawlStruct data)
        {
            //Defaults to searching left side

            //Clears old crawl content
            if (data.overwrite)
            {
                try {
                    Directory.Delete(data.outputFolder, true);
                    Console.WriteLine("Deleted old scan files.");
                    Directory.CreateDirectory(data.outputFolder);
                } catch (IOException exception) {
                    Directory.CreateDirectory(data.outputFolder);
                    Console.WriteLine("First scan- Not files to delete.");
                }
            }
            Directory.CreateDirectory(data.outputFolder);
            //Spawns the markov chain
            if (shouldCrawl)
            {
                if (data.iterative)
                {
                    CrawlWorker evenCrawl = new CrawlWorker(0, urlString, 0, false, data);
                    CrawlWorker oddCrawl  = new CrawlWorker(0, urlString, 0, true, data);
                    SpawnCrawler(evenCrawl);
                    SpawnCrawler(oddCrawl);
                }
                else
                {
                    CrawlWorker crawlWorker = new CrawlWorker(0, urlString, 0, false, data);
                    SpawnCrawler(crawlWorker);
                }
                //Automatic saving
                Thread saveThread = new Thread(() => {
                    while (true)
                    {
                        Thread.Sleep((int)TimeSpan.FromMinutes(SAVERATE).TotalMilliseconds);
                        saveScanState();
                    }
                });
                saveThread.IsBackground = true;
                saveThread.Start();
                //Checks pages crawled
                Thread titleThread = new Thread(() => {
                    while (true)
                    {
                        Console.Title = ("Scanned " + pagesCrawled + " pages(infrequently updates), saved " + timesSaved + " backups of pages visited");
                        Thread.Sleep((int)TimeSpan.FromSeconds(1).TotalMilliseconds);
                    }
                });
                titleThread.IsBackground = true;
                titleThread.Start();
                //waits for the threads to complete
                Thread.Sleep(1000);
                while (true)
                {
                    while (crawlTasks.Count > 0)
                    {
                        Task <int> finishedTask = crawlTasks.Dequeue();
                        pagesCrawled += finishedTask.Result;
                    }
                    SaveQueue.killService();
                    break;
                }
                //Starts chain generation
                Console.WriteLine("Done! Scanned " + pagesCrawled + " pages.");
            }
            if (printMarkov)
            {
                Console.WriteLine("Printing sentence from generated chain... ");
                markovChain.addWords(File.ReadAllText(data.outputFolder + "textDigest" + ".txt"));
                markovChain.generateSentence(markovSentences);
            }
            //Save scan results
            saveScanState();
        }
 public DownloadManager(CrawlStruct data)
 {
     DownloadManager.data = data;
     webStringUtils       = new WebStringUtils(data.outputFolder);
     saveQueue            = new SaveQueue();
 }