Exemple #1
0
        public void Crawl(object URL)
        {
            //Get URL from scheduler
            string url = (string)URL;

            Console.WriteLine("Crawler " + crawlerID + " got the following URL: {0}", url);

            //Check if URL is valid
            if (isURLValid(url))
            {
                //Console.WriteLine("valid URL");
                //Access URL text
                string data = urlText(url);
                //Console.WriteLine("Gets Text");

                //Send (data,date) tuple to Index Scheduler
                IndexScheduler indexScheduler = Program.indexScheduler;
                //Console.WriteLine("Waiting");
                indexScheduler.dataSema.WaitOne();
                //Console.WriteLine("Got it");
                indexScheduler.dataQueue.Enqueue(new Tuple <string, string, DateTime>(url, data, DateTime.Now));
                indexScheduler.dataSema.Release();
                Console.WriteLine("Sent data to Index Scheduler");

                //Put self back into URL scheduler
                URLScheduler urlScheduler = Program.urlScheduler;
                urlScheduler.crawlerSema.WaitOne();
                urlScheduler.crawlerQueue.Enqueue(this);
                urlScheduler.crawlerSema.Release();
            }
            else
            {
                Program.database.addInvalidURL();
            }
        }
        public void Index(object urlDataDateTuple)
        {
            //Get Data from Indexer
            Tuple <string, string, DateTime> urlDataDate = (Tuple <string, string, DateTime>)urlDataDateTuple;

            Console.WriteLine("Indexer " + indexerID + " got the following URL: {0}", urlDataDate.Item1);

            //Process Data
            Tuple <Dictionary <string, int>, List <string> > frequenciesAndURLs = getFrequenciesAndURLs(urlDataDate.Item1, urlDataDate.Item2);

            //Send info to database
            URLData newData = new URLData(urlDataDate.Item1, urlDataDate.Item3, frequenciesAndURLs.Item1);

            Program.database.addURLData(newData);
            //Console.WriteLine("Sent info into database");

            //Send new URLs to Scheduler if they haven't been crawled
            URLScheduler scheduler = Program.urlScheduler;

            scheduler.queueSema.WaitOne();
            foreach (var newURL in frequenciesAndURLs.Item2)
            {
                scheduler.urlQueue.Enqueue(newURL);
            }
            scheduler.queueSema.Release();
            //Console.WriteLine("Sent new URL's to URL scheduler");

            //Put self back into Index Scheduler
            IndexScheduler indexScheduler = Program.indexScheduler;

            indexScheduler.indexerSema.WaitOne();
            indexScheduler.indexerQueue.Enqueue(this);
            indexScheduler.indexerSema.Release();

            //Debugging
            if (Program.debugMode)
            {
                Console.WriteLine(frequenciesAndURLs.Item2.Count + " URLs added to scheduler");
            }
        }
 //Add old URLs to the scheduler again
 public void autoCache()
 {
     while (keepAutoCaching)
     {
         storageSema.WaitOne();
         //Re-cache is scheduler is still active
         if (!Program.urlScheduler.keepScheduling)
         {
             foreach (var entry in urlData)
             {
                 if (DateTime.Now.Subtract(entry.Value.dateModified).Days > days)
                 {
                     //Re-queue URL
                     URLScheduler scheduler = Program.urlScheduler;
                     scheduler.queueSema.WaitOne();
                     scheduler.urlQueue.Enqueue((string)entry.Key);
                     scheduler.queueSema.Release();
                 }
             }
         }
         storageSema.Release();
         Thread.Sleep(60000 * 60 * 24); //Sleep for a day
     }
 }
        static void Main(string[] args)
        {
            Console.Title = "5412 Crawler";


            /*Parts needed
             * 1. Storage - Stores all the program data
             * 2. URL Scheduler - assigns URLs to web crawlers (and creates crawlers)
             * 3. Index Scheduler - assigns workers to index content (and creates workers)
             * 4. Web crawlers - Get data from page, and return content to Index scheduler
             * 5. Index workers - Process data from the web crawl, and add new links to URL scheduler
             * 6. User Interface - Simple interface that takes string and seaches storage for it
             * */

            //Set-up storage
            database = new DataStorage(DateTime.Now, autoCacheDays);
            if (enableAutoCache)
            {
                new Thread(database.autoCache).Start();
            }

            //Set up URL scheduler
            urlScheduler = new URLScheduler(maxCrawlers, numCrawlsToPerform, startURL);
            new Thread(urlScheduler.schedule).Start();

            //Set up Index Scheduler
            indexScheduler = new IndexScheduler(maxIndexers);
            new Thread(indexScheduler.index).Start();

            //Launch Interface
            //Console.WriteLine("Type 's' to enter search mode or 'q' to quit");
            //char key = Console.ReadKey().KeyChar;
            //while (key != 's' && key != 'q')
            //{
            //    Console.WriteLine();
            //    Console.WriteLine("'s' and 'q' are the only accepted keys");
            //    key = Console.ReadKey().KeyChar;
            //}
            //Console.WriteLine();
            //if (key == 's')
            //{
            //    IntPtr hWnd = FindWindow(null, Console.Title);
            //    if (hWnd != IntPtr.Zero)
            //    {
            //        ShowWindow(hWnd, 0);
            //    }
            //    SearchWindow ui = new SearchWindow();
            //    ui.ShowDialog();
            //    ShowWindow(hWnd, 1);
            //    SetForegroundWindow(hWnd);
            //}

            //if (searchString != "" && searchMade)
            //{
            //    Console.WriteLine("Search string was {0}", searchString);
            //}

            //Debugging
            if (debugMode)
            {
                Console.WriteLine("Type 'q' to quit crawling");
                char key = Console.ReadKey().KeyChar;
                while (key != 'q')
                {
                    Console.WriteLine("");
                    Console.WriteLine("'q' is the only accepted key");
                    key = Console.ReadKey().KeyChar;
                }
                Console.WriteLine("");
                stopCrawling();
                //Console.WriteLine("Stopped Crawling");
                createReport();
                //Console.WriteLine("Created Report");
            }

            //string test = "<link rel=\"stylesheet\" type=\"text/css\" href=\"http://o.aolcdn.com/os/dmoz/editors/css/dmoznew.jpg\">";
            //MatchCollection urls = Regex.Matches(test, "href=\"[a-zA-Z./:&\\d_-]+\"");
            //string url;
            //foreach (Match match in urls)
            //{
            //    url = match.Value.Replace("href=\"", "");
            //    url = url.Substring(0, url.IndexOf("\""));
            //    Console.WriteLine(url);
            //    if (unwantedExtensions.Any(url.Contains)) Console.WriteLine("BAD");
            //    //if (url.EndsWith(".css")) Console.WriteLine("BAD");
            //}

            //string currentData = "http://dmoz.org/|n|i|c|c|";
            //string[] urlAndData = Regex.Split(currentData, @"\|n\|i\|c\|c\|");
            //string currentURL = urlAndData[0];
            //currentData = urlAndData[1];
            //Console.WriteLine("\n\nURL:\n" + currentURL + "\n\n");
            //Console.WriteLine("\n\nData:\n" + currentData + "\n\n");

            //Reached end of Code
            Console.WriteLine("End of Program. Type Any Key to Exit.");

            //Press any key to exit
            Console.ReadKey();
        }