public void Crawl(object URL) { //Get URL from scheduler string url = (string)URL; Console.WriteLine("Crawler " + crawlerID + " got the following URL: {0}", url); //Check if URL is valid if (isURLValid(url)) { //Console.WriteLine("valid URL"); //Access URL text string data = urlText(url); //Console.WriteLine("Gets Text"); //Send (data,date) tuple to Index Scheduler IndexScheduler indexScheduler = Program.indexScheduler; //Console.WriteLine("Waiting"); indexScheduler.dataSema.WaitOne(); //Console.WriteLine("Got it"); indexScheduler.dataQueue.Enqueue(new Tuple <string, string, DateTime>(url, data, DateTime.Now)); indexScheduler.dataSema.Release(); Console.WriteLine("Sent data to Index Scheduler"); //Put self back into URL scheduler URLScheduler urlScheduler = Program.urlScheduler; urlScheduler.crawlerSema.WaitOne(); urlScheduler.crawlerQueue.Enqueue(this); urlScheduler.crawlerSema.Release(); } else { Program.database.addInvalidURL(); } }
public void Index(object urlDataDateTuple) { //Get Data from Indexer Tuple <string, string, DateTime> urlDataDate = (Tuple <string, string, DateTime>)urlDataDateTuple; Console.WriteLine("Indexer " + indexerID + " got the following URL: {0}", urlDataDate.Item1); //Process Data Tuple <Dictionary <string, int>, List <string> > frequenciesAndURLs = getFrequenciesAndURLs(urlDataDate.Item1, urlDataDate.Item2); //Send info to database URLData newData = new URLData(urlDataDate.Item1, urlDataDate.Item3, frequenciesAndURLs.Item1); Program.database.addURLData(newData); //Console.WriteLine("Sent info into database"); //Send new URLs to Scheduler if they haven't been crawled URLScheduler scheduler = Program.urlScheduler; scheduler.queueSema.WaitOne(); foreach (var newURL in frequenciesAndURLs.Item2) { scheduler.urlQueue.Enqueue(newURL); } scheduler.queueSema.Release(); //Console.WriteLine("Sent new URL's to URL scheduler"); //Put self back into Index Scheduler IndexScheduler indexScheduler = Program.indexScheduler; indexScheduler.indexerSema.WaitOne(); indexScheduler.indexerQueue.Enqueue(this); indexScheduler.indexerSema.Release(); //Debugging if (Program.debugMode) { Console.WriteLine(frequenciesAndURLs.Item2.Count + " URLs added to scheduler"); } }
static void Main(string[] args) { Console.Title = "5412 Crawler"; /*Parts needed * 1. Storage - Stores all the program data * 2. URL Scheduler - assigns URLs to web crawlers (and creates crawlers) * 3. Index Scheduler - assigns workers to index content (and creates workers) * 4. Web crawlers - Get data from page, and return content to Index scheduler * 5. Index workers - Process data from the web crawl, and add new links to URL scheduler * 6. User Interface - Simple interface that takes string and seaches storage for it * */ //Set-up storage database = new DataStorage(DateTime.Now, autoCacheDays); if (enableAutoCache) { new Thread(database.autoCache).Start(); } //Set up URL scheduler urlScheduler = new URLScheduler(maxCrawlers, numCrawlsToPerform, startURL); new Thread(urlScheduler.schedule).Start(); //Set up Index Scheduler indexScheduler = new IndexScheduler(maxIndexers); new Thread(indexScheduler.index).Start(); //Launch Interface //Console.WriteLine("Type 's' to enter search mode or 'q' to quit"); //char key = Console.ReadKey().KeyChar; //while (key != 's' && key != 'q') //{ // Console.WriteLine(); // Console.WriteLine("'s' and 'q' are the only accepted keys"); // key = Console.ReadKey().KeyChar; //} //Console.WriteLine(); //if (key == 's') //{ // IntPtr hWnd = FindWindow(null, Console.Title); // if (hWnd != IntPtr.Zero) // { // ShowWindow(hWnd, 0); // } // SearchWindow ui = new SearchWindow(); // ui.ShowDialog(); // ShowWindow(hWnd, 1); // SetForegroundWindow(hWnd); //} //if (searchString != "" && searchMade) //{ // Console.WriteLine("Search string was {0}", searchString); //} //Debugging if (debugMode) { Console.WriteLine("Type 'q' to quit crawling"); char key = Console.ReadKey().KeyChar; while (key != 'q') { Console.WriteLine(""); Console.WriteLine("'q' is the only accepted key"); key = Console.ReadKey().KeyChar; } Console.WriteLine(""); stopCrawling(); //Console.WriteLine("Stopped Crawling"); createReport(); //Console.WriteLine("Created Report"); } //string test = "<link rel=\"stylesheet\" type=\"text/css\" href=\"http://o.aolcdn.com/os/dmoz/editors/css/dmoznew.jpg\">"; //MatchCollection urls = Regex.Matches(test, "href=\"[a-zA-Z./:&\\d_-]+\""); //string url; //foreach (Match match in urls) //{ // url = match.Value.Replace("href=\"", ""); // url = url.Substring(0, url.IndexOf("\"")); // Console.WriteLine(url); // if (unwantedExtensions.Any(url.Contains)) Console.WriteLine("BAD"); // //if (url.EndsWith(".css")) Console.WriteLine("BAD"); //} //string currentData = "http://dmoz.org/|n|i|c|c|"; //string[] urlAndData = Regex.Split(currentData, @"\|n\|i\|c\|c\|"); //string currentURL = urlAndData[0]; //currentData = urlAndData[1]; //Console.WriteLine("\n\nURL:\n" + currentURL + "\n\n"); //Console.WriteLine("\n\nData:\n" + currentData + "\n\n"); //Reached end of Code Console.WriteLine("End of Program. Type Any Key to Exit."); //Press any key to exit Console.ReadKey(); }