/** * it will invoke the worker to start working on the tasks - never returns */ public void run() { int requestNum = 0, timeoutCounter = 0; bool needToTerminate = false; TimeSpan totalProcessTime; Thread.Sleep(10000); while (needToTerminate == false) { DateTime startTime = DateTime.Now; try { //System.Console.WriteLine("-<>--------------------------------------------------------------------------"); SyncAccessor.getSlot(2, 1); Url task = SyncAccessor.getFromQueue <Url>(_tasks, _timer); //System.Console.WriteLine(" Start Working on : " + task.getUrl() + " ..."); ResourceContent content = _fetchers.fetchResource(task.getUrl()); if (content.isValid() != true) { timeoutCounter++; //System.Console.WriteLine(" Fetch Failed Ignoring ... "); continue; } //System.Console.WriteLine(" Fetched Successfully ... "); ResourceContent modifiedContent = new ResourceContent(content.getResourceUrl(), content.getResourceType() , content.getResourceContent(), content.getReturnCode(), task.getRank()); DateTime startProcess = DateTime.Now; _processors.processResource(modifiedContent); DateTime endProcess = DateTime.Now; totalProcessTime = endProcess - startProcess; //System.Console.WriteLine(" URL Processed Successfully ... "); System.Console.WriteLine(" URL Processed Successfully ... "); } catch (Exception e) { //System.Console.WriteLine("[Exception Happened] " + e); RuntimeStatistics.addToErrors(1); continue; } DateTime endTime = DateTime.Now; TimeSpan totalRequestTime = endTime - startTime; if (LogDebuggerControl.getInstance().enableTiming) { // write request time to timing log file StreamWriter sw = new StreamWriter("_DEBUG_INFO_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - " + requestNum++ + " takes about " + totalRequestTime.TotalSeconds + " s, Processed At " + totalProcessTime.TotalSeconds + " s"); sw.Close(); } } }
/** * This is a test method that checks if the class resourceContent works fine */ public static void Test() { String rsrcurl = "www.adamsearch.com"; ResourceType rsrcType = ResourceType.HtmlResource; String rsrcContent = "Please click the next buttom to start crawling !!"; int rtrnCode = 200; ResourceContent htmlResource1 = new ResourceContent(rsrcurl, rsrcType, rsrcContent, rtrnCode,0); Console.WriteLine("The resource is : " + htmlResource1.isValid()); Console.WriteLine("Get url: " + htmlResource1.getResourceUrl()); Console.WriteLine("Get resourceType: " + htmlResource1.getResourceType()); Console.WriteLine("Get resourceContent: " + htmlResource1.getResourceContent()); Console.WriteLine("to string:\n" + htmlResource1); }
/** * This is a test method that checks if the class resourceContent works fine */ public static void Test() { String rsrcurl = "www.adamsearch.com"; ResourceType rsrcType = ResourceType.HtmlResource; String rsrcContent = "Please click the next buttom to start crawling !!"; int rtrnCode = 200; ResourceContent htmlResource1 = new ResourceContent(rsrcurl, rsrcType, rsrcContent, rtrnCode, 0); Console.WriteLine("The resource is : " + htmlResource1.isValid()); Console.WriteLine("Get url: " + htmlResource1.getResourceUrl()); Console.WriteLine("Get resourceType: " + htmlResource1.getResourceType()); Console.WriteLine("Get resourceContent: " + htmlResource1.getResourceContent()); Console.WriteLine("to string:\n" + htmlResource1); }
/** * This method tries to process the given content assuming that the given content * can be processed via this processor */ public void process(ResourceContent resource) { DateTime startTime = DateTime.Now; List <LinkItem> listOfLinks; //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - startTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach (LinkItem item in listOfLinks) { //Filter the links and return only links that can be crawled List <String> links = new List <String>(); links.Add(item.getLink()); List <String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) { RuntimeStatistics.addToCrawledUrls(1); } DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; //Save all the results to Storage foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** $. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; // write request time to timing log file if (LogDebuggerControl.getInstance().enableTiming) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - [] "); sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Ranker Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Deploy Time " + deployRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds "); sw.Close(); } }
/** * This method tries to process the given content assuming that the given content * can be processed via this processor */ public void process(ResourceContent resource) { DateTime startTime = DateTime.Now; List<LinkItem> listOfLinks; //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - startTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach(LinkItem item in listOfLinks ) { //Filter the links and return only links that can be crawled List<String>links = new List<String>(); links.Add(item.getLink()); List<String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource,item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List<Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) RuntimeStatistics.addToCrawledUrls(1); DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; //Save all the results to Storage foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(),classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** $. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; // write request time to timing log file if (LogDebuggerControl.getInstance().enableTiming) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - [] "); sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Ranker Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Deploy Time " + deployRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds "); sw.Close(); } }