/** * it will invoke the worker to start working on the tasks - never returns */ public void run() { int requestNum = 0, timeoutCounter = 0; bool needToTerminate = false; TimeSpan totalProcessTime; Thread.Sleep(10000); while (needToTerminate == false) { DateTime startTime = DateTime.Now; try { //System.Console.WriteLine("-<>--------------------------------------------------------------------------"); SyncAccessor.getSlot(2, 1); Url task = SyncAccessor.getFromQueue <Url>(_tasks, _timer); //System.Console.WriteLine(" Start Working on : " + task.getUrl() + " ..."); ResourceContent content = _fetchers.fetchResource(task.getUrl()); if (content.isValid() != true) { timeoutCounter++; //System.Console.WriteLine(" Fetch Failed Ignoring ... "); continue; } //System.Console.WriteLine(" Fetched Successfully ... "); ResourceContent modifiedContent = new ResourceContent(content.getResourceUrl(), content.getResourceType() , content.getResourceContent(), content.getReturnCode(), task.getRank()); DateTime startProcess = DateTime.Now; _processors.processResource(modifiedContent); DateTime endProcess = DateTime.Now; totalProcessTime = endProcess - startProcess; //System.Console.WriteLine(" URL Processed Successfully ... "); System.Console.WriteLine(" URL Processed Successfully ... "); } catch (Exception e) { //System.Console.WriteLine("[Exception Happened] " + e); RuntimeStatistics.addToErrors(1); continue; } DateTime endTime = DateTime.Now; TimeSpan totalRequestTime = endTime - startTime; if (LogDebuggerControl.getInstance().enableTiming) { // write request time to timing log file StreamWriter sw = new StreamWriter("_DEBUG_INFO_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - " + requestNum++ + " takes about " + totalRequestTime.TotalSeconds + " s, Processed At " + totalProcessTime.TotalSeconds + " s"); sw.Close(); } } }
/** * This is a test method that checks if the class resourceContent works fine */ public static void Test() { String rsrcurl = "www.adamsearch.com"; ResourceType rsrcType = ResourceType.HtmlResource; String rsrcContent = "Please click the next buttom to start crawling !!"; int rtrnCode = 200; ResourceContent htmlResource1 = new ResourceContent(rsrcurl, rsrcType, rsrcContent, rtrnCode,0); Console.WriteLine("The resource is : " + htmlResource1.isValid()); Console.WriteLine("Get url: " + htmlResource1.getResourceUrl()); Console.WriteLine("Get resourceType: " + htmlResource1.getResourceType()); Console.WriteLine("Get resourceContent: " + htmlResource1.getResourceContent()); Console.WriteLine("to string:\n" + htmlResource1); }
/** * This is a test method that checks if the class resourceContent works fine */ public static void Test() { String rsrcurl = "www.adamsearch.com"; ResourceType rsrcType = ResourceType.HtmlResource; String rsrcContent = "Please click the next buttom to start crawling !!"; int rtrnCode = 200; ResourceContent htmlResource1 = new ResourceContent(rsrcurl, rsrcType, rsrcContent, rtrnCode, 0); Console.WriteLine("The resource is : " + htmlResource1.isValid()); Console.WriteLine("Get url: " + htmlResource1.getResourceUrl()); Console.WriteLine("Get resourceType: " + htmlResource1.getResourceType()); Console.WriteLine("Get resourceContent: " + htmlResource1.getResourceContent()); Console.WriteLine("to string:\n" + htmlResource1); }
/** * This method calculates the rank of a given url and returns it. */ public int rankUrl(ResourceContent parentResource, LinkItem item) { //These variables will contain the ranks for the whole content match and nearby text match and //anchor match and the parentrank. int rankParentUrl = parentResource.getRankOfUrl(); int anchorRank = 0; //int wholePageRank = 0; int nearbyTextRank = 0; int neighborhood = 0; int context = 0; int inherited = 0; char[] separators = { ' ', '\t', '\n' }; NumOfLinks++; sumOfTotalNearbyWords += item.getText().Split(separators).Length; sumOfTotalAnchorWords += item.getAnchor().Split(separators, StringSplitOptions.RemoveEmptyEntries).Length; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" *********HEAD REQUEST *********************************************"); sw.WriteLine(" ***** DATA FOR RANKER******************************************** "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" PARENT URL : " + item.getParentUrl()); sw.Close(); } //rank of the whole page if (!((lastResourceContent != null) && (lastResourceContent.Equals(parentResource.getResourceContent())))) { lastResourceContent = parentResource.getResourceContent(); wholePageRank = getRankOfWholeContent(parentResource); } //rank of the nearby text nearbyTextRank = getRankOfNearbyText(item); DateTime endTimeOfNearby = DateTime.Now; //rank of the anchor url anchorRank = getRankOfAnchor(item); //rank of the neighborhood,that includes rank of the anchor and the nearby text if (anchorRank > RankerOptions.ConfidenceLevelOfAnchor) { context = 100; } else { //nearbyTextRank = getRankOfNearbyText(item); context = nearbyTextRank; } neighborhood = (int)(RankerOptions.BETTA * anchorRank + (1 - RankerOptions.BETTA) * context); //rank of the inherited,that includes the rank of the parentUrl and paren content inherited = (int)(RankerOptions.ALPHA * rankParentUrl + (1 - RankerOptions.ALPHA) * wholePageRank); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine("************************DATA CONCLUSION*************************"); sw.WriteLine(" .PARENT RANK: "); sw.WriteLine(rankParentUrl); sw.WriteLine(" .RANK OF NEARBY TEXT: "); sw.WriteLine(nearbyTextRank); sw.WriteLine(" .AVG OF NEARBY WORDS"); sw.WriteLine((int)(sumOfTotalNearbyWords / NumOfLinks)); sw.WriteLine(" .RANK OF ANCHOR: "); sw.WriteLine(anchorRank); sw.WriteLine(" .AVG OF ANCHOR TEXT"); sw.WriteLine((int)(sumOfTotalAnchorWords / NumOfLinks)); sw.WriteLine(" .NEIGHBORHOOD: "); sw.WriteLine(neighborhood); sw.WriteLine(" .RANK OF WHOLE CONTENT: "); sw.WriteLine(wholePageRank); sw.WriteLine(" .INHERITED: "); sw.WriteLine(inherited); sw.WriteLine(" .RANK OF THE URL: "); sw.WriteLine((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } //Console.WriteLine(totalRankingTime.TotalSeconds); return((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); }
/** * This method returns a rank for the whole page content. */ private int getRankOfWholeContent(ResourceContent resource) { //These variables will contain the max and avg of the match levels of the whole //content of parentReasource. int maxMatchLevelForContent = 0; int avgMatchLevelForContent = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** "); //sw.WriteLine(" URL : " + resource.getResourceUrl()); sw.Close(); } //calculate the min and max of the match levels of the whole resource content to the categories. if (wholeContentOptions == null) { wholeContentOptions = getOptions("wholeContent"); } List <int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions); maxMatchLevelForContent = calculateMax(matchLevelsForContent); avgMatchLevelForContent = calculateAvg(matchLevelsForContent); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(maxMatchLevelForContent); sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(avgMatchLevelForContent); //sw.WriteLine(" .RANK OF WHOLE CONTENT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent)); }
/** * This method calculates the rank of a given url and returns it. */ public int rankUrl(ResourceContent parentResource,LinkItem item) { //These variables will contain the ranks for the whole content match and nearby text match and //anchor match and the parentrank. int rankParentUrl = parentResource.getRankOfUrl(); int anchorRank = 0; //int wholePageRank = 0; int nearbyTextRank = 0; int neighborhood = 0; int context = 0; int inherited = 0; char[] separators = {' ', '\t', '\n'}; NumOfLinks++; sumOfTotalNearbyWords += item.getText().Split(separators).Length; sumOfTotalAnchorWords += item.getAnchor().Split(separators, StringSplitOptions.RemoveEmptyEntries).Length; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" *********HEAD REQUEST *********************************************"); sw.WriteLine(" ***** DATA FOR RANKER******************************************** "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" PARENT URL : " + item.getParentUrl()); sw.Close(); } //rank of the whole page if (!((lastResourceContent!=null)&&(lastResourceContent.Equals(parentResource.getResourceContent())))) { lastResourceContent=parentResource.getResourceContent(); wholePageRank = getRankOfWholeContent(parentResource); } //rank of the nearby text nearbyTextRank =getRankOfNearbyText(item); DateTime endTimeOfNearby = DateTime.Now; //rank of the anchor url anchorRank = getRankOfAnchor(item); //rank of the neighborhood,that includes rank of the anchor and the nearby text if (anchorRank > RankerOptions.ConfidenceLevelOfAnchor) context = 100; else { //nearbyTextRank = getRankOfNearbyText(item); context = nearbyTextRank; } neighborhood = (int)(RankerOptions.BETTA * anchorRank + (1 - RankerOptions.BETTA) * context); //rank of the inherited,that includes the rank of the parentUrl and paren content inherited = (int)(RankerOptions.ALPHA * rankParentUrl + (1 - RankerOptions.ALPHA) * wholePageRank); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine("************************DATA CONCLUSION*************************"); sw.WriteLine(" .PARENT RANK: "); sw.WriteLine(rankParentUrl); sw.WriteLine(" .RANK OF NEARBY TEXT: "); sw.WriteLine(nearbyTextRank); sw.WriteLine(" .AVG OF NEARBY WORDS"); sw.WriteLine((int)(sumOfTotalNearbyWords / NumOfLinks)); sw.WriteLine(" .RANK OF ANCHOR: "); sw.WriteLine(anchorRank); sw.WriteLine(" .AVG OF ANCHOR TEXT"); sw.WriteLine((int)(sumOfTotalAnchorWords / NumOfLinks)); sw.WriteLine(" .NEIGHBORHOOD: "); sw.WriteLine(neighborhood); sw.WriteLine(" .RANK OF WHOLE CONTENT: "); sw.WriteLine(wholePageRank); sw.WriteLine(" .INHERITED: "); sw.WriteLine(inherited); sw.WriteLine(" .RANK OF THE URL: "); sw.WriteLine((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } //Console.WriteLine(totalRankingTime.TotalSeconds); return ((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); }
/** * This method returns a rank for the whole page content. */ private int getRankOfWholeContent(ResourceContent resource) { //These variables will contain the max and avg of the match levels of the whole //content of parentReasource. int maxMatchLevelForContent = 0; int avgMatchLevelForContent = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** "); //sw.WriteLine(" URL : " + resource.getResourceUrl()); sw.Close(); } //calculate the min and max of the match levels of the whole resource content to the categories. if (wholeContentOptions == null) { wholeContentOptions = getOptions("wholeContent"); } List<int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions); maxMatchLevelForContent = calculateMax(matchLevelsForContent); avgMatchLevelForContent = calculateAvg(matchLevelsForContent); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(maxMatchLevelForContent); sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(avgMatchLevelForContent); //sw.WriteLine(" .RANK OF WHOLE CONTENT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent)); }
/** * This method tries to process the given content assuming that the given content * can be processed via this processor */ public void process(ResourceContent resource) { DateTime startTime = DateTime.Now; List <LinkItem> listOfLinks; //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - startTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach (LinkItem item in listOfLinks) { //Filter the links and return only links that can be crawled List <String> links = new List <String>(); links.Add(item.getLink()); List <String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) { RuntimeStatistics.addToCrawledUrls(1); } DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; //Save all the results to Storage foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** $. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; // write request time to timing log file if (LogDebuggerControl.getInstance().enableTiming) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - [] "); sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Ranker Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Deploy Time " + deployRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds "); sw.Close(); } }
/** * This method tries to process the given content assuming that the given content * can be processed via this processor */ public void process(ResourceContent resource) { DateTime startTime = DateTime.Now; List<LinkItem> listOfLinks; //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - startTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach(LinkItem item in listOfLinks ) { //Filter the links and return only links that can be crawled List<String>links = new List<String>(); links.Add(item.getLink()); List<String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource,item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List<Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) RuntimeStatistics.addToCrawledUrls(1); DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; //Save all the results to Storage foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(),classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** $. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; // write request time to timing log file if (LogDebuggerControl.getInstance().enableTiming) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - [] "); sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Ranker Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Deploy Time " + deployRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds "); sw.Close(); } }