/** * This method recieves a word list which contains the content of the * resource(html page) and then classifies it to the suitable category. */ public List <Result> classifyContent(String resource, String url) { if (options == null) { options = getCategorizerOptions(); } List <Result> results = new List <Result>(); foreach (Category category in categoryList) { if (LogDebuggerControl.getInstance().debugCategorization) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** HEAD REQUEST ************************************************* "); sw.WriteLine(" URL : " + url); sw.Close(); } int matchLevel = category.getMatchLevel(resource, options); if (matchLevel > category.getConfidenceLevel()) { results.Add(new Result("0", url, category.getCategoryID(), 0, matchLevel)); } } //results.Add(new Result("0", url, "0", 0, 100)); return(results); }
/** * it will invoke the worker to start working on the tasks - never returns */ public void run() { int requestNum = 0, timeoutCounter = 0; bool needToTerminate = false; TimeSpan totalProcessTime; Thread.Sleep(10000); while (needToTerminate == false) { DateTime startTime = DateTime.Now; try { //System.Console.WriteLine("-<>--------------------------------------------------------------------------"); SyncAccessor.getSlot(2, 1); Url task = SyncAccessor.getFromQueue <Url>(_tasks, _timer); //System.Console.WriteLine(" Start Working on : " + task.getUrl() + " ..."); ResourceContent content = _fetchers.fetchResource(task.getUrl()); if (content.isValid() != true) { timeoutCounter++; //System.Console.WriteLine(" Fetch Failed Ignoring ... "); continue; } //System.Console.WriteLine(" Fetched Successfully ... "); ResourceContent modifiedContent = new ResourceContent(content.getResourceUrl(), content.getResourceType() , content.getResourceContent(), content.getReturnCode(), task.getRank()); DateTime startProcess = DateTime.Now; _processors.processResource(modifiedContent); DateTime endProcess = DateTime.Now; totalProcessTime = endProcess - startProcess; //System.Console.WriteLine(" URL Processed Successfully ... "); System.Console.WriteLine(" URL Processed Successfully ... "); } catch (Exception e) { //System.Console.WriteLine("[Exception Happened] " + e); RuntimeStatistics.addToErrors(1); continue; } DateTime endTime = DateTime.Now; TimeSpan totalRequestTime = endTime - startTime; if (LogDebuggerControl.getInstance().enableTiming) { // write request time to timing log file StreamWriter sw = new StreamWriter("_DEBUG_INFO_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - " + requestNum++ + " takes about " + totalRequestTime.TotalSeconds + " s, Processed At " + totalProcessTime.TotalSeconds + " s"); sw.Close(); } } }
/** * This method returns a rank for the anchor url */ private int getRankOfAnchor(LinkItem item) { //These variables will contain the max and avg of the match levels of the Anchor Url int maxMatchLevelForAnchor = 0; int avgMatchLevelForAnchor = 0; if (item.getAnchor() == null) { return(0); } StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF ANCHOR:"); sw.WriteLine(item.getAnchor()); sw.Close(); } //calculate the min and max of the match levels of the anchor url to the categories. if (anchorOptions == null) { anchorOptions = getOptions("anchor"); } List <int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions); maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor); avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: "); sw.WriteLine(maxMatchLevelForAnchor); sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: "); sw.WriteLine(avgMatchLevelForAnchor); //sw.WriteLine(" .RANK OF ANCHOR: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor)); //sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor)); }
/** * This method returns a rank for the nearby textof the url */ private int getRankOfNearbyText(LinkItem item) { //These variables will contain the max and avg of the match levels of the nearby //text of the extracted url. int maxMatchLevelForNearby = 0; int avgMatchLevelForNearby = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF NEARBY TEXT:"); sw.WriteLine(item.getText()); sw.Close(); } //calculate the min and max of the match levels of the nearby text to the categories. if (nearbyOptions == null) { nearbyOptions = getOptions("nearby"); } List <int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions); maxMatchLevelForNearby = calculateMax(matchLevelsForNearby); avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(maxMatchLevelForNearby); sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(avgMatchLevelForNearby); //sw.WriteLine(" .RANK OF NEARBY TEXT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby)); }
/** * This method returns a rank for the whole page content. */ private int getRankOfWholeContent(ResourceContent resource) { //These variables will contain the max and avg of the match levels of the whole //content of parentReasource. int maxMatchLevelForContent = 0; int avgMatchLevelForContent = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** "); //sw.WriteLine(" URL : " + resource.getResourceUrl()); sw.Close(); } //calculate the min and max of the match levels of the whole resource content to the categories. if (wholeContentOptions == null) { wholeContentOptions = getOptions("wholeContent"); } List <int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions); maxMatchLevelForContent = calculateMax(matchLevelsForContent); avgMatchLevelForContent = calculateAvg(matchLevelsForContent); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(maxMatchLevelForContent); sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(avgMatchLevelForContent); //sw.WriteLine(" .RANK OF WHOLE CONTENT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent)); }
/** * Main method of the console application */ public static void Main(String[] args) { bool toContinue = ParseArguements(args), needToRestart = false; if (toContinue == false) { return; } Queue <int> keepAlive = new Queue <int>(); String currentUser = "******", currentTask = ""; LogDebuggerControl.getInstance().debugCategorization = false; LogDebuggerControl.getInstance().debugCategorizationInRanker = false; LogDebuggerControl.getInstance().debugRanker = false; while (true) { // select which task to invoke SelectTask(ref currentUser, ref currentTask); //update the WorkDetails class with the new taskId WorkDetails.setTaskId(currentTask); //Set ALL constants of the task SetAllConstants(); // getting init data SetInitializer(currentTask); // init queues InitQueues(currentTask); // initing worker and frontier threads InvokeThreads(); // polling to the user requests while (needToRestart == false) { Thread.Sleep(_refreshRate * 1000); StatusDisplay.DisplayOnScreen(_feedBackQueue, _serversQueues); if (_operationMode == operationMode_t.Auto) { List <TaskStatus> tasks = StorageSystem.StorageSystem.getInstance().getWorkDetails(currentUser, QueryOption.ActiveTasks); needToRestart = true; foreach (TaskStatus task in tasks) { if (task.getTaskID() == currentTask) { task.setTaskElapsedTime(task.getTaskElapsedTime() + _refreshRate); StorageSystem.StorageSystem.getInstance().changeWorkDetails(task); needToRestart = false; continue; } } } } // Terminate all the threads TerminateThreads(); needToRestart = false; RuntimeStatistics.resetStatistics(); } //RankerTest test = new RankerTest(); //test.Test2(); }
/** * This method calculates the rank of a given url and returns it. */ public int rankUrl(ResourceContent parentResource, LinkItem item) { //These variables will contain the ranks for the whole content match and nearby text match and //anchor match and the parentrank. int rankParentUrl = parentResource.getRankOfUrl(); int anchorRank = 0; //int wholePageRank = 0; int nearbyTextRank = 0; int neighborhood = 0; int context = 0; int inherited = 0; char[] separators = { ' ', '\t', '\n' }; NumOfLinks++; sumOfTotalNearbyWords += item.getText().Split(separators).Length; sumOfTotalAnchorWords += item.getAnchor().Split(separators, StringSplitOptions.RemoveEmptyEntries).Length; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" *********HEAD REQUEST *********************************************"); sw.WriteLine(" ***** DATA FOR RANKER******************************************** "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" PARENT URL : " + item.getParentUrl()); sw.Close(); } //rank of the whole page if (!((lastResourceContent != null) && (lastResourceContent.Equals(parentResource.getResourceContent())))) { lastResourceContent = parentResource.getResourceContent(); wholePageRank = getRankOfWholeContent(parentResource); } //rank of the nearby text nearbyTextRank = getRankOfNearbyText(item); DateTime endTimeOfNearby = DateTime.Now; //rank of the anchor url anchorRank = getRankOfAnchor(item); //rank of the neighborhood,that includes rank of the anchor and the nearby text if (anchorRank > RankerOptions.ConfidenceLevelOfAnchor) { context = 100; } else { //nearbyTextRank = getRankOfNearbyText(item); context = nearbyTextRank; } neighborhood = (int)(RankerOptions.BETTA * anchorRank + (1 - RankerOptions.BETTA) * context); //rank of the inherited,that includes the rank of the parentUrl and paren content inherited = (int)(RankerOptions.ALPHA * rankParentUrl + (1 - RankerOptions.ALPHA) * wholePageRank); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine("************************DATA CONCLUSION*************************"); sw.WriteLine(" .PARENT RANK: "); sw.WriteLine(rankParentUrl); sw.WriteLine(" .RANK OF NEARBY TEXT: "); sw.WriteLine(nearbyTextRank); sw.WriteLine(" .AVG OF NEARBY WORDS"); sw.WriteLine((int)(sumOfTotalNearbyWords / NumOfLinks)); sw.WriteLine(" .RANK OF ANCHOR: "); sw.WriteLine(anchorRank); sw.WriteLine(" .AVG OF ANCHOR TEXT"); sw.WriteLine((int)(sumOfTotalAnchorWords / NumOfLinks)); sw.WriteLine(" .NEIGHBORHOOD: "); sw.WriteLine(neighborhood); sw.WriteLine(" .RANK OF WHOLE CONTENT: "); sw.WriteLine(wholePageRank); sw.WriteLine(" .INHERITED: "); sw.WriteLine(inherited); sw.WriteLine(" .RANK OF THE URL: "); sw.WriteLine((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } //Console.WriteLine(totalRankingTime.TotalSeconds); return((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); }
/** * This method tries to process the given content assuming that the given content * can be processed via this processor */ public void process(ResourceContent resource) { DateTime startTime = DateTime.Now; List <LinkItem> listOfLinks; //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - startTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach (LinkItem item in listOfLinks) { //Filter the links and return only links that can be crawled List <String> links = new List <String>(); links.Add(item.getLink()); List <String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) { RuntimeStatistics.addToCrawledUrls(1); } DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; //Save all the results to Storage foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** $. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; // write request time to timing log file if (LogDebuggerControl.getInstance().enableTiming) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" TIMING FOR REQ - [] "); sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Ranker Time " + extRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Deploy Time " + deployRequest.TotalSeconds + " seconds "); sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds "); sw.Close(); } }