Пример #1
0
        /**
         * This method recieves a word list which contains the content of the
         * resource(html page) and then classifies it to the suitable category.
         */
        public List <Result> classifyContent(String resource, String url)
        {
            if (options == null)
            {
                options = getCategorizerOptions();
            }

            List <Result> results = new List <Result>();

            foreach (Category category in categoryList)
            {
                if (LogDebuggerControl.getInstance().debugCategorization)
                {
                    StreamWriter sw = new
                                      StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" ***** HEAD REQUEST ************************************************* ");
                    sw.WriteLine(" URL : " + url);
                    sw.Close();
                }

                int matchLevel = category.getMatchLevel(resource, options);


                if (matchLevel > category.getConfidenceLevel())
                {
                    results.Add(new Result("0", url, category.getCategoryID(), 0, matchLevel));
                }
            }
            //results.Add(new Result("0", url, "0", 0, 100));
            return(results);
        }
Пример #2
0
        /**
         * it will invoke the worker to start working on the tasks - never returns
         */
        public void run()
        {
            int      requestNum = 0, timeoutCounter = 0;
            bool     needToTerminate = false;
            TimeSpan totalProcessTime;

            Thread.Sleep(10000);
            while (needToTerminate == false)
            {
                DateTime startTime = DateTime.Now;
                try
                {
                    //System.Console.WriteLine("-<>--------------------------------------------------------------------------");
                    SyncAccessor.getSlot(2, 1);
                    Url task = SyncAccessor.getFromQueue <Url>(_tasks, _timer);

                    //System.Console.WriteLine(" Start Working on : " + task.getUrl() + " ...");
                    ResourceContent content = _fetchers.fetchResource(task.getUrl());

                    if (content.isValid() != true)
                    {
                        timeoutCounter++;
                        //System.Console.WriteLine(" Fetch Failed Ignoring ... ");
                        continue;
                    }

                    //System.Console.WriteLine(" Fetched Successfully ... ");

                    ResourceContent modifiedContent = new ResourceContent(content.getResourceUrl(), content.getResourceType()
                                                                          , content.getResourceContent(), content.getReturnCode(), task.getRank());

                    DateTime startProcess = DateTime.Now;
                    _processors.processResource(modifiedContent);
                    DateTime endProcess = DateTime.Now;
                    totalProcessTime = endProcess - startProcess;
                    //System.Console.WriteLine(" URL Processed Successfully ... ");

                    System.Console.WriteLine(" URL Processed Successfully ... ");
                }
                catch (Exception e)
                {
                    //System.Console.WriteLine("[Exception Happened] " + e);
                    RuntimeStatistics.addToErrors(1);
                    continue;
                }
                DateTime endTime          = DateTime.Now;
                TimeSpan totalRequestTime = endTime - startTime;

                if (LogDebuggerControl.getInstance().enableTiming)
                {
                    // write request time to timing log file
                    StreamWriter sw = new
                                      StreamWriter("_DEBUG_INFO_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" TIMING FOR REQ - " + requestNum++ + " takes about " + totalRequestTime.TotalSeconds + " s, Processed At " + totalProcessTime.TotalSeconds + " s");
                    sw.Close();
                }
            }
        }
Пример #3
0
        /**
         * This method returns a rank for the anchor url
         */
        private int getRankOfAnchor(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the Anchor Url
            int maxMatchLevelForAnchor = 0;
            int avgMatchLevelForAnchor = 0;

            if (item.getAnchor() == null)
            {
                return(0);
            }

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF ANCHOR:");
                sw.WriteLine(item.getAnchor());
                sw.Close();
            }

            //calculate the min and max of the match levels of the anchor url to the categories.
            if (anchorOptions == null)
            {
                anchorOptions = getOptions("anchor");
            }

            List <int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions);

            maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor);
            avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(maxMatchLevelForAnchor);
                sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(avgMatchLevelForAnchor);
                //sw.WriteLine(" .RANK OF ANCHOR: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor));
                //sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor));
        }
Пример #4
0
        /**
         * This method returns a rank for the nearby textof the url
         */
        private int getRankOfNearbyText(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the nearby
            //text of the extracted url.
            int maxMatchLevelForNearby = 0;
            int avgMatchLevelForNearby = 0;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF NEARBY TEXT:");
                sw.WriteLine(item.getText());
                sw.Close();
            }

            //calculate the min and max of the match levels of the nearby text to the categories.
            if (nearbyOptions == null)
            {
                nearbyOptions = getOptions("nearby");
            }

            List <int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions);

            maxMatchLevelForNearby = calculateMax(matchLevelsForNearby);
            avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(maxMatchLevelForNearby);
                sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(avgMatchLevelForNearby);
                //sw.WriteLine(" .RANK OF NEARBY TEXT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby));
        }
Пример #5
0
        /**
         * This method returns a rank for the whole page content.
         */
        private int getRankOfWholeContent(ResourceContent resource)
        {
            //These variables will contain the max and avg of the match levels of the whole
            //content of parentReasource.
            int maxMatchLevelForContent = 0;
            int avgMatchLevelForContent = 0;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** ");
                //sw.WriteLine(" URL : " + resource.getResourceUrl());
                sw.Close();
            }
            //calculate the min and max of the match levels of the whole resource content to the categories.
            if (wholeContentOptions == null)
            {
                wholeContentOptions = getOptions("wholeContent");
            }

            List <int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions);

            maxMatchLevelForContent = calculateMax(matchLevelsForContent);
            avgMatchLevelForContent = calculateAvg(matchLevelsForContent);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: ");
                sw.WriteLine(maxMatchLevelForContent);
                sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: ");
                sw.WriteLine(avgMatchLevelForContent);
                //sw.WriteLine(" .RANK OF WHOLE CONTENT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent));
        }
Пример #6
0
        /**
         * Main method of the console application
         */
        public static void Main(String[] args)
        {
            bool toContinue = ParseArguements(args), needToRestart = false;

            if (toContinue == false)
            {
                return;
            }
            Queue <int> keepAlive = new Queue <int>();
            String      currentUser = "******", currentTask = "";

            LogDebuggerControl.getInstance().debugCategorization = false;
            LogDebuggerControl.getInstance().debugCategorizationInRanker = false;
            LogDebuggerControl.getInstance().debugRanker = false;

            while (true)
            {
                // select which task to invoke
                SelectTask(ref currentUser, ref currentTask);

                //update the WorkDetails class with the new taskId
                WorkDetails.setTaskId(currentTask);

                //Set ALL constants of the task
                SetAllConstants();

                // getting init data
                SetInitializer(currentTask);

                // init queues
                InitQueues(currentTask);

                // initing worker and frontier threads
                InvokeThreads();

                // polling to the user requests
                while (needToRestart == false)
                {
                    Thread.Sleep(_refreshRate * 1000);
                    StatusDisplay.DisplayOnScreen(_feedBackQueue, _serversQueues);
                    if (_operationMode == operationMode_t.Auto)
                    {
                        List <TaskStatus> tasks =
                            StorageSystem.StorageSystem.getInstance().getWorkDetails(currentUser, QueryOption.ActiveTasks);

                        needToRestart = true;
                        foreach (TaskStatus task in tasks)
                        {
                            if (task.getTaskID() == currentTask)
                            {
                                task.setTaskElapsedTime(task.getTaskElapsedTime() + _refreshRate);
                                StorageSystem.StorageSystem.getInstance().changeWorkDetails(task);
                                needToRestart = false;
                                continue;
                            }
                        }
                    }
                }

                // Terminate all the threads
                TerminateThreads();
                needToRestart = false;
                RuntimeStatistics.resetStatistics();
            }

            //RankerTest test = new RankerTest();
            //test.Test2();
        }
Пример #7
0
        /**
         * This method calculates the rank of a given url and returns it.
         */
        public int rankUrl(ResourceContent parentResource, LinkItem item)
        {
            //These variables will contain the ranks for the whole content match and nearby text match and
            //anchor match and the parentrank.
            int rankParentUrl = parentResource.getRankOfUrl();
            int anchorRank    = 0;
            //int wholePageRank = 0;
            int nearbyTextRank = 0;

            int neighborhood = 0;
            int context      = 0;
            int inherited    = 0;

            char[] separators = { ' ', '\t', '\n' };
            NumOfLinks++;
            sumOfTotalNearbyWords += item.getText().Split(separators).Length;
            sumOfTotalAnchorWords += item.getAnchor().Split(separators, StringSplitOptions.RemoveEmptyEntries).Length;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" *********HEAD REQUEST *********************************************");
                sw.WriteLine(" ***** DATA FOR RANKER******************************************** ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" PARENT URL : " + item.getParentUrl());
                sw.Close();
            }

            //rank of the whole page
            if (!((lastResourceContent != null) && (lastResourceContent.Equals(parentResource.getResourceContent()))))
            {
                lastResourceContent = parentResource.getResourceContent();
                wholePageRank       = getRankOfWholeContent(parentResource);
            }

            //rank of the nearby text
            nearbyTextRank = getRankOfNearbyText(item);

            DateTime endTimeOfNearby = DateTime.Now;

            //rank of the anchor url
            anchorRank = getRankOfAnchor(item);

            //rank of the neighborhood,that includes rank of the anchor and the nearby text
            if (anchorRank > RankerOptions.ConfidenceLevelOfAnchor)
            {
                context = 100;
            }
            else
            {
                //nearbyTextRank = getRankOfNearbyText(item);
                context = nearbyTextRank;
            }
            neighborhood = (int)(RankerOptions.BETTA * anchorRank + (1 - RankerOptions.BETTA) * context);

            //rank of the inherited,that includes the rank of the parentUrl and paren content
            inherited = (int)(RankerOptions.ALPHA * rankParentUrl + (1 - RankerOptions.ALPHA) * wholePageRank);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine("************************DATA CONCLUSION*************************");
                sw.WriteLine(" .PARENT RANK: ");
                sw.WriteLine(rankParentUrl);
                sw.WriteLine(" .RANK OF NEARBY TEXT: ");
                sw.WriteLine(nearbyTextRank);
                sw.WriteLine(" .AVG OF NEARBY WORDS");
                sw.WriteLine((int)(sumOfTotalNearbyWords / NumOfLinks));
                sw.WriteLine(" .RANK OF ANCHOR: ");
                sw.WriteLine(anchorRank);
                sw.WriteLine(" .AVG OF ANCHOR TEXT");
                sw.WriteLine((int)(sumOfTotalAnchorWords / NumOfLinks));
                sw.WriteLine(" .NEIGHBORHOOD: ");
                sw.WriteLine(neighborhood);
                sw.WriteLine(" .RANK OF WHOLE CONTENT: ");
                sw.WriteLine(wholePageRank);
                sw.WriteLine(" .INHERITED: ");
                sw.WriteLine(inherited);
                sw.WriteLine(" .RANK OF THE URL: ");
                sw.WriteLine((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            //Console.WriteLine(totalRankingTime.TotalSeconds);
            return((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood));
        }
        /**
         * This method tries to process the given content assuming that the given content
         * can be processed via this processor
         */
        public void process(ResourceContent resource)
        {
            DateTime startTime = DateTime.Now;

            List <LinkItem> listOfLinks;

            //extract all the links in page
            listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent());
            RuntimeStatistics.addToExtractedUrls(listOfLinks.Count);

            DateTime extEndTime = DateTime.Now;

            /*** 1. Extracting the link from the request ***/
            TimeSpan extRequest = extEndTime - startTime;

            //reset the dictionary in filter that contains the urls from the same page
            filter.resetDictionary();
            int filteredUrlsCount = 0;

            foreach (LinkItem item in listOfLinks)
            {
                //Filter the links and return only links that can be crawled
                List <String> links = new List <String>();
                links.Add(item.getLink());
                List <String> filteredLinks = filter.filterLinks(links);

                //If filteredLinks is not empty
                if (filteredLinks.Count > 0)
                {
                    filteredUrlsCount++;
                    Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item),
                                      item.getDomainUrl(), hashUrl(item.getDomainUrl()));
                    deployLinksToFrontier(url);
                    RuntimeStatistics.addToFeedUrls(1);
                }
            }

            DateTime catStartTime = DateTime.Now;

            /*** 2. Ranking and deployment to the frontier ***/
            TimeSpan rankTotalRequest = catStartTime - extEndTime;

            //Ascribe the url to all the categories it is belonged to.
            List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(),
                                                                          resource.getResourceUrl());

            if (classifiedResults.Count != 0)
            {
                RuntimeStatistics.addToCrawledUrls(1);
            }
            DateTime catEndTime = DateTime.Now;

            /*** 3. Classification of the current request ***/
            TimeSpan catTotalRequest = catEndTime - catStartTime;

            //Save all the results to Storage
            foreach (Result classifiedResult in classifiedResults)
            {
                Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(),
                                           resource.getRankOfUrl(), classifiedResult.getTrustMeter());
                deployResourceToStorage(result);
            }

            DateTime endTime = DateTime.Now;
            /*** 4. deployment to the database (result) ***/
            TimeSpan deployRequest = endTime - catEndTime;

            /*** $. Total processing time ***/
            TimeSpan totalRequest = endTime - startTime;

            // write request time to timing log file
            if (LogDebuggerControl.getInstance().enableTiming)
            {
                StreamWriter sw = new
                                  StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" TIMING FOR REQ - [] ");
                sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Ranker    Time " + extRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Deploy    Time " + deployRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds ");
                sw.Close();
            }
        }