예제 #1
0
        /**
         * This method recieves a word list which contains the content of the
         * resource(html page) and then classifies it to the suitable category.
         */
        public List <Result> classifyContent(String resource, String url)
        {
            if (options == null)
            {
                options = getCategorizerOptions();
            }

            List <Result> results = new List <Result>();

            foreach (Category category in categoryList)
            {
                if (LogDebuggerControl.getInstance().debugCategorization)
                {
                    StreamWriter sw = new
                                      StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" ***** HEAD REQUEST ************************************************* ");
                    sw.WriteLine(" URL : " + url);
                    sw.Close();
                }

                int matchLevel = category.getMatchLevel(resource, options);


                if (matchLevel > category.getConfidenceLevel())
                {
                    results.Add(new Result("0", url, category.getCategoryID(), 0, matchLevel));
                }
            }
            //results.Add(new Result("0", url, "0", 0, 100));
            return(results);
        }
예제 #2
0
        /**
         * This method recieves a word list which contains the content of the
         * resource(html page) and then classifies it to the suitable category.
         */
        public List<Result> classifyContent(String resource,String url)
        {
            if (options == null)
            {
                options = getCategorizerOptions();
            }

            List<Result> results = new List<Result>();
            foreach (Category category in categoryList)
            {
                if (LogDebuggerControl.getInstance().debugCategorization)
                {
                    StreamWriter sw = new
                        StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" ***** HEAD REQUEST ************************************************* ");
                    sw.WriteLine(" URL : " + url);
                    sw.Close();
                }

                int matchLevel = category.getMatchLevel(resource,options);

                if (matchLevel > category.getConfidenceLevel())
                {
                    results.Add(new Result("0", url, category.getCategoryID(), 0, matchLevel));
                }

            }
            //results.Add(new Result("0", url, "0", 0, 100));
            return results;
        }
예제 #3
0
        /**
         * This method gets a HTML page, and returns a list of all the
         * match levels of the resource to all the categories.
         */
        public List<int> classifyContentToAllCategories(String resource, CategorizerOptions options)
        {
            List<int> matchLevelResults = new List<int>();
            foreach (Category category in categoryList)
            {
                int matchLevel = category.getMatchLevel(resource,options);

                matchLevelResults.Add(matchLevel);
            }
            //results.Add(new Result("0", url, "0", 0, 100));
            return matchLevelResults;
        }
예제 #4
0
        /**
         * This method gets a HTML page, and returns a list of all the
         * match levels of the resource to all the categories.
         */
        public List <int> classifyContentToAllCategories(String resource, CategorizerOptions options)
        {
            List <int> matchLevelResults = new List <int>();

            foreach (Category category in categoryList)
            {
                int matchLevel = category.getMatchLevel(resource, options);

                matchLevelResults.Add(matchLevel);
            }
            //results.Add(new Result("0", url, "0", 0, 100));
            return(matchLevelResults);
        }
예제 #5
0
        /**
         * This method returns a rank for the anchor url
         */
        private int getRankOfAnchor(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the Anchor Url
            int maxMatchLevelForAnchor = 0;
            int avgMatchLevelForAnchor = 0;

            if (item.getAnchor() == null)
            {
                return(0);
            }

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF ANCHOR:");
                sw.WriteLine(item.getAnchor());
                sw.Close();
            }

            //calculate the min and max of the match levels of the anchor url to the categories.
            if (anchorOptions == null)
            {
                anchorOptions = getOptions("anchor");
            }

            List <int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions);

            maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor);
            avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(maxMatchLevelForAnchor);
                sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(avgMatchLevelForAnchor);
                //sw.WriteLine(" .RANK OF ANCHOR: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor));
                //sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor));
        }
예제 #6
0
        /**
         * This method creates Categorizer Options and returns it.
         * The values of the variables of the new object are brought from the data base.
         * Note : In case the operation mode is manual the variables will be default ones,
         *        such is the case when the returns values from the data base are nulls.
         */
        private CategorizerOptions getCategorizerOptions()
        {
            CategorizerOptions options = new CategorizerOptions();

            if (WorkDetails.getOperationMode() == operationMode_t.Auto)
            {
                options.ALPHA            = RankerOptions.CAT_ALPHA;
                options.BETA             = RankerOptions.CAT_BETA;
                options.GAMMA            = RankerOptions.CAT_GAMMA;
                options.MIN_WORDS_LIMIT  = RankerOptions.CAT_MIN;
                options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY;
            }
            return(options);
        }
예제 #7
0
        /**
         * This method returns a rank for the nearby textof the url
         */
        private int getRankOfNearbyText(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the nearby
            //text of the extracted url.
            int maxMatchLevelForNearby = 0;
            int avgMatchLevelForNearby = 0;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF NEARBY TEXT:");
                sw.WriteLine(item.getText());
                sw.Close();
            }

            //calculate the min and max of the match levels of the nearby text to the categories.
            if (nearbyOptions == null)
            {
                nearbyOptions = getOptions("nearby");
            }

            List <int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions);

            maxMatchLevelForNearby = calculateMax(matchLevelsForNearby);
            avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(maxMatchLevelForNearby);
                sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(avgMatchLevelForNearby);
                //sw.WriteLine(" .RANK OF NEARBY TEXT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby));
        }
예제 #8
0
        /**
         * This method creates Categorizer Options and returns it.
         * The values of the variables of the new object are brought from the data base.
         * Note : In case the operation mode is manual the variables will be default ones,
         *        such is the case when the returns values from the data base are nulls.
         */
        private CategorizerOptions getOptions(String optionsType)
        {
            CategorizerOptions options = new CategorizerOptions();

            if (WorkDetails.getOperationMode() == operationMode_t.Auto)
            {
                switch (optionsType)
                {
                case "anchor":
                    options.ALPHA              = RankerOptions.ANC_ALPHA;
                    options.BETA               = RankerOptions.ANC_BETA;
                    options.GAMMA              = RankerOptions.ANC_GAMMA;
                    options.MIN_WORDS_LIMIT    = RankerOptions.ANC_MIN;
                    options.MIN_WORDS_PENLTY   = RankerOptions.ANC_PENLTY;
                    options.isRank             = true;
                    options.NONZERO_MAX_EFFECT = 0;
                    break;

                case "wholeContent":
                    options.ALPHA            = RankerOptions.CAT_ALPHA;
                    options.BETA             = RankerOptions.CAT_BETA;
                    options.GAMMA            = RankerOptions.CAT_GAMMA;
                    options.MIN_WORDS_LIMIT  = RankerOptions.CAT_MIN;
                    options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY;
                    options.isRank           = true;
                    break;

                case "nearby":
                    options.ALPHA              = RankerOptions.NER_ALPHA;
                    options.BETA               = RankerOptions.NER_BETA;
                    options.GAMMA              = RankerOptions.NER_GAMMA;
                    options.MIN_WORDS_LIMIT    = RankerOptions.NER_MIN;
                    options.MIN_WORDS_PENLTY   = RankerOptions.NER_PENLTY;
                    options.NONZERO_MAX_EFFECT = 40;
                    options.isRank             = true;
                    break;

                default:
                    goto case "wholeContent";
                }
            }

            return(options);
        }
예제 #9
0
        /**
         * This method returns a rank for the whole page content.
         */
        private int getRankOfWholeContent(ResourceContent resource)
        {
            //These variables will contain the max and avg of the match levels of the whole
            //content of parentReasource.
            int maxMatchLevelForContent = 0;
            int avgMatchLevelForContent = 0;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** ");
                //sw.WriteLine(" URL : " + resource.getResourceUrl());
                sw.Close();
            }
            //calculate the min and max of the match levels of the whole resource content to the categories.
            if (wholeContentOptions == null)
            {
                wholeContentOptions = getOptions("wholeContent");
            }

            List <int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions);

            maxMatchLevelForContent = calculateMax(matchLevelsForContent);
            avgMatchLevelForContent = calculateAvg(matchLevelsForContent);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: ");
                sw.WriteLine(maxMatchLevelForContent);
                sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: ");
                sw.WriteLine(avgMatchLevelForContent);
                //sw.WriteLine(" .RANK OF WHOLE CONTENT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent));
        }
예제 #10
0
        /**
         * This method sets ALL the constants needed for the task.
         */
        public static void SetAllConstants()
        {
            //sets the number of threads
            SetNumberOfThreads();

            //get anchor options
            CategorizerOptions anchorOptions = getOptions("anchor");

            RankerOptions.ANC_ALPHA  = anchorOptions.ALPHA;
            RankerOptions.ANC_BETA   = anchorOptions.BETA;
            RankerOptions.ANC_GAMMA  = anchorOptions.GAMMA;
            RankerOptions.ANC_MIN    = anchorOptions.MIN_WORDS_LIMIT;
            RankerOptions.ANC_PENLTY = anchorOptions.MIN_WORDS_PENLTY;

            //get nearby options
            CategorizerOptions nearbyOptions = getOptions("nearby");

            RankerOptions.NER_ALPHA  = nearbyOptions.ALPHA;
            RankerOptions.NER_BETA   = nearbyOptions.BETA;
            RankerOptions.NER_GAMMA  = nearbyOptions.GAMMA;
            RankerOptions.NER_MIN    = nearbyOptions.MIN_WORDS_LIMIT;
            RankerOptions.NER_PENLTY = nearbyOptions.MIN_WORDS_PENLTY;

            //get category Options
            CategorizerOptions categoryOptions = getOptions("Category");

            RankerOptions.CAT_ALPHA  = categoryOptions.ALPHA;
            RankerOptions.CAT_BETA   = categoryOptions.BETA;
            RankerOptions.CAT_GAMMA  = categoryOptions.GAMMA;
            RankerOptions.CAT_MIN    = categoryOptions.MIN_WORDS_LIMIT;
            RankerOptions.CAT_PENLTY = categoryOptions.MIN_WORDS_PENLTY;

            //get Ranker options
            getRankerOptions();
            //set symmitric line
            String symmetric = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId()
                                                                                     , TaskProperty.SYMMETRIC_LINE.ToString());
            //TODO:continue the assigning of symmitric line
        }
예제 #11
0
        /**
         * This method creates Categorizer Options and returns it.
         * The values of the variables of the new object are brought from the data base.
         * Note : In case the operation mode is manual the variables will be default ones,
         *        such is the case when the returns values from the data base are nulls.
         */
        private static CategorizerOptions getOptions(String optionsType)
        {
            CategorizerOptions options = new CategorizerOptions();

            if (WorkDetails.getOperationMode() == operationMode_t.Auto)
            {
                String alphaSearch = null, bettaSearch = null, gammaSearch = null, minSearch = null, penaltySearch = null;
                switch (optionsType)
                {
                    case "anchor":
                        alphaSearch = TaskProperty.ANC_ALPHA.ToString();
                        bettaSearch = TaskProperty.ANC_BETA.ToString();
                        gammaSearch = TaskProperty.ANC_GAMMA.ToString();
                        minSearch = TaskProperty.ANC_MIN.ToString();
                        penaltySearch = TaskProperty.ANC_PENLTY.ToString();
                        break;
                    case "Category":
                        alphaSearch = TaskProperty.CAT_ALPHA.ToString();
                        bettaSearch = TaskProperty.CAT_BETA.ToString();
                        gammaSearch = TaskProperty.CAT_GAMMA.ToString();
                        minSearch = TaskProperty.CAT_MIN.ToString();
                        penaltySearch = TaskProperty.CAT_PENLTY.ToString();
                        break;
                    case "nearby":
                        alphaSearch = TaskProperty.NER_ALPHA.ToString();
                        bettaSearch = TaskProperty.NER_BETA.ToString();
                        gammaSearch = TaskProperty.NER_GAMMA.ToString();
                        minSearch = TaskProperty.NER_MIN.ToString();
                        penaltySearch = TaskProperty.NER_PENLTY.ToString();
                        break;
                    default:
                        goto case "Category";
                }

                String alpha = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), alphaSearch);
                String betta = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), bettaSearch);
                String gamma = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), gammaSearch);
                String min = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), minSearch);
                String penalty = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), penaltySearch);

                if (isRealNum(alpha))
                    options.ALPHA = Convert.ToDouble(alpha);
                if (isRealNum(betta))
                    options.BETA = Convert.ToDouble(betta);
                if (isRealNum(gamma))
                    options.GAMMA = Convert.ToDouble(gamma);
                if (isRealNum(min))
                    options.MIN_WORDS_LIMIT = Convert.ToDouble(min);
                if (isRealNum(penalty))
                    options.MIN_WORDS_PENLTY = Convert.ToDouble(penalty);
            }

            return options;
        }
예제 #12
0
        /**
         * This method returns a rank for the whole page content.
         */
        private int getRankOfWholeContent(ResourceContent resource)
        {
            //These variables will contain the max and avg of the match levels of the whole
            //content of parentReasource.
            int maxMatchLevelForContent = 0;
            int avgMatchLevelForContent = 0;

            StreamWriter sw = null;
            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** ");
                //sw.WriteLine(" URL : " + resource.getResourceUrl());
                sw.Close();
            }
            //calculate the min and max of the match levels of the whole resource content to the categories.
            if (wholeContentOptions == null)
            {
                wholeContentOptions = getOptions("wholeContent");
            }

            List<int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions);
            maxMatchLevelForContent = calculateMax(matchLevelsForContent);
            avgMatchLevelForContent = calculateAvg(matchLevelsForContent);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: ");
                sw.WriteLine(maxMatchLevelForContent);
                sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: ");
                sw.WriteLine(avgMatchLevelForContent);
                //sw.WriteLine(" .RANK OF WHOLE CONTENT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent));
        }
예제 #13
0
        /**
         * This method returns a rank for the anchor url
         */
        private int getRankOfAnchor(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the Anchor Url
            int maxMatchLevelForAnchor = 0;
            int avgMatchLevelForAnchor = 0;

            if (item.getAnchor() == null)
                return 0;

            StreamWriter sw = null;
            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF ANCHOR:");
                sw.WriteLine(item.getAnchor());
                sw.Close();
            }

            //calculate the min and max of the match levels of the anchor url to the categories.
            if (anchorOptions == null)
            {
                anchorOptions = getOptions("anchor");
            }

            List<int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions);
            maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor);
            avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(maxMatchLevelForAnchor);
                sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(avgMatchLevelForAnchor);
                //sw.WriteLine(" .RANK OF ANCHOR: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor));
                //sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor));
        }
예제 #14
0
        /**
         * This method creates Categorizer Options and returns it.
         * The values of the variables of the new object are brought from the data base.
         * Note : In case the operation mode is manual the variables will be default ones,
         *        such is the case when the returns values from the data base are nulls.
         */
        private CategorizerOptions getOptions(String optionsType)
        {
            CategorizerOptions options = new CategorizerOptions();

            if (WorkDetails.getOperationMode() == operationMode_t.Auto)
            {
                switch (optionsType)
                {
                    case "anchor":
                        options.ALPHA = RankerOptions.ANC_ALPHA;
                        options.BETA = RankerOptions.ANC_BETA;
                        options.GAMMA = RankerOptions.ANC_GAMMA;
                        options.MIN_WORDS_LIMIT = RankerOptions.ANC_MIN;
                        options.MIN_WORDS_PENLTY = RankerOptions.ANC_PENLTY;
                        options.isRank = true;
                        options.NONZERO_MAX_EFFECT = 0;
                        break;
                    case "wholeContent":
                        options.ALPHA = RankerOptions.CAT_ALPHA;
                        options.BETA = RankerOptions.CAT_BETA;
                        options.GAMMA = RankerOptions.CAT_GAMMA;
                        options.MIN_WORDS_LIMIT = RankerOptions.CAT_MIN;
                        options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY;
                        options.isRank = true;
                        break;
                    case "nearby":
                        options.ALPHA = RankerOptions.NER_ALPHA;
                        options.BETA = RankerOptions.NER_BETA;
                        options.GAMMA = RankerOptions.NER_GAMMA;
                        options.MIN_WORDS_LIMIT = RankerOptions.NER_MIN;
                        options.MIN_WORDS_PENLTY = RankerOptions.NER_PENLTY;
                        options.NONZERO_MAX_EFFECT = 40;
                        options.isRank = true;
                        break;
                    default:
                        goto case "wholeContent";
                }
            }

            return options;
        }
예제 #15
0
        /**
         * This method returns the match level of the given wordlist according to a
         * certain formula.
         */
        public int getMatchLevel(String wordList,CategorizerOptions parameters)
        {
            char[] separators = {' ', '\t', '\n'};
            int numOfWords    = Math.Max(1, wordList.Split(separators).Length);
            int numOfKeywords = Math.Max(1, keywordList.Count);
            int nonZero = 0;
            double sumOfhistogram = 0;
            double threshold = Math.Max(2.0, ((numOfWords * parameters.BETA) / numOfKeywords));

            // keywordList and wordList are copied to a new arrays so that we won't change them(the originals)
            List<String> keywordListCopied = new List<string>(keywordList);
            String wordListCopied = (String)wordList.Clone();

            // Transforming the keywordListCopied and wordListCopied to canonical form
            //keywordListCopied.ForEach(canonicForm);
            canonicForm(ref wordListCopied);
            //wordListCopied = wordListCopied.ToLower();
            int[] histogram = new int[numOfKeywords];
            //Initialising the histogram array to zeros
            for (int i = 0; i < numOfKeywords; i++)
            {
                histogram[i] = 0;
            }

            foreach (String keyword in keywordListCopied)
            {
                Regex objPattern = new Regex(keyword.ToLower());
                int count = objPattern.Matches(wordListCopied,0).Count;

                int index = keywordListCopied.IndexOf(keyword);
                if (count != 0 && histogram[index] == 0)
                        nonZero++;

                if (histogram[index] < threshold)
                {
                    int add = Math.Min(histogram[index] + count, (int)threshold) - histogram[index];
                    histogram[index] = histogram[index] + add;
                    sumOfhistogram = sumOfhistogram + add;
                }
            }

            double nonZeroBonus = (nonZero * parameters.GAMMA) / numOfKeywords;
            nonZeroBonus = Math.Min(parameters.NONZERO_MAX_EFFECT, nonZeroBonus);
            double matchPercent = (sumOfhistogram * parameters.ALPHA) / numOfWords;
            matchPercent = Math.Min(parameters.MATCH_MAX_EFFECT, matchPercent);
            double total = parameters.MAX_MATCH_LEVEL * (nonZeroBonus + matchPercent) / (parameters.MATCH_MAX_EFFECT + parameters.NONZERO_MAX_EFFECT);
            if (numOfWords < parameters.MIN_WORDS_LIMIT)
            {
                total = parameters.MIN_WORDS_PENLTY * total;
            }

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugCategorization)
            {
                if (!parameters.isRank)
                {
                    sw = new
                        StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" ***** DATA FOR REQUEST ************************************************* ");
                    //sw.WriteLine(" .CONTENT WORDS: ");
                    //sw.WriteLine(wordListCopied.ToString());
                    sw.WriteLine(" .NUM OF WORDS: ");
                    sw.WriteLine(numOfWords.ToString());
                    sw.WriteLine(" .KEY WORDS: ");
                    sw.WriteLine(keywordListCopied.ToString());
                    sw.WriteLine(" .NUM OF KEY WORDS: ");
                    sw.WriteLine(numOfKeywords.ToString());
                    sw.WriteLine(" .THRESOLD PARAM: ");
                    sw.WriteLine(threshold.ToString());
                    sw.WriteLine(" .SUM OF HISTOGRAM: ");
                    sw.WriteLine(sumOfhistogram.ToString());
                    sw.WriteLine(" .NONZERO PARAM: ");
                    sw.WriteLine(nonZero.ToString());
                    sw.WriteLine(" .HISTOGRAM DATA:");
                    for (int j = 0; j < numOfKeywords; j++)
                    {
                        sw.WriteLine(" .[" + keywordList[j] + "] -> " + histogram[j].ToString());
                    }
                    sw.WriteLine(" .NON-ZERO BONUS: ");
                    sw.WriteLine(nonZeroBonus.ToString());
                    sw.WriteLine(" .MATCH PERCENT: ");
                    sw.WriteLine(matchPercent.ToString());
                    sw.WriteLine(" .TOTAL TRUST: ");
                    sw.WriteLine(total.ToString());
                    sw.WriteLine(" * END ****************************************************************** ");
                    sw.Close();
                }
            }

            if (LogDebuggerControl.getInstance().debugCategorizationInRanker && LogDebuggerControl.getInstance().debugRanker)
            {
                if (parameters.isRank)
                {
                    sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" ***** DATA FOR Categorizer ************************************************* ");
                    //sw.WriteLine(" .CONTENT WORDS: ");
                    //sw.WriteLine(wordListCopied.ToString());
                    sw.WriteLine(" .NUM OF WORDS: ");
                    sw.WriteLine(numOfWords.ToString());
                    //String[] wordListSplited = wordListCopied.Split(separators);
                    //for (int k = 0; k < numOfWords;k++ )
                    //{
                    //    sw.WriteLine(" .[" + k + "] -> " + wordListSplited[k]);
                    //}
                    sw.WriteLine(" .KEY WORDS: ");
                    sw.WriteLine(keywordListCopied.ToString());
                    sw.WriteLine(" .NUM OF KEY WORDS: ");
                    sw.WriteLine(numOfKeywords.ToString());
                    sw.WriteLine(" .THRESOLD PARAM: ");
                    sw.WriteLine(threshold.ToString());
                    sw.WriteLine(" .SUM OF HISTOGRAM: ");
                    sw.WriteLine(sumOfhistogram.ToString());
                    sw.WriteLine(" .NONZERO PARAM: ");
                    sw.WriteLine(nonZero.ToString());
                    sw.WriteLine(" .HISTOGRAM DATA:");
                    for (int j = 0; j < numOfKeywords; j++)
                    {
                        sw.WriteLine(" .[" + keywordList[j] + "] -> " + histogram[j].ToString());
                    }
                    sw.WriteLine(" .NON-ZERO BONUS: ");
                    sw.WriteLine(nonZeroBonus.ToString());
                    sw.WriteLine(" .MATCH PERCENT: ");
                    sw.WriteLine(matchPercent.ToString());
                    sw.WriteLine(" .TOTAL TRUST: ");
                    sw.WriteLine(total.ToString());
                    sw.WriteLine(" * END ****************************************************************** ");
                    sw.Close();
                }
            }
            return Convert.ToInt32(total);
        }
예제 #16
0
        /**
         * This method creates Categorizer Options and returns it.
         * The values of the variables of the new object are brought from the data base.
         * Note : In case the operation mode is manual the variables will be default ones,
         *        such is the case when the returns values from the data base are nulls.
         */
        private CategorizerOptions getCategorizerOptions()
        {
            CategorizerOptions options = new CategorizerOptions();

            if (WorkDetails.getOperationMode() == operationMode_t.Auto)
            {
                 options.ALPHA =RankerOptions.CAT_ALPHA;
                 options.BETA = RankerOptions.CAT_BETA;
                 options.GAMMA = RankerOptions.CAT_GAMMA;
                 options.MIN_WORDS_LIMIT = RankerOptions.CAT_MIN;
                 options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY;
            }
            return options;
        }
예제 #17
0
        /**
         * This method creates Categorizer Options and returns it.
         * The values of the variables of the new object are brought from the data base.
         * Note : In case the operation mode is manual the variables will be default ones,
         *        such is the case when the returns values from the data base are nulls.
         */
        private static CategorizerOptions getOptions(String optionsType)
        {
            CategorizerOptions options = new CategorizerOptions();

            if (WorkDetails.getOperationMode() == operationMode_t.Auto)
            {
                String alphaSearch = null, bettaSearch = null, gammaSearch = null, minSearch = null, penaltySearch = null;
                switch (optionsType)
                {
                case "anchor":
                    alphaSearch   = TaskProperty.ANC_ALPHA.ToString();
                    bettaSearch   = TaskProperty.ANC_BETA.ToString();
                    gammaSearch   = TaskProperty.ANC_GAMMA.ToString();
                    minSearch     = TaskProperty.ANC_MIN.ToString();
                    penaltySearch = TaskProperty.ANC_PENLTY.ToString();
                    break;

                case "Category":
                    alphaSearch   = TaskProperty.CAT_ALPHA.ToString();
                    bettaSearch   = TaskProperty.CAT_BETA.ToString();
                    gammaSearch   = TaskProperty.CAT_GAMMA.ToString();
                    minSearch     = TaskProperty.CAT_MIN.ToString();
                    penaltySearch = TaskProperty.CAT_PENLTY.ToString();
                    break;

                case "nearby":
                    alphaSearch   = TaskProperty.NER_ALPHA.ToString();
                    bettaSearch   = TaskProperty.NER_BETA.ToString();
                    gammaSearch   = TaskProperty.NER_GAMMA.ToString();
                    minSearch     = TaskProperty.NER_MIN.ToString();
                    penaltySearch = TaskProperty.NER_PENLTY.ToString();
                    break;

                default:
                    goto case "Category";
                }

                String alpha   = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), alphaSearch);
                String betta   = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), bettaSearch);
                String gamma   = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), gammaSearch);
                String min     = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), minSearch);
                String penalty = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), penaltySearch);

                if (isRealNum(alpha))
                {
                    options.ALPHA = Convert.ToDouble(alpha);
                }
                if (isRealNum(betta))
                {
                    options.BETA = Convert.ToDouble(betta);
                }
                if (isRealNum(gamma))
                {
                    options.GAMMA = Convert.ToDouble(gamma);
                }
                if (isRealNum(min))
                {
                    options.MIN_WORDS_LIMIT = Convert.ToDouble(min);
                }
                if (isRealNum(penalty))
                {
                    options.MIN_WORDS_PENLTY = Convert.ToDouble(penalty);
                }
            }

            return(options);
        }
예제 #18
0
        /**
         * This method returns a rank for the nearby textof the url
         */
        private int getRankOfNearbyText(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the nearby
            //text of the extracted url.
            int maxMatchLevelForNearby = 0;
            int avgMatchLevelForNearby = 0;

            StreamWriter sw = null;
            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF NEARBY TEXT:");
                sw.WriteLine(item.getText());
                sw.Close();
            }

            //calculate the min and max of the match levels of the nearby text to the categories.
            if (nearbyOptions == null)
            {
                nearbyOptions = getOptions("nearby");
            }

            List<int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions);
            maxMatchLevelForNearby = calculateMax(matchLevelsForNearby);
            avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(maxMatchLevelForNearby);
                sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(avgMatchLevelForNearby);
                //sw.WriteLine(" .RANK OF NEARBY TEXT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby));
        }