/** * This method recieves a word list which contains the content of the * resource(html page) and then classifies it to the suitable category. */ public List <Result> classifyContent(String resource, String url) { if (options == null) { options = getCategorizerOptions(); } List <Result> results = new List <Result>(); foreach (Category category in categoryList) { if (LogDebuggerControl.getInstance().debugCategorization) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** HEAD REQUEST ************************************************* "); sw.WriteLine(" URL : " + url); sw.Close(); } int matchLevel = category.getMatchLevel(resource, options); if (matchLevel > category.getConfidenceLevel()) { results.Add(new Result("0", url, category.getCategoryID(), 0, matchLevel)); } } //results.Add(new Result("0", url, "0", 0, 100)); return(results); }
/** * This method recieves a word list which contains the content of the * resource(html page) and then classifies it to the suitable category. */ public List<Result> classifyContent(String resource,String url) { if (options == null) { options = getCategorizerOptions(); } List<Result> results = new List<Result>(); foreach (Category category in categoryList) { if (LogDebuggerControl.getInstance().debugCategorization) { StreamWriter sw = new StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** HEAD REQUEST ************************************************* "); sw.WriteLine(" URL : " + url); sw.Close(); } int matchLevel = category.getMatchLevel(resource,options); if (matchLevel > category.getConfidenceLevel()) { results.Add(new Result("0", url, category.getCategoryID(), 0, matchLevel)); } } //results.Add(new Result("0", url, "0", 0, 100)); return results; }
/** * This method gets a HTML page, and returns a list of all the * match levels of the resource to all the categories. */ public List<int> classifyContentToAllCategories(String resource, CategorizerOptions options) { List<int> matchLevelResults = new List<int>(); foreach (Category category in categoryList) { int matchLevel = category.getMatchLevel(resource,options); matchLevelResults.Add(matchLevel); } //results.Add(new Result("0", url, "0", 0, 100)); return matchLevelResults; }
/** * This method gets a HTML page, and returns a list of all the * match levels of the resource to all the categories. */ public List <int> classifyContentToAllCategories(String resource, CategorizerOptions options) { List <int> matchLevelResults = new List <int>(); foreach (Category category in categoryList) { int matchLevel = category.getMatchLevel(resource, options); matchLevelResults.Add(matchLevel); } //results.Add(new Result("0", url, "0", 0, 100)); return(matchLevelResults); }
/** * This method returns a rank for the anchor url */ private int getRankOfAnchor(LinkItem item) { //These variables will contain the max and avg of the match levels of the Anchor Url int maxMatchLevelForAnchor = 0; int avgMatchLevelForAnchor = 0; if (item.getAnchor() == null) { return(0); } StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF ANCHOR:"); sw.WriteLine(item.getAnchor()); sw.Close(); } //calculate the min and max of the match levels of the anchor url to the categories. if (anchorOptions == null) { anchorOptions = getOptions("anchor"); } List <int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions); maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor); avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: "); sw.WriteLine(maxMatchLevelForAnchor); sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: "); sw.WriteLine(avgMatchLevelForAnchor); //sw.WriteLine(" .RANK OF ANCHOR: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor)); //sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor)); }
/** * This method creates Categorizer Options and returns it. * The values of the variables of the new object are brought from the data base. * Note : In case the operation mode is manual the variables will be default ones, * such is the case when the returns values from the data base are nulls. */ private CategorizerOptions getCategorizerOptions() { CategorizerOptions options = new CategorizerOptions(); if (WorkDetails.getOperationMode() == operationMode_t.Auto) { options.ALPHA = RankerOptions.CAT_ALPHA; options.BETA = RankerOptions.CAT_BETA; options.GAMMA = RankerOptions.CAT_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.CAT_MIN; options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY; } return(options); }
/** * This method returns a rank for the nearby textof the url */ private int getRankOfNearbyText(LinkItem item) { //These variables will contain the max and avg of the match levels of the nearby //text of the extracted url. int maxMatchLevelForNearby = 0; int avgMatchLevelForNearby = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF NEARBY TEXT:"); sw.WriteLine(item.getText()); sw.Close(); } //calculate the min and max of the match levels of the nearby text to the categories. if (nearbyOptions == null) { nearbyOptions = getOptions("nearby"); } List <int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions); maxMatchLevelForNearby = calculateMax(matchLevelsForNearby); avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(maxMatchLevelForNearby); sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(avgMatchLevelForNearby); //sw.WriteLine(" .RANK OF NEARBY TEXT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby)); }
/** * This method creates Categorizer Options and returns it. * The values of the variables of the new object are brought from the data base. * Note : In case the operation mode is manual the variables will be default ones, * such is the case when the returns values from the data base are nulls. */ private CategorizerOptions getOptions(String optionsType) { CategorizerOptions options = new CategorizerOptions(); if (WorkDetails.getOperationMode() == operationMode_t.Auto) { switch (optionsType) { case "anchor": options.ALPHA = RankerOptions.ANC_ALPHA; options.BETA = RankerOptions.ANC_BETA; options.GAMMA = RankerOptions.ANC_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.ANC_MIN; options.MIN_WORDS_PENLTY = RankerOptions.ANC_PENLTY; options.isRank = true; options.NONZERO_MAX_EFFECT = 0; break; case "wholeContent": options.ALPHA = RankerOptions.CAT_ALPHA; options.BETA = RankerOptions.CAT_BETA; options.GAMMA = RankerOptions.CAT_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.CAT_MIN; options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY; options.isRank = true; break; case "nearby": options.ALPHA = RankerOptions.NER_ALPHA; options.BETA = RankerOptions.NER_BETA; options.GAMMA = RankerOptions.NER_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.NER_MIN; options.MIN_WORDS_PENLTY = RankerOptions.NER_PENLTY; options.NONZERO_MAX_EFFECT = 40; options.isRank = true; break; default: goto case "wholeContent"; } } return(options); }
/** * This method returns a rank for the whole page content. */ private int getRankOfWholeContent(ResourceContent resource) { //These variables will contain the max and avg of the match levels of the whole //content of parentReasource. int maxMatchLevelForContent = 0; int avgMatchLevelForContent = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** "); //sw.WriteLine(" URL : " + resource.getResourceUrl()); sw.Close(); } //calculate the min and max of the match levels of the whole resource content to the categories. if (wholeContentOptions == null) { wholeContentOptions = getOptions("wholeContent"); } List <int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions); maxMatchLevelForContent = calculateMax(matchLevelsForContent); avgMatchLevelForContent = calculateAvg(matchLevelsForContent); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(maxMatchLevelForContent); sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(avgMatchLevelForContent); //sw.WriteLine(" .RANK OF WHOLE CONTENT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent)); }
/** * This method sets ALL the constants needed for the task. */ public static void SetAllConstants() { //sets the number of threads SetNumberOfThreads(); //get anchor options CategorizerOptions anchorOptions = getOptions("anchor"); RankerOptions.ANC_ALPHA = anchorOptions.ALPHA; RankerOptions.ANC_BETA = anchorOptions.BETA; RankerOptions.ANC_GAMMA = anchorOptions.GAMMA; RankerOptions.ANC_MIN = anchorOptions.MIN_WORDS_LIMIT; RankerOptions.ANC_PENLTY = anchorOptions.MIN_WORDS_PENLTY; //get nearby options CategorizerOptions nearbyOptions = getOptions("nearby"); RankerOptions.NER_ALPHA = nearbyOptions.ALPHA; RankerOptions.NER_BETA = nearbyOptions.BETA; RankerOptions.NER_GAMMA = nearbyOptions.GAMMA; RankerOptions.NER_MIN = nearbyOptions.MIN_WORDS_LIMIT; RankerOptions.NER_PENLTY = nearbyOptions.MIN_WORDS_PENLTY; //get category Options CategorizerOptions categoryOptions = getOptions("Category"); RankerOptions.CAT_ALPHA = categoryOptions.ALPHA; RankerOptions.CAT_BETA = categoryOptions.BETA; RankerOptions.CAT_GAMMA = categoryOptions.GAMMA; RankerOptions.CAT_MIN = categoryOptions.MIN_WORDS_LIMIT; RankerOptions.CAT_PENLTY = categoryOptions.MIN_WORDS_PENLTY; //get Ranker options getRankerOptions(); //set symmitric line String symmetric = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId() , TaskProperty.SYMMETRIC_LINE.ToString()); //TODO:continue the assigning of symmitric line }
/** * This method creates Categorizer Options and returns it. * The values of the variables of the new object are brought from the data base. * Note : In case the operation mode is manual the variables will be default ones, * such is the case when the returns values from the data base are nulls. */ private static CategorizerOptions getOptions(String optionsType) { CategorizerOptions options = new CategorizerOptions(); if (WorkDetails.getOperationMode() == operationMode_t.Auto) { String alphaSearch = null, bettaSearch = null, gammaSearch = null, minSearch = null, penaltySearch = null; switch (optionsType) { case "anchor": alphaSearch = TaskProperty.ANC_ALPHA.ToString(); bettaSearch = TaskProperty.ANC_BETA.ToString(); gammaSearch = TaskProperty.ANC_GAMMA.ToString(); minSearch = TaskProperty.ANC_MIN.ToString(); penaltySearch = TaskProperty.ANC_PENLTY.ToString(); break; case "Category": alphaSearch = TaskProperty.CAT_ALPHA.ToString(); bettaSearch = TaskProperty.CAT_BETA.ToString(); gammaSearch = TaskProperty.CAT_GAMMA.ToString(); minSearch = TaskProperty.CAT_MIN.ToString(); penaltySearch = TaskProperty.CAT_PENLTY.ToString(); break; case "nearby": alphaSearch = TaskProperty.NER_ALPHA.ToString(); bettaSearch = TaskProperty.NER_BETA.ToString(); gammaSearch = TaskProperty.NER_GAMMA.ToString(); minSearch = TaskProperty.NER_MIN.ToString(); penaltySearch = TaskProperty.NER_PENLTY.ToString(); break; default: goto case "Category"; } String alpha = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), alphaSearch); String betta = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), bettaSearch); String gamma = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), gammaSearch); String min = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), minSearch); String penalty = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), penaltySearch); if (isRealNum(alpha)) options.ALPHA = Convert.ToDouble(alpha); if (isRealNum(betta)) options.BETA = Convert.ToDouble(betta); if (isRealNum(gamma)) options.GAMMA = Convert.ToDouble(gamma); if (isRealNum(min)) options.MIN_WORDS_LIMIT = Convert.ToDouble(min); if (isRealNum(penalty)) options.MIN_WORDS_PENLTY = Convert.ToDouble(penalty); } return options; }
/** * This method returns a rank for the whole page content. */ private int getRankOfWholeContent(ResourceContent resource) { //These variables will contain the max and avg of the match levels of the whole //content of parentReasource. int maxMatchLevelForContent = 0; int avgMatchLevelForContent = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR WHOLE CONTENT RANK******************************** "); //sw.WriteLine(" URL : " + resource.getResourceUrl()); sw.Close(); } //calculate the min and max of the match levels of the whole resource content to the categories. if (wholeContentOptions == null) { wholeContentOptions = getOptions("wholeContent"); } List<int> matchLevelsForContent = categorizer.classifyContentToAllCategories(resource.getResourceContent().Substring(0), wholeContentOptions); maxMatchLevelForContent = calculateMax(matchLevelsForContent); avgMatchLevelForContent = calculateAvg(matchLevelsForContent); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(maxMatchLevelForContent); sw.WriteLine(" .AVG MATCH LEVEL OF WHOLE CONTENT: "); sw.WriteLine(avgMatchLevelForContent); //sw.WriteLine(" .RANK OF WHOLE CONTENT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForContent)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForContent + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForContent)); }
/** * This method returns a rank for the anchor url */ private int getRankOfAnchor(LinkItem item) { //These variables will contain the max and avg of the match levels of the Anchor Url int maxMatchLevelForAnchor = 0; int avgMatchLevelForAnchor = 0; if (item.getAnchor() == null) return 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF ANCHOR:"); sw.WriteLine(item.getAnchor()); sw.Close(); } //calculate the min and max of the match levels of the anchor url to the categories. if (anchorOptions == null) { anchorOptions = getOptions("anchor"); } List<int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions); maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor); avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: "); sw.WriteLine(maxMatchLevelForAnchor); sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: "); sw.WriteLine(avgMatchLevelForAnchor); //sw.WriteLine(" .RANK OF ANCHOR: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor)); //sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor)); }
/** * This method creates Categorizer Options and returns it. * The values of the variables of the new object are brought from the data base. * Note : In case the operation mode is manual the variables will be default ones, * such is the case when the returns values from the data base are nulls. */ private CategorizerOptions getOptions(String optionsType) { CategorizerOptions options = new CategorizerOptions(); if (WorkDetails.getOperationMode() == operationMode_t.Auto) { switch (optionsType) { case "anchor": options.ALPHA = RankerOptions.ANC_ALPHA; options.BETA = RankerOptions.ANC_BETA; options.GAMMA = RankerOptions.ANC_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.ANC_MIN; options.MIN_WORDS_PENLTY = RankerOptions.ANC_PENLTY; options.isRank = true; options.NONZERO_MAX_EFFECT = 0; break; case "wholeContent": options.ALPHA = RankerOptions.CAT_ALPHA; options.BETA = RankerOptions.CAT_BETA; options.GAMMA = RankerOptions.CAT_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.CAT_MIN; options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY; options.isRank = true; break; case "nearby": options.ALPHA = RankerOptions.NER_ALPHA; options.BETA = RankerOptions.NER_BETA; options.GAMMA = RankerOptions.NER_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.NER_MIN; options.MIN_WORDS_PENLTY = RankerOptions.NER_PENLTY; options.NONZERO_MAX_EFFECT = 40; options.isRank = true; break; default: goto case "wholeContent"; } } return options; }
/** * This method returns the match level of the given wordlist according to a * certain formula. */ public int getMatchLevel(String wordList,CategorizerOptions parameters) { char[] separators = {' ', '\t', '\n'}; int numOfWords = Math.Max(1, wordList.Split(separators).Length); int numOfKeywords = Math.Max(1, keywordList.Count); int nonZero = 0; double sumOfhistogram = 0; double threshold = Math.Max(2.0, ((numOfWords * parameters.BETA) / numOfKeywords)); // keywordList and wordList are copied to a new arrays so that we won't change them(the originals) List<String> keywordListCopied = new List<string>(keywordList); String wordListCopied = (String)wordList.Clone(); // Transforming the keywordListCopied and wordListCopied to canonical form //keywordListCopied.ForEach(canonicForm); canonicForm(ref wordListCopied); //wordListCopied = wordListCopied.ToLower(); int[] histogram = new int[numOfKeywords]; //Initialising the histogram array to zeros for (int i = 0; i < numOfKeywords; i++) { histogram[i] = 0; } foreach (String keyword in keywordListCopied) { Regex objPattern = new Regex(keyword.ToLower()); int count = objPattern.Matches(wordListCopied,0).Count; int index = keywordListCopied.IndexOf(keyword); if (count != 0 && histogram[index] == 0) nonZero++; if (histogram[index] < threshold) { int add = Math.Min(histogram[index] + count, (int)threshold) - histogram[index]; histogram[index] = histogram[index] + add; sumOfhistogram = sumOfhistogram + add; } } double nonZeroBonus = (nonZero * parameters.GAMMA) / numOfKeywords; nonZeroBonus = Math.Min(parameters.NONZERO_MAX_EFFECT, nonZeroBonus); double matchPercent = (sumOfhistogram * parameters.ALPHA) / numOfWords; matchPercent = Math.Min(parameters.MATCH_MAX_EFFECT, matchPercent); double total = parameters.MAX_MATCH_LEVEL * (nonZeroBonus + matchPercent) / (parameters.MATCH_MAX_EFFECT + parameters.NONZERO_MAX_EFFECT); if (numOfWords < parameters.MIN_WORDS_LIMIT) { total = parameters.MIN_WORDS_PENLTY * total; } StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugCategorization) { if (!parameters.isRank) { sw = new StreamWriter("_DEBUG_INFO_CATEGORIZER@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** DATA FOR REQUEST ************************************************* "); //sw.WriteLine(" .CONTENT WORDS: "); //sw.WriteLine(wordListCopied.ToString()); sw.WriteLine(" .NUM OF WORDS: "); sw.WriteLine(numOfWords.ToString()); sw.WriteLine(" .KEY WORDS: "); sw.WriteLine(keywordListCopied.ToString()); sw.WriteLine(" .NUM OF KEY WORDS: "); sw.WriteLine(numOfKeywords.ToString()); sw.WriteLine(" .THRESOLD PARAM: "); sw.WriteLine(threshold.ToString()); sw.WriteLine(" .SUM OF HISTOGRAM: "); sw.WriteLine(sumOfhistogram.ToString()); sw.WriteLine(" .NONZERO PARAM: "); sw.WriteLine(nonZero.ToString()); sw.WriteLine(" .HISTOGRAM DATA:"); for (int j = 0; j < numOfKeywords; j++) { sw.WriteLine(" .[" + keywordList[j] + "] -> " + histogram[j].ToString()); } sw.WriteLine(" .NON-ZERO BONUS: "); sw.WriteLine(nonZeroBonus.ToString()); sw.WriteLine(" .MATCH PERCENT: "); sw.WriteLine(matchPercent.ToString()); sw.WriteLine(" .TOTAL TRUST: "); sw.WriteLine(total.ToString()); sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } } if (LogDebuggerControl.getInstance().debugCategorizationInRanker && LogDebuggerControl.getInstance().debugRanker) { if (parameters.isRank) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** DATA FOR Categorizer ************************************************* "); //sw.WriteLine(" .CONTENT WORDS: "); //sw.WriteLine(wordListCopied.ToString()); sw.WriteLine(" .NUM OF WORDS: "); sw.WriteLine(numOfWords.ToString()); //String[] wordListSplited = wordListCopied.Split(separators); //for (int k = 0; k < numOfWords;k++ ) //{ // sw.WriteLine(" .[" + k + "] -> " + wordListSplited[k]); //} sw.WriteLine(" .KEY WORDS: "); sw.WriteLine(keywordListCopied.ToString()); sw.WriteLine(" .NUM OF KEY WORDS: "); sw.WriteLine(numOfKeywords.ToString()); sw.WriteLine(" .THRESOLD PARAM: "); sw.WriteLine(threshold.ToString()); sw.WriteLine(" .SUM OF HISTOGRAM: "); sw.WriteLine(sumOfhistogram.ToString()); sw.WriteLine(" .NONZERO PARAM: "); sw.WriteLine(nonZero.ToString()); sw.WriteLine(" .HISTOGRAM DATA:"); for (int j = 0; j < numOfKeywords; j++) { sw.WriteLine(" .[" + keywordList[j] + "] -> " + histogram[j].ToString()); } sw.WriteLine(" .NON-ZERO BONUS: "); sw.WriteLine(nonZeroBonus.ToString()); sw.WriteLine(" .MATCH PERCENT: "); sw.WriteLine(matchPercent.ToString()); sw.WriteLine(" .TOTAL TRUST: "); sw.WriteLine(total.ToString()); sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } } return Convert.ToInt32(total); }
/** * This method creates Categorizer Options and returns it. * The values of the variables of the new object are brought from the data base. * Note : In case the operation mode is manual the variables will be default ones, * such is the case when the returns values from the data base are nulls. */ private CategorizerOptions getCategorizerOptions() { CategorizerOptions options = new CategorizerOptions(); if (WorkDetails.getOperationMode() == operationMode_t.Auto) { options.ALPHA =RankerOptions.CAT_ALPHA; options.BETA = RankerOptions.CAT_BETA; options.GAMMA = RankerOptions.CAT_GAMMA; options.MIN_WORDS_LIMIT = RankerOptions.CAT_MIN; options.MIN_WORDS_PENLTY = RankerOptions.CAT_PENLTY; } return options; }
/** * This method creates Categorizer Options and returns it. * The values of the variables of the new object are brought from the data base. * Note : In case the operation mode is manual the variables will be default ones, * such is the case when the returns values from the data base are nulls. */ private static CategorizerOptions getOptions(String optionsType) { CategorizerOptions options = new CategorizerOptions(); if (WorkDetails.getOperationMode() == operationMode_t.Auto) { String alphaSearch = null, bettaSearch = null, gammaSearch = null, minSearch = null, penaltySearch = null; switch (optionsType) { case "anchor": alphaSearch = TaskProperty.ANC_ALPHA.ToString(); bettaSearch = TaskProperty.ANC_BETA.ToString(); gammaSearch = TaskProperty.ANC_GAMMA.ToString(); minSearch = TaskProperty.ANC_MIN.ToString(); penaltySearch = TaskProperty.ANC_PENLTY.ToString(); break; case "Category": alphaSearch = TaskProperty.CAT_ALPHA.ToString(); bettaSearch = TaskProperty.CAT_BETA.ToString(); gammaSearch = TaskProperty.CAT_GAMMA.ToString(); minSearch = TaskProperty.CAT_MIN.ToString(); penaltySearch = TaskProperty.CAT_PENLTY.ToString(); break; case "nearby": alphaSearch = TaskProperty.NER_ALPHA.ToString(); bettaSearch = TaskProperty.NER_BETA.ToString(); gammaSearch = TaskProperty.NER_GAMMA.ToString(); minSearch = TaskProperty.NER_MIN.ToString(); penaltySearch = TaskProperty.NER_PENLTY.ToString(); break; default: goto case "Category"; } String alpha = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), alphaSearch); String betta = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), bettaSearch); String gamma = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), gammaSearch); String min = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), minSearch); String penalty = StorageSystem.StorageSystem.getInstance().getProperty(WorkDetails.getTaskId(), penaltySearch); if (isRealNum(alpha)) { options.ALPHA = Convert.ToDouble(alpha); } if (isRealNum(betta)) { options.BETA = Convert.ToDouble(betta); } if (isRealNum(gamma)) { options.GAMMA = Convert.ToDouble(gamma); } if (isRealNum(min)) { options.MIN_WORDS_LIMIT = Convert.ToDouble(min); } if (isRealNum(penalty)) { options.MIN_WORDS_PENLTY = Convert.ToDouble(penalty); } } return(options); }
/** * This method returns a rank for the nearby textof the url */ private int getRankOfNearbyText(LinkItem item) { //These variables will contain the max and avg of the match levels of the nearby //text of the extracted url. int maxMatchLevelForNearby = 0; int avgMatchLevelForNearby = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF NEARBY TEXT:"); sw.WriteLine(item.getText()); sw.Close(); } //calculate the min and max of the match levels of the nearby text to the categories. if (nearbyOptions == null) { nearbyOptions = getOptions("nearby"); } List<int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions); maxMatchLevelForNearby = calculateMax(matchLevelsForNearby); avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(maxMatchLevelForNearby); sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(avgMatchLevelForNearby); //sw.WriteLine(" .RANK OF NEARBY TEXT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return ((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby)); }