/** * returns the link items from the given page */ public List <LinkItem> extractLinks(String url, String page) { List <LinkItem> list = new List <LinkItem>(); if (page == null) { return(list); } /*** 1. start extractor ***/ DateTime startTime = DateTime.Now; // Find all matches in file. MatchCollection reg = Regex.Matches(page, @"(<[aA][ \t\n].*?>.*?</[aA]>)", RegexOptions.Singleline); // Loop over each match. foreach (Match match in reg) { string tagValue = match.Groups[1].Value; LinkItem item = new LinkItem(); item.setParent(url); item.setTag(tagValue); if (tagValue.Contains(LINK_ATTR) == true) { list.Add(item); } item.setIndex(match.Index); } /*** 2. getting all the link matches ***/ DateTime matchTime = DateTime.Now; TimeSpan totalMatchTime = matchTime - startTime; // gets the text near each link getText(list, page); /*** 3. get the nearby text ***/ DateTime getNearbyTime = DateTime.Now; TimeSpan totalNearbyTime = getNearbyTime - matchTime; // gets the links getLinks(list); /*** 4. get the links from the tags */ DateTime getLinksTime = DateTime.Now; TimeSpan totalLinksTime = getLinksTime - getNearbyTime; //gets the anchor of each link getAnchors(list); /*** 5. get the anchors of the links ***/ DateTime getAnchorsTime = DateTime.Now; TimeSpan totalAnchorsTime = getAnchorsTime - getLinksTime; return(list); }
/** * This method returns a rank for the anchor url */ private int getRankOfAnchor(LinkItem item) { //These variables will contain the max and avg of the match levels of the Anchor Url int maxMatchLevelForAnchor = 0; int avgMatchLevelForAnchor = 0; if (item.getAnchor() == null) { return(0); } StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF ANCHOR:"); sw.WriteLine(item.getAnchor()); sw.Close(); } //calculate the min and max of the match levels of the anchor url to the categories. if (anchorOptions == null) { anchorOptions = getOptions("anchor"); } List <int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions); maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor); avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: "); sw.WriteLine(maxMatchLevelForAnchor); sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: "); sw.WriteLine(avgMatchLevelForAnchor); //sw.WriteLine(" .RANK OF ANCHOR: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor)); //sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor)); }
/** * This method returns a rank for the nearby textof the url */ private int getRankOfNearbyText(LinkItem item) { //These variables will contain the max and avg of the match levels of the nearby //text of the extracted url. int maxMatchLevelForNearby = 0; int avgMatchLevelForNearby = 0; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" CONTENT OF NEARBY TEXT:"); sw.WriteLine(item.getText()); sw.Close(); } //calculate the min and max of the match levels of the nearby text to the categories. if (nearbyOptions == null) { nearbyOptions = getOptions("nearby"); } List <int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions); maxMatchLevelForNearby = calculateMax(matchLevelsForNearby); avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(maxMatchLevelForNearby); sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: "); sw.WriteLine(avgMatchLevelForNearby); //sw.WriteLine(" .RANK OF NEARBY TEXT: "); //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby)); }
/** * This method calculates the rank of a given url and returns it. */ public int rankUrl(ResourceContent parentResource, LinkItem item) { //These variables will contain the ranks for the whole content match and nearby text match and //anchor match and the parentrank. int rankParentUrl = parentResource.getRankOfUrl(); int anchorRank = 0; //int wholePageRank = 0; int nearbyTextRank = 0; int neighborhood = 0; int context = 0; int inherited = 0; char[] separators = { ' ', '\t', '\n' }; NumOfLinks++; sumOfTotalNearbyWords += item.getText().Split(separators).Length; sumOfTotalAnchorWords += item.getAnchor().Split(separators, StringSplitOptions.RemoveEmptyEntries).Length; StreamWriter sw = null; if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine(" *********HEAD REQUEST *********************************************"); sw.WriteLine(" ***** DATA FOR RANKER******************************************** "); sw.WriteLine(" URL : " + item.getLink()); sw.WriteLine(" PARENT URL : " + item.getParentUrl()); sw.Close(); } //rank of the whole page if (!((lastResourceContent != null) && (lastResourceContent.Equals(parentResource.getResourceContent())))) { lastResourceContent = parentResource.getResourceContent(); wholePageRank = getRankOfWholeContent(parentResource); } //rank of the nearby text nearbyTextRank = getRankOfNearbyText(item); DateTime endTimeOfNearby = DateTime.Now; //rank of the anchor url anchorRank = getRankOfAnchor(item); //rank of the neighborhood,that includes rank of the anchor and the nearby text if (anchorRank > RankerOptions.ConfidenceLevelOfAnchor) { context = 100; } else { //nearbyTextRank = getRankOfNearbyText(item); context = nearbyTextRank; } neighborhood = (int)(RankerOptions.BETTA * anchorRank + (1 - RankerOptions.BETTA) * context); //rank of the inherited,that includes the rank of the parentUrl and paren content inherited = (int)(RankerOptions.ALPHA * rankParentUrl + (1 - RankerOptions.ALPHA) * wholePageRank); if (LogDebuggerControl.getInstance().debugRanker) { sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true); sw.WriteLine("************************DATA CONCLUSION*************************"); sw.WriteLine(" .PARENT RANK: "); sw.WriteLine(rankParentUrl); sw.WriteLine(" .RANK OF NEARBY TEXT: "); sw.WriteLine(nearbyTextRank); sw.WriteLine(" .AVG OF NEARBY WORDS"); sw.WriteLine((int)(sumOfTotalNearbyWords / NumOfLinks)); sw.WriteLine(" .RANK OF ANCHOR: "); sw.WriteLine(anchorRank); sw.WriteLine(" .AVG OF ANCHOR TEXT"); sw.WriteLine((int)(sumOfTotalAnchorWords / NumOfLinks)); sw.WriteLine(" .NEIGHBORHOOD: "); sw.WriteLine(neighborhood); sw.WriteLine(" .RANK OF WHOLE CONTENT: "); sw.WriteLine(wholePageRank); sw.WriteLine(" .INHERITED: "); sw.WriteLine(inherited); sw.WriteLine(" .RANK OF THE URL: "); sw.WriteLine((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); // sw.WriteLine(" * END ****************************************************************** "); sw.Close(); } //Console.WriteLine(totalRankingTime.TotalSeconds); return((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood)); }