예제 #1
0
        /**
         * returns the link items from the given page
         */
        public List <LinkItem> extractLinks(String url, String page)
        {
            List <LinkItem> list = new List <LinkItem>();

            if (page == null)
            {
                return(list);
            }
            /*** 1. start extractor ***/
            DateTime startTime = DateTime.Now;

            // Find all matches in file.
            MatchCollection reg = Regex.Matches(page, @"(<[aA][ \t\n].*?>.*?</[aA]>)",
                                                RegexOptions.Singleline);

            // Loop over each match.
            foreach (Match match in reg)
            {
                string   tagValue = match.Groups[1].Value;
                LinkItem item     = new LinkItem();
                item.setParent(url);
                item.setTag(tagValue);
                if (tagValue.Contains(LINK_ATTR) == true)
                {
                    list.Add(item);
                }

                item.setIndex(match.Index);
            }

            /*** 2. getting all the link matches ***/
            DateTime matchTime      = DateTime.Now;
            TimeSpan totalMatchTime = matchTime - startTime;

            // gets the text near each link
            getText(list, page);

            /*** 3. get the nearby text ***/
            DateTime getNearbyTime   = DateTime.Now;
            TimeSpan totalNearbyTime = getNearbyTime - matchTime;

            // gets the links
            getLinks(list);

            /*** 4. get the links from the tags */
            DateTime getLinksTime   = DateTime.Now;
            TimeSpan totalLinksTime = getLinksTime - getNearbyTime;

            //gets the anchor of each link
            getAnchors(list);

            /*** 5. get the anchors of the links ***/
            DateTime getAnchorsTime   = DateTime.Now;
            TimeSpan totalAnchorsTime = getAnchorsTime - getLinksTime;

            return(list);
        }
예제 #2
0
        /**
         * This method returns a rank for the anchor url
         */
        private int getRankOfAnchor(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the Anchor Url
            int maxMatchLevelForAnchor = 0;
            int avgMatchLevelForAnchor = 0;

            if (item.getAnchor() == null)
            {
                return(0);
            }

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR ANCHOR URL RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF ANCHOR:");
                sw.WriteLine(item.getAnchor());
                sw.Close();
            }

            //calculate the min and max of the match levels of the anchor url to the categories.
            if (anchorOptions == null)
            {
                anchorOptions = getOptions("anchor");
            }

            List <int> matchLevelsForAnchor = categorizer.classifyContentToAllCategories(item.getAnchor(), anchorOptions);

            maxMatchLevelForAnchor = calculateMax(matchLevelsForAnchor);
            avgMatchLevelForAnchor = calculateAvg(matchLevelsForAnchor);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(maxMatchLevelForAnchor);
                sw.WriteLine(" .AVG MATCH LEVEL OF ANCHOR: ");
                sw.WriteLine(avgMatchLevelForAnchor);
                //sw.WriteLine(" .RANK OF ANCHOR: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForAnchor));
                //sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForAnchor + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForAnchor));
        }
예제 #3
0
        /**
         * This method returns a rank for the nearby textof the url
         */
        private int getRankOfNearbyText(LinkItem item)
        {
            //These variables will contain the max and avg of the match levels of the nearby
            //text of the extracted url.
            int maxMatchLevelForNearby = 0;
            int avgMatchLevelForNearby = 0;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" ***** REQUEST FOR NEARBY TEXT RANK************************************ ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" CONTENT OF NEARBY TEXT:");
                sw.WriteLine(item.getText());
                sw.Close();
            }

            //calculate the min and max of the match levels of the nearby text to the categories.
            if (nearbyOptions == null)
            {
                nearbyOptions = getOptions("nearby");
            }

            List <int> matchLevelsForNearby = categorizer.classifyContentToAllCategories(item.getText(), nearbyOptions);

            maxMatchLevelForNearby = calculateMax(matchLevelsForNearby);
            avgMatchLevelForNearby = calculateAvg(matchLevelsForNearby);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" .MAX MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(maxMatchLevelForNearby);
                sw.WriteLine(" .AVG MATCH LEVEL OF NEARBY TEXT: ");
                sw.WriteLine(avgMatchLevelForNearby);
                //sw.WriteLine(" .RANK OF NEARBY TEXT: ");
                //sw.WriteLine((int)(RankParams.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankParams.MinAndMaxRATIO) * avgMatchLevelForNearby));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            return((int)(RankerOptions.MinAndMaxRATIO * maxMatchLevelForNearby + (1 - RankerOptions.MinAndMaxRATIO) * avgMatchLevelForNearby));
        }
예제 #4
0
        /**
         * This method calculates the rank of a given url and returns it.
         */
        public int rankUrl(ResourceContent parentResource, LinkItem item)
        {
            //These variables will contain the ranks for the whole content match and nearby text match and
            //anchor match and the parentrank.
            int rankParentUrl = parentResource.getRankOfUrl();
            int anchorRank    = 0;
            //int wholePageRank = 0;
            int nearbyTextRank = 0;

            int neighborhood = 0;
            int context      = 0;
            int inherited    = 0;

            char[] separators = { ' ', '\t', '\n' };
            NumOfLinks++;
            sumOfTotalNearbyWords += item.getText().Split(separators).Length;
            sumOfTotalAnchorWords += item.getAnchor().Split(separators, StringSplitOptions.RemoveEmptyEntries).Length;

            StreamWriter sw = null;

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new
                     StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" *********HEAD REQUEST *********************************************");
                sw.WriteLine(" ***** DATA FOR RANKER******************************************** ");
                sw.WriteLine(" URL : " + item.getLink());
                sw.WriteLine(" PARENT URL : " + item.getParentUrl());
                sw.Close();
            }

            //rank of the whole page
            if (!((lastResourceContent != null) && (lastResourceContent.Equals(parentResource.getResourceContent()))))
            {
                lastResourceContent = parentResource.getResourceContent();
                wholePageRank       = getRankOfWholeContent(parentResource);
            }

            //rank of the nearby text
            nearbyTextRank = getRankOfNearbyText(item);

            DateTime endTimeOfNearby = DateTime.Now;

            //rank of the anchor url
            anchorRank = getRankOfAnchor(item);

            //rank of the neighborhood,that includes rank of the anchor and the nearby text
            if (anchorRank > RankerOptions.ConfidenceLevelOfAnchor)
            {
                context = 100;
            }
            else
            {
                //nearbyTextRank = getRankOfNearbyText(item);
                context = nearbyTextRank;
            }
            neighborhood = (int)(RankerOptions.BETTA * anchorRank + (1 - RankerOptions.BETTA) * context);

            //rank of the inherited,that includes the rank of the parentUrl and paren content
            inherited = (int)(RankerOptions.ALPHA * rankParentUrl + (1 - RankerOptions.ALPHA) * wholePageRank);

            if (LogDebuggerControl.getInstance().debugRanker)
            {
                sw = new StreamWriter("DataForRank" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine("************************DATA CONCLUSION*************************");
                sw.WriteLine(" .PARENT RANK: ");
                sw.WriteLine(rankParentUrl);
                sw.WriteLine(" .RANK OF NEARBY TEXT: ");
                sw.WriteLine(nearbyTextRank);
                sw.WriteLine(" .AVG OF NEARBY WORDS");
                sw.WriteLine((int)(sumOfTotalNearbyWords / NumOfLinks));
                sw.WriteLine(" .RANK OF ANCHOR: ");
                sw.WriteLine(anchorRank);
                sw.WriteLine(" .AVG OF ANCHOR TEXT");
                sw.WriteLine((int)(sumOfTotalAnchorWords / NumOfLinks));
                sw.WriteLine(" .NEIGHBORHOOD: ");
                sw.WriteLine(neighborhood);
                sw.WriteLine(" .RANK OF WHOLE CONTENT: ");
                sw.WriteLine(wholePageRank);
                sw.WriteLine(" .INHERITED: ");
                sw.WriteLine(inherited);
                sw.WriteLine(" .RANK OF THE URL: ");
                sw.WriteLine((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood));
                // sw.WriteLine(" * END ****************************************************************** ");
                sw.Close();
            }

            //Console.WriteLine(totalRankingTime.TotalSeconds);
            return((int)(RankerOptions.GAMMA * inherited + (1 - RankerOptions.GAMMA) * neighborhood));
        }