Esempio n. 1
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML(string html_content, string inner_text)
        {
            //html_content = RemoveScripts(html_content);
            element _element = new element();

            _element.BagofWords = inner_text;
            _element.wordCount  = HTML.WordsCountGivenText(_element.BagofWords);


            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            int count_link = webfilter.CountStringOccurrences(html_content, "<a ");

            count_link = count_link + webfilter.CountStringOccurrences(html_content, "<A ");
            count_link = count_link + webfilter.CountStringOccurrences(html_content, "onclick="); //interesting javascript with link

            _element.LinkCount       = count_link;
            _element.wordCountinLink = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount != 0)
            {
                _element.meanofWordinLinks = (double)_element.wordCountinLink / _element.LinkCount;
            }
            else
            {
                _element.meanofWordinLinks = 0;
            }

            if (_element.wordCount != 0)
            {
                _element.meanofWordinLinksAllWords = (double)_element.wordCountinLink / _element.wordCount;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }
Esempio n. 2
0
        //prepare information for a given hmtl
        public element AnalyzeGivenHTML_AE(string html_content, string inner_text)
        {
            //html_content = RemoveScripts(html_content);
            element _element = new element();

            _element.BagofWords_AE = inner_text;
            _element.wordCount_AE  = HTML.WordsCountGivenText(_element.BagofWords_AE);

            string pattern = "href=.*?>(.*?)</a";
            Regex  exp     = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);

            MatchCollection matchList = exp.Matches(html_content);

            string[] _list     = new string[matchList.Count];
            string   URL_INNER = "";

            for (int i = 0; i < matchList.Count; i++)
            {
                Match match = matchList[i];
                if (match.Value.Length > 0)
                {
                    URL_INNER = URL_INNER + " " + HTML.stripHtml(match.Groups[1].Value);
                }
            }

            _element.LinkCount_AE       = matchList.Count;
            _element.wordCountinLink_AE = HTML.WordsCountGivenText(URL_INNER);
            if (_element.LinkCount_AE != 0)
            {
                _element.meanofWordinLinks_AE = (double)_element.wordCountinLink_AE / _element.LinkCount_AE;
            }
            else
            {
                _element.meanofWordinLinks_AE = 0;
            }

            if (_element.wordCount_AE != 0)
            {
                _element.meanofWordinLinksAllWords_AE = (double)_element.wordCountinLink_AE / _element.wordCount_AE;
            }
            else
            {
                _element.meanofWordinLinksAllWords = 0;
            }

            return(_element);
        }