Пример #1
0
        public void ProcessNewPage(Website inputwebsite)
        {
            try
            {
                string        URL          = inputwebsite.currentPath;
                HtmlWeb       htmlweb      = new HtmlWeb();
                HtmlDocument  htmlDocument = htmlweb.Load(URL);
                List <string> urls         = new List <string>();
                try
                {
                    urls = htmlDocument.DocumentNode.SelectNodes("//a[@href]").Select(i => i.GetAttributeValue("href", null)).ToList();
                }
                catch (Exception e)
                {
                    //Console.WriteLine(e.Message);
                }
                List <string> banned = new List <string>();
                inputwebsite.HTMLContent = htmlDocument.Text;
                //websites.Find(x => x.currentPath == URL).HTMLContent = htmlDocument.Text;

                foreach (string item in urls)
                {
                    if (item.Contains("facebook.com") || item.ToLower().Contains(".pdf"))
                    {
                        banned.Add(item);
                    }
                }
                foreach (string item in banned)
                {
                    urls.Remove(item);
                }



                string url1;
                string httpstring = "http";
                foreach (string url in urls)
                {
                    try
                    {
                        //Console.WriteLine(url.IndexOf('h').ToString() + url.IndexOf('t').ToString() + url.IndexOf('p').ToString());
                        if (!url.Contains("www"))
                        {
                            if (url.IndexOf('h') == 0 && url.IndexOf('t') == 1 && url.IndexOf('p') == 3)
                            {
                                url1 = url;
                            }
                            else if (url[0] == '/' && url[1] == '/')
                            {
                                url1 = httpstring + url;
                            }
                            else
                            {
                                url1 = URL.Remove(URL.Length - 1, 1) + url;
                            }
                        }
                        else
                        {
                            url1 = url;
                        }

                        Uri    uri    = new Uri(url1);
                        string domain = uri.Host;
                        Domain dom    = domains.Find(x => x.URL == domain);
                        if (dom == null)
                        {
                            dom = new Domain(domain, RobotTXTHandler.FindRestrictions(domain));
                            domains.Add(dom);
                        }

                        Website tempwebsite = new Website(dom, url1);
                        if (!dom.restriction.disallow.Contains(tempwebsite.currentPath.Remove(0, tempwebsite.DomainURL.URL.Length)))
                        {
                            if (!queue.Contains(tempwebsite) && !websites.Contains(tempwebsite))
                            {
                                tempwebsite.LinkedFrom.Add(inputwebsite);
                                queue.Enqueue(tempwebsite);
                            }
                            else if (websites.Contains(tempwebsite))
                            {
                                websites.Find(x => x == tempwebsite).LinkedFrom.Add(inputwebsite);
                            }
                            else if (queue.Contains(tempwebsite))
                            {
                                queue.ElementAt(queue.ToArray().ToList().IndexOf(tempwebsite)).LinkedFrom.Add(inputwebsite);
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        //Console.WriteLine(e.Message);
                    }
                }
            }
            catch (Exception e)
            {
                //Console.WriteLine(e.Message);
            }
        }
Пример #2
0
        public double Jaccard(Website input1, Website input2)
        {
            string     text1input = input1.HTMLContent;
            string     text2input = input2.HTMLContent;
            List <int> text1      = new List <int>();
            List <int> text2      = new List <int>();
            List <int> shift1     = new List <int>(84);
            List <int> shift2     = new List <int>(84);

            if (input1.HTMLContent == "" || input1.HTMLContent == null)
            {
                if (input2.HTMLContent == "" || input2.HTMLContent == null)
                {
                    return(100);
                }
                else
                {
                    return(0);
                }
            }
            if (input2.HTMLContent == "" || input2.HTMLContent == null || input1.HTMLContent.Split(' ').Length < 4 || input2.HTMLContent.Split(' ').Length < 6)
            {
                return(0);
            }

            if (knownwebsitees.Contains(input1))
            {
                shift1 = input1.Hashnumber;
            }
            else
            {
                text1 = FindHashNumber(text1input);
                for (int i = 0; i < randomList.Count; i++)
                {
                    shift1.Add(BigShiftHash(text1, randomList[i]).Min());
                }
                input1.Hashnumber = shift1;
                knownwebsitees.Add(input1);
            }

            if (knownwebsitees.Contains(input2) && input2.Hashnumber != null)
            {
                shift2 = input2.Hashnumber;
            }
            else
            {
                text2 = FindHashNumber(text2input);
                for (int i = 0; i < randomList.Count; i++)
                {
                    shift2.Add(BigShiftHash(text2, randomList[i]).Min());
                }
                input2.Hashnumber = shift2;
                knownwebsitees.Add(input2);
            }

            double identicalcounter = 0;

            for (int i = 0; i < shift1.Count - 1; i++)
            {
                if (shift1[i] == shift2[i])
                {
                    identicalcounter++;
                }
            }

            /*
             * foreach (Int32 item in randomList)
             * {
             *  text1Hashes = BigShiftHash(text1, item).Min();
             *  text2Hashes = BigShiftHash(text2, item).Min();
             *  if (text1Hashes == text2Hashes)
             *  {
             *      identicalcounter++;
             *  }
             * }*/

            return((identicalcounter / 84) * 100);
        }