public void ProcessNewPage(Website inputwebsite) { try { string URL = inputwebsite.currentPath; HtmlWeb htmlweb = new HtmlWeb(); HtmlDocument htmlDocument = htmlweb.Load(URL); List <string> urls = new List <string>(); try { urls = htmlDocument.DocumentNode.SelectNodes("//a[@href]").Select(i => i.GetAttributeValue("href", null)).ToList(); } catch (Exception e) { //Console.WriteLine(e.Message); } List <string> banned = new List <string>(); inputwebsite.HTMLContent = htmlDocument.Text; //websites.Find(x => x.currentPath == URL).HTMLContent = htmlDocument.Text; foreach (string item in urls) { if (item.Contains("facebook.com") || item.ToLower().Contains(".pdf")) { banned.Add(item); } } foreach (string item in banned) { urls.Remove(item); } string url1; string httpstring = "http"; foreach (string url in urls) { try { //Console.WriteLine(url.IndexOf('h').ToString() + url.IndexOf('t').ToString() + url.IndexOf('p').ToString()); if (!url.Contains("www")) { if (url.IndexOf('h') == 0 && url.IndexOf('t') == 1 && url.IndexOf('p') == 3) { url1 = url; } else if (url[0] == '/' && url[1] == '/') { url1 = httpstring + url; } else { url1 = URL.Remove(URL.Length - 1, 1) + url; } } else { url1 = url; } Uri uri = new Uri(url1); string domain = uri.Host; Domain dom = domains.Find(x => x.URL == domain); if (dom == null) { dom = new Domain(domain, RobotTXTHandler.FindRestrictions(domain)); domains.Add(dom); } Website tempwebsite = new Website(dom, url1); if (!dom.restriction.disallow.Contains(tempwebsite.currentPath.Remove(0, tempwebsite.DomainURL.URL.Length))) { if (!queue.Contains(tempwebsite) && !websites.Contains(tempwebsite)) { tempwebsite.LinkedFrom.Add(inputwebsite); queue.Enqueue(tempwebsite); } else if (websites.Contains(tempwebsite)) { websites.Find(x => x == tempwebsite).LinkedFrom.Add(inputwebsite); } else if (queue.Contains(tempwebsite)) { queue.ElementAt(queue.ToArray().ToList().IndexOf(tempwebsite)).LinkedFrom.Add(inputwebsite); } } } catch (Exception e) { //Console.WriteLine(e.Message); } } } catch (Exception e) { //Console.WriteLine(e.Message); } }
public double Jaccard(Website input1, Website input2) { string text1input = input1.HTMLContent; string text2input = input2.HTMLContent; List <int> text1 = new List <int>(); List <int> text2 = new List <int>(); List <int> shift1 = new List <int>(84); List <int> shift2 = new List <int>(84); if (input1.HTMLContent == "" || input1.HTMLContent == null) { if (input2.HTMLContent == "" || input2.HTMLContent == null) { return(100); } else { return(0); } } if (input2.HTMLContent == "" || input2.HTMLContent == null || input1.HTMLContent.Split(' ').Length < 4 || input2.HTMLContent.Split(' ').Length < 6) { return(0); } if (knownwebsitees.Contains(input1)) { shift1 = input1.Hashnumber; } else { text1 = FindHashNumber(text1input); for (int i = 0; i < randomList.Count; i++) { shift1.Add(BigShiftHash(text1, randomList[i]).Min()); } input1.Hashnumber = shift1; knownwebsitees.Add(input1); } if (knownwebsitees.Contains(input2) && input2.Hashnumber != null) { shift2 = input2.Hashnumber; } else { text2 = FindHashNumber(text2input); for (int i = 0; i < randomList.Count; i++) { shift2.Add(BigShiftHash(text2, randomList[i]).Min()); } input2.Hashnumber = shift2; knownwebsitees.Add(input2); } double identicalcounter = 0; for (int i = 0; i < shift1.Count - 1; i++) { if (shift1[i] == shift2[i]) { identicalcounter++; } } /* * foreach (Int32 item in randomList) * { * text1Hashes = BigShiftHash(text1, item).Min(); * text2Hashes = BigShiftHash(text2, item).Min(); * if (text1Hashes == text2Hashes) * { * identicalcounter++; * } * }*/ return((identicalcounter / 84) * 100); }