public void initiateSeedToFrontier() { List<HTMLLink> seed = new List<HTMLLink>(); HTMLLink stack = new HTMLLink(); // XML loader fejler på disse links stack.hrefLink = new Uri("http://stackoverflow.com/questions/10984336/net-using-using-blocks-vs-calling-dispose"); HTMLLink ww2 = new HTMLLink(); ww2.hrefLink = new Uri("http://en.wikipedia.org/wiki/World_War_II"); HTMLLink mma = new HTMLLink(); mma.hrefLink = new Uri("http://en.wikipedia.org/wiki/MMA"); HTMLLink newz = new HTMLLink(); newz.hrefLink = new Uri("http://raid1.newz.dk/western-digital-annoncerer-10-tb-harddisk"); HTMLLink china = new HTMLLink(); china.hrefLink = new Uri("http://en.wikipedia.org/wiki/Republic_of_China_%281912%E2%80%9349%29"); seed.Add(ww2); seed.Add(mma); seed.Add(china); //seed.Add(newz); foreach (HTMLLink seedUri in seed) { frontier.Enqueue(seedUri); } }
public List<HTMLLink> FindLinks(HTMLLink currentWebsite) { List<HTMLLink> listOfLinks = new List<HTMLLink>(); MatchCollection linkMatches; string hostUrl = currentWebsite.hrefLink.Scheme + "://" + currentWebsite.hrefLink.Host; string protocolAsString = hostUrl.Replace(currentWebsite.hrefLink.Host, ""); StreamReader streamReader = new StreamReader(getDataStream(currentWebsite.hrefLink)); while (!streamReader.EndOfStream) { linkMatches = Regex.Matches(streamReader.ReadLine(), @"(<a.*?>.*?</a>)", RegexOptions.Singleline); foreach (Match link in linkMatches) { string htmlString = link.Groups[1].Value; Match linkUrl = Regex.Match(htmlString, @"href=\""(.*?)\""", RegexOptions.Singleline); if (linkUrl.Success) { string temporaryString = ""; HTMLLink htmlLink = new HTMLLink(); htmlLink.title = Regex.Replace(htmlString, @"\s*<.*?>\s*", "", RegexOptions.Singleline); temporaryString = linkUrl.Groups[1].Value; if (temporaryString.StartsWith("http")) { //link is correctly formatted } else if (temporaryString.StartsWith("//")) { temporaryString = temporaryString.Remove(0, 2); temporaryString = protocolAsString + temporaryString; } else if (temporaryString.StartsWith("/")) { temporaryString = hostUrl + temporaryString; } else temporaryString = hostUrl + "/" + temporaryString; //Få dette til at fjerne alle uønskede links og filtyper .avi, .php, etc. Match wrongUrl = Regex.Match(temporaryString, @"\.php", RegexOptions.Singleline); if (!temporaryString.Contains("#") && temporaryString.Length > 0 && !wrongUrl.Success) { htmlLink.hrefLink = new Uri(temporaryString); listOfLinks.Add(htmlLink); } } } } return listOfLinks; }
/// <summary> /// Filters the urls by the information collected in the robots.txt file /// </summary> /// <param name="link">The link to filter</param> /// <returns>true if the link should be crawled, false otherwise</returns> public bool FilterUrlsByRobotsTxt(HTMLLink link) { if (disallowedList.Contains(link.hrefLink.AbsolutePath)) { if (allowedList.Contains(link.hrefLink.AbsolutePath)) { return true; } else return false; } else { return true; } }