public void Start() { // ignore ssl errors ServicePointManager.ServerCertificateValidationCallback = (obj, certificate, chain, errors) => (true); // start var starter = UrlObject.FromString(Frontier); if (!Unvisited.Any()) { Unvisited.Add(starter.GetFullPath(false), starter); } // while still pages unprocessed while (Unvisited.Any() && Visited.Count < MaxAllowedPages) { Parallel.ForEach(Unvisited, (urlPair) => { try { try { var p = PageFromUrl(urlPair.Value); ProcessNewPaths(p, urlPair.Value); } catch (ArgumentOutOfRangeException) { } var unprocessed = Visited.Where(x => x.Value.Processed == false); foreach (var page in unprocessed) { if (this.JobType == SpiderJobType.PAGE_ONLY) { page.Value.LinkTags = new List <LinkTag>(); } PersistenceInserter.PersistData(page.Value); page.Value.Processed = true; } } catch (ArgumentException) { } catch (Exception e) { Console.WriteLine(e); } }); } }
public void ProcessNewPaths(Page p, UrlObject domainObject) { if (p != null && domainObject != null) { Console.WriteLine("Visited: " + p.Link.GetFullPath(false)); Unvisited.Remove(p.Link.GetFullPath(false)); if (!Visited.ContainsKey(p.Link.GetFullPath(false))) { Visited.Add(p.Link.GetFullPath(false), p); } foreach (LinkTag l in p.LinkTags) { var toBeVisited = false; var visited = false; try { var key = Unvisited[l.Url.GetFullPath(false)]; toBeVisited = true; } catch (KeyNotFoundException /* knfe */) { } try { var key = Visited[l.Url.GetFullPath(false)]; visited = true; } catch (KeyNotFoundException /* knfe */) { } if (toBeVisited != true & visited != true) { if (l.Url.GetDomain() == domainObject.GetDomain()) { Unvisited.Add(l.Url.GetFullPath(false), l.Url); } } } } }