Example #1
0
        public void Start()
        {
            // ignore ssl errors
            ServicePointManager.ServerCertificateValidationCallback = (obj, certificate, chain, errors) => (true);

            // start
            var starter = UrlObject.FromString(Frontier);

            if (!Unvisited.Any())
            {
                Unvisited.Add(starter.GetFullPath(false), starter);
            }

            // while still pages unprocessed
            while (Unvisited.Any() && Visited.Count < MaxAllowedPages)
            {
                Parallel.ForEach(Unvisited, (urlPair) =>
                {
                    try
                    {
                        try
                        {
                            var p = PageFromUrl(urlPair.Value);
                            ProcessNewPaths(p, urlPair.Value);
                        }
                        catch (ArgumentOutOfRangeException) { }

                        var unprocessed = Visited.Where(x => x.Value.Processed == false);
                        foreach (var page in unprocessed)
                        {
                            if (this.JobType == SpiderJobType.PAGE_ONLY)
                            {
                                page.Value.LinkTags = new List <LinkTag>();
                            }
                            PersistenceInserter.PersistData(page.Value);
                            page.Value.Processed = true;
                        }
                    }
                    catch (ArgumentException) { }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                });
            }
        }
Example #2
0
        public void ProcessNewPaths(Page p, UrlObject domainObject)
        {
            if (p != null && domainObject != null)
            {
                Console.WriteLine("Visited: " + p.Link.GetFullPath(false));

                Unvisited.Remove(p.Link.GetFullPath(false));
                if (!Visited.ContainsKey(p.Link.GetFullPath(false)))
                {
                    Visited.Add(p.Link.GetFullPath(false), p);
                }

                foreach (LinkTag l in p.LinkTags)
                {
                    var toBeVisited = false;
                    var visited     = false;
                    try
                    {
                        var key = Unvisited[l.Url.GetFullPath(false)];
                        toBeVisited = true;
                    }
                    catch (KeyNotFoundException /* knfe */) { }

                    try
                    {
                        var key = Visited[l.Url.GetFullPath(false)];
                        visited = true;
                    }
                    catch (KeyNotFoundException /* knfe */) { }

                    if (toBeVisited != true
                        & visited != true)
                    {
                        if (l.Url.GetDomain() == domainObject.GetDomain())
                        {
                            Unvisited.Add(l.Url.GetFullPath(false), l.Url);
                        }
                    }
                }
            }
        }