コード例 #1
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <BinaryFile> GetFiles(Guid pageId, string address, string content)
        {
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            List <BinaryFile> files          = new List <BinaryFile>();
            WebClient         downloadClient = new WebClient();

            try
            {
                //get all of the hrefs on the page
                foreach (var link in document.DocumentNode.SelectNodes("//a[@href]"))
                {
                    var hrefAttribute = link.Attributes["href"];
                    var urlObject     = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() != null)
                    {
                        bool hasValidExtension = false;
                        foreach (var extension in ValidFileExtensions)
                        {
                            var lastOrDefault = urlObject.Path.LastOrDefault();
                            if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                            {
                                hasValidExtension = true;
                            }
                        }

                        if (hasValidExtension)
                        {
                            try
                            {
                                byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false));
                                files.Add(new BinaryFile(pageId)
                                {
                                    Url      = urlObject,
                                    Tag      = link.OuterHtml,
                                    Name     = urlObject.Path.LastOrDefault(),
                                    Contents = new MemoryStream(fileBytes)
                                });
                                Console.WriteLine("Found files: " + urlObject.GetFullPath(false));
                            }
                            catch (WebException wex)
                            {
                                log.Warn(wex);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                log.Warn("No files in the document", e);
            }
            return(files);
        }
コード例 #2
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <BinaryFile> GetImages(Guid pageId, string address, string content)
        {
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            List <BinaryFile> images         = new List <BinaryFile>();
            WebClient         downloadClient = new WebClient();

            try
            {
                foreach (HtmlNode image in document.DocumentNode.SelectNodes("//img[@src]"))
                {
                    HtmlAttribute imgSrcAttribute = image.Attributes["src"];
                    UrlObject     urlObject       = UrlObject.FromRelativeString(address, imgSrcAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() != null)
                    {
                        bool hasValidExtension = false;
                        foreach (var extension in ValidImgExtensions)
                        {
                            var lastOrDefault = urlObject.Path.LastOrDefault();
                            if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                            {
                                hasValidExtension = true;
                            }
                        }

                        if (hasValidExtension)
                        {
                            try
                            {
                                byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false));
                                images.Add(new BinaryFile(pageId)
                                {
                                    Url      = urlObject,
                                    Tag      = image.OuterHtml,
                                    Name     = urlObject.Path.LastOrDefault(),
                                    Contents = new MemoryStream(fileBytes),
                                });
                                Console.WriteLine("Found image: " + urlObject.GetFullPath(false));
                            }
                            catch (WebException wex)
                            {
                                log.Warn(wex);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                log.Warn("No image tags present in document", e);
            }
            return(images);
        }
コード例 #3
0
ファイル: UrlObject.cs プロジェクト: jonfast565/Spidr
 public override bool Equals(Object o1)
 {
     if (o1 != null)
     {
         UrlObject o1obj = (UrlObject)o1;
         return(o1obj.GetFullPath(false) == this.GetFullPath(false));
     }
     else
     {
         return(false);
     }
 }
コード例 #4
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private Page GetPage(UrlObject address)
        {
            // web client for downloading the file
            try
            {
                string fullPath    = address.GetFullPath(false);
                string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada...

                string title  = GetTitle(pageContent);
                var    pageId = Guid.NewGuid();

                List <LinkTag> links = null;
                links = GetLinks(pageId, fullPath, pageContent);

                List <BinaryFile> images = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY ||
                    this.JobType == SpiderJobType.PING_ONLY)
                {
                    images = new List <BinaryFile>();
                }
                else
                {
                    images = GetImages(pageId, fullPath, pageContent);
                }

                List <BinaryFile> files = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY ||
                    this.JobType == SpiderJobType.PING_ONLY)
                {
                    files = new List <BinaryFile>();
                }
                else
                {
                    files = GetFiles(pageId, fullPath, pageContent);
                }

                return(new Page
                {
                    Content = pageContent,
                    Name = title,
                    FileTags = files,
                    ImageTags = images,
                    LinkTags = links,
                    Link = address,
                    PageId = pageId
                });
            }
            catch (Exception wex)
            {
                log.Warn(wex);
                return(null);
            }
        }
コード例 #5
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <LinkTag> GetLinks(Guid pageId, string address, string content)
        {
            //attempt to parse the document
            var document = new HtmlDocument();

            document.LoadHtml(content);
            var tags = new List <LinkTag>();

            try
            {
                //get all of the hrefs on the page
                foreach (var link in document.DocumentNode.SelectNodes("//a[@href]"))
                {
                    var hrefAttribute = link.Attributes["href"];
                    var urlObject     = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() == null)
                    {
                        continue;
                    }
                    var hasValidExtension = false;
                    foreach (var extension in ValidFileExtensions)
                    {
                        var lastOrDefault = urlObject.Path.LastOrDefault();
                        if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                        {
                            hasValidExtension = true;
                        }
                    }

                    if (hasValidExtension)
                    {
                        continue;
                    }
                    tags.Add(new LinkTag(pageId)
                    {
                        Tag = link.OuterHtml,
                        Url = urlObject
                    });
                    Console.WriteLine("Found link: " + urlObject.GetFullPath(false));
                }
            }
            catch (Exception e)
            {
                log.Warn("No links available in the document", e);
            }
            return(tags);
        }
コード例 #6
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        public void Start()
        {
            // ignore ssl errors
            ServicePointManager.ServerCertificateValidationCallback = (obj, certificate, chain, errors) => (true);

            // start
            var starter = UrlObject.FromString(Frontier);

            if (!Unvisited.Any())
            {
                Unvisited.Add(starter.GetFullPath(false), starter);
            }

            // while still pages unprocessed
            while (Unvisited.Any() && Visited.Count < MaxAllowedPages)
            {
                Parallel.ForEach(Unvisited, (urlPair) =>
                {
                    try
                    {
                        try
                        {
                            var p = PageFromUrl(urlPair.Value);
                            ProcessNewPaths(p, urlPair.Value);
                        }
                        catch (ArgumentOutOfRangeException) { }

                        var unprocessed = Visited.Where(x => x.Value.Processed == false);
                        foreach (var page in unprocessed)
                        {
                            if (this.JobType == SpiderJobType.PAGE_ONLY)
                            {
                                page.Value.LinkTags = new List <LinkTag>();
                            }
                            PersistenceInserter.PersistData(page.Value);
                            page.Value.Processed = true;
                        }
                    }
                    catch (ArgumentException) { }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                });
            }
        }
コード例 #7
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        public void ProcessNewPaths(Page p, UrlObject domainObject)
        {
            if (p != null && domainObject != null)
            {
                Console.WriteLine("Visited: " + p.Link.GetFullPath(false));

                Unvisited.Remove(p.Link.GetFullPath(false));
                if (!Visited.ContainsKey(p.Link.GetFullPath(false)))
                {
                    Visited.Add(p.Link.GetFullPath(false), p);
                }

                foreach (LinkTag l in p.LinkTags)
                {
                    var toBeVisited = false;
                    var visited     = false;
                    try
                    {
                        var key = Unvisited[l.Url.GetFullPath(false)];
                        toBeVisited = true;
                    }
                    catch (KeyNotFoundException /* knfe */) { }

                    try
                    {
                        var key = Visited[l.Url.GetFullPath(false)];
                        visited = true;
                    }
                    catch (KeyNotFoundException /* knfe */) { }

                    if (toBeVisited != true
                        & visited != true)
                    {
                        if (l.Url.GetDomain() == domainObject.GetDomain())
                        {
                            Unvisited.Add(l.Url.GetFullPath(false), l.Url);
                        }
                    }
                }
            }
        }
コード例 #8
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
 public Page PageFromUrl(UrlObject address)
 {
     return(GetPage(address));
 }
コード例 #9
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private Page GetPage(UrlObject address)
        {
            // web client for downloading the file
            try
            {
                string fullPath = address.GetFullPath(false);
                string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada...

                string title = GetTitle(pageContent);
                var pageId = Guid.NewGuid();

                List<LinkTag> links = null;
                links = GetLinks(pageId, fullPath, pageContent);

                List<BinaryFile> images = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY
                    || this.JobType == SpiderJobType.PING_ONLY)
                {
                    images = new List<BinaryFile>();
                }
                else
                {
                    images = GetImages(pageId, fullPath, pageContent);
                }

                List<BinaryFile> files = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY
                    || this.JobType == SpiderJobType.PING_ONLY)
                {
                    files = new List<BinaryFile>();
                }
                else
                {
                    files = GetFiles(pageId, fullPath, pageContent);
                }

                return new Page
                {
                    Content = pageContent,
                    Name = title,
                    FileTags = files,
                    ImageTags = images,
                    LinkTags = links,
                    Link = address,
                    PageId = pageId
                };
            }
            catch (Exception wex)
            {
                log.Warn(wex);
                return null;
            }
        }
コード例 #10
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        public void ProcessNewPaths(Page p, UrlObject domainObject)
        {
            if (p != null && domainObject != null)
            {
                Console.WriteLine("Visited: " + p.Link.GetFullPath(false));

                Unvisited.Remove(p.Link.GetFullPath(false));
                if (!Visited.ContainsKey(p.Link.GetFullPath(false)))
                {
                    Visited.Add(p.Link.GetFullPath(false), p);
                }

                foreach (LinkTag l in p.LinkTags)
                {
                    var toBeVisited = false;
                    var visited = false;
                    try
                    {
                        var key = Unvisited[l.Url.GetFullPath(false)];
                        toBeVisited = true;
                    }
                    catch (KeyNotFoundException /* knfe */) { }

                    try
                    {
                        var key = Visited[l.Url.GetFullPath(false)];
                        visited = true;
                    }
                    catch (KeyNotFoundException /* knfe */) { }

                    if (toBeVisited != true
                        & visited != true)
                    {
                        if (l.Url.GetDomain() == domainObject.GetDomain())
                            Unvisited.Add(l.Url.GetFullPath(false), l.Url);
                    }
                }
            }
        }
コード例 #11
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
 public Page PageFromUrl(UrlObject address)
 {
     return GetPage(address);
 }