コード例 #1
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <BinaryFile> GetFiles(Guid pageId, string address, string content)
        {
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            List <BinaryFile> files          = new List <BinaryFile>();
            WebClient         downloadClient = new WebClient();

            try
            {
                //get all of the hrefs on the page
                foreach (var link in document.DocumentNode.SelectNodes("//a[@href]"))
                {
                    var hrefAttribute = link.Attributes["href"];
                    var urlObject     = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() != null)
                    {
                        bool hasValidExtension = false;
                        foreach (var extension in ValidFileExtensions)
                        {
                            var lastOrDefault = urlObject.Path.LastOrDefault();
                            if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                            {
                                hasValidExtension = true;
                            }
                        }

                        if (hasValidExtension)
                        {
                            try
                            {
                                byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false));
                                files.Add(new BinaryFile(pageId)
                                {
                                    Url      = urlObject,
                                    Tag      = link.OuterHtml,
                                    Name     = urlObject.Path.LastOrDefault(),
                                    Contents = new MemoryStream(fileBytes)
                                });
                                Console.WriteLine("Found files: " + urlObject.GetFullPath(false));
                            }
                            catch (WebException wex)
                            {
                                log.Warn(wex);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                log.Warn("No files in the document", e);
            }
            return(files);
        }
コード例 #2
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <BinaryFile> GetImages(Guid pageId, string address, string content)
        {
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            List <BinaryFile> images         = new List <BinaryFile>();
            WebClient         downloadClient = new WebClient();

            try
            {
                foreach (HtmlNode image in document.DocumentNode.SelectNodes("//img[@src]"))
                {
                    HtmlAttribute imgSrcAttribute = image.Attributes["src"];
                    UrlObject     urlObject       = UrlObject.FromRelativeString(address, imgSrcAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() != null)
                    {
                        bool hasValidExtension = false;
                        foreach (var extension in ValidImgExtensions)
                        {
                            var lastOrDefault = urlObject.Path.LastOrDefault();
                            if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                            {
                                hasValidExtension = true;
                            }
                        }

                        if (hasValidExtension)
                        {
                            try
                            {
                                byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false));
                                images.Add(new BinaryFile(pageId)
                                {
                                    Url      = urlObject,
                                    Tag      = image.OuterHtml,
                                    Name     = urlObject.Path.LastOrDefault(),
                                    Contents = new MemoryStream(fileBytes),
                                });
                                Console.WriteLine("Found image: " + urlObject.GetFullPath(false));
                            }
                            catch (WebException wex)
                            {
                                log.Warn(wex);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                log.Warn("No image tags present in document", e);
            }
            return(images);
        }
コード例 #3
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <LinkTag> GetLinks(Guid pageId, string address, string content)
        {
            //attempt to parse the document
            var document = new HtmlDocument();

            document.LoadHtml(content);
            var tags = new List <LinkTag>();

            try
            {
                //get all of the hrefs on the page
                foreach (var link in document.DocumentNode.SelectNodes("//a[@href]"))
                {
                    var hrefAttribute = link.Attributes["href"];
                    var urlObject     = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() == null)
                    {
                        continue;
                    }
                    var hasValidExtension = false;
                    foreach (var extension in ValidFileExtensions)
                    {
                        var lastOrDefault = urlObject.Path.LastOrDefault();
                        if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                        {
                            hasValidExtension = true;
                        }
                    }

                    if (hasValidExtension)
                    {
                        continue;
                    }
                    tags.Add(new LinkTag(pageId)
                    {
                        Tag = link.OuterHtml,
                        Url = urlObject
                    });
                    Console.WriteLine("Found link: " + urlObject.GetFullPath(false));
                }
            }
            catch (Exception e)
            {
                log.Warn("No links available in the document", e);
            }
            return(tags);
        }