コード例 #1
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private List <BinaryFile> GetImages(Guid pageId, string address, string content)
        {
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            List <BinaryFile> images         = new List <BinaryFile>();
            WebClient         downloadClient = new WebClient();

            try
            {
                foreach (HtmlNode image in document.DocumentNode.SelectNodes("//img[@src]"))
                {
                    HtmlAttribute imgSrcAttribute = image.Attributes["src"];
                    UrlObject     urlObject       = UrlObject.FromRelativeString(address, imgSrcAttribute.Value.ToString());
                    if (urlObject.Path.LastOrDefault() != null)
                    {
                        bool hasValidExtension = false;
                        foreach (var extension in ValidImgExtensions)
                        {
                            var lastOrDefault = urlObject.Path.LastOrDefault();
                            if (lastOrDefault != null && lastOrDefault.Contains("." + extension))
                            {
                                hasValidExtension = true;
                            }
                        }

                        if (hasValidExtension)
                        {
                            try
                            {
                                byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false));
                                images.Add(new BinaryFile(pageId)
                                {
                                    Url      = urlObject,
                                    Tag      = image.OuterHtml,
                                    Name     = urlObject.Path.LastOrDefault(),
                                    Contents = new MemoryStream(fileBytes),
                                });
                                Console.WriteLine("Found image: " + urlObject.GetFullPath(false));
                            }
                            catch (WebException wex)
                            {
                                log.Warn(wex);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                log.Warn("No image tags present in document", e);
            }
            return(images);
        }
コード例 #2
0
ファイル: UrlObject.cs プロジェクト: jonfast565/Spidr
 public override bool Equals(Object o1)
 {
     if (o1 != null)
     {
         UrlObject o1obj = (UrlObject)o1;
         return(o1obj.GetFullPath(false) == this.GetFullPath(false));
     }
     else
     {
         return(false);
     }
 }
コード例 #3
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private Page GetPage(UrlObject address)
        {
            // web client for downloading the file
            try
            {
                string fullPath    = address.GetFullPath(false);
                string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada...

                string title  = GetTitle(pageContent);
                var    pageId = Guid.NewGuid();

                List <LinkTag> links = null;
                links = GetLinks(pageId, fullPath, pageContent);

                List <BinaryFile> images = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY ||
                    this.JobType == SpiderJobType.PING_ONLY)
                {
                    images = new List <BinaryFile>();
                }
                else
                {
                    images = GetImages(pageId, fullPath, pageContent);
                }

                List <BinaryFile> files = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY ||
                    this.JobType == SpiderJobType.PING_ONLY)
                {
                    files = new List <BinaryFile>();
                }
                else
                {
                    files = GetFiles(pageId, fullPath, pageContent);
                }

                return(new Page
                {
                    Content = pageContent,
                    Name = title,
                    FileTags = files,
                    ImageTags = images,
                    LinkTags = links,
                    Link = address,
                    PageId = pageId
                });
            }
            catch (Exception wex)
            {
                log.Warn(wex);
                return(null);
            }
        }
コード例 #4
0
ファイル: Spider.cs プロジェクト: jonfast565/Spidr
        private Page GetPage(UrlObject address)
        {
            // web client for downloading the file
            try
            {
                string fullPath = address.GetFullPath(false);
                string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada...

                string title = GetTitle(pageContent);
                var pageId = Guid.NewGuid();

                List<LinkTag> links = null;
                links = GetLinks(pageId, fullPath, pageContent);

                List<BinaryFile> images = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY
                    || this.JobType == SpiderJobType.PING_ONLY)
                {
                    images = new List<BinaryFile>();
                }
                else
                {
                    images = GetImages(pageId, fullPath, pageContent);
                }

                List<BinaryFile> files = null;
                if (this.JobType == SpiderJobType.PAGE_ONLY
                    || this.JobType == SpiderJobType.PING_ONLY)
                {
                    files = new List<BinaryFile>();
                }
                else
                {
                    files = GetFiles(pageId, fullPath, pageContent);
                }

                return new Page
                {
                    Content = pageContent,
                    Name = title,
                    FileTags = files,
                    ImageTags = images,
                    LinkTags = links,
                    Link = address,
                    PageId = pageId
                };
            }
            catch (Exception wex)
            {
                log.Warn(wex);
                return null;
            }
        }