示例#1
0
        public void DoJob()
        {
            //_db.Database.Delete();
            //_db.Database.Create();
            var init = new LinkModel {
                Url = InitLink
            };

            VisitLink(init);
        }
示例#2
0
        public void VisitLink(LinkModel link)
        {
            if (link.ParentLevel > depth)
            {
                depth = link.ParentLevel;
            }
            if (_handleLinks.Count % 20 == 0)
            {
                _db.SaveChanges();
            }

            // 1. Download html
            // Save data
            link.Content = _wc.DownloadString(link.Url);
            _db.RawEntryModels.Add(new RawEntryModel()
            {
                Url = link.Url, Content = link.Content, Parent = null
            });

            // 2. Extract a
            // Filter a

            var doc = new HtmlDocument();

            doc.LoadHtml(link.Content);
            var nodes = doc.DocumentNode.SelectNodes("//a");

            // 3. For a VisitLink
            foreach (var node in nodes)
            {
                if (node.Attributes["href"] == null)
                {
                    continue;
                }
                var url = node.Attributes["href"].Value;
                // Filter URL
                var regx1 = new Regex(@"\.([h][t][m][l])?$");
                var regx2 = new Regex(@"\.([h][t][m])?$");
                var regx3 = new Regex(@"\.([a-z][a-z][a-z])?$");
                // File case
                if (regx3.IsMatch(url) && !(regx1.IsMatch(url) || regx2.IsMatch(url)))
                {
                    continue;
                }

                if (url == "/" || url == "/?removecookie=true")
                {
                    continue;
                }
                if (!url.StartsWith("/") && !url.Contains(InitLink))
                {
                    continue;
                }

                if (url.StartsWith("/"))
                {
                    url = InitLink + url;
                }

                if (_handleLinks.Contains(url))
                {
                    continue;
                }

                System.Diagnostics.Debug.WriteLine(">> Handle:" + url);
                link = new LinkModel {
                    Url = url, ParentLevel = link.ParentLevel + 1
                };
                //_newLinks.Add(link);
                _handleLinks.Add(url);
                VisitLink(link);
            }
        }