예제 #1
0
        private async Task <List <string> > ProcessResponse(HttpResponseMessage response)
        {
            var html = await response.Content.ReadAsStringAsync();

            if (string.IsNullOrWhiteSpace(html))
            {
                return(new List <string>());
            }

            var doc = new HtmlDocument();

            doc.LoadHtml(html);
            if (doc.DocumentNode == null)
            {
                return(new List <string>());
            }

            var next = doc.DocumentNode.SelectNodes("//a");

            if (next == null || next.Count == 0)
            {
                return(new List <string>());
            }


            var mainDomain = _baseUrl.UrlGetDomain();
            var nextBach   = new List <string>();

            foreach (var link in next)
            {
                var url = link.GetAttributeValue("href", string.Empty);
                if (url.IsNullOrEmpty())
                {
                    continue;
                }

                if (url.StartsWith("/") && !url.StartsWith("//"))
                {
                    url = (_baseUrl + url).UrlFixUrl();
                }
                else if (url.StartsWith("/www.", StringComparison.InvariantCultureIgnoreCase))
                {
                    url = _baseUrl.ToUri().Scheme + url;
                }
                else if (url.StartsWith("//"))
                {
                    url = _baseUrl.ToUri().Scheme + url;
                }

                var domain = url.UrlGetDomain();
                if (domain != mainDomain)
                {
                    continue;
                }

                if (_results.Any(d => d.Url.Equals(url, StringComparison.InvariantCultureIgnoreCase)))
                {
                    continue;
                }
                if (_crawlBag.Any(d => d.Equals(url, StringComparison.InvariantCultureIgnoreCase)))
                {
                    continue;
                }
                nextBach.Add(url);
                _crawlBag.Add(url);
            }

            var contentParser = new PageContentParser();
            var pageContent   = contentParser.GetPageContnent(doc);
            var md5           = pageContent.Md5Get();
            var pageResult    = new CrawlingPageResult(response.RequestMessage.RequestUri.OriginalString)
            {
                HttpStatusCode = response.StatusCode,
                MD5            = md5
            };

            _results.Add(pageResult);
            return(nextBach);
        }