Ejemplo n.º 1
0
        private void EnqueueRedirect(Document document, IElement node, string url, CancellationToken ct)
        {
            if (!MustEnqueueUrl(url))
            {
                return;
            }

            // Remove fragment
            var parsedUrl = new Url(url);

            parsedUrl.Fragment = null;

            var redirectUrl = parsedUrl.Href;

            document.RedirectUrl = redirectUrl;

            if (document.IsSelfOrInRedirections(redirectUrl))
            {
                document.IsRedirectionLoop = true;
                return;
            }

            var discoveredUrl = new DiscoveredUrl
            {
                Url            = redirectUrl,
                SourceDocument = document,
                IsRedirect     = true,
                Excerpt        = GetExcerpt(node)
            };

            _discoveredUrls.Add(discoveredUrl, ct);
        }
Ejemplo n.º 2
0
        private async Task ProcessItemAsync(CrawlResult result, DiscoveredUrl discoveredUrl, CancellationToken ct)
        {
            // Test the domain, same domain as start url or external by 1 level
            if (!MustProcess(result, discoveredUrl))
            {
                return;
            }

            // Already processed
            Document existingDocument;

            lock (result.Documents)
            {
                existingDocument = result.Documents.FirstOrDefault(d => discoveredUrl.IsSame(d));
            }

            if (existingDocument != null)
            {
                AddReference(discoveredUrl, existingDocument);
                return;
            }

            var doc = await GetAsync(discoveredUrl, ct).ConfigureAwait(false);

            lock (result.Documents)
            {
                existingDocument = result.Documents.FirstOrDefault(d => doc.IsSame(d)); // Another thread as processed the same URL at the same time
                if (existingDocument != null)
                {
                    AddReference(discoveredUrl, existingDocument);
                    return;
                }
            }

            if (discoveredUrl.SourceDocument != null)
            {
                lock (doc.ReferencedBy)
                {
                    doc.ReferencedBy.Add(new DocumentRef {
                        SourceDocument = discoveredUrl.SourceDocument, TargetDocument = doc, Excerpt = discoveredUrl.Excerpt
                    });
                }
            }

            lock (result.Documents)
            {
                result.Documents.Add(doc);
            }

            OnDocumentParsed(doc);
        }
Ejemplo n.º 3
0
        private void AddReference(DiscoveredUrl discoveredUrl, Document document)
        {
            var documentRef = new DocumentRef();

            documentRef.SourceDocument = discoveredUrl.SourceDocument;
            documentRef.TargetDocument = document;
            documentRef.Excerpt        = discoveredUrl.Excerpt;
            lock (document.ReferencedBy)
            {
                document.ReferencedBy.Add(documentRef);
            }

            OnDocumentRefAdded(documentRef);
            OnDocumentUpdated(document);
        }
Ejemplo n.º 4
0
        private void Enqueue(Document document, string url, string language, string excerpt, CancellationToken ct)
        {
            if (!MustEnqueueUrl(url))
            {
                return;
            }

            // Remove fragment
            var parsedUrl = new Url(url);

            parsedUrl.Fragment = null;

            var discoveredUrl = new DiscoveredUrl();

            discoveredUrl.Url            = parsedUrl.Href;
            discoveredUrl.Language       = language;
            discoveredUrl.SourceDocument = document;
            discoveredUrl.Excerpt        = excerpt;

            _discoveredUrls.Add(discoveredUrl, ct);
        }
Ejemplo n.º 5
0
        private bool MustProcess(CrawlResult result, DiscoveredUrl discoveredUrl)
        {
            if (discoveredUrl.SourceDocument == null) // root page
            {
                return(true);
            }

            if (discoveredUrl.IsRedirect) // we go to the redicted page
            {
                return(true);
            }

            var isSameHost = IsSameHost(result, discoveredUrl.Url);

            if (!isSameHost && IsSameHost(result, discoveredUrl.SourceDocument.Url)) // External link by one level
            {
                return(true);
            }

            if (isSameHost) // same domain
            {
                return(true);
            }

            if (_options.Includes != null)
            {
                foreach (var include in _options.Includes)
                {
                    if (include.IsMatch(discoveredUrl.Url))
                    {
                        return(true);
                    }
                }
            }

            return(false);
        }
Ejemplo n.º 6
0
        private async Task <Document> GetAsync(DiscoveredUrl discoveredUrl, CancellationToken ct = default(CancellationToken))
        {
            var doc = new Document();

            doc.CrawledOn = DateTime.UtcNow;
            doc.Url       = discoveredUrl.Url;
            doc.Language  = discoveredUrl.Language;
            try
            {
                using (var requestMessage = new HttpRequestMessage(HttpMethod.Get, doc.Url))
                {
                    requestMessage.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("*/*"));

                    if (discoveredUrl.Language != null)
                    {
                        requestMessage.Headers.AcceptLanguage.Add(new StringWithQualityHeaderValue(doc.Language));
                    }

                    using (var response = await _client.SendAsync(requestMessage, HttpCompletionOption.ResponseHeadersRead, ct).ConfigureAwait(false))
                    {
                        doc.StatusCode      = response.StatusCode;
                        doc.ReasonPhrase    = response.ReasonPhrase;
                        doc.RequestHeaders  = Combine(CloneHeaders(response.RequestMessage.Headers), CloneHeaders(response.RequestMessage.Content?.Headers));
                        doc.ResponseHeaders = Combine(CloneHeaders(response.Headers), CloneHeaders(response.Content?.Headers));

                        if (Utilities.IsRedirect(response.StatusCode))
                        {
                            if (response.Headers.TryGetValues("Location", out var locationHeader))
                            {
                                var location = locationHeader.FirstOrDefault();
                                EnqueueRedirect(doc, GetAbsoluteUrl(new Url(doc.Url), location), ct);
                            }
                        }
                        else
                        {
                            if (response.Content != null)
                            {
                                var contentType = response.Content?.Headers.ContentType?.MediaType;
                                if (contentType == null || Utilities.IsHtmlMimeType(contentType))
                                {
                                    var htmlDocument = await HandleHtmlAsync(doc, response, ct).ConfigureAwait(false);

                                    foreach (var analyser in _options.Analysers.OfType <IHtmlAnalyser>())
                                    {
                                        AddAnalyserResult(doc, analyser.Analyse(new HtmlAnalyseArgs(doc, htmlDocument)));
                                    }

                                    var htmlStyleTagAnalysers = _options.Analysers.OfType <ICssAnalyser>().Where(analyser => (analyser.Targets & CssAnalyserTargets.HtmlStyleTag) == CssAnalyserTargets.HtmlStyleTag).ToList();
                                    if (htmlStyleTagAnalysers.Any())
                                    {
                                        var elements = htmlDocument.QuerySelectorAll <IHtmlStyleElement>("style").ToList();
                                        foreach (var analyser in htmlStyleTagAnalysers)
                                        {
                                            foreach (var element in elements)
                                            {
                                                AddAnalyserResult(doc, analyser.Analyse(new CssAnalyseArgs(doc, CssAnalyserTargets.HtmlStyleTag, ParseCss(element.InnerHtml), element)));
                                            }
                                        }
                                    }

                                    var htmlStyleAttributeAnalysers = _options.Analysers.OfType <ICssAnalyser>().Where(analyser => (analyser.Targets & CssAnalyserTargets.HtmlStyleAttribute) == CssAnalyserTargets.HtmlStyleAttribute).ToList();
                                    if (htmlStyleAttributeAnalysers.Any())
                                    {
                                        var elements = htmlDocument.QuerySelectorAll("*[style]").ToList();
                                        foreach (var analyser in htmlStyleAttributeAnalysers)
                                        {
                                            foreach (var element in elements)
                                            {
                                                var style = element.GetAttribute("style");
                                                var rule  = CreateCssRuleFromInlineStyle(style);
                                                AddAnalyserResult(doc, analyser.Analyse(new CssAnalyseArgs(doc, CssAnalyserTargets.HtmlStyleAttribute, ParseCss(rule), element)));
                                            }
                                        }
                                    }
                                }
                                else if (Utilities.IsCssMimeType(contentType))
                                {
                                    var stylesheet = await HandleCssAsync(doc, response, ct).ConfigureAwait(false);

                                    foreach (var analyser in _options.Analysers.OfType <ICssAnalyser>())
                                    {
                                        AddAnalyserResult(doc, analyser.Analyse(new CssAnalyseArgs(doc, CssAnalyserTargets.StyleSheet, stylesheet)));
                                    }
                                }

                                foreach (var analyser in _options.Analysers.OfType <IDocumentAnalyser>())
                                {
                                    AddAnalyserResult(doc, analyser.Analyse(new AnalyseArgs(doc)));
                                }
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                doc.ErrorMessage     = GetErrorMessage(ex);
                doc.FullErrorMessage = ex.ToString();
            }

            return(doc);
        }