private void EnqueueRedirect(Document document, IElement node, string url, CancellationToken ct) { if (!MustEnqueueUrl(url)) { return; } // Remove fragment var parsedUrl = new Url(url); parsedUrl.Fragment = null; var redirectUrl = parsedUrl.Href; document.RedirectUrl = redirectUrl; if (document.IsSelfOrInRedirections(redirectUrl)) { document.IsRedirectionLoop = true; return; } var discoveredUrl = new DiscoveredUrl { Url = redirectUrl, SourceDocument = document, IsRedirect = true, Excerpt = GetExcerpt(node) }; _discoveredUrls.Add(discoveredUrl, ct); }
private async Task ProcessItemAsync(CrawlResult result, DiscoveredUrl discoveredUrl, CancellationToken ct) { // Test the domain, same domain as start url or external by 1 level if (!MustProcess(result, discoveredUrl)) { return; } // Already processed Document existingDocument; lock (result.Documents) { existingDocument = result.Documents.FirstOrDefault(d => discoveredUrl.IsSame(d)); } if (existingDocument != null) { AddReference(discoveredUrl, existingDocument); return; } var doc = await GetAsync(discoveredUrl, ct).ConfigureAwait(false); lock (result.Documents) { existingDocument = result.Documents.FirstOrDefault(d => doc.IsSame(d)); // Another thread as processed the same URL at the same time if (existingDocument != null) { AddReference(discoveredUrl, existingDocument); return; } } if (discoveredUrl.SourceDocument != null) { lock (doc.ReferencedBy) { doc.ReferencedBy.Add(new DocumentRef { SourceDocument = discoveredUrl.SourceDocument, TargetDocument = doc, Excerpt = discoveredUrl.Excerpt }); } } lock (result.Documents) { result.Documents.Add(doc); } OnDocumentParsed(doc); }
private void AddReference(DiscoveredUrl discoveredUrl, Document document) { var documentRef = new DocumentRef(); documentRef.SourceDocument = discoveredUrl.SourceDocument; documentRef.TargetDocument = document; documentRef.Excerpt = discoveredUrl.Excerpt; lock (document.ReferencedBy) { document.ReferencedBy.Add(documentRef); } OnDocumentRefAdded(documentRef); OnDocumentUpdated(document); }
private void Enqueue(Document document, string url, string language, string excerpt, CancellationToken ct) { if (!MustEnqueueUrl(url)) { return; } // Remove fragment var parsedUrl = new Url(url); parsedUrl.Fragment = null; var discoveredUrl = new DiscoveredUrl(); discoveredUrl.Url = parsedUrl.Href; discoveredUrl.Language = language; discoveredUrl.SourceDocument = document; discoveredUrl.Excerpt = excerpt; _discoveredUrls.Add(discoveredUrl, ct); }
private bool MustProcess(CrawlResult result, DiscoveredUrl discoveredUrl) { if (discoveredUrl.SourceDocument == null) // root page { return(true); } if (discoveredUrl.IsRedirect) // we go to the redicted page { return(true); } var isSameHost = IsSameHost(result, discoveredUrl.Url); if (!isSameHost && IsSameHost(result, discoveredUrl.SourceDocument.Url)) // External link by one level { return(true); } if (isSameHost) // same domain { return(true); } if (_options.Includes != null) { foreach (var include in _options.Includes) { if (include.IsMatch(discoveredUrl.Url)) { return(true); } } } return(false); }
private async Task <Document> GetAsync(DiscoveredUrl discoveredUrl, CancellationToken ct = default(CancellationToken)) { var doc = new Document(); doc.CrawledOn = DateTime.UtcNow; doc.Url = discoveredUrl.Url; doc.Language = discoveredUrl.Language; try { using (var requestMessage = new HttpRequestMessage(HttpMethod.Get, doc.Url)) { requestMessage.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("*/*")); if (discoveredUrl.Language != null) { requestMessage.Headers.AcceptLanguage.Add(new StringWithQualityHeaderValue(doc.Language)); } using (var response = await _client.SendAsync(requestMessage, HttpCompletionOption.ResponseHeadersRead, ct).ConfigureAwait(false)) { doc.StatusCode = response.StatusCode; doc.ReasonPhrase = response.ReasonPhrase; doc.RequestHeaders = Combine(CloneHeaders(response.RequestMessage.Headers), CloneHeaders(response.RequestMessage.Content?.Headers)); doc.ResponseHeaders = Combine(CloneHeaders(response.Headers), CloneHeaders(response.Content?.Headers)); if (Utilities.IsRedirect(response.StatusCode)) { if (response.Headers.TryGetValues("Location", out var locationHeader)) { var location = locationHeader.FirstOrDefault(); EnqueueRedirect(doc, GetAbsoluteUrl(new Url(doc.Url), location), ct); } } else { if (response.Content != null) { var contentType = response.Content?.Headers.ContentType?.MediaType; if (contentType == null || Utilities.IsHtmlMimeType(contentType)) { var htmlDocument = await HandleHtmlAsync(doc, response, ct).ConfigureAwait(false); foreach (var analyser in _options.Analysers.OfType <IHtmlAnalyser>()) { AddAnalyserResult(doc, analyser.Analyse(new HtmlAnalyseArgs(doc, htmlDocument))); } var htmlStyleTagAnalysers = _options.Analysers.OfType <ICssAnalyser>().Where(analyser => (analyser.Targets & CssAnalyserTargets.HtmlStyleTag) == CssAnalyserTargets.HtmlStyleTag).ToList(); if (htmlStyleTagAnalysers.Any()) { var elements = htmlDocument.QuerySelectorAll <IHtmlStyleElement>("style").ToList(); foreach (var analyser in htmlStyleTagAnalysers) { foreach (var element in elements) { AddAnalyserResult(doc, analyser.Analyse(new CssAnalyseArgs(doc, CssAnalyserTargets.HtmlStyleTag, ParseCss(element.InnerHtml), element))); } } } var htmlStyleAttributeAnalysers = _options.Analysers.OfType <ICssAnalyser>().Where(analyser => (analyser.Targets & CssAnalyserTargets.HtmlStyleAttribute) == CssAnalyserTargets.HtmlStyleAttribute).ToList(); if (htmlStyleAttributeAnalysers.Any()) { var elements = htmlDocument.QuerySelectorAll("*[style]").ToList(); foreach (var analyser in htmlStyleAttributeAnalysers) { foreach (var element in elements) { var style = element.GetAttribute("style"); var rule = CreateCssRuleFromInlineStyle(style); AddAnalyserResult(doc, analyser.Analyse(new CssAnalyseArgs(doc, CssAnalyserTargets.HtmlStyleAttribute, ParseCss(rule), element))); } } } } else if (Utilities.IsCssMimeType(contentType)) { var stylesheet = await HandleCssAsync(doc, response, ct).ConfigureAwait(false); foreach (var analyser in _options.Analysers.OfType <ICssAnalyser>()) { AddAnalyserResult(doc, analyser.Analyse(new CssAnalyseArgs(doc, CssAnalyserTargets.StyleSheet, stylesheet))); } } foreach (var analyser in _options.Analysers.OfType <IDocumentAnalyser>()) { AddAnalyserResult(doc, analyser.Analyse(new AnalyseArgs(doc))); } } } } } } catch (Exception ex) { doc.ErrorMessage = GetErrorMessage(ex); doc.FullErrorMessage = ex.ToString(); } return(doc); }