/// <summary> /// Creates a HtmlDocument (WebPage) that references downloaded content. If the Discovery isn't available locally, a remote (hotlinked) request is made. /// </summary> /// <param name = "webPageAbsoluteUri">The web page absolute URI.</param> /// <param name = "source">The source.</param> /// <param name = "uriQualificationType">Type of the URI qualification.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns></returns> public override HtmlDocument CreateHtmlDocument(string webPageAbsoluteUri, string fullTextIndexType, string source, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing) { HtmlDocument htmlDocument = new HtmlDocument(); try { htmlDocument.LoadHtml(source); switch (uriQualificationType) { case UriQualificationType.None: case UriQualificationType.Relative: break; case UriQualificationType.Absolute: case UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable: Uri uri = new Uri(webPageAbsoluteUri); QualifyNode(uri.Scheme + "://" + uri.Host + "/", fullTextIndexType, htmlDocument.DocumentNode, uriQualificationType, arachnodeDAO, prepareForLocalBrowsing); break; } } catch (Exception exception) { arachnodeDAO.InsertException(webPageAbsoluteUri, null, exception, false); } return(htmlDocument); }
/// <summary> /// Creates a WebPages that references downloaded content. If the Discovery isn't available locally, a remote (hotlinked) request is made.v /// </summary> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "htmlNode">The HTML node.</param> /// <param name = "uriQualificationType">Type of the URI qualification.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public abstract void QualifyNode(string absoluteUri, string fullTextIndexType, HtmlNode htmlNode, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing);
/// <summary> /// Creates a WebPages that references downloaded content. If the Discovery isn't available locally, a remote (hotlinked) request is made.v /// </summary> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "htmlNode">The HTML node.</param> /// <param name = "uriQualificationType">Type of the URI qualification.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void QualifyNode(string absoluteUri, string fullTextIndexType, HtmlNode htmlNode, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing) { if (htmlNode.HasAttributes) { foreach (HtmlAttribute htmlAttribute in htmlNode.Attributes) { if (string.Compare(htmlAttribute.Name, "src", StringComparison.OrdinalIgnoreCase) == 0 || string.Compare(htmlAttribute.Name, "href", StringComparison.OrdinalIgnoreCase) == 0) { Uri uri; if (Uri.TryCreate(htmlAttribute.Value, UriKind.RelativeOrAbsolute, out uri)) { if (!uri.IsAbsoluteUri) { Uri.TryCreate(absoluteUri + uri.OriginalString, UriKind.Absolute, out uri); } } //remove double "//"... uri = new Uri(uri.Scheme + "://" + uri.AbsoluteUri.Replace(uri.Scheme + "://", string.Empty).Replace("//", "/").TrimEnd("/".ToCharArray())); if (uri.IsAbsoluteUri) { string downloadedFileDiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, uri.AbsoluteUri, string.Empty); string downloadedImageDiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, uri.AbsoluteUri, string.Empty); string downloadedWebPageDiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, uri.AbsoluteUri, string.Empty); bool doesDownloadedFileDiscoveryExist = false; bool doesDownloadedImageDiscoveryExist = false; bool doesDownloadedWebPageDiscoveryExist = false; if (prepareForLocalBrowsing) { if (htmlNode.Name == "applet" || htmlNode.Name == "embed" || htmlNode.Name == "img") { if (!_discoveryManager.DoesDiscoveryExist(downloadedImageDiscoveryPath)) { htmlNode.ParentNode.InnerHtml = "<span class=\"undiscoveredDiscovery\">" + htmlNode.OuterHtml + "</span>"; } else { doesDownloadedImageDiscoveryExist = true; htmlNode.ParentNode.InnerHtml = "<span class=\"discoveredDiscovery\">" + htmlNode.OuterHtml + "</span>"; } } if (htmlNode.Name == "link" || htmlNode.Name == "script") { //favicons are referenced by <link> but are images... (thus the check in the images path...) if (!_discoveryManager.DoesDiscoveryExist(downloadedFileDiscoveryPath) && !_discoveryManager.DoesDiscoveryExist(downloadedImageDiscoveryPath)) { htmlNode.OwnerDocument.DocumentNode.InnerHtml = "<span class=\"undiscoveredDiscovery\">Undiscovered: " + uri.AbsoluteUri + "</span>" + htmlNode.OwnerDocument.DocumentNode.InnerHtml; } else { doesDownloadedFileDiscoveryExist = true; } } if (htmlNode.Name == "a") { if (!_discoveryManager.DoesDiscoveryExist(downloadedWebPageDiscoveryPath)) { htmlNode.InnerHtml = "<span class=\"undiscoveredDiscovery\">" + htmlNode.InnerHtml + "</span>"; } else { htmlNode.InnerHtml = "<span class=\"discoveredDiscovery\">" + htmlNode.InnerHtml + "</span>"; doesDownloadedWebPageDiscoveryExist = true; } } } string discoveryExtension = null; if (uriQualificationType == UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable) { if (doesDownloadedFileDiscoveryExist || _discoveryManager.DoesDiscoveryExist(downloadedFileDiscoveryPath)) { discoveryExtension = _discoveryManager.GetDiscoveryExtension(downloadedFileDiscoveryPath); if (!string.IsNullOrEmpty(discoveryExtension)) { htmlAttribute.Value = _discoveryManager.GetDiscoveryPath(_webSettings.DownloadedFilesVirtualDirectory, downloadedFileDiscoveryPath.Replace(ApplicationSettings.DownloadedFilesDirectory, _webSettings.DownloadedFilesVirtualDirectory) + discoveryExtension); } return; } if (doesDownloadedImageDiscoveryExist || _discoveryManager.DoesDiscoveryExist(downloadedImageDiscoveryPath)) { discoveryExtension = _discoveryManager.GetDiscoveryExtension(downloadedImageDiscoveryPath); if (!string.IsNullOrEmpty(discoveryExtension)) { htmlAttribute.Value = _discoveryManager.GetDiscoveryPath(_webSettings.DownloadedImagesVirtualDirectory, downloadedImageDiscoveryPath.Replace(ApplicationSettings.DownloadedImagesDirectory, _webSettings.DownloadedImagesVirtualDirectory) + discoveryExtension); } return; } if (prepareForLocalBrowsing) { if (doesDownloadedWebPageDiscoveryExist) { htmlAttribute.Value = "/Browse.aspx?absoluteUri=" + uri.AbsoluteUri; return; } } if (_webSettings.CreateCrawlRequestsForMissingFilesAndImages) { if (ApplicationSettings.InsertCrawlRequests) { arachnodeDAO.InsertCrawlRequest(SqlDateTime.MinValue.Value, null, absoluteUri, uri.AbsoluteUri, 1, 1, (short)UriClassificationType.Host, (short)UriClassificationType.Host, Double.MaxValue, (byte)RenderType.None, (byte)RenderType.None); } } } htmlAttribute.Value = uri.AbsoluteUri; } } } } for (int i = 0; i < htmlNode.ChildNodes.Count; i++) { HtmlNode htmlNode2 = htmlNode.ChildNodes[i]; QualifyNode(absoluteUri, fullTextIndexType, htmlNode2, uriQualificationType, arachnodeDAO, prepareForLocalBrowsing); } }
/// <summary> /// Creates a HtmlDocument (WebPage) that references downloaded content. If the Discovery isn't available locally, a remote (hotlinked) request is made. /// </summary> /// <param name = "webPageAbsoluteUri">The web page absolute URI.</param> /// <param name = "source">The source.</param> /// <param name = "uriQualificationType">Type of the URI qualification.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns></returns> public abstract HtmlDocument CreateHtmlDocument(string webPageAbsoluteUri, string fullTextIndexType, string source, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing);