コード例 #1
0
        /// <summary>
        ///     Creates a HtmlDocument (WebPage) that references downloaded content.  If the Discovery isn't available locally, a remote (hotlinked) request is made.
        /// </summary>
        /// <param name = "webPageAbsoluteUri">The web page absolute URI.</param>
        /// <param name = "source">The source.</param>
        /// <param name = "uriQualificationType">Type of the URI qualification.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns></returns>
        public override HtmlDocument CreateHtmlDocument(string webPageAbsoluteUri, string fullTextIndexType, string source, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing)
        {
            HtmlDocument htmlDocument = new HtmlDocument();

            try
            {
                htmlDocument.LoadHtml(source);

                switch (uriQualificationType)
                {
                case UriQualificationType.None:
                case UriQualificationType.Relative:
                    break;

                case UriQualificationType.Absolute:
                case UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable:
                    Uri uri = new Uri(webPageAbsoluteUri);

                    QualifyNode(uri.Scheme + "://" + uri.Host + "/", fullTextIndexType, htmlDocument.DocumentNode, uriQualificationType, arachnodeDAO, prepareForLocalBrowsing);
                    break;
                }
            }
            catch (Exception exception)
            {
                arachnodeDAO.InsertException(webPageAbsoluteUri, null, exception, false);
            }

            return(htmlDocument);
        }
コード例 #2
0
 /// <summary>
 ///     Creates a WebPages that references downloaded content.  If the Discovery isn't available locally, a remote (hotlinked) request is made.v
 /// </summary>
 /// <param name = "absoluteUri">The absolute URI.</param>
 /// <param name = "htmlNode">The HTML node.</param>
 /// <param name = "uriQualificationType">Type of the URI qualification.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 public abstract void QualifyNode(string absoluteUri, string fullTextIndexType, HtmlNode htmlNode, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing);
コード例 #3
0
        /// <summary>
        ///     Creates a WebPages that references downloaded content.  If the Discovery isn't available locally, a remote (hotlinked) request is made.v
        /// </summary>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "htmlNode">The HTML node.</param>
        /// <param name = "uriQualificationType">Type of the URI qualification.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void QualifyNode(string absoluteUri, string fullTextIndexType, HtmlNode htmlNode, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing)
        {
            if (htmlNode.HasAttributes)
            {
                foreach (HtmlAttribute htmlAttribute in htmlNode.Attributes)
                {
                    if (string.Compare(htmlAttribute.Name, "src", StringComparison.OrdinalIgnoreCase) == 0 || string.Compare(htmlAttribute.Name, "href", StringComparison.OrdinalIgnoreCase) == 0)
                    {
                        Uri uri;
                        if (Uri.TryCreate(htmlAttribute.Value, UriKind.RelativeOrAbsolute, out uri))
                        {
                            if (!uri.IsAbsoluteUri)
                            {
                                Uri.TryCreate(absoluteUri + uri.OriginalString, UriKind.Absolute, out uri);
                            }
                        }

                        //remove double "//"...
                        uri = new Uri(uri.Scheme + "://" + uri.AbsoluteUri.Replace(uri.Scheme + "://", string.Empty).Replace("//", "/").TrimEnd("/".ToCharArray()));

                        if (uri.IsAbsoluteUri)
                        {
                            string downloadedFileDiscoveryPath    = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, uri.AbsoluteUri, string.Empty);
                            string downloadedImageDiscoveryPath   = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, uri.AbsoluteUri, string.Empty);
                            string downloadedWebPageDiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, uri.AbsoluteUri, string.Empty);

                            bool doesDownloadedFileDiscoveryExist    = false;
                            bool doesDownloadedImageDiscoveryExist   = false;
                            bool doesDownloadedWebPageDiscoveryExist = false;

                            if (prepareForLocalBrowsing)
                            {
                                if (htmlNode.Name == "applet" || htmlNode.Name == "embed" || htmlNode.Name == "img")
                                {
                                    if (!_discoveryManager.DoesDiscoveryExist(downloadedImageDiscoveryPath))
                                    {
                                        htmlNode.ParentNode.InnerHtml = "<span class=\"undiscoveredDiscovery\">" + htmlNode.OuterHtml + "</span>";
                                    }
                                    else
                                    {
                                        doesDownloadedImageDiscoveryExist = true;

                                        htmlNode.ParentNode.InnerHtml = "<span class=\"discoveredDiscovery\">" + htmlNode.OuterHtml + "</span>";
                                    }
                                }

                                if (htmlNode.Name == "link" || htmlNode.Name == "script")
                                {
                                    //favicons are referenced by <link> but are images... (thus the check in the images path...)
                                    if (!_discoveryManager.DoesDiscoveryExist(downloadedFileDiscoveryPath) && !_discoveryManager.DoesDiscoveryExist(downloadedImageDiscoveryPath))
                                    {
                                        htmlNode.OwnerDocument.DocumentNode.InnerHtml = "<span class=\"undiscoveredDiscovery\">Undiscovered: " + uri.AbsoluteUri + "</span>" + htmlNode.OwnerDocument.DocumentNode.InnerHtml;
                                    }
                                    else
                                    {
                                        doesDownloadedFileDiscoveryExist = true;
                                    }
                                }

                                if (htmlNode.Name == "a")
                                {
                                    if (!_discoveryManager.DoesDiscoveryExist(downloadedWebPageDiscoveryPath))
                                    {
                                        htmlNode.InnerHtml = "<span class=\"undiscoveredDiscovery\">" + htmlNode.InnerHtml + "</span>";
                                    }
                                    else
                                    {
                                        htmlNode.InnerHtml = "<span class=\"discoveredDiscovery\">" + htmlNode.InnerHtml + "</span>";

                                        doesDownloadedWebPageDiscoveryExist = true;
                                    }
                                }
                            }

                            string discoveryExtension = null;

                            if (uriQualificationType == UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable)
                            {
                                if (doesDownloadedFileDiscoveryExist || _discoveryManager.DoesDiscoveryExist(downloadedFileDiscoveryPath))
                                {
                                    discoveryExtension = _discoveryManager.GetDiscoveryExtension(downloadedFileDiscoveryPath);

                                    if (!string.IsNullOrEmpty(discoveryExtension))
                                    {
                                        htmlAttribute.Value = _discoveryManager.GetDiscoveryPath(_webSettings.DownloadedFilesVirtualDirectory, downloadedFileDiscoveryPath.Replace(ApplicationSettings.DownloadedFilesDirectory, _webSettings.DownloadedFilesVirtualDirectory) + discoveryExtension);
                                    }
                                    return;
                                }

                                if (doesDownloadedImageDiscoveryExist || _discoveryManager.DoesDiscoveryExist(downloadedImageDiscoveryPath))
                                {
                                    discoveryExtension = _discoveryManager.GetDiscoveryExtension(downloadedImageDiscoveryPath);

                                    if (!string.IsNullOrEmpty(discoveryExtension))
                                    {
                                        htmlAttribute.Value = _discoveryManager.GetDiscoveryPath(_webSettings.DownloadedImagesVirtualDirectory, downloadedImageDiscoveryPath.Replace(ApplicationSettings.DownloadedImagesDirectory, _webSettings.DownloadedImagesVirtualDirectory) + discoveryExtension);
                                    }
                                    return;
                                }

                                if (prepareForLocalBrowsing)
                                {
                                    if (doesDownloadedWebPageDiscoveryExist)
                                    {
                                        htmlAttribute.Value = "/Browse.aspx?absoluteUri=" + uri.AbsoluteUri;
                                        return;
                                    }
                                }

                                if (_webSettings.CreateCrawlRequestsForMissingFilesAndImages)
                                {
                                    if (ApplicationSettings.InsertCrawlRequests)
                                    {
                                        arachnodeDAO.InsertCrawlRequest(SqlDateTime.MinValue.Value, null, absoluteUri, uri.AbsoluteUri, 1, 1, (short)UriClassificationType.Host, (short)UriClassificationType.Host, Double.MaxValue, (byte)RenderType.None, (byte)RenderType.None);
                                    }
                                }
                            }

                            htmlAttribute.Value = uri.AbsoluteUri;
                        }
                    }
                }
            }

            for (int i = 0; i < htmlNode.ChildNodes.Count; i++)
            {
                HtmlNode htmlNode2 = htmlNode.ChildNodes[i];

                QualifyNode(absoluteUri, fullTextIndexType, htmlNode2, uriQualificationType, arachnodeDAO, prepareForLocalBrowsing);
            }
        }
コード例 #4
0
 /// <summary>
 ///     Creates a HtmlDocument (WebPage) that references downloaded content.  If the Discovery isn't available locally, a remote (hotlinked) request is made.
 /// </summary>
 /// <param name = "webPageAbsoluteUri">The web page absolute URI.</param>
 /// <param name = "source">The source.</param>
 /// <param name = "uriQualificationType">Type of the URI qualification.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 /// <returns></returns>
 public abstract HtmlDocument CreateHtmlDocument(string webPageAbsoluteUri, string fullTextIndexType, string source, UriQualificationType uriQualificationType, IArachnodeDAO arachnodeDAO, bool prepareForLocalBrowsing);