Ejemplo n.º 1
0
        // ------------------------------------------------------------------
        #endregion

        #region Private methods.
        // ------------------------------------------------------------------

        /// <summary>
        /// Checks the type of the verify link.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="linkType">Type of the link.</param>
        /// <returns></returns>
        private UriType CheckVerifyLinkType(
            Uri absoluteUri,
            UriType linkType)
        {
            if (linkType == UriType.Resource)
            {
                // The original.
                return(linkType);
            }
            else
            {
                if (DoIsProcessableUri(absoluteUri, linkType))
                {
                    // Ensure PDFs don't get parsed.
                    string head = ResourceDownloader.DownloadHead(
                        absoluteUri,
                        _options);

                    if (string.IsNullOrEmpty(head))
                    {
                        return(UriType.Resource);
                    }
                    else
                    {
                        head = head.ToLowerInvariant();

                        if (head.Contains(@"pdf") ||
                            head.Contains(@"application") ||
                            head.Contains(@"image"))
                        {
                            return(UriType.Resource);
                        }
                        else
                        {
                            Debug.Assert(
                                head.Contains(@"text"),
                                @"no text document type but marked as content.");

                            // The original.
                            return(linkType);
                        }
                    }
                }
                else
                {
                    return(UriType.Resource);
                }
            }
        }
Ejemplo n.º 2
0
        // ------------------------------------------------------------------
        #endregion

        #region Private methods.
        // ------------------------------------------------------------------

        /// <summary>
        /// Process one single URI with a document behind (i.e. no
        /// resource URI).
        /// </summary>
        /// <param name="uriInfo">The URI info.</param>
        /// <param name="depth">The depth.</param>
        private void ProcessUrl(
            DownloadedResourceInformation uriInfo,
            int depth)
        {
            Trace.WriteLine(
                string.Format(
                    @"Processing URI '{0}', with depth {1}.",
                    uriInfo.AbsoluteUri.AbsoluteUri,
                    depth));

            string ext = DownloadedResourceInformation.CorrectFileExtension(DownloadedResourceInformation.TryExtractFileExtension(uriInfo.AbsoluteUri));

            if (ext == ".html" && _settings.Options.MaximumLinkDepth >= 0 &&
                depth > _settings.Options.MaximumLinkDepth)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Depth {1} exceeds maximum configured depth. Ending recursion " +
                        @"at URI '{0}'.",
                        uriInfo.AbsoluteUri.AbsoluteUri,
                        depth));
            }
            else if (depth > _maxDepth)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Depth {1} exceeds maximum allowed recursion depth. " +
                        @"Ending recursion at URI '{0}' to possible continue later.",
                        uriInfo.AbsoluteUri.AbsoluteUri,
                        depth));

                // Add myself to start there later.
                // But only if not yet process, otherwise we would never finish.
                if (_settings.HasDownloadedUri(uriInfo))
                {
                    Trace.WriteLine(
                        string.Format(
                            @"URI '{0}' was already downloaded. NOT continuing later.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
                else
                {
                    _settings.AddDownloadedResourceInfo(uriInfo);

                    // Finished the function.

                    Trace.WriteLine(
                        string.Format(
                            @"Added URI '{0}' to continue later.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
            }
            else
            {
                // If we are in asynchron mode, periodically check for stopps.
                if (processAsyncBackgroundWorker != null)
                {
                    if (processAsyncBackgroundWorker.CancellationPending)
                    {
                        throw new StopProcessingException();
                    }
                }

                // --

                // Notify event sinks about this URL.
                if (ProcessingUrl != null)
                {
                    ProcessingUrlEventArgs e = new ProcessingUrlEventArgs(
                        uriInfo,
                        depth);

                    ProcessingUrl(this, e);
                }

                // --

                if (uriInfo.IsProcessableUri)
                {
                    if (_settings.HasDownloadedUri(uriInfo))
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"URI '{0}' was already downloaded. Skipping.",
                                uriInfo.AbsoluteUri.AbsoluteUri));
                    }
                    else
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"URI '{0}' was not already downloaded. Processing.",
                                uriInfo.AbsoluteUri.AbsoluteUri));

                        if (uriInfo.LinkType == UriType.Resource)
                        {
                            Trace.WriteLine(
                                string.Format(
                                    @"Processing resource URI '{0}', with depth {1}.",
                                    uriInfo.AbsoluteUri.AbsoluteUri,
                                    depth));

                            byte[] binaryContent;

                            ResourceDownloader.DownloadBinary(
                                uriInfo.AbsoluteUri,
                                out binaryContent,
                                _settings.Options);

                            ResourceStorer storer =
                                new ResourceStorer(_settings);

                            storer.StoreBinary(
                                binaryContent,
                                uriInfo);

                            _settings.AddDownloadedResourceInfo(uriInfo);
                            _settings.PersistDownloadedResourceInfo(uriInfo);
                        }
                        else
                        {
                            Trace.WriteLine(
                                string.Format(
                                    @"Processing content URI '{0}', with depth {1}.",
                                    uriInfo.AbsoluteUri.AbsoluteUri,
                                    depth));

                            string   textContent;
                            string   encodingName;
                            Encoding encoding;
                            byte[]   binaryContent;

                            ResourceDownloader.DownloadHtml(
                                uriInfo.AbsoluteUri,
                                out textContent,
                                out encodingName,
                                out encoding,
                                out binaryContent,
                                _settings.Options);

                            ResourceParser parser = new ResourceParser(
                                _settings,
                                uriInfo,
                                textContent);

                            List <UriResourceInformation> linkInfos =
                                parser.ExtractLinks();

                            ResourceRewriter rewriter =
                                new ResourceRewriter(_settings);
                            textContent = rewriter.ReplaceLinks(
                                textContent,
                                uriInfo);

                            ResourceStorer storer =
                                new ResourceStorer(_settings);

                            storer.StoreHtml(
                                textContent,
                                encoding,
                                uriInfo);

                            // Add before parsing childs.
                            _settings.AddDownloadedResourceInfo(uriInfo);

                            foreach (UriResourceInformation linkInfo in linkInfos)
                            {
                                DownloadedResourceInformation dlInfo =
                                    new DownloadedResourceInformation(
                                        linkInfo,
                                        uriInfo.LocalFolderPath,
                                        uriInfo.LocalBaseFolderPath);

                                // Recurse.
                                ProcessUrl(dlInfo, depth + 1);

                                // Do not return or break immediately if too deep,
                                // because this would omit certain pages at this
                                // recursion level.
                            }

                            // Persist after completely parsed childs.
                            _settings.PersistDownloadedResourceInfo(uriInfo);
                        }

                        Trace.WriteLine(
                            string.Format(
                                @"Finished processing URI '{0}'.",
                                uriInfo.AbsoluteUri.AbsoluteUri));
                    }
                }
                else
                {
                    Trace.WriteLine(
                        string.Format(
                            @"URI '{0}' is not processable. Skipping.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
            }
        }