Beispiel #1
0
 /// <summary>
 /// Constructor.
 /// </summary>
 internal ProcessingUrlEventArgs(
     DownloadedResourceInformation uriInfo,
     int depth)
 {
     this.uriInfo = uriInfo;
     this.depth   = depth;
 }
Beispiel #2
0
        /// <summary>
        /// Check whether a file was already downloaded.
        /// </summary>
        /// <param name="uriInfo">The URI info.</param>
        /// <returns>
        ///     <c>true</c> if [has downloaded URI] [the specified URI info];
        /// otherwise, <c>false</c>.
        /// </returns>
        public bool HasDownloadedUri(
            DownloadedResourceInformation uriInfo)
        {
            // Search whether exists in list.
            int foundPosition =
                _temporaryDownloadedResourceInfos.IndexOf(
                    uriInfo);

            if (foundPosition < 0)
            {
                return(false);
            }
            else
            {
                // Found. Check various attributes.
                DownloadedResourceInformation foundInfo =
                    _temporaryDownloadedResourceInfos[foundPosition];

                if (foundInfo.AddedByProcessID ==
                    Process.GetCurrentProcess().Id)
                {
                    return(true);
                }
                else if (foundInfo.DateAdded.AddHours(10) > DateTime.Now)
                {
                    return(true);
                }
                else
                {
                    return(foundInfo.FileExists);
                }
            }
        }
Beispiel #3
0
        /// <summary>
        /// Add information about a downloaded resource.
        /// </summary>
        /// <param name="info">The info.</param>
        public void AddDownloadedResourceInfo(
            DownloadedResourceInformation info)
        {
            if (_temporaryDownloadedResourceInfos.Contains(info))
            {
                _temporaryDownloadedResourceInfos.Remove(info);
            }

            _temporaryDownloadedResourceInfos.Add(info);
        }
Beispiel #4
0
        /// <summary>
        /// The URLs where to continue parsing when the stack trace gets too deep.
        /// </summary>
        /// <value>The continue downloaded resource infos.</value>
        public void AddContinueDownloadedResourceInfos(
            DownloadedResourceInformation resourceInfo)
        {
            if (_continueDownloadedResourceInfos.Contains(resourceInfo))
            {
                _continueDownloadedResourceInfos.Remove(resourceInfo);
            }

            _continueDownloadedResourceInfos.Add(resourceInfo);
            Persist();
        }
        /// <summary>
        /// Stores a HTML resource to the local file system.
        /// Does no hyperlink replacement.
        /// </summary>
        /// <returns>Return the info about the stored data.</returns>
        public DownloadedResourceInformation StoreHtml(
            string textContent,
            Encoding encoding,
            UriResourceInformation uriInfo)
        {
            DownloadedResourceInformation result =
                new DownloadedResourceInformation(
                    uriInfo,
                    _settings.Options.DestinationFolderPath);

            try
            {
                if (result.LocalFilePath.Exists)
                {
                    result.LocalFilePath.Delete();
                }

                if (!result.LocalFilePath.Directory.Exists)
                {
                    result.LocalFilePath.Directory.Create();
                }

                Trace.WriteLine(
                    string.Format(
                        @"Writing text content to file '{0}'.",
                        result.LocalFilePath));

                using (FileStream s = new FileStream(
                           result.LocalFilePath.FullName,
                           FileMode.Create,
                           FileAccess.Write))
                    using (StreamWriter w = new StreamWriter(s, encoding))
                    {
                        w.Write(textContent);
                    }
            }
            catch (IOException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring IO exception while storing HTML file: '{0}'.",
                        x.Message));
            }
            catch (UnauthorizedAccessException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring exception while storing HTML file: '{0}'.",
                        x.Message));
            }

            return(result);
        }
        /// <summary>
        /// Stores a binary resource to the local file system.
        /// </summary>
        /// <returns>Return the info about the stored data.</returns>
        public DownloadedResourceInformation StoreBinary(
            byte[] binaryContent,
            UriResourceInformation uriInfo)
        {
            DownloadedResourceInformation result =
                new DownloadedResourceInformation(
                    uriInfo,
                    _settings.Options.DestinationFolderPath);

            try
            {
                if (result.LocalFilePath.Exists)
                {
                    result.LocalFilePath.Delete();
                }

                if (binaryContent != null && binaryContent.Length > 0)
                {
                    Trace.WriteLine(
                        string.Format(
                            @"Writing binary content to file '{0}'.",
                            result.LocalFilePath));

                    using (FileStream s = result.LocalFilePath.OpenWrite())
                    {
                        s.Write(binaryContent, 0, binaryContent.Length);
                    }
                }
            }
            catch (IOException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring IO exception while storing binary file: '{0}'.",
                        x.Message));
            }
            catch (UnauthorizedAccessException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring exception while storing binary file: '{0}'.",
                        x.Message));
            }

            return(result);
        }
Beispiel #7
0
        /// <summary>
        /// Pops the continue downloaded resource infos.
        /// </summary>
        /// <returns>Returns the first entry or NULL if none.</returns>
        public DownloadedResourceInformation PopContinueDownloadedResourceInfos()
        {
            if (_continueDownloadedResourceInfos.Count <= 0)
            {
                return(null);
            }
            else
            {
                DownloadedResourceInformation result =
                    _continueDownloadedResourceInfos[0];

                _continueDownloadedResourceInfos.RemoveAt(0);

                Persist();

                return(result);
            }
        }
Beispiel #8
0
        /// <summary>
        /// Persist information about a downloaded resource.
        /// </summary>
        /// <param name="uriInfo">The URI info.</param>
        public void PersistDownloadedResourceInfo(
            DownloadedResourceInformation uriInfo)
        {
            int foundPosition =
                _temporaryDownloadedResourceInfos.IndexOf(
                    uriInfo);

            DownloadedResourceInformation foundInfo =
                _temporaryDownloadedResourceInfos[foundPosition];

            // --

            // Move over.
            if (_persistentDownloadedResourceInfos.Contains(foundInfo))
            {
                _persistentDownloadedResourceInfos.Remove(foundInfo);
            }

            _persistentDownloadedResourceInfos.Add(foundInfo);

            // And store.
            Persist();
        }
Beispiel #9
0
        /// <summary>
        /// Replace URIs inside a given HTML document that was previously
        /// downloaded with the local URIs.
        /// </summary>
        /// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            ResourceParser parser = new ResourceParser(
                _settings,
                uriInfo,
                textContent);

            List <UriResourceInformation> linkInfos =
                parser.ExtractLinks();

            // For remembering duplicates.
            Dictionary <string, string> replacedLinks =
                new Dictionary <string, string>();

            // --

            foreach (UriResourceInformation linkInfo in linkInfos)
            {
                if (linkInfo.WantFollowUri || linkInfo.IsResourceUri)
                {
                    DownloadedResourceInformation dlInfo =
                        new DownloadedResourceInformation(
                            linkInfo,
                            _settings.Options.DestinationFolderPath);

                    //					/*
                    if (!string.IsNullOrEmpty(linkInfo.OriginalUrl))
                    {
                        string textContentBefore = textContent;

                        string link =
                            Regex.Escape(linkInfo.OriginalUrl);

                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"""{0}""", link),
                            string.Format(@"""Resources\{0}""", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"'{0}'", link),
                            string.Format(@"'Resources\{0}'", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // For style-"url(...)"-links.
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"\(\s*{0}\s*\)", link),
                            string.Format(@"(Resources\{0})", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // Some checking.
                        // 2007-07-27, Uwe Keim.
                        if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                            textContentBefore == textContent &&
                            !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath))
                        {
                        }
                        else
                        {
                            // Remember.
                            replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                linkInfo.AbsoluteUri.AbsolutePath;
                        }
                    }
                    //					*/
                }
            }

            // --

            return(textContent);
        }
Beispiel #10
0
        /// <summary>
        /// Performs the complete downloading (synchronously).
        /// Does return only when completely finished or when an exception
        /// occured.
        /// </summary>
        public void Process()
        {
            string baseUrl =
                _settings.Options.DownloadUri.OriginalString.TrimEnd('/').
                Split('?')[0];

            if (_settings.Options.DownloadUri.AbsolutePath.IndexOf('/') >= 0 &&
                _settings.Options.DownloadUri.AbsolutePath.Length > 1)
            {
                baseUrl = baseUrl.Substring(0, baseUrl.LastIndexOf('/'));
            }

            // --

            // The URI that is configured to be the start URI.
            Uri baseUri = new Uri(baseUrl, UriKind.Absolute);

            // The initial seed.
            DownloadedResourceInformation seedInfo =
                new DownloadedResourceInformation(
                    _settings.Options,
                    @"/",
                    _settings.Options.DownloadUri,
                    baseUri,
                    _settings.Options.DestinationFolderPath,
                    _settings.Options.DestinationFolderPath,
                    UriType.Content);

            // --

            // Add the first one as the seed.
            if (!_settings.HasContinueDownloadedResourceInfos)
            {
                _settings.AddContinueDownloadedResourceInfos(seedInfo);
            }

            // 2007-07-27, Uwe Keim:
            // Doing a multiple looping, to avoid stack overflows.
            // Since a download-"tree" (i.e. the hierachy of all downloadable
            // pages) can get _very_ deep, process one part at a time only.
            // The state is already persisted, so we need to set up again at
            // the previous position.
            int index = 0;

            while (_settings.HasContinueDownloadedResourceInfos)
            {
                // Fetch one.
                DownloadedResourceInformation processInfo =
                    _settings.PopContinueDownloadedResourceInfos();

                Trace.WriteLine(
                    string.Format(
                        @"{0}. loop: Starting processing URLs from '{1}'.",
                        index + 1,
                        processInfo.AbsoluteUri.AbsoluteUri));

                // Process the URI, add any continue URIs to start
                // again, later.
                ProcessUrl(processInfo, 0);

                index++;
            }

            Trace.WriteLine(
                string.Format(
                    @"{0}. loop: Finished processing URLs from seed '{1}'.",
                    index + 1,
                    _settings.Options.DownloadUri));
        }
Beispiel #11
0
        // ------------------------------------------------------------------
        #endregion

        #region Private methods.
        // ------------------------------------------------------------------

        /// <summary>
        /// Process one single URI with a document behind (i.e. no
        /// resource URI).
        /// </summary>
        /// <param name="uriInfo">The URI info.</param>
        /// <param name="depth">The depth.</param>
        private void ProcessUrl(
            DownloadedResourceInformation uriInfo,
            int depth)
        {
            Trace.WriteLine(
                string.Format(
                    @"Processing URI '{0}', with depth {1}.",
                    uriInfo.AbsoluteUri.AbsoluteUri,
                    depth));

            string ext = DownloadedResourceInformation.CorrectFileExtension(DownloadedResourceInformation.TryExtractFileExtension(uriInfo.AbsoluteUri));

            if (ext == ".html" && _settings.Options.MaximumLinkDepth >= 0 &&
                depth > _settings.Options.MaximumLinkDepth)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Depth {1} exceeds maximum configured depth. Ending recursion " +
                        @"at URI '{0}'.",
                        uriInfo.AbsoluteUri.AbsoluteUri,
                        depth));
            }
            else if (depth > _maxDepth)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Depth {1} exceeds maximum allowed recursion depth. " +
                        @"Ending recursion at URI '{0}' to possible continue later.",
                        uriInfo.AbsoluteUri.AbsoluteUri,
                        depth));

                // Add myself to start there later.
                // But only if not yet process, otherwise we would never finish.
                if (_settings.HasDownloadedUri(uriInfo))
                {
                    Trace.WriteLine(
                        string.Format(
                            @"URI '{0}' was already downloaded. NOT continuing later.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
                else
                {
                    _settings.AddDownloadedResourceInfo(uriInfo);

                    // Finished the function.

                    Trace.WriteLine(
                        string.Format(
                            @"Added URI '{0}' to continue later.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
            }
            else
            {
                // If we are in asynchron mode, periodically check for stopps.
                if (processAsyncBackgroundWorker != null)
                {
                    if (processAsyncBackgroundWorker.CancellationPending)
                    {
                        throw new StopProcessingException();
                    }
                }

                // --

                // Notify event sinks about this URL.
                if (ProcessingUrl != null)
                {
                    ProcessingUrlEventArgs e = new ProcessingUrlEventArgs(
                        uriInfo,
                        depth);

                    ProcessingUrl(this, e);
                }

                // --

                if (uriInfo.IsProcessableUri)
                {
                    if (_settings.HasDownloadedUri(uriInfo))
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"URI '{0}' was already downloaded. Skipping.",
                                uriInfo.AbsoluteUri.AbsoluteUri));
                    }
                    else
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"URI '{0}' was not already downloaded. Processing.",
                                uriInfo.AbsoluteUri.AbsoluteUri));

                        if (uriInfo.LinkType == UriType.Resource)
                        {
                            Trace.WriteLine(
                                string.Format(
                                    @"Processing resource URI '{0}', with depth {1}.",
                                    uriInfo.AbsoluteUri.AbsoluteUri,
                                    depth));

                            byte[] binaryContent;

                            ResourceDownloader.DownloadBinary(
                                uriInfo.AbsoluteUri,
                                out binaryContent,
                                _settings.Options);

                            ResourceStorer storer =
                                new ResourceStorer(_settings);

                            storer.StoreBinary(
                                binaryContent,
                                uriInfo);

                            _settings.AddDownloadedResourceInfo(uriInfo);
                            _settings.PersistDownloadedResourceInfo(uriInfo);
                        }
                        else
                        {
                            Trace.WriteLine(
                                string.Format(
                                    @"Processing content URI '{0}', with depth {1}.",
                                    uriInfo.AbsoluteUri.AbsoluteUri,
                                    depth));

                            string   textContent;
                            string   encodingName;
                            Encoding encoding;
                            byte[]   binaryContent;

                            ResourceDownloader.DownloadHtml(
                                uriInfo.AbsoluteUri,
                                out textContent,
                                out encodingName,
                                out encoding,
                                out binaryContent,
                                _settings.Options);

                            ResourceParser parser = new ResourceParser(
                                _settings,
                                uriInfo,
                                textContent);

                            List <UriResourceInformation> linkInfos =
                                parser.ExtractLinks();

                            ResourceRewriter rewriter =
                                new ResourceRewriter(_settings);
                            textContent = rewriter.ReplaceLinks(
                                textContent,
                                uriInfo);

                            ResourceStorer storer =
                                new ResourceStorer(_settings);

                            storer.StoreHtml(
                                textContent,
                                encoding,
                                uriInfo);

                            // Add before parsing childs.
                            _settings.AddDownloadedResourceInfo(uriInfo);

                            foreach (UriResourceInformation linkInfo in linkInfos)
                            {
                                DownloadedResourceInformation dlInfo =
                                    new DownloadedResourceInformation(
                                        linkInfo,
                                        uriInfo.LocalFolderPath,
                                        uriInfo.LocalBaseFolderPath);

                                // Recurse.
                                ProcessUrl(dlInfo, depth + 1);

                                // Do not return or break immediately if too deep,
                                // because this would omit certain pages at this
                                // recursion level.
                            }

                            // Persist after completely parsed childs.
                            _settings.PersistDownloadedResourceInfo(uriInfo);
                        }

                        Trace.WriteLine(
                            string.Format(
                                @"Finished processing URI '{0}'.",
                                uriInfo.AbsoluteUri.AbsoluteUri));
                    }
                }
                else
                {
                    Trace.WriteLine(
                        string.Format(
                            @"URI '{0}' is not processable. Skipping.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
            }
        }