예제 #1
0
        /// <summary>
        /// Replace URIs inside a given HTML document that was previously 
        /// downloaded with the local URIs.
        /// </summary>
        /// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            ResourceParser parser = new ResourceParser(
                _settings,
                uriInfo,
                textContent );

            List<UriResourceInformation> linkInfos =
                parser.ExtractLinks();

            // For remembering duplicates.
            Dictionary<string, string> replacedLinks =
                new Dictionary<string, string>();

            // --

            foreach ( UriResourceInformation linkInfo in linkInfos )
            {
                if ( linkInfo.WantFollowUri || linkInfo.IsResourceUri )
                {
                    DownloadedResourceInformation dlInfo =
                        new DownloadedResourceInformation(
                        linkInfo,
                        _settings.Options.DestinationFolderPath );

                    //					/*
                    if ( !string.IsNullOrEmpty( linkInfo.OriginalUrl ) )
                    {
                        string textContentBefore = textContent;

                        string link =
                            Regex.Escape( linkInfo.OriginalUrl );

                        textContent = Regex.Replace(
                            textContent,
                            string.Format( @"""{0}""", link ),
                            string.Format( @"""Resources\{0}""", dlInfo.LocalFileName ),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline );
                        textContent = Regex.Replace(
                            textContent,
                            string.Format( @"'{0}'", link ),
                            string.Format(@"'Resources\{0}'", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline );

                        // For style-"url(...)"-links.
                        textContent = Regex.Replace(
                            textContent,
                            string.Format( @"\(\s*{0}\s*\)", link ),
                            string.Format(@"(Resources\{0})", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline );

                        // Some checking.
                        // 2007-07-27, Uwe Keim.
                        if ( linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                            textContentBefore == textContent &&
                            !replacedLinks.ContainsKey( linkInfo.AbsoluteUri.AbsolutePath ) )
                        {

                        }
                        else
                        {
                            // Remember.
                            replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                linkInfo.AbsoluteUri.AbsolutePath;
                        }
                    }
                    //					*/
                }
            }

            // --

            return textContent;
        }
예제 #2
0
		/// <summary>
		/// Replace URIs inside a given HTML document that was previously 
		/// downloaded with the local URIs.
		/// </summary>
		/// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            try
            {

                ResourceParser parser = new ResourceParser(
                    _settings,
                    uriInfo,
                    textContent);

                List<UriResourceInformation> linkInfos =
                    parser.ExtractLinks();

                // For remembering duplicates.
                Dictionary<string, string> replacedLinks =
                    new Dictionary<string, string>();

                // --

                foreach (UriResourceInformation linkInfo in linkInfos)
                {
                    if (linkInfo.WantFollowUri || linkInfo.IsResourceUri)
                    {
                        DownloadedResourceInformation dlInfo =
                            new DownloadedResourceInformation(
                            linkInfo,
                            _settings.Options.DestinationFolderPath);

                        //					/*
                        if (!string.IsNullOrEmpty(linkInfo.OriginalUrl))
                        {
                            string textContentBefore = textContent;

                            string link =
                                Regex.Escape(linkInfo.OriginalUrl);

                            textContent = Regex.Replace(
                                textContent,
                                string.Format(@"""{0}""", link),
                                string.Format(@"""{0}""", dlInfo.LocalFileName),
                                RegexOptions.IgnoreCase | RegexOptions.Multiline);
                            textContent = Regex.Replace(
                                textContent,
                                string.Format(@"'{0}'", link),
                                string.Format(@"'{0}'", dlInfo.LocalFileName),
                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                            // For style-"url(...)"-links.
                            textContent = Regex.Replace(
                                textContent,
                                string.Format(@"\(\s*{0}\s*\)", link),
                                string.Format(@"({0})", dlInfo.LocalFileName),
                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                            // Some checking.
                            // 2007-07-27, Uwe Keim.
                            if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                                textContentBefore == textContent &&
                                !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath))
                            {
                                //throw new ApplicationException(
                                    string.Format(
                                        @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.",
                                        linkInfo.OriginalUrl,
                                        dlInfo.LocalFileName,
                                        textContent);
                            }
                            else
                            {
                                // Remember.
                                replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                    linkInfo.AbsoluteUri.AbsolutePath;
                            }
                        }
                        //					*/
                    }
                }
            }
            catch (Exception ex)
            {//Satwadhir written code }

                // --
                Console.WriteLine(ex.Message.ToString());
               
            }
            return textContent;
        }
예제 #3
0
        // ------------------------------------------------------------------
        #endregion

        #region Private methods.
        // ------------------------------------------------------------------

        /// <summary>
        /// Process one single URI with a document behind (i.e. no
        /// resource URI).
        /// </summary>
        /// <param name="uriInfo">The URI info.</param>
        /// <param name="depth">The depth.</param>
        private void ProcessUrl(
            DownloadedResourceInformation uriInfo,
            int depth)
        {
            Trace.WriteLine(
                string.Format(
                    @"Processing URI '{0}', with depth {1}.",
                    uriInfo.AbsoluteUri.AbsoluteUri,
                    depth));

            string ext = DownloadedResourceInformation.CorrectFileExtension(DownloadedResourceInformation.TryExtractFileExtension(uriInfo.AbsoluteUri));

            if (ext == ".html" && _settings.Options.MaximumLinkDepth >= 0 &&
                depth > _settings.Options.MaximumLinkDepth)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Depth {1} exceeds maximum configured depth. Ending recursion " +
                        @"at URI '{0}'.",
                        uriInfo.AbsoluteUri.AbsoluteUri,
                        depth));
            }
            else if (depth > _maxDepth)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Depth {1} exceeds maximum allowed recursion depth. " +
                        @"Ending recursion at URI '{0}' to possible continue later.",
                        uriInfo.AbsoluteUri.AbsoluteUri,
                        depth));

                // Add myself to start there later.
                // But only if not yet process, otherwise we would never finish.
                if (_settings.HasDownloadedUri(uriInfo))
                {
                    Trace.WriteLine(
                        string.Format(
                            @"URI '{0}' was already downloaded. NOT continuing later.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
                else
                {
                    _settings.AddDownloadedResourceInfo(uriInfo);

                    // Finished the function.

                    Trace.WriteLine(
                        string.Format(
                            @"Added URI '{0}' to continue later.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
            }
            else
            {
                // If we are in asynchron mode, periodically check for stopps.
                if (processAsyncBackgroundWorker != null)
                {
                    if (processAsyncBackgroundWorker.CancellationPending)
                    {
                        throw new StopProcessingException();
                    }
                }

                // --

                // Notify event sinks about this URL.
                if (ProcessingUrl != null)
                {
                    ProcessingUrlEventArgs e = new ProcessingUrlEventArgs(
                        uriInfo,
                        depth);

                    ProcessingUrl(this, e);
                }

                // --

                if (uriInfo.IsProcessableUri)
                {
                    if (_settings.HasDownloadedUri(uriInfo))
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"URI '{0}' was already downloaded. Skipping.",
                                uriInfo.AbsoluteUri.AbsoluteUri));
                    }
                    else
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"URI '{0}' was not already downloaded. Processing.",
                                uriInfo.AbsoluteUri.AbsoluteUri));

                        if (uriInfo.LinkType == UriType.Resource)
                        {
                            Trace.WriteLine(
                                string.Format(
                                    @"Processing resource URI '{0}', with depth {1}.",
                                    uriInfo.AbsoluteUri.AbsoluteUri,
                                    depth));

                            byte[] binaryContent;

                            ResourceDownloader.DownloadBinary(
                                uriInfo.AbsoluteUri,
                                out binaryContent,
                                _settings.Options);

                            ResourceStorer storer =
                                new ResourceStorer(_settings);

                            storer.StoreBinary(
                                binaryContent,
                                uriInfo);

                            _settings.AddDownloadedResourceInfo(uriInfo);
                            _settings.PersistDownloadedResourceInfo(uriInfo);
                        }
                        else
                        {
                            Trace.WriteLine(
                                string.Format(
                                    @"Processing content URI '{0}', with depth {1}.",
                                    uriInfo.AbsoluteUri.AbsoluteUri,
                                    depth));

                            string   textContent;
                            string   encodingName;
                            Encoding encoding;
                            byte[]   binaryContent;

                            ResourceDownloader.DownloadHtml(
                                uriInfo.AbsoluteUri,
                                out textContent,
                                out encodingName,
                                out encoding,
                                out binaryContent,
                                _settings.Options);

                            ResourceParser parser = new ResourceParser(
                                _settings,
                                uriInfo,
                                textContent);

                            List <UriResourceInformation> linkInfos =
                                parser.ExtractLinks();

                            ResourceRewriter rewriter =
                                new ResourceRewriter(_settings);
                            textContent = rewriter.ReplaceLinks(
                                textContent,
                                uriInfo);

                            ResourceStorer storer =
                                new ResourceStorer(_settings);

                            storer.StoreHtml(
                                textContent,
                                encoding,
                                uriInfo);

                            // Add before parsing childs.
                            _settings.AddDownloadedResourceInfo(uriInfo);

                            foreach (UriResourceInformation linkInfo in linkInfos)
                            {
                                DownloadedResourceInformation dlInfo =
                                    new DownloadedResourceInformation(
                                        linkInfo,
                                        uriInfo.LocalFolderPath,
                                        uriInfo.LocalBaseFolderPath);

                                // Recurse.
                                ProcessUrl(dlInfo, depth + 1);

                                // Do not return or break immediately if too deep,
                                // because this would omit certain pages at this
                                // recursion level.
                            }

                            // Persist after completely parsed childs.
                            _settings.PersistDownloadedResourceInfo(uriInfo);
                        }

                        Trace.WriteLine(
                            string.Format(
                                @"Finished processing URI '{0}'.",
                                uriInfo.AbsoluteUri.AbsoluteUri));
                    }
                }
                else
                {
                    Trace.WriteLine(
                        string.Format(
                            @"URI '{0}' is not processable. Skipping.",
                            uriInfo.AbsoluteUri.AbsoluteUri));
                }
            }
        }
예제 #4
0
        /// <summary>
        /// Replace URIs inside a given HTML document that was previously
        /// downloaded with the local URIs.
        /// </summary>
        /// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            ResourceParser parser = new ResourceParser(
                _settings,
                uriInfo,
                textContent);

            List <UriResourceInformation> linkInfos =
                parser.ExtractLinks();

            // For remembering duplicates.
            Dictionary <string, string> replacedLinks =
                new Dictionary <string, string>();

            // --

            foreach (UriResourceInformation linkInfo in linkInfos)
            {
                if (linkInfo.WantFollowUri || linkInfo.IsResourceUri)
                {
                    DownloadedResourceInformation dlInfo =
                        new DownloadedResourceInformation(
                            linkInfo,
                            _settings.Options.DestinationFolderPath);

                    //					/*
                    if (!string.IsNullOrEmpty(linkInfo.OriginalUrl))
                    {
                        string textContentBefore = textContent;

                        string link =
                            Regex.Escape(linkInfo.OriginalUrl);

                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"""{0}""", link),
                            string.Format(@"""Resources\{0}""", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"'{0}'", link),
                            string.Format(@"'Resources\{0}'", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // For style-"url(...)"-links.
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"\(\s*{0}\s*\)", link),
                            string.Format(@"(Resources\{0})", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // Some checking.
                        // 2007-07-27, Uwe Keim.
                        if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                            textContentBefore == textContent &&
                            !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath))
                        {
                        }
                        else
                        {
                            // Remember.
                            replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                linkInfo.AbsoluteUri.AbsolutePath;
                        }
                    }
                    //					*/
                }
            }

            // --

            return(textContent);
        }
예제 #5
0
		// ------------------------------------------------------------------
		#endregion

		#region Private methods.
		// ------------------------------------------------------------------

		/// <summary>
		/// Process one single URI with a document behind (i.e. no
		/// resource URI).
		/// </summary>
		/// <param name="uriInfo">The URI info.</param>
		/// <param name="depth">The depth.</param>
		private void ProcessUrl(
			DownloadedResourceInformation uriInfo,
			int depth )
		{
			Trace.WriteLine(
				string.Format(
					@"Processing URI '{0}', with depth {1}.",
					uriInfo.AbsoluteUri.AbsoluteUri,
					depth ) );

			if ( _settings.Options.MaximumLinkDepth > 0 &&
				depth > _settings.Options.MaximumLinkDepth )
			{
				Trace.WriteLine(
					string.Format(
						@"Depth {1} exceeds maximum configured depth. Ending recursion " +
							@"at URI '{0}'.",
						uriInfo.AbsoluteUri.AbsoluteUri,
						depth ) );
			}
			else if ( depth > _maxDepth )
			{
				Trace.WriteLine(
					string.Format(
						@"Depth {1} exceeds maximum allowed recursion depth. " +
							@"Ending recursion at URI '{0}' to possible continue later.",
						uriInfo.AbsoluteUri.AbsoluteUri,
						depth ) );

				// Add myself to start there later.
				// But only if not yet process, otherwise we would never finish.
				if ( _settings.HasDownloadedUri( uriInfo ) )
				{
					Trace.WriteLine(
						string.Format(
							@"URI '{0}' was already downloaded. NOT continuing later.",
							uriInfo.AbsoluteUri.AbsoluteUri ) );
				}
				else
				{
					_settings.AddDownloadedResourceInfo( uriInfo );

					// Finished the function.

					Trace.WriteLine(
						string.Format(
							@"Added URI '{0}' to continue later.",
							uriInfo.AbsoluteUri.AbsoluteUri ) );
				}
			}
			else
			{
				// If we are in asynchron mode, periodically check for stopps.
				if ( processAsyncBackgroundWorker != null )
				{
					if ( processAsyncBackgroundWorker.CancellationPending )
					{
						//throw new StopProcessingException();
					}
				}

				// --

				// Notify event sinks about this URL.
				if ( ProcessingUrl != null )
				{
					ProcessingUrlEventArgs e = new ProcessingUrlEventArgs(
						uriInfo,
						depth );

					ProcessingUrl( this, e );
				}

				// --

				if ( uriInfo.IsProcessableUri )
				{
					if ( _settings.HasDownloadedUri( uriInfo ) )
					{
						Trace.WriteLine(
							string.Format(
								@"URI '{0}' was already downloaded. Skipping.",
								uriInfo.AbsoluteUri.AbsoluteUri ) );
					}
					else
					{
						Trace.WriteLine(
							string.Format(
								@"URI '{0}' was not already downloaded. Processing.",
								uriInfo.AbsoluteUri.AbsoluteUri ) );

						if ( uriInfo.LinkType == UriType.Resource )
						{
							Trace.WriteLine(
								string.Format(
									@"Processing resource URI '{0}', with depth {1}.",
									uriInfo.AbsoluteUri.AbsoluteUri,
									depth ) );

							byte[] binaryContent;

							ResourceDownloader.DownloadBinary(
								uriInfo.AbsoluteUri,
								out binaryContent,
								_settings.Options );

							ResourceStorer storer =
								new ResourceStorer( _settings );

							storer.StoreBinary(
								binaryContent,
								uriInfo );

							_settings.AddDownloadedResourceInfo( uriInfo );
							_settings.PersistDownloadedResourceInfo( uriInfo );
						}
						else
						{
							Trace.WriteLine(
								string.Format(
									@"Processing content URI '{0}', with depth {1}.",
									uriInfo.AbsoluteUri.AbsoluteUri,
									depth ) );

							string textContent;
							string encodingName;
							Encoding encoding;
							byte[] binaryContent;

							ResourceDownloader.DownloadHtml(
								uriInfo.AbsoluteUri,
								out textContent,
								out encodingName,
								out encoding,
								out binaryContent,
								_settings.Options );

							ResourceParser parser = new ResourceParser(
								_settings,
								uriInfo,
								textContent );

							List<UriResourceInformation> linkInfos =
								parser.ExtractLinks();

							ResourceRewriter rewriter =
								new ResourceRewriter( _settings );
							textContent = rewriter.ReplaceLinks(
								textContent,
								uriInfo );

							ResourceStorer storer =
								new ResourceStorer( _settings );

							storer.StoreHtml(
								textContent,
								encoding,
								uriInfo );

							// Add before parsing childs.
							_settings.AddDownloadedResourceInfo( uriInfo );

							foreach ( UriResourceInformation linkInfo in linkInfos )
							{
								DownloadedResourceInformation dlInfo =
									new DownloadedResourceInformation(
										linkInfo,
										uriInfo.LocalFolderPath,
										uriInfo.LocalBaseFolderPath );

								// Recurse.
								ProcessUrl( dlInfo, depth + 1 );

								// Do not return or break immediately if too deep, 
								// because this would omit certain pages at this
								// recursion level.
							}

							// Persist after completely parsed childs.
							_settings.PersistDownloadedResourceInfo( uriInfo );
						}

						Trace.WriteLine(
							string.Format(
								@"Finished processing URI '{0}'.",
								uriInfo.AbsoluteUri.AbsoluteUri ) );
					}
				}
				else
				{
					Trace.WriteLine(
						string.Format(
							@"URI '{0}' is not processable. Skipping.",
							uriInfo.AbsoluteUri.AbsoluteUri ) );
				}
			}
		}