/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent ); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary<string, string> replacedLinks = new Dictionary<string, string>(); // -- foreach ( UriResourceInformation linkInfo in linkInfos ) { if ( linkInfo.WantFollowUri || linkInfo.IsResourceUri ) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath ); // /* if ( !string.IsNullOrEmpty( linkInfo.OriginalUrl ) ) { string textContentBefore = textContent; string link = Regex.Escape( linkInfo.OriginalUrl ); textContent = Regex.Replace( textContent, string.Format( @"""{0}""", link ), string.Format( @"""Resources\{0}""", dlInfo.LocalFileName ), RegexOptions.IgnoreCase | RegexOptions.Multiline ); textContent = Regex.Replace( textContent, string.Format( @"'{0}'", link ), string.Format(@"'Resources\{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline ); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format( @"\(\s*{0}\s*\)", link ), string.Format(@"(Resources\{0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline ); // Some checking. // 2007-07-27, Uwe Keim. if ( linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey( linkInfo.AbsoluteUri.AbsolutePath ) ) { } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return textContent; }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { try { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary<string, string> replacedLinks = new Dictionary<string, string>(); // -- foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); // /* if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"({0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { //throw new ApplicationException( string.Format( @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.", linkInfo.OriginalUrl, dlInfo.LocalFileName, textContent); } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } } catch (Exception ex) {//Satwadhir written code } // -- Console.WriteLine(ex.Message.ToString()); } return textContent; }
// ------------------------------------------------------------------ #endregion #region Private methods. // ------------------------------------------------------------------ /// <summary> /// Process one single URI with a document behind (i.e. no /// resource URI). /// </summary> /// <param name="uriInfo">The URI info.</param> /// <param name="depth">The depth.</param> private void ProcessUrl( DownloadedResourceInformation uriInfo, int depth) { Trace.WriteLine( string.Format( @"Processing URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); string ext = DownloadedResourceInformation.CorrectFileExtension(DownloadedResourceInformation.TryExtractFileExtension(uriInfo.AbsoluteUri)); if (ext == ".html" && _settings.Options.MaximumLinkDepth >= 0 && depth > _settings.Options.MaximumLinkDepth) { Trace.WriteLine( string.Format( @"Depth {1} exceeds maximum configured depth. Ending recursion " + @"at URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); } else if (depth > _maxDepth) { Trace.WriteLine( string.Format( @"Depth {1} exceeds maximum allowed recursion depth. " + @"Ending recursion at URI '{0}' to possible continue later.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); // Add myself to start there later. // But only if not yet process, otherwise we would never finish. if (_settings.HasDownloadedUri(uriInfo)) { Trace.WriteLine( string.Format( @"URI '{0}' was already downloaded. NOT continuing later.", uriInfo.AbsoluteUri.AbsoluteUri)); } else { _settings.AddDownloadedResourceInfo(uriInfo); // Finished the function. Trace.WriteLine( string.Format( @"Added URI '{0}' to continue later.", uriInfo.AbsoluteUri.AbsoluteUri)); } } else { // If we are in asynchron mode, periodically check for stopps. if (processAsyncBackgroundWorker != null) { if (processAsyncBackgroundWorker.CancellationPending) { throw new StopProcessingException(); } } // -- // Notify event sinks about this URL. if (ProcessingUrl != null) { ProcessingUrlEventArgs e = new ProcessingUrlEventArgs( uriInfo, depth); ProcessingUrl(this, e); } // -- if (uriInfo.IsProcessableUri) { if (_settings.HasDownloadedUri(uriInfo)) { Trace.WriteLine( string.Format( @"URI '{0}' was already downloaded. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri)); } else { Trace.WriteLine( string.Format( @"URI '{0}' was not already downloaded. Processing.", uriInfo.AbsoluteUri.AbsoluteUri)); if (uriInfo.LinkType == UriType.Resource) { Trace.WriteLine( string.Format( @"Processing resource URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); byte[] binaryContent; ResourceDownloader.DownloadBinary( uriInfo.AbsoluteUri, out binaryContent, _settings.Options); ResourceStorer storer = new ResourceStorer(_settings); storer.StoreBinary( binaryContent, uriInfo); _settings.AddDownloadedResourceInfo(uriInfo); _settings.PersistDownloadedResourceInfo(uriInfo); } else { Trace.WriteLine( string.Format( @"Processing content URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); string textContent; string encodingName; Encoding encoding; byte[] binaryContent; ResourceDownloader.DownloadHtml( uriInfo.AbsoluteUri, out textContent, out encodingName, out encoding, out binaryContent, _settings.Options); ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List <UriResourceInformation> linkInfos = parser.ExtractLinks(); ResourceRewriter rewriter = new ResourceRewriter(_settings); textContent = rewriter.ReplaceLinks( textContent, uriInfo); ResourceStorer storer = new ResourceStorer(_settings); storer.StoreHtml( textContent, encoding, uriInfo); // Add before parsing childs. _settings.AddDownloadedResourceInfo(uriInfo); foreach (UriResourceInformation linkInfo in linkInfos) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, uriInfo.LocalFolderPath, uriInfo.LocalBaseFolderPath); // Recurse. ProcessUrl(dlInfo, depth + 1); // Do not return or break immediately if too deep, // because this would omit certain pages at this // recursion level. } // Persist after completely parsed childs. _settings.PersistDownloadedResourceInfo(uriInfo); } Trace.WriteLine( string.Format( @"Finished processing URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri)); } } else { Trace.WriteLine( string.Format( @"URI '{0}' is not processable. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri)); } } }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List <UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary <string, string> replacedLinks = new Dictionary <string, string>(); // -- foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); // /* if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""Resources\{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'Resources\{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"(Resources\{0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return(textContent); }
// ------------------------------------------------------------------ #endregion #region Private methods. // ------------------------------------------------------------------ /// <summary> /// Process one single URI with a document behind (i.e. no /// resource URI). /// </summary> /// <param name="uriInfo">The URI info.</param> /// <param name="depth">The depth.</param> private void ProcessUrl( DownloadedResourceInformation uriInfo, int depth ) { Trace.WriteLine( string.Format( @"Processing URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth ) ); if ( _settings.Options.MaximumLinkDepth > 0 && depth > _settings.Options.MaximumLinkDepth ) { Trace.WriteLine( string.Format( @"Depth {1} exceeds maximum configured depth. Ending recursion " + @"at URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri, depth ) ); } else if ( depth > _maxDepth ) { Trace.WriteLine( string.Format( @"Depth {1} exceeds maximum allowed recursion depth. " + @"Ending recursion at URI '{0}' to possible continue later.", uriInfo.AbsoluteUri.AbsoluteUri, depth ) ); // Add myself to start there later. // But only if not yet process, otherwise we would never finish. if ( _settings.HasDownloadedUri( uriInfo ) ) { Trace.WriteLine( string.Format( @"URI '{0}' was already downloaded. NOT continuing later.", uriInfo.AbsoluteUri.AbsoluteUri ) ); } else { _settings.AddDownloadedResourceInfo( uriInfo ); // Finished the function. Trace.WriteLine( string.Format( @"Added URI '{0}' to continue later.", uriInfo.AbsoluteUri.AbsoluteUri ) ); } } else { // If we are in asynchron mode, periodically check for stopps. if ( processAsyncBackgroundWorker != null ) { if ( processAsyncBackgroundWorker.CancellationPending ) { //throw new StopProcessingException(); } } // -- // Notify event sinks about this URL. if ( ProcessingUrl != null ) { ProcessingUrlEventArgs e = new ProcessingUrlEventArgs( uriInfo, depth ); ProcessingUrl( this, e ); } // -- if ( uriInfo.IsProcessableUri ) { if ( _settings.HasDownloadedUri( uriInfo ) ) { Trace.WriteLine( string.Format( @"URI '{0}' was already downloaded. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri ) ); } else { Trace.WriteLine( string.Format( @"URI '{0}' was not already downloaded. Processing.", uriInfo.AbsoluteUri.AbsoluteUri ) ); if ( uriInfo.LinkType == UriType.Resource ) { Trace.WriteLine( string.Format( @"Processing resource URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth ) ); byte[] binaryContent; ResourceDownloader.DownloadBinary( uriInfo.AbsoluteUri, out binaryContent, _settings.Options ); ResourceStorer storer = new ResourceStorer( _settings ); storer.StoreBinary( binaryContent, uriInfo ); _settings.AddDownloadedResourceInfo( uriInfo ); _settings.PersistDownloadedResourceInfo( uriInfo ); } else { Trace.WriteLine( string.Format( @"Processing content URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth ) ); string textContent; string encodingName; Encoding encoding; byte[] binaryContent; ResourceDownloader.DownloadHtml( uriInfo.AbsoluteUri, out textContent, out encodingName, out encoding, out binaryContent, _settings.Options ); ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent ); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); ResourceRewriter rewriter = new ResourceRewriter( _settings ); textContent = rewriter.ReplaceLinks( textContent, uriInfo ); ResourceStorer storer = new ResourceStorer( _settings ); storer.StoreHtml( textContent, encoding, uriInfo ); // Add before parsing childs. _settings.AddDownloadedResourceInfo( uriInfo ); foreach ( UriResourceInformation linkInfo in linkInfos ) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, uriInfo.LocalFolderPath, uriInfo.LocalBaseFolderPath ); // Recurse. ProcessUrl( dlInfo, depth + 1 ); // Do not return or break immediately if too deep, // because this would omit certain pages at this // recursion level. } // Persist after completely parsed childs. _settings.PersistDownloadedResourceInfo( uriInfo ); } Trace.WriteLine( string.Format( @"Finished processing URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri ) ); } } else { Trace.WriteLine( string.Format( @"URI '{0}' is not processable. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri ) ); } } }