/// <summary> /// Constructor. /// </summary> internal ProcessingUrlEventArgs( DownloadedResourceInformation uriInfo, int depth) { this.uriInfo = uriInfo; this.depth = depth; }
/// <summary> /// Check whether a file was already downloaded. /// </summary> /// <param name="uriInfo">The URI info.</param> /// <returns> /// <c>true</c> if [has downloaded URI] [the specified URI info]; /// otherwise, <c>false</c>. /// </returns> public bool HasDownloadedUri( DownloadedResourceInformation uriInfo) { // Search whether exists in list. int foundPosition = _temporaryDownloadedResourceInfos.IndexOf( uriInfo); if (foundPosition < 0) { return(false); } else { // Found. Check various attributes. DownloadedResourceInformation foundInfo = _temporaryDownloadedResourceInfos[foundPosition]; if (foundInfo.AddedByProcessID == Process.GetCurrentProcess().Id) { return(true); } else if (foundInfo.DateAdded.AddHours(10) > DateTime.Now) { return(true); } else { return(foundInfo.FileExists); } } }
/// <summary> /// Add information about a downloaded resource. /// </summary> /// <param name="info">The info.</param> public void AddDownloadedResourceInfo( DownloadedResourceInformation info) { if (_temporaryDownloadedResourceInfos.Contains(info)) { _temporaryDownloadedResourceInfos.Remove(info); } _temporaryDownloadedResourceInfos.Add(info); }
/// <summary> /// The URLs where to continue parsing when the stack trace gets too deep. /// </summary> /// <value>The continue downloaded resource infos.</value> public void AddContinueDownloadedResourceInfos( DownloadedResourceInformation resourceInfo) { if (_continueDownloadedResourceInfos.Contains(resourceInfo)) { _continueDownloadedResourceInfos.Remove(resourceInfo); } _continueDownloadedResourceInfos.Add(resourceInfo); Persist(); }
/// <summary> /// Stores a HTML resource to the local file system. /// Does no hyperlink replacement. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreHtml( string textContent, Encoding encoding, UriResourceInformation uriInfo) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath); try { if (result.LocalFilePath.Exists) { result.LocalFilePath.Delete(); } if (!result.LocalFilePath.Directory.Exists) { result.LocalFilePath.Directory.Create(); } Trace.WriteLine( string.Format( @"Writing text content to file '{0}'.", result.LocalFilePath)); using (FileStream s = new FileStream( result.LocalFilePath.FullName, FileMode.Create, FileAccess.Write)) using (StreamWriter w = new StreamWriter(s, encoding)) { w.Write(textContent); } } catch (IOException x) { Trace.WriteLine( string.Format( @"Ignoring IO exception while storing HTML file: '{0}'.", x.Message)); } catch (UnauthorizedAccessException x) { Trace.WriteLine( string.Format( @"Ignoring exception while storing HTML file: '{0}'.", x.Message)); } return(result); }
/// <summary> /// Stores a binary resource to the local file system. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreBinary( byte[] binaryContent, UriResourceInformation uriInfo) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath); try { if (result.LocalFilePath.Exists) { result.LocalFilePath.Delete(); } if (binaryContent != null && binaryContent.Length > 0) { Trace.WriteLine( string.Format( @"Writing binary content to file '{0}'.", result.LocalFilePath)); using (FileStream s = result.LocalFilePath.OpenWrite()) { s.Write(binaryContent, 0, binaryContent.Length); } } } catch (IOException x) { Trace.WriteLine( string.Format( @"Ignoring IO exception while storing binary file: '{0}'.", x.Message)); } catch (UnauthorizedAccessException x) { Trace.WriteLine( string.Format( @"Ignoring exception while storing binary file: '{0}'.", x.Message)); } return(result); }
/// <summary> /// Pops the continue downloaded resource infos. /// </summary> /// <returns>Returns the first entry or NULL if none.</returns> public DownloadedResourceInformation PopContinueDownloadedResourceInfos() { if (_continueDownloadedResourceInfos.Count <= 0) { return(null); } else { DownloadedResourceInformation result = _continueDownloadedResourceInfos[0]; _continueDownloadedResourceInfos.RemoveAt(0); Persist(); return(result); } }
/// <summary> /// Persist information about a downloaded resource. /// </summary> /// <param name="uriInfo">The URI info.</param> public void PersistDownloadedResourceInfo( DownloadedResourceInformation uriInfo) { int foundPosition = _temporaryDownloadedResourceInfos.IndexOf( uriInfo); DownloadedResourceInformation foundInfo = _temporaryDownloadedResourceInfos[foundPosition]; // -- // Move over. if (_persistentDownloadedResourceInfos.Contains(foundInfo)) { _persistentDownloadedResourceInfos.Remove(foundInfo); } _persistentDownloadedResourceInfos.Add(foundInfo); // And store. Persist(); }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List <UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary <string, string> replacedLinks = new Dictionary <string, string>(); // -- foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); // /* if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""Resources\{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'Resources\{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"(Resources\{0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return(textContent); }
/// <summary> /// Performs the complete downloading (synchronously). /// Does return only when completely finished or when an exception /// occured. /// </summary> public void Process() { string baseUrl = _settings.Options.DownloadUri.OriginalString.TrimEnd('/'). Split('?')[0]; if (_settings.Options.DownloadUri.AbsolutePath.IndexOf('/') >= 0 && _settings.Options.DownloadUri.AbsolutePath.Length > 1) { baseUrl = baseUrl.Substring(0, baseUrl.LastIndexOf('/')); } // -- // The URI that is configured to be the start URI. Uri baseUri = new Uri(baseUrl, UriKind.Absolute); // The initial seed. DownloadedResourceInformation seedInfo = new DownloadedResourceInformation( _settings.Options, @"/", _settings.Options.DownloadUri, baseUri, _settings.Options.DestinationFolderPath, _settings.Options.DestinationFolderPath, UriType.Content); // -- // Add the first one as the seed. if (!_settings.HasContinueDownloadedResourceInfos) { _settings.AddContinueDownloadedResourceInfos(seedInfo); } // 2007-07-27, Uwe Keim: // Doing a multiple looping, to avoid stack overflows. // Since a download-"tree" (i.e. the hierachy of all downloadable // pages) can get _very_ deep, process one part at a time only. // The state is already persisted, so we need to set up again at // the previous position. int index = 0; while (_settings.HasContinueDownloadedResourceInfos) { // Fetch one. DownloadedResourceInformation processInfo = _settings.PopContinueDownloadedResourceInfos(); Trace.WriteLine( string.Format( @"{0}. loop: Starting processing URLs from '{1}'.", index + 1, processInfo.AbsoluteUri.AbsoluteUri)); // Process the URI, add any continue URIs to start // again, later. ProcessUrl(processInfo, 0); index++; } Trace.WriteLine( string.Format( @"{0}. loop: Finished processing URLs from seed '{1}'.", index + 1, _settings.Options.DownloadUri)); }
// ------------------------------------------------------------------ #endregion #region Private methods. // ------------------------------------------------------------------ /// <summary> /// Process one single URI with a document behind (i.e. no /// resource URI). /// </summary> /// <param name="uriInfo">The URI info.</param> /// <param name="depth">The depth.</param> private void ProcessUrl( DownloadedResourceInformation uriInfo, int depth) { Trace.WriteLine( string.Format( @"Processing URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); string ext = DownloadedResourceInformation.CorrectFileExtension(DownloadedResourceInformation.TryExtractFileExtension(uriInfo.AbsoluteUri)); if (ext == ".html" && _settings.Options.MaximumLinkDepth >= 0 && depth > _settings.Options.MaximumLinkDepth) { Trace.WriteLine( string.Format( @"Depth {1} exceeds maximum configured depth. Ending recursion " + @"at URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); } else if (depth > _maxDepth) { Trace.WriteLine( string.Format( @"Depth {1} exceeds maximum allowed recursion depth. " + @"Ending recursion at URI '{0}' to possible continue later.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); // Add myself to start there later. // But only if not yet process, otherwise we would never finish. if (_settings.HasDownloadedUri(uriInfo)) { Trace.WriteLine( string.Format( @"URI '{0}' was already downloaded. NOT continuing later.", uriInfo.AbsoluteUri.AbsoluteUri)); } else { _settings.AddDownloadedResourceInfo(uriInfo); // Finished the function. Trace.WriteLine( string.Format( @"Added URI '{0}' to continue later.", uriInfo.AbsoluteUri.AbsoluteUri)); } } else { // If we are in asynchron mode, periodically check for stopps. if (processAsyncBackgroundWorker != null) { if (processAsyncBackgroundWorker.CancellationPending) { throw new StopProcessingException(); } } // -- // Notify event sinks about this URL. if (ProcessingUrl != null) { ProcessingUrlEventArgs e = new ProcessingUrlEventArgs( uriInfo, depth); ProcessingUrl(this, e); } // -- if (uriInfo.IsProcessableUri) { if (_settings.HasDownloadedUri(uriInfo)) { Trace.WriteLine( string.Format( @"URI '{0}' was already downloaded. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri)); } else { Trace.WriteLine( string.Format( @"URI '{0}' was not already downloaded. Processing.", uriInfo.AbsoluteUri.AbsoluteUri)); if (uriInfo.LinkType == UriType.Resource) { Trace.WriteLine( string.Format( @"Processing resource URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); byte[] binaryContent; ResourceDownloader.DownloadBinary( uriInfo.AbsoluteUri, out binaryContent, _settings.Options); ResourceStorer storer = new ResourceStorer(_settings); storer.StoreBinary( binaryContent, uriInfo); _settings.AddDownloadedResourceInfo(uriInfo); _settings.PersistDownloadedResourceInfo(uriInfo); } else { Trace.WriteLine( string.Format( @"Processing content URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); string textContent; string encodingName; Encoding encoding; byte[] binaryContent; ResourceDownloader.DownloadHtml( uriInfo.AbsoluteUri, out textContent, out encodingName, out encoding, out binaryContent, _settings.Options); ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List <UriResourceInformation> linkInfos = parser.ExtractLinks(); ResourceRewriter rewriter = new ResourceRewriter(_settings); textContent = rewriter.ReplaceLinks( textContent, uriInfo); ResourceStorer storer = new ResourceStorer(_settings); storer.StoreHtml( textContent, encoding, uriInfo); // Add before parsing childs. _settings.AddDownloadedResourceInfo(uriInfo); foreach (UriResourceInformation linkInfo in linkInfos) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, uriInfo.LocalFolderPath, uriInfo.LocalBaseFolderPath); // Recurse. ProcessUrl(dlInfo, depth + 1); // Do not return or break immediately if too deep, // because this would omit certain pages at this // recursion level. } // Persist after completely parsed childs. _settings.PersistDownloadedResourceInfo(uriInfo); } Trace.WriteLine( string.Format( @"Finished processing URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri)); } } else { Trace.WriteLine( string.Format( @"URI '{0}' is not processable. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri)); } } }