/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary<string, string> replacedLinks = new Dictionary<string, string>(); foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"({0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { throw new ApplicationException( string.Format( @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.", linkInfo.OriginalUrl, dlInfo.LocalFileName, textContent)); } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return textContent; }
// ------------------------------------------------------------------ /// <summary> /// Process one single URI with a document behind (i.e. no /// resource URI). /// </summary> /// <param name="uriInfo">The URI info.</param> /// <param name="depth">The depth.</param> private void ProcessUrl( DownloadedResourceInformation uriInfo, int depth, List<iCollector> git, List<iFollower> folo) { Console.WriteLine( string.Format( @"Processing URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); bool blnFollow = true; //Check to see if the uriInfo is followable by an internal list if (depth > 0) blnFollow = ((from f in folo.ToList() let matches = f.pattern.Matches(uriInfo.AbsoluteUri.AbsoluteUri.ToString()) where matches.Count > 0 && f.depth == depth select f).Count()) > 0 ? true : false; //return true if follow exists. if (_settings.Options.MaximumLinkDepth > -1 && depth > _settings.Options.MaximumLinkDepth) { Console.WriteLine( string.Format( @"Depth {1} exceeds maximum configured depth. Ending recursion " + @"at URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); } else if (!blnFollow) { Console.WriteLine( string.Format( @"Follower {1} exceeds maximum configured depth. Not following " + @" URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); //Fake our way into it. //_settings.AddDownloadedResourceInfo(uriInfo); } else if (depth > _maxDepth) { Console.WriteLine( string.Format( @"Depth {1} exceeds maximum allowed recursion depth. " + @"Ending recursion at URI '{0}' to possible continue later.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); // Add myself to start there later. // But only if not yet process, otherwise we would never finish. if (_settings.HasDownloadedUri(uriInfo)) { Console.WriteLine( string.Format( @"URI '{0}' was already downloaded. NOT continuing later.", uriInfo.AbsoluteUri.AbsoluteUri)); } else { _settings.AddDownloadedResourceInfo(uriInfo); // Finished the function. Console.WriteLine( string.Format( @"Added URI '{0}' to continue later.", uriInfo.AbsoluteUri.AbsoluteUri)); } } else { // If we are in asynchron mode, periodically check for stops. if (processAsyncBackgroundWorker != null) { if (processAsyncBackgroundWorker.CancellationPending) { throw new StopProcessingException(); } } // -- // Notify event sinks about this URL. if (ProcessingUrl != null) { ProcessingUrlEventArgs e = new ProcessingUrlEventArgs( uriInfo, depth); ProcessingUrl(this, e); } // -- if (uriInfo.IsProcessableUri) { if (_settings.HasDownloadedUri(uriInfo)) { Console.WriteLine( string.Format( @"URI '{0}' was already downloaded. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri)); } else { Console.WriteLine( string.Format( @"URI '{0}' was not already downloaded. Processing.", uriInfo.AbsoluteUri.AbsoluteUri)); //Switch case variables. string textContent; string encodingName; Encoding encoding; byte[] binaryContent; //Local storage. ResourceStorer storer = new ResourceStorer(_settings); ResourceParser parser = null; List<UriResourceInformation> linkInfos = null; List<iCollector> req = null; switch (uriInfo.LinkType) { case UriType.Content: Console.WriteLine(string.Format(@"Processing content URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); //Grab the page content. ResourceDownloader.DownloadHtml( uriInfo.AbsoluteUri, out textContent, out encodingName, out encoding, out binaryContent, _settings.Options); //Fire-up resource parser (A, FORMS, IMG) parser. parser = new ResourceParser( _settings, uriInfo, textContent); //Grab all the Git collector requests, that match the Uri. req = (from g in _settings.Options.GitCollectionRequest where g.pageType == UriType.Content && uriInfo.AbsoluteUri.AbsoluteUri.ToString().Contains(g.pageName) && !(from o in _settings.Parsings where o.pageType == g.pageType select o.source.AbsoluteUri.AbsoluteUri) .Contains(uriInfo.AbsoluteUri.AbsoluteUri) select g.Clone()).ToList(); //Have valid requests? if (req.Count() > 0) { //Persist the collector results. _settings.PersistCollectorResultInfo(parser.ExtractCollectorRequest(req)); } //Process link extraction. linkInfos = parser.ExtractLinks(); // Add before parsing childs. _settings.AddDownloadedResourceInfo(uriInfo); foreach (UriResourceInformation linkInfo in linkInfos) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, uriInfo.LocalFolderPath, uriInfo.LocalBaseFolderPath, linkInfo.Parent, depth + 1); // Recurse. ProcessUrl(dlInfo, depth + 1, git, folo); // Do not return or break immediately if too deep, // because this would omit certain pages at this // recursion level. } // Persist after completely parsed childs. _settings.PersistDownloadedResourceInfo(uriInfo); break; case UriType.Resource: //Console.WriteLine( //string.Format( // @"Processing resource URI '{0}', with depth {1}.", // uriInfo.AbsoluteUri.AbsoluteUri, // depth)); //Scrape Resource (IMG, JS) //ResourceDownloader.DownloadBinary( // uriInfo.AbsoluteUri, // out binaryContent, // _settings.Options); //storer = new ResourceStorer(_settings); //Save the resource (IMG, JS) //storer.StoreBinary( // binaryContent, // uriInfo); //Act like we did it. _settings.AddDownloadedResourceInfo(uriInfo); _settings.PersistDownloadedResourceInfo(uriInfo); break; case UriType.Form: Console.WriteLine( string.Format( @"Processing Form POST to URI '{0}', with depth {1}.", uriInfo.AbsoluteUri.AbsoluteUri, depth)); //Grab the Form response content. ResourceDownloader.DownloadForm( uriInfo.AbsoluteUri, out textContent, out encodingName, out encoding, out binaryContent, _settings.Options); //Fire-up resource parser (A, FORMS, IMG) parser. parser = new ResourceParser( _settings, uriInfo, textContent); var w = _settings.Parsings; //Grab all the Git collector requests, that match the Uri. req = (from g in _settings.Options.GitCollectionRequest where g.pageType == UriType.Form && uriInfo.AbsoluteUri.AbsoluteUri.ToString().Contains(g.pageName) && !(from o in _settings.Parsings where o.pageType == g.pageType select o.source.AbsoluteUri.AbsoluteUri) .Contains(uriInfo.AbsoluteUri.AbsoluteUri) select g.Clone()).ToList(); //Have requests? if (req.Count() > 0) { //Persist the collector results. _settings.PersistCollectorResultInfo(parser.ExtractCollectorRequest(req)); } //Process link extraction. linkInfos = parser.ExtractLinks(); // Add before parsing childs. _settings.AddDownloadedResourceInfo(uriInfo); foreach (UriResourceInformation linkInfo in linkInfos) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, uriInfo.LocalFolderPath, uriInfo.LocalBaseFolderPath, linkInfo.Parent, depth + 1); // Recurse. ProcessUrl(dlInfo, depth + 1, git, folo); // Do not return or break immediately if too deep, // because this would omit certain pages at this // recursion level. } // Persist after completely parsed childs. _settings.PersistDownloadedResourceInfo(uriInfo); break; default: break; } Console.WriteLine( string.Format( @"Finished processing URI '{0}'.", uriInfo.AbsoluteUri.AbsoluteUri)); } } else { Console.WriteLine( string.Format( @"URI '{0}' is not processable. Skipping.", uriInfo.AbsoluteUri.AbsoluteUri)); } } }