// ------------------------------------------------------------------ /// <summary> /// Constructor. /// </summary> /// <param name="settings">The settings.</param> /// <param name="uriInfo">The URI info.</param> /// <param name="textContent">Content of the text.</param> public ResourceParser( SpiderSettings settings, UriResourceInformation uriInfo, string textContent) { _settings = settings; _uriInfo = uriInfo; _textContent = textContent; }
// ------------------------------------------------------------------ /// <summary> /// Constructor. /// </summary> /// <param name="settings">The settings.</param> /// <param name="uriInfo">The URI info.</param> /// <param name="textContent">Content of the text.</param> public ResourceParser( SpiderSettings settings, UriResourceInformation uriInfo, string textContent ) { _settings = settings; _uriInfo = uriInfo; _textContent = textContent; }
/// <summary> /// Constructor. /// </summary> /// <param name="copyFrom">The copy from.</param> /// <param name="folderPath">The folder path.</param> /// <param name="baseFolderPath">The base folder path.</param> public DownloadedResourceInformation( UriResourceInformation copyFrom, DirectoryInfo folderPath, DirectoryInfo baseFolderPath) : base(copyFrom) { _localFolderPath = folderPath; _localBaseFolderPath = baseFolderPath; }
/// <summary> /// Initializes a new instance of the /// <see cref="UriResourceInformation"/> class. /// </summary> /// <param name="copyFrom">The copy from.</param> public UriResourceInformation( UriResourceInformation copyFrom ) { _options = copyFrom._options; _originalUrl = copyFrom._originalUrl; _relativeUri = copyFrom._relativeUri; _baseUri = copyFrom._baseUri; _absoluteUri = copyFrom._absoluteUri; _linkType = copyFrom._linkType; }
/// <summary> /// Initializes a new instance of the /// <see cref="UriResourceInformation"/> class. /// </summary> /// <param name="copyFrom">The copy from.</param> public UriResourceInformation( UriResourceInformation copyFrom) { _options = copyFrom._options; _originalUrl = copyFrom._originalUrl; _relativeUri = copyFrom._relativeUri; _baseUri = copyFrom._baseUri; _absoluteUri = copyFrom._absoluteUri; _linkType = copyFrom._linkType; }
/// <summary> /// Stores a HTML resource to the local file system. /// Does no hyperlink replacement. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreHtml( string textContent, Encoding encoding, UriResourceInformation uriInfo) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath); try { if (result.LocalFilePath.Exists) { result.LocalFilePath.Delete(); } if (!result.LocalFilePath.Directory.Exists) { result.LocalFilePath.Directory.Create(); } Trace.WriteLine( string.Format( @"Writing text content to file '{0}'.", result.LocalFilePath)); using (FileStream s = new FileStream( result.LocalFilePath.FullName, FileMode.Create, FileAccess.Write)) using (StreamWriter w = new StreamWriter(s, encoding)) { w.Write(textContent); } } catch (IOException x) { Trace.WriteLine( string.Format( @"Ignoring IO exception while storing HTML file: '{0}'.", x.Message)); } catch (UnauthorizedAccessException x) { Trace.WriteLine( string.Format( @"Ignoring exception while storing HTML file: '{0}'.", x.Message)); } return(result); }
/// <summary> /// Detects URLs in styles. /// </summary> /// <param name="baseUri">The base URI.</param> /// <param name="attributeName">Name of the attribute.</param> /// <param name="attributeValue">The attribute value.</param> /// <returns></returns> private List <UriResourceInformation> ExtractStyleUrls( Uri baseUri, string attributeName, string attributeValue) { List <UriResourceInformation> result = new List <UriResourceInformation>(); if (string.Compare(attributeName, @"style", true) == 0) { if (attributeValue != null && attributeValue.Trim().Length > 0) { MatchCollection matchs = Regex.Matches( attributeValue, @"url\s*\(\s*([^\)\s]+)\s*\)", RegexOptions.Singleline | RegexOptions.IgnoreCase); if (matchs.Count > 0) { foreach (Match match in matchs) { if (match != null && match.Success) { string url = match.Groups[1].Value; UriResourceInformation ui = new UriResourceInformation( _settings.Options, url, new Uri(url, UriKind.RelativeOrAbsolute), baseUri, UriType.Resource); bool isOnSameSite = ui.IsOnSameSite(baseUri); if ((isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri) { result.Add(ui); } } } } } } return(result); }
/// <summary> /// Stores a binary resource to the local file system. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreBinary( byte[] binaryContent, UriResourceInformation uriInfo ) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath ); try { if ( result.LocalFilePath.Exists ) { result.LocalFilePath.Delete(); } if ( binaryContent != null && binaryContent.Length > 0 ) { Trace.WriteLine( string.Format( @"Writing binary content to file '{0}'.", result.LocalFilePath ) ); using ( FileStream s = result.LocalFilePath.OpenWrite() ) { s.Write( binaryContent, 0, binaryContent.Length ); } } } catch ( IOException x ) { Trace.WriteLine( string.Format( @"Ignoring IO exception while storing binary file: '{0}'.", x.Message ) ); } catch ( UnauthorizedAccessException x ) { Trace.WriteLine( string.Format( @"Ignoring exception while storing binary file: '{0}'.", x.Message ) ); } return result; }
/// <summary> /// Stores a binary resource to the local file system. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreBinary( byte[] binaryContent, UriResourceInformation uriInfo) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath); try { if (result.LocalFilePath.Exists) { result.LocalFilePath.Delete(); } if (binaryContent != null && binaryContent.Length > 0) { Trace.WriteLine( string.Format( @"Writing binary content to file '{0}'.", result.LocalFilePath)); using (FileStream s = result.LocalFilePath.OpenWrite()) { s.Write(binaryContent, 0, binaryContent.Length); } } } catch (IOException x) { Trace.WriteLine( string.Format( @"Ignoring IO exception while storing binary file: '{0}'.", x.Message)); } catch (UnauthorizedAccessException x) { Trace.WriteLine( string.Format( @"Ignoring exception while storing binary file: '{0}'.", x.Message)); } return(result); }
/// <summary> /// Constructor. /// </summary> /// <param name="copyFrom">The copy from.</param> /// <param name="baseFolderPath">The base folder path.</param> public DownloadedResourceInformation( UriResourceInformation copyFrom, DirectoryInfo baseFolderPath) : base(copyFrom) { _localBaseFolderPath = baseFolderPath; _localFilePath = new FileInfo( Path.Combine( baseFolderPath.FullName, MakeLocalFileName( copyFrom.AbsoluteUri, copyFrom.BaseUri, copyFrom.LinkType))); _localFileName = new FileInfo(_localFilePath.Name); }
// ------------------------------------------------------------------ #endregion #region Private methods. // ------------------------------------------------------------------ /// <summary> /// Does the extract links. /// </summary> /// <param name="xml">The XML.</param> /// <param name="uriInfo">The URI info.</param> /// <returns></returns> private List <UriResourceInformation> DoExtractLinks( XmlReader xml, UriResourceInformation uriInfo) { List <UriResourceInformation> links = new List <UriResourceInformation>(); while (xml.Read()) { switch (xml.NodeType) { // Added 2006-03-27: Inside comments, too. case XmlNodeType.Comment: XmlReader childXml = GetDocReader(xml.Value, uriInfo.BaseUri); List <UriResourceInformation> childLinks = DoExtractLinks(childXml, uriInfo); links.AddRange(childLinks); break; // A node element. case XmlNodeType.Element: string[] linkAttributeNames; UriType linkType; // If this is a link element, store the URLs to modify. if (IsLinkElement( xml.Name, out linkAttributeNames, out linkType)) { while (xml.MoveToNextAttribute()) { links.AddRange( ExtractStyleUrls( uriInfo.BaseUriWithFolder, xml.Name, xml.Value)); foreach (string a in linkAttributeNames) { if (string.Compare(a, xml.Name, true) == 0) { string url = xml.Value; UriResourceInformation ui = new UriResourceInformation( _settings.Options, url, new Uri(url, UriKind.RelativeOrAbsolute), uriInfo.BaseUriWithFolder, linkType); bool isOnSameSite = ui.IsOnSameSite(uriInfo.BaseUri); if ((isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri) { links.Add(ui); } } } } } else { // Also, look for style attributes. while (xml.MoveToNextAttribute()) { links.AddRange( ExtractStyleUrls( uriInfo.BaseUriWithFolder, xml.Name, xml.Value)); } } break; } } return(links); }
/// <summary> /// Stores a HTML resource to the local file system. /// Does no hyperlink replacement. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreHtml( string textContent, Encoding encoding, UriResourceInformation uriInfo ) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath ); try { if ( result.LocalFilePath.Exists ) { result.LocalFilePath.Delete(); } if ( !result.LocalFilePath.Directory.Exists ) { result.LocalFilePath.Directory.Create(); } Trace.WriteLine( string.Format( @"Writing text content to file '{0}'.", result.LocalFilePath ) ); using ( FileStream s = new FileStream( result.LocalFilePath.FullName, FileMode.Create, FileAccess.Write ) ) using ( StreamWriter w = new StreamWriter( s, encoding ) ) { w.Write( textContent ); } } catch ( IOException x ) { Trace.WriteLine( string.Format( @"Ignoring IO exception while storing HTML file: '{0}'.", x.Message ) ); } catch ( UnauthorizedAccessException x ) { Trace.WriteLine( string.Format( @"Ignoring exception while storing HTML file: '{0}'.", x.Message ) ); } return result; }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent ); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary<string, string> replacedLinks = new Dictionary<string, string>(); // -- foreach ( UriResourceInformation linkInfo in linkInfos ) { if ( linkInfo.WantFollowUri || linkInfo.IsResourceUri ) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath ); // /* if ( !string.IsNullOrEmpty( linkInfo.OriginalUrl ) ) { string textContentBefore = textContent; string link = Regex.Escape( linkInfo.OriginalUrl ); textContent = Regex.Replace( textContent, string.Format( @"""{0}""", link ), string.Format( @"""Resources\{0}""", dlInfo.LocalFileName ), RegexOptions.IgnoreCase | RegexOptions.Multiline ); textContent = Regex.Replace( textContent, string.Format( @"'{0}'", link ), string.Format(@"'Resources\{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline ); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format( @"\(\s*{0}\s*\)", link ), string.Format(@"(Resources\{0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline ); // Some checking. // 2007-07-27, Uwe Keim. if ( linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey( linkInfo.AbsoluteUri.AbsolutePath ) ) { } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return textContent; }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { try { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary<string, string> replacedLinks = new Dictionary<string, string>(); // -- foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); // /* if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"({0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { //throw new ApplicationException( string.Format( @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.", linkInfo.OriginalUrl, dlInfo.LocalFileName, textContent); } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } } catch (Exception ex) {//Satwadhir written code } // -- Console.WriteLine(ex.Message.ToString()); } return textContent; }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List <UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary <string, string> replacedLinks = new Dictionary <string, string>(); // -- foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); // /* if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""Resources\{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'Resources\{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"(Resources\{0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return(textContent); }
// ------------------------------------------------------------------ #endregion #region Private methods. // ------------------------------------------------------------------ /// <summary> /// Does the extract links. /// </summary> /// <param name="xml">The XML.</param> /// <param name="uriInfo">The URI info.</param> /// <returns></returns> private List<UriResourceInformation> DoExtractLinks( XmlReader xml, UriResourceInformation uriInfo ) { List<UriResourceInformation> links = new List<UriResourceInformation>(); while ( xml.Read() ) { switch ( xml.NodeType ) { // Added 2006-03-27: Inside comments, too. case XmlNodeType.Comment: XmlReader childXml = GetDocReader( xml.Value, uriInfo.BaseUri ); List<UriResourceInformation> childLinks = DoExtractLinks( childXml, uriInfo ); links.AddRange( childLinks ); break; // A node element. case XmlNodeType.Element: string[] linkAttributeNames; UriType linkType; // If this is a link element, store the URLs to modify. if ( IsLinkElement( xml.Name, out linkAttributeNames, out linkType ) ) { while ( xml.MoveToNextAttribute() ) { links.AddRange( ExtractStyleUrls( uriInfo.BaseUriWithFolder, xml.Name, xml.Value ) ); foreach ( string a in linkAttributeNames ) { if ( string.Compare( a, xml.Name, true ) == 0 ) { string url = xml.Value; UriResourceInformation ui = new UriResourceInformation( _settings.Options, url, new Uri( url, UriKind.RelativeOrAbsolute ), uriInfo.BaseUriWithFolder, linkType ); bool isOnSameSite = ui.IsOnSameSite( uriInfo.BaseUri ); if ( (isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri ) { links.Add( ui ); } } } } } else { // Also, look for style attributes. while ( xml.MoveToNextAttribute() ) { links.AddRange( ExtractStyleUrls( uriInfo.BaseUriWithFolder, xml.Name, xml.Value ) ); } } break; } } return links; }
/// <summary> /// Detects URLs in styles. /// </summary> /// <param name="baseUri">The base URI.</param> /// <param name="attributeName">Name of the attribute.</param> /// <param name="attributeValue">The attribute value.</param> /// <returns></returns> private List<UriResourceInformation> ExtractStyleUrls( Uri baseUri, string attributeName, string attributeValue ) { List<UriResourceInformation> result = new List<UriResourceInformation>(); if ( string.Compare( attributeName, @"style", true ) == 0 ) { if ( attributeValue != null && attributeValue.Trim().Length > 0 ) { MatchCollection matchs = Regex.Matches( attributeValue, @"url\s*\(\s*([^\)\s]+)\s*\)", RegexOptions.Singleline | RegexOptions.IgnoreCase ); if ( matchs.Count > 0 ) { foreach ( Match match in matchs ) { if ( match != null && match.Success ) { string url = match.Groups[1].Value; UriResourceInformation ui = new UriResourceInformation( _settings.Options, url, new Uri( url, UriKind.RelativeOrAbsolute ), baseUri, UriType.Resource ); bool isOnSameSite = ui.IsOnSameSite( baseUri ); if ( (isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri ) { result.Add( ui ); } } } } } } return result; }