/// <summary> /// Stores a binary resource to the local file system. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInfo StoreBinary( byte[] binaryContent, UriResourceInfo uriInfo) { DownloadedResourceInfo result = new DownloadedResourceInfo( uriInfo, _settings.Options.DestinationFolderPath); try { if (result.LocalFilePath.Exists) { result.LocalFilePath.Delete(); } if (binaryContent != null && binaryContent.Length > 0) { result.LocalFilePath.Create(); Trace.WriteLine($"Writing binary content to file '{result.LocalFilePath}'."); using (FileStream s = result.LocalFilePath.OpenWrite()) { s.Write(binaryContent, 0, binaryContent.Length); } } } catch (Exception x) { Trace.WriteLine($"Ignoring exception while storing binary file: '{ x.Message}'."); } return(result); }
public UriResourceInfo(UriResourceInfo copyFrom) { _options = copyFrom._options; _originalUrl = copyFrom._originalUrl; _relativeUri = copyFrom._relativeUri; _baseUri = copyFrom._baseUri; _absoluteUri = copyFrom._absoluteUri; _linkType = copyFrom._linkType; }
/// <summary> /// Constructor. /// </summary> /// <param name="settings">The settings.</param> /// <param name="uriInfo">The URI info.</param> /// <param name="textContent">Content of the text.</param> public ResourceParser( Settings settings, UriResourceInfo uriInfo, string textContent) { _settings = settings; _uriInfo = uriInfo; _textContent = textContent; }
/// <summary> /// Constructor. /// </summary> /// <param name="copyFrom">The copy from.</param> /// <param name="folderPath">The folder path.</param> /// <param name="baseFolderPath">The base folder path.</param> public DownloadedResourceInfo( UriResourceInfo copyFrom, DirectoryInfo folderPath, DirectoryInfo baseFolderPath) : base(copyFrom) { _localFolderPath = folderPath; _localBaseFolderPath = baseFolderPath; }
/// <summary> /// Detects URLs in styles. /// </summary> /// <param name="baseUri">The base URI.</param> /// <param name="attributeName">Name of the attribute.</param> /// <param name="attributeValue">The attribute value.</param> /// <returns></returns> private List <UriResourceInfo> ExtractStyleUrls( Uri baseUri, string attributeName, string attributeValue) { List <UriResourceInfo> result = new List <UriResourceInfo>(); if (string.Compare(attributeName, @"style", true) == 0) { if (attributeValue != null && attributeValue.Trim().Length > 0) { MatchCollection matchs = Regex.Matches( attributeValue, @"url\s*\(\s*([^\)\s]+)\s*\)", RegexOptions.Singleline | RegexOptions.IgnoreCase); if (matchs.Count > 0) { foreach (Match match in matchs) { if (match != null && match.Success) { string url = match.Groups[1].Value; UriResourceInfo ui = new UriResourceInfo( _settings.Options, url, new Uri(url, UriKind.RelativeOrAbsolute), baseUri, UriType.Resource); bool isOnSameSite = ui.IsOnSameSite(baseUri); if ((isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri) { result.Add(ui); } } } } } } return(result); }
/// <summary> /// Constructor. /// </summary> /// <param name="copyFrom">The copy from.</param> /// <param name="baseFolderPath">The base folder path.</param> public DownloadedResourceInfo( UriResourceInfo copyFrom, DirectoryInfo baseFolderPath) : base(copyFrom) { _localBaseFolderPath = baseFolderPath; _localFilePath = new FileInfo( Path.Combine( baseFolderPath.FullName, MakeLocalFileName( copyFrom.AbsoluteUri, copyFrom.BaseUri, copyFrom.LinkType).Name)); _localFileName = new FileInfo(_localFilePath.Name); }
/// <summary> /// Stores a HTML resource to the local file system. /// Does no hyperlink replacement. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInfo StoreHtml( string textContent, Encoding encoding, UriResourceInfo uriInfo) { DownloadedResourceInfo result = new DownloadedResourceInfo( uriInfo, _settings.Options.DestinationFolderPath); try { if (result.LocalFilePath.Exists) { result.LocalFilePath.Delete(); } if (!result.LocalFilePath.Directory.Exists) { result.LocalFilePath.Directory.Create(); } Trace.WriteLine($"Writing text content to file '{result.LocalFilePath}'."); using (FileStream s = new FileStream(result.LocalFilePath.FullName, FileMode.Create, FileAccess.Write)) using (StreamWriter w = new StreamWriter(s, encoding)) { w.Write(textContent); } } catch (Exception x) { Trace.WriteLine($"Ignoring IO exception while storing HTML file: '{x.Message}'."); } return(result); }
/// <summary> /// Does the extract links. /// </summary> /// <param name="xml">The XML.</param> /// <param name="uriInfo">The URI info.</param> /// <returns></returns> private List <UriResourceInfo> DoExtractLinks( XmlReader xml, UriResourceInfo uriInfo) { List <UriResourceInfo> links = new List <UriResourceInfo>(); while (xml.Read()) { switch (xml.NodeType) { // Added 2016-10-16: Inside comments, too. case XmlNodeType.Comment: XmlReader childXml = GetDocReader(xml.Value, uriInfo.BaseUri); List <UriResourceInfo> childLinks = DoExtractLinks(childXml, uriInfo); links.AddRange(childLinks); break; // A node element. case XmlNodeType.Element: string[] linkAttributeNames; UriType linkType; // If this is a link element, store the URLs to modify. if (IsLinkElement( xml.Name, out linkAttributeNames, out linkType)) { while (xml.MoveToNextAttribute()) { links.AddRange( ExtractStyleUrls( uriInfo.BaseUriWithFolder, xml.Name, xml.Value)); foreach (string a in linkAttributeNames) { if (string.Compare(a, xml.Name, true) == 0) { string url = xml.Value; UriResourceInfo ui = new UriResourceInfo( _settings.Options, url, new Uri(url, UriKind.RelativeOrAbsolute), uriInfo.BaseUriWithFolder, linkType); bool isOnSameSite = ui.IsOnSameSite(uriInfo.BaseUri); if ((isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri) { links.Add(ui); } } } } } else { // Also, look for style attributes. while (xml.MoveToNextAttribute()) { links.AddRange( ExtractStyleUrls( uriInfo.BaseUriWithFolder, xml.Name, xml.Value)); } } break; } } return(links); }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInfo uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List <UriResourceInfo> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary <string, string> replacedLinks = new Dictionary <string, string>(); // -- foreach (UriResourceInfo linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInfo dlInfo = new DownloadedResourceInfo( linkInfo, _settings.Options.DestinationFolderPath); // /* if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"({0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2016-10-16, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { throw new ApplicationException($"Failed to replace URI '{linkInfo.OriginalUrl}' with URI '{dlInfo.LocalFileName}' in HTML text '{textContent}'."); } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return(textContent); }