Example #1
0
        // ------------------------------------------------------------------

        /// <summary>
        /// Constructor.
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="uriInfo">The URI info.</param>
        /// <param name="textContent">Content of the text.</param>
        public ResourceParser(
            SpiderSettings settings,
            UriResourceInformation uriInfo,
            string textContent)
        {
            _settings    = settings;
            _uriInfo     = uriInfo;
            _textContent = textContent;
        }
Example #2
0
		// ------------------------------------------------------------------

		/// <summary>
		/// Constructor.
		/// </summary>
		/// <param name="settings">The settings.</param>
		/// <param name="uriInfo">The URI info.</param>
		/// <param name="textContent">Content of the text.</param>
		public ResourceParser(
			SpiderSettings settings,
			UriResourceInformation uriInfo,
			string textContent )
		{
			_settings = settings;
			_uriInfo = uriInfo;
			_textContent = textContent;
		}
 /// <summary>
 /// Constructor.
 /// </summary>
 /// <param name="copyFrom">The copy from.</param>
 /// <param name="folderPath">The folder path.</param>
 /// <param name="baseFolderPath">The base folder path.</param>
 public DownloadedResourceInformation(
     UriResourceInformation copyFrom,
     DirectoryInfo folderPath,
     DirectoryInfo baseFolderPath)
     :
     base(copyFrom)
 {
     _localFolderPath     = folderPath;
     _localBaseFolderPath = baseFolderPath;
 }
		/// <summary>
		/// Initializes a new instance of the 
		/// <see cref="UriResourceInformation"/> class.
		/// </summary>
		/// <param name="copyFrom">The copy from.</param>
		public UriResourceInformation(
			UriResourceInformation copyFrom )
		{
			_options = copyFrom._options;
			_originalUrl = copyFrom._originalUrl;
			_relativeUri = copyFrom._relativeUri;
			_baseUri = copyFrom._baseUri;
			_absoluteUri = copyFrom._absoluteUri;
			_linkType = copyFrom._linkType;
		}
 /// <summary>
 /// Initializes a new instance of the
 /// <see cref="UriResourceInformation"/> class.
 /// </summary>
 /// <param name="copyFrom">The copy from.</param>
 public UriResourceInformation(
     UriResourceInformation copyFrom)
 {
     _options     = copyFrom._options;
     _originalUrl = copyFrom._originalUrl;
     _relativeUri = copyFrom._relativeUri;
     _baseUri     = copyFrom._baseUri;
     _absoluteUri = copyFrom._absoluteUri;
     _linkType    = copyFrom._linkType;
 }
Example #6
0
        /// <summary>
        /// Stores a HTML resource to the local file system.
        /// Does no hyperlink replacement.
        /// </summary>
        /// <returns>Return the info about the stored data.</returns>
        public DownloadedResourceInformation StoreHtml(
            string textContent,
            Encoding encoding,
            UriResourceInformation uriInfo)
        {
            DownloadedResourceInformation result =
                new DownloadedResourceInformation(
                    uriInfo,
                    _settings.Options.DestinationFolderPath);

            try
            {
                if (result.LocalFilePath.Exists)
                {
                    result.LocalFilePath.Delete();
                }

                if (!result.LocalFilePath.Directory.Exists)
                {
                    result.LocalFilePath.Directory.Create();
                }

                Trace.WriteLine(
                    string.Format(
                        @"Writing text content to file '{0}'.",
                        result.LocalFilePath));

                using (FileStream s = new FileStream(
                           result.LocalFilePath.FullName,
                           FileMode.Create,
                           FileAccess.Write))
                    using (StreamWriter w = new StreamWriter(s, encoding))
                    {
                        w.Write(textContent);
                    }
            }
            catch (IOException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring IO exception while storing HTML file: '{0}'.",
                        x.Message));
            }
            catch (UnauthorizedAccessException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring exception while storing HTML file: '{0}'.",
                        x.Message));
            }

            return(result);
        }
Example #7
0
        /// <summary>
        /// Detects URLs in styles.
        /// </summary>
        /// <param name="baseUri">The base URI.</param>
        /// <param name="attributeName">Name of the attribute.</param>
        /// <param name="attributeValue">The attribute value.</param>
        /// <returns></returns>
        private List <UriResourceInformation> ExtractStyleUrls(
            Uri baseUri,
            string attributeName,
            string attributeValue)
        {
            List <UriResourceInformation> result =
                new List <UriResourceInformation>();

            if (string.Compare(attributeName, @"style", true) == 0)
            {
                if (attributeValue != null &&
                    attributeValue.Trim().Length > 0)
                {
                    MatchCollection matchs = Regex.Matches(
                        attributeValue,
                        @"url\s*\(\s*([^\)\s]+)\s*\)",
                        RegexOptions.Singleline | RegexOptions.IgnoreCase);

                    if (matchs.Count > 0)
                    {
                        foreach (Match match in matchs)
                        {
                            if (match != null && match.Success)
                            {
                                string url = match.Groups[1].Value;

                                UriResourceInformation ui =
                                    new UriResourceInformation(
                                        _settings.Options,
                                        url,
                                        new Uri(url, UriKind.RelativeOrAbsolute),
                                        baseUri,
                                        UriType.Resource);

                                bool isOnSameSite =
                                    ui.IsOnSameSite(baseUri);

                                if ((isOnSameSite ||
                                     !_settings.Options.StayOnSite) &&
                                    ui.IsProcessableUri)
                                {
                                    result.Add(ui);
                                }
                            }
                        }
                    }
                }
            }

            return(result);
        }
Example #8
0
		/// <summary>
		/// Stores a binary resource to the local file system.
		/// </summary>
		/// <returns>Return the info about the stored data.</returns>
		public DownloadedResourceInformation StoreBinary(
			byte[] binaryContent,
			UriResourceInformation uriInfo )
		{
			DownloadedResourceInformation result =
				new DownloadedResourceInformation(
				uriInfo,
				_settings.Options.DestinationFolderPath );

			try
			{
				if ( result.LocalFilePath.Exists )
				{
					result.LocalFilePath.Delete();
				}

				if ( binaryContent != null && binaryContent.Length > 0 )
				{
					Trace.WriteLine(
						string.Format(
						@"Writing binary content to file '{0}'.",
						result.LocalFilePath ) );

					using ( FileStream s = result.LocalFilePath.OpenWrite() )
					{
						s.Write( binaryContent, 0, binaryContent.Length );
					}
				}
			}
			catch ( IOException x )
			{
				Trace.WriteLine(
					string.Format(
					@"Ignoring IO exception while storing binary file: '{0}'.",
					x.Message ) );
			}
			catch ( UnauthorizedAccessException x )
			{
				Trace.WriteLine(
					string.Format(
					@"Ignoring exception while storing binary file: '{0}'.",
					x.Message ) );
			}

			return result;
		}
Example #9
0
        /// <summary>
        /// Stores a binary resource to the local file system.
        /// </summary>
        /// <returns>Return the info about the stored data.</returns>
        public DownloadedResourceInformation StoreBinary(
            byte[] binaryContent,
            UriResourceInformation uriInfo)
        {
            DownloadedResourceInformation result =
                new DownloadedResourceInformation(
                    uriInfo,
                    _settings.Options.DestinationFolderPath);

            try
            {
                if (result.LocalFilePath.Exists)
                {
                    result.LocalFilePath.Delete();
                }

                if (binaryContent != null && binaryContent.Length > 0)
                {
                    Trace.WriteLine(
                        string.Format(
                            @"Writing binary content to file '{0}'.",
                            result.LocalFilePath));

                    using (FileStream s = result.LocalFilePath.OpenWrite())
                    {
                        s.Write(binaryContent, 0, binaryContent.Length);
                    }
                }
            }
            catch (IOException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring IO exception while storing binary file: '{0}'.",
                        x.Message));
            }
            catch (UnauthorizedAccessException x)
            {
                Trace.WriteLine(
                    string.Format(
                        @"Ignoring exception while storing binary file: '{0}'.",
                        x.Message));
            }

            return(result);
        }
        /// <summary>
        /// Constructor.
        /// </summary>
        /// <param name="copyFrom">The copy from.</param>
        /// <param name="baseFolderPath">The base folder path.</param>
        public DownloadedResourceInformation(
            UriResourceInformation copyFrom,
            DirectoryInfo baseFolderPath)
            :
            base(copyFrom)
        {
            _localBaseFolderPath = baseFolderPath;

            _localFilePath = new FileInfo(
                Path.Combine(
                    baseFolderPath.FullName,
                    MakeLocalFileName(
                        copyFrom.AbsoluteUri,
                        copyFrom.BaseUri,
                        copyFrom.LinkType)));

            _localFileName =
                new FileInfo(_localFilePath.Name);
        }
Example #11
0
        // ------------------------------------------------------------------
        #endregion

        #region Private methods.
        // ------------------------------------------------------------------

        /// <summary>
        /// Does the extract links.
        /// </summary>
        /// <param name="xml">The XML.</param>
        /// <param name="uriInfo">The URI info.</param>
        /// <returns></returns>
        private List <UriResourceInformation> DoExtractLinks(
            XmlReader xml,
            UriResourceInformation uriInfo)
        {
            List <UriResourceInformation> links =
                new List <UriResourceInformation>();

            while (xml.Read())
            {
                switch (xml.NodeType)
                {
                // Added 2006-03-27: Inside comments, too.
                case XmlNodeType.Comment:
                    XmlReader childXml =
                        GetDocReader(xml.Value, uriInfo.BaseUri);

                    List <UriResourceInformation> childLinks =
                        DoExtractLinks(childXml, uriInfo);
                    links.AddRange(childLinks);
                    break;

                // A node element.
                case XmlNodeType.Element:
                    string[] linkAttributeNames;
                    UriType  linkType;
                    // If this is a link element, store the URLs to modify.
                    if (IsLinkElement(
                            xml.Name,
                            out linkAttributeNames,
                            out linkType))
                    {
                        while (xml.MoveToNextAttribute())
                        {
                            links.AddRange(
                                ExtractStyleUrls(
                                    uriInfo.BaseUriWithFolder,
                                    xml.Name,
                                    xml.Value));

                            foreach (string a in linkAttributeNames)
                            {
                                if (string.Compare(a, xml.Name, true) == 0)
                                {
                                    string url = xml.Value;

                                    UriResourceInformation ui =
                                        new UriResourceInformation(
                                            _settings.Options,
                                            url,
                                            new Uri(url, UriKind.RelativeOrAbsolute),
                                            uriInfo.BaseUriWithFolder,
                                            linkType);

                                    bool isOnSameSite =
                                        ui.IsOnSameSite(uriInfo.BaseUri);

                                    if ((isOnSameSite ||
                                         !_settings.Options.StayOnSite) &&
                                        ui.IsProcessableUri)
                                    {
                                        links.Add(ui);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        // Also, look for style attributes.
                        while (xml.MoveToNextAttribute())
                        {
                            links.AddRange(
                                ExtractStyleUrls(
                                    uriInfo.BaseUriWithFolder,
                                    xml.Name,
                                    xml.Value));
                        }
                    }
                    break;
                }
            }

            return(links);
        }
Example #12
0
		/// <summary>
		/// Stores a HTML resource to the local file system.
		/// Does no hyperlink replacement.
		/// </summary>
		/// <returns>Return the info about the stored data.</returns>
		public DownloadedResourceInformation StoreHtml(
			string textContent,
			Encoding encoding,
			UriResourceInformation uriInfo )
		{
			DownloadedResourceInformation result =
				new DownloadedResourceInformation(
				uriInfo,
				_settings.Options.DestinationFolderPath );

			try
			{
				if ( result.LocalFilePath.Exists )
				{
					result.LocalFilePath.Delete();
				}

				if ( !result.LocalFilePath.Directory.Exists )
				{
					result.LocalFilePath.Directory.Create();
				}

				Trace.WriteLine(
					string.Format(
					@"Writing text content to file '{0}'.",
					result.LocalFilePath ) );

				using ( FileStream s = new FileStream(
					result.LocalFilePath.FullName,
					FileMode.Create,
					FileAccess.Write ) )
				using ( StreamWriter w = new StreamWriter( s, encoding ) )
				{
					w.Write( textContent );
				}
			}
			catch ( IOException x )
			{
				Trace.WriteLine(
					string.Format(
					@"Ignoring IO exception while storing HTML file: '{0}'.",
					x.Message ) );
			}
			catch ( UnauthorizedAccessException x )
			{
				Trace.WriteLine(
					string.Format(
					@"Ignoring exception while storing HTML file: '{0}'.",
					x.Message ) );
			}

			return result;
		}
Example #13
0
        /// <summary>
        /// Replace URIs inside a given HTML document that was previously 
        /// downloaded with the local URIs.
        /// </summary>
        /// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            ResourceParser parser = new ResourceParser(
                _settings,
                uriInfo,
                textContent );

            List<UriResourceInformation> linkInfos =
                parser.ExtractLinks();

            // For remembering duplicates.
            Dictionary<string, string> replacedLinks =
                new Dictionary<string, string>();

            // --

            foreach ( UriResourceInformation linkInfo in linkInfos )
            {
                if ( linkInfo.WantFollowUri || linkInfo.IsResourceUri )
                {
                    DownloadedResourceInformation dlInfo =
                        new DownloadedResourceInformation(
                        linkInfo,
                        _settings.Options.DestinationFolderPath );

                    //					/*
                    if ( !string.IsNullOrEmpty( linkInfo.OriginalUrl ) )
                    {
                        string textContentBefore = textContent;

                        string link =
                            Regex.Escape( linkInfo.OriginalUrl );

                        textContent = Regex.Replace(
                            textContent,
                            string.Format( @"""{0}""", link ),
                            string.Format( @"""Resources\{0}""", dlInfo.LocalFileName ),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline );
                        textContent = Regex.Replace(
                            textContent,
                            string.Format( @"'{0}'", link ),
                            string.Format(@"'Resources\{0}'", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline );

                        // For style-"url(...)"-links.
                        textContent = Regex.Replace(
                            textContent,
                            string.Format( @"\(\s*{0}\s*\)", link ),
                            string.Format(@"(Resources\{0})", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline );

                        // Some checking.
                        // 2007-07-27, Uwe Keim.
                        if ( linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                            textContentBefore == textContent &&
                            !replacedLinks.ContainsKey( linkInfo.AbsoluteUri.AbsolutePath ) )
                        {

                        }
                        else
                        {
                            // Remember.
                            replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                linkInfo.AbsoluteUri.AbsolutePath;
                        }
                    }
                    //					*/
                }
            }

            // --

            return textContent;
        }
		/// <summary>
		/// Replace URIs inside a given HTML document that was previously 
		/// downloaded with the local URIs.
		/// </summary>
		/// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            try
            {

                ResourceParser parser = new ResourceParser(
                    _settings,
                    uriInfo,
                    textContent);

                List<UriResourceInformation> linkInfos =
                    parser.ExtractLinks();

                // For remembering duplicates.
                Dictionary<string, string> replacedLinks =
                    new Dictionary<string, string>();

                // --

                foreach (UriResourceInformation linkInfo in linkInfos)
                {
                    if (linkInfo.WantFollowUri || linkInfo.IsResourceUri)
                    {
                        DownloadedResourceInformation dlInfo =
                            new DownloadedResourceInformation(
                            linkInfo,
                            _settings.Options.DestinationFolderPath);

                        //					/*
                        if (!string.IsNullOrEmpty(linkInfo.OriginalUrl))
                        {
                            string textContentBefore = textContent;

                            string link =
                                Regex.Escape(linkInfo.OriginalUrl);

                            textContent = Regex.Replace(
                                textContent,
                                string.Format(@"""{0}""", link),
                                string.Format(@"""{0}""", dlInfo.LocalFileName),
                                RegexOptions.IgnoreCase | RegexOptions.Multiline);
                            textContent = Regex.Replace(
                                textContent,
                                string.Format(@"'{0}'", link),
                                string.Format(@"'{0}'", dlInfo.LocalFileName),
                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                            // For style-"url(...)"-links.
                            textContent = Regex.Replace(
                                textContent,
                                string.Format(@"\(\s*{0}\s*\)", link),
                                string.Format(@"({0})", dlInfo.LocalFileName),
                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                            // Some checking.
                            // 2007-07-27, Uwe Keim.
                            if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                                textContentBefore == textContent &&
                                !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath))
                            {
                                //throw new ApplicationException(
                                    string.Format(
                                        @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.",
                                        linkInfo.OriginalUrl,
                                        dlInfo.LocalFileName,
                                        textContent);
                            }
                            else
                            {
                                // Remember.
                                replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                    linkInfo.AbsoluteUri.AbsolutePath;
                            }
                        }
                        //					*/
                    }
                }
            }
            catch (Exception ex)
            {//Satwadhir written code }

                // --
                Console.WriteLine(ex.Message.ToString());
               
            }
            return textContent;
        }
Example #15
0
        /// <summary>
        /// Replace URIs inside a given HTML document that was previously
        /// downloaded with the local URIs.
        /// </summary>
        /// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            ResourceParser parser = new ResourceParser(
                _settings,
                uriInfo,
                textContent);

            List <UriResourceInformation> linkInfos =
                parser.ExtractLinks();

            // For remembering duplicates.
            Dictionary <string, string> replacedLinks =
                new Dictionary <string, string>();

            // --

            foreach (UriResourceInformation linkInfo in linkInfos)
            {
                if (linkInfo.WantFollowUri || linkInfo.IsResourceUri)
                {
                    DownloadedResourceInformation dlInfo =
                        new DownloadedResourceInformation(
                            linkInfo,
                            _settings.Options.DestinationFolderPath);

                    //					/*
                    if (!string.IsNullOrEmpty(linkInfo.OriginalUrl))
                    {
                        string textContentBefore = textContent;

                        string link =
                            Regex.Escape(linkInfo.OriginalUrl);

                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"""{0}""", link),
                            string.Format(@"""Resources\{0}""", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"'{0}'", link),
                            string.Format(@"'Resources\{0}'", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // For style-"url(...)"-links.
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"\(\s*{0}\s*\)", link),
                            string.Format(@"(Resources\{0})", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // Some checking.
                        // 2007-07-27, Uwe Keim.
                        if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                            textContentBefore == textContent &&
                            !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath))
                        {
                        }
                        else
                        {
                            // Remember.
                            replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                linkInfo.AbsoluteUri.AbsolutePath;
                        }
                    }
                    //					*/
                }
            }

            // --

            return(textContent);
        }
Example #16
0
		// ------------------------------------------------------------------
		#endregion

		#region Private methods.
		// ------------------------------------------------------------------

		/// <summary>
		/// Does the extract links.
		/// </summary>
		/// <param name="xml">The XML.</param>
		/// <param name="uriInfo">The URI info.</param>
		/// <returns></returns>
		private List<UriResourceInformation> DoExtractLinks(
			XmlReader xml,
			UriResourceInformation uriInfo )
		{
			List<UriResourceInformation> links =
				new List<UriResourceInformation>();

			while ( xml.Read() )
			{
				switch ( xml.NodeType )
				{
					// Added 2006-03-27: Inside comments, too.
					case XmlNodeType.Comment:
						XmlReader childXml =
							GetDocReader( xml.Value, uriInfo.BaseUri );

						List<UriResourceInformation> childLinks =
							DoExtractLinks( childXml, uriInfo );
						links.AddRange( childLinks );
						break;

					// A node element.
					case XmlNodeType.Element:
						string[] linkAttributeNames;
						UriType linkType;
						// If this is a link element, store the URLs to modify.
						if ( IsLinkElement(
							xml.Name,
							out linkAttributeNames,
							out linkType ) )
						{
							while ( xml.MoveToNextAttribute() )
							{
								links.AddRange(
									ExtractStyleUrls(
									uriInfo.BaseUriWithFolder,
									xml.Name,
									xml.Value ) );

								foreach ( string a in linkAttributeNames )
								{
									if ( string.Compare( a, xml.Name, true ) == 0 )
									{
										string url = xml.Value;

										UriResourceInformation ui =
											new UriResourceInformation(
											_settings.Options,
											url,
											new Uri( url, UriKind.RelativeOrAbsolute ),
											uriInfo.BaseUriWithFolder,
											linkType );

										bool isOnSameSite =
											ui.IsOnSameSite( uriInfo.BaseUri );

										if ( (isOnSameSite ||
											!_settings.Options.StayOnSite) &&
											ui.IsProcessableUri )
										{
											links.Add( ui );
										}
									}
								}
							}
						}
						else
						{
							// Also, look for style attributes.
							while ( xml.MoveToNextAttribute() )
							{
								links.AddRange(
									ExtractStyleUrls(
									uriInfo.BaseUriWithFolder,
									xml.Name,
									xml.Value ) );
							}
						}
						break;
				}
			}

			return links;
		}
Example #17
0
		/// <summary>
		/// Detects URLs in styles.
		/// </summary>
		/// <param name="baseUri">The base URI.</param>
		/// <param name="attributeName">Name of the attribute.</param>
		/// <param name="attributeValue">The attribute value.</param>
		/// <returns></returns>
		private List<UriResourceInformation> ExtractStyleUrls(
			Uri baseUri,
			string attributeName,
			string attributeValue )
		{
			List<UriResourceInformation> result =
				new List<UriResourceInformation>();

			if ( string.Compare( attributeName, @"style", true ) == 0 )
			{
				if ( attributeValue != null &&
					attributeValue.Trim().Length > 0 )
				{
					MatchCollection matchs = Regex.Matches(
						attributeValue,
						@"url\s*\(\s*([^\)\s]+)\s*\)",
						RegexOptions.Singleline | RegexOptions.IgnoreCase );

					if ( matchs.Count > 0 )
					{
						foreach ( Match match in matchs )
						{
							if ( match != null && match.Success )
							{
								string url = match.Groups[1].Value;

								UriResourceInformation ui =
									new UriResourceInformation(
									_settings.Options,
									url,
									new Uri( url, UriKind.RelativeOrAbsolute ),
									baseUri,
									UriType.Resource );

								bool isOnSameSite =
									ui.IsOnSameSite( baseUri );

								if ( (isOnSameSite ||
									!_settings.Options.StayOnSite) &&
									ui.IsProcessableUri )
								{
									result.Add( ui );
								}
							}
						}
					}
				}
			}

			return result;
		}