Exemplo n.º 1
0
        /// <summary>
        /// Download a HTML page. Returns both the binary content,
        /// as well as the textual representation of the HTML page.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="textContent">Content of the text.</param>
        /// <param name="encodingName">Name of the encoding.</param>
        /// <param name="encoding">The encoding.</param>
        /// <param name="binaryContent">Content of the binary.</param>
        /// <param name="options">The options.</param>
        public static void DownloadHtml(
            Uri absoluteUri,
            out string textContent,
            out string encodingName,
            out Encoding encoding,
            out byte[] binaryContent,
            WebSiteDownloaderOptions options)
        {
            DownloadBinary(absoluteUri, out binaryContent, options);

            encodingName = DetectEncodingName(binaryContent);

            Debug.WriteLine(
                string.Format(
                    @"Detected encoding '{0}' for remote HTML document from URL '{1}'.",
                    encodingName,
                    absoluteUri));

            if (binaryContent != null && binaryContent.Length > 0)
            {
                encoding    = GetEncodingByName(encodingName);
                textContent = encoding.GetString(binaryContent).TrimEnd(new char[] { '\0' }).Trim();
            }
            else
            {
                // Default.
                encoding    = Encoding.Default;
                textContent = null;
            }
        }
Exemplo n.º 2
0
        }//End FirstAnalyze

        public static bool FirstAnalyze(String WebUri, String DirPath)
        {
            urlfinishedcount = 0;
            finished = false;

            WebSiteDownloaderOptions options =
                new WebSiteDownloaderOptions();

            //options.DownloadUri =
            //    new Uri(@"http://sudarshannews.com/");
            //options.DestinationFolderPath =
            //    new DirectoryInfo(@"C:\Users\SATWADHIR PAWAR\Desktop\scrap");


            options.DownloadUri =
               new Uri(WebUri);
            options.DestinationFolderPath =
                new DirectoryInfo(DirPath);

            WebSiteDownloader downloader =
                new WebSiteDownloader(options);

            downloader.ProcessingUrl +=
                new WebSiteDownloader.ProcessingUrlEventHandler(
                downloader_ProcessingUrl);

            downloader.ProcessCompleted +=
                new WebSiteDownloader.ProcessCompletedEventHandler(
                downloader_ProcessCompleted);

            downloader.ProcessAsync();

            while (true)
            {
                Thread.Sleep(100);
                Console.WriteLine(@".");
                
                    urlfinishedcount += 1;

                if (urlfinishedcount == 200)
                {
                    urlfinishedcount = 0;
                    finished = true;
                }

                lock (typeof(ScrapperDemoStart))
                {
                    if (finished)
                    {
                        break;
                    }
                }
            }

            Console.WriteLine(@"finished.");

            return finished;
        }//End FirstAnalyze
Exemplo n.º 3
0
        public void SetOptions(WebSiteDownloaderOptions options)
        {
            Trace.WriteLine(
                string.Format(
                    @"Constructing WebSiteDownloader for URI '{0}', destination folder path '{1}'.",
                    options.DownloadUri,
                    options.DestinationFolderPath));

            _settings = SpiderSettings.Restore(options.DestinationFolderPath);

            _settings.Options = options;
        }
Exemplo n.º 4
0
		// ------------------------------------------------------------------

		/// <summary>
		/// Initializes a new instance of the <see cref="WebSiteDownloader"/> 
		/// class.
		/// </summary>
		/// <param name="options">The options.</param>
		public WebSiteDownloader(
			WebSiteDownloaderOptions options )
		{
			Trace.WriteLine(
				string.Format(
					@"Constructing WebSiteDownloader for URI '{0}', destination folder path '{1}'.",
					options.DownloadUri,
					options.DestinationFolderPath ) );

			_settings = SpiderSettings.Restore( options.DestinationFolderPath );

			_settings.Options = options;
		}
 /// <summary>
 /// Constructor.
 /// </summary>
 /// <param name="options">The options.</param>
 /// <param name="originalUrl">The original URL.</param>
 /// <param name="uri">The URI.</param>
 /// <param name="baseUri">The base URI.</param>
 /// <param name="folderPath">The folder path.</param>
 /// <param name="baseFolderPath">The base folder path.</param>
 /// <param name="linkType">Type of the link.</param>
 public DownloadedResourceInformation(
     WebSiteDownloaderOptions options,
     string originalUrl,
     Uri uri,
     Uri baseUri,
     DirectoryInfo folderPath,
     DirectoryInfo baseFolderPath,
     UriType linkType)
     :
     base(options, originalUrl, uri, baseUri, linkType)
 {
     _localFolderPath     = folderPath;
     _localBaseFolderPath = baseFolderPath;
 }
Exemplo n.º 6
0
        public static bool RunTheFirstAnalyze(string url, string folderpath)
        {

            WebSiteDownloaderOptions options =
                new WebSiteDownloaderOptions();

            options.DownloadUri =
                new Uri(url);
            options.DestinationFolderPath =
                new DirectoryInfo(folderpath);

            WebSiteDownloader downloader =
                new WebSiteDownloader(options);

            downloader.ProcessingUrl +=
                new WebSiteDownloader.ProcessingUrlEventHandler(
                downloader_ProcessingUrl);

            downloader.ProcessCompleted +=
                new WebSiteDownloader.ProcessCompletedEventHandler(
                downloader_ProcessCompleted);

            downloader.ProcessAsync();

            while (true)
            {
                Thread.Sleep(1000);
                Console.WriteLine(@".");

                lock (typeof(Program))
                {
                    if (finished)
                    {
                        break;
                    }
                }
            }

            Console.WriteLine(@"finished.");

            return finished;
        }
Exemplo n.º 7
0
        /// <summary>
        /// If a proxy is required, apply it to the request.
        /// </summary>
        /// <param name="req">The req.</param>
        /// <param name="options">The options.</param>
        private static void ApplyProxy(
            WebRequest req,
            WebSiteDownloaderOptions options)
        {
            switch (options.ProxyUsage)
            {
            default:
            case DownloadProxyUsage.Default:
                req.Proxy = WebRequest.DefaultWebProxy;
                break;

            case DownloadProxyUsage.NoProxy:
                req.Proxy = null;
                break;

            case DownloadProxyUsage.UseProxy:
                Debug.Assert(options.Proxy != null);
                req.Proxy = options.Proxy;
                break;
            }
        }
Exemplo n.º 8
0
        // ------------------------------------------------------------------

        /// <summary>
        /// Initializes a new instance of the
        /// <see cref="UriResourceInformation"/> class.
        /// </summary>
        /// <param name="options">The options.</param>
        /// <param name="originalUrl">The original URL.</param>
        /// <param name="uri">The URI.</param>
        /// <param name="baseUri">The base URI.</param>
        /// <param name="linkType">Type of the link.</param>
        public UriResourceInformation(
            WebSiteDownloaderOptions options,
            string originalUrl,
            Uri uri,
            Uri baseUri,
            UriType linkType)
        {
            _options     = options;
            _originalUrl = originalUrl;
            _baseUri     = baseUri;

            uri = new Uri(CleanupUrl(uri.OriginalString), UriKind.RelativeOrAbsolute);

            if (Uri.IsWellFormedUriString(uri.OriginalString, UriKind.Absolute))
            {
                _absoluteUri = uri;
                _relativeUri = null;
            }
            else if (Uri.IsWellFormedUriString(uri.OriginalString, UriKind.Relative))
            {
                _absoluteUri = MakeAbsoluteUri(baseUri, uri);
                _relativeUri = uri;
            }
            else
            {
                if (originalUrl.StartsWith(@"#"))
                {
                    _absoluteUri = null;
                    _relativeUri = new Uri(originalUrl, UriKind.RelativeOrAbsolute);
                }
                else
                {
                    _absoluteUri = MakeAbsoluteUri(baseUri, uri);
                    _relativeUri = uri;
                }
            }

            _linkType = linkType;
        }
Exemplo n.º 9
0
		private static void Main( string[] args )
		{
			WebSiteDownloaderOptions options =
				new WebSiteDownloaderOptions();

			options.DownloadUri =
                new Uri(@"http://www.cadfolks.com");  
			options.DestinationFolderPath =
                new DirectoryInfo(@"F:\demo");

			WebSiteDownloader downloader =
				new WebSiteDownloader( options );

			downloader.ProcessingUrl +=
				new WebSiteDownloader.ProcessingUrlEventHandler(
				downloader_ProcessingUrl );

			downloader.ProcessCompleted +=
				new WebSiteDownloader.ProcessCompletedEventHandler(
				downloader_ProcessCompleted );

			downloader.ProcessAsync();

			while ( true )
			{
				Thread.Sleep( 1000 );
				Console.WriteLine( @"." );

				lock ( typeof( Program ) )
				{
					if ( finished )
					{
						break;
					}
				}
			}

			Console.WriteLine( @"finished." );
		}
Exemplo n.º 10
0
		// ------------------------------------------------------------------

		/// <summary>
		/// Initializes a new instance of the 
		/// <see cref="UriResourceInformation"/> class.
		/// </summary>
		/// <param name="options">The options.</param>
		/// <param name="originalUrl">The original URL.</param>
		/// <param name="uri">The URI.</param>
		/// <param name="baseUri">The base URI.</param>
		/// <param name="linkType">Type of the link.</param>
		public UriResourceInformation(
			WebSiteDownloaderOptions options,
			string originalUrl,
			Uri uri,
			Uri baseUri,
			UriType linkType )
		{
			_options = options;
			_originalUrl = originalUrl;
			_baseUri = baseUri;

			uri = new Uri( CleanupUrl( uri.OriginalString ), UriKind.RelativeOrAbsolute );

			if ( Uri.IsWellFormedUriString( uri.OriginalString, UriKind.Absolute ) )
			{
				_absoluteUri = uri;
				_relativeUri = null;
			}
			else if ( Uri.IsWellFormedUriString( uri.OriginalString, UriKind.Relative ) )
			{
				_absoluteUri = MakeAbsoluteUri( baseUri, uri );
				_relativeUri = uri;
			}
			else
			{
				if ( originalUrl.StartsWith( @"#" ) )
				{
					_absoluteUri = null;
					_relativeUri = new Uri( originalUrl, UriKind.RelativeOrAbsolute );
				}
				else
				{
					_absoluteUri = MakeAbsoluteUri( baseUri, uri );
					_relativeUri = uri;
				}
			}

			_linkType = linkType;
		}
        //[TestMethod]
        public void Run_Spider_Inflate_Check_Results_For_Minutes()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            var minutes = (from m in rslt.Resources
                           where m.AbsoluteUri.AbsoluteUri.Contains(@"get_minutes.asp")
                           select m).ToList();

            string stop = @"";
        }
        /// <summary>
        /// Re-inflates the search results.
        /// </summary>
        //[TestMethod]
        public void Run_Spider_On_Activity_Search_Results()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options = new WebSiteDownloaderOptions();
            options.DestinationFolderPath = new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));

            //Download que engine.
            WebSiteDownloader downloader = new WebSiteDownloader(options);

            List<iCollector> coll = (from p in downloader.Parsings
                                     select p).ToList();

            //RegEx for matching bill copy.
            Regex r = new Regex("get[_]bill[_]text[.]asp");

            //Get all matches.
            List<iCollector> refined = (from el in coll
                                        let matches = r.Matches(el.source.AbsoluteUri.AbsoluteUri)
                                        where matches.Count != 0
                                        select el).ToList();

            Assert.IsTrue(coll.Count() > 0);
        }
Exemplo n.º 13
0
        // ------------------------------------------------------------------

        /// <summary>
        /// Downloads the head.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="options">The options.</param>
        /// <returns></returns>
        public static string DownloadHead(
            Uri absoluteUri,
            WebSiteDownloaderOptions options)
        {
            try
            {
                if (_headPool.ContainsKey(absoluteUri))
                {
                    return(_headPool[absoluteUri]);
                }
                else
                {
                    Debug.WriteLine(
                        string.Format(
                            @"Reading HEAD from URL '{0}'.",
                            absoluteUri));

                    HttpWebRequest req =
                        (HttpWebRequest)WebRequest.Create(absoluteUri);
                    req.Method = @"HEAD";
                    ApplyProxy(req, options);

                    RequestCachePolicy cp = new RequestCachePolicy(
                        RequestCacheLevel.BypassCache);
                    req.CachePolicy = cp;

                    using (HttpWebResponse resp =
                               (HttpWebResponse)req.GetResponse())
                    {
                        _headPool[absoluteUri] = resp.ContentType;
                        return(resp.ContentType);
                    }
                }
            }
            catch (WebException x)
            {
                if (x.Status == WebExceptionStatus.ProtocolError)
                {
                    HttpWebResponse resp =
                        (HttpWebResponse)x.Response;

                    if (resp.StatusCode == HttpStatusCode.NotFound ||
                        resp.StatusCode == HttpStatusCode.InternalServerError)
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"Ignoring web exception: '{0}'.",
                                x.Message));
                        return(null);
                    }
                    else
                    {
                        throw;
                    }
                }
                else
                {
                    throw;
                }
            }
        }
        //[TestMethod]
        public void Run_Spider_On_Activity_Parsing_Using_iFollower_And_Saving_Using_GitCollector()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            //What pages to follow.
            options.UriFollower.Add(new Follow { depth = 1, pattern = new Regex("get_bill.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_fulltext.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_complete_bill.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_minutes.asp") });
            options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_bill_text.asp") });
            options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_single_minute.asp[?]ch") });

            //What content to serialize and save off.
            options.GitCollectionRequest.Add(new DocumentHistory() { pageName = "get_fulltext.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentActivity() { pageName = "get_complete_bill.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentMeeting() { pageName = "get_minutes.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_bill_text.asp", pageType = UriType.Content });
            options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_single_minute.asp", pageType = UriType.Content });

            //Specialized content filter using regular expression matches.
            var d = new Dictionary<String, Regex>();
            d.Add("Bill Name", new Regex(@"(?<=<b>BILL:</b>)[a-z,\s,\w,(,),"",',;,.]{5,800}", RegexOptions.IgnoreCase));
            d.Add("Title", new Regex(@"(?<=<b>TITLE:</b>)[a-z,\s,\w,(,),"",',\-,;,.]{5,800}", RegexOptions.IgnoreCase));
            d.Add("Short Title", new Regex(@"(?<=<b>SHORT TITLE:</b>)(.*)(?=</font>)", RegexOptions.IgnoreCase));
            d.Add("Status Date", new Regex(@"(?<=<b>STATUS DATE:</b>)[0-9,/\,\w\s]{5,50}", RegexOptions.IgnoreCase));
            d.Add("Current Status", new Regex(@"(?<=<b>CURRENT STATUS:</b>)[a-z,\s,&,\ ,(,)]{2,60}", RegexOptions.IgnoreCase));
            d.Add("Sponsors", new Regex(@"(?<=<b>SPONSOR[(]S[)]:</b>)[a-z,\s,\w,(,),.,;,""]{5,800}", RegexOptions.IgnoreCase));
            //RegEx matching container.
            options.GitCollectionRequest.Add(new DocumentKVP() { pageName = "get_bill.asp", pageType = UriType.Content, rvp = d });

            //Re-inflate the indexing.
            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            Assert.IsTrue(rslt.Parsings.Count() > 0);
        }
        public void Run_Spider_Inflate_And_Save_Results_To_GitHub()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options =
             new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));

            options.MaximumLinkDepth = 4;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            /*
            1. Select Bill Names
            * We need to know the bill name (HB16), so we can save data in a folder of the same name.
            */

            //Static List of Bills
            var masterlist = (from r in rslt.Resources
                              where r.Index == 1 && (r.AbsoluteUri.AbsoluteUri.Contains(@"get_bill.asp"))
                              select r).ToList();

            //Match bill titles in the URI (HB16,SB12..)
            Regex billTitles = new Regex(@"(?<=[=])[H|R|S][B|C|R|J]{0,3}[0-9]{1,4}", RegexOptions.IgnoreCase);

            //Return a list of the first matches
            var bills = (from b in masterlist
                         let matches = billTitles.Matches(b.AbsoluteUri.AbsoluteUri)
                         where matches.Count > 0
                         select new
                         {
                             resource = b,
                             url = b.AbsoluteUri,
                             name = matches.Cast<Match>().FirstOrDefault()
                         }).ToList();

            /*
            2. Build out directory structure for bill data.
            * We have a list of bills, now where are we going to save the data?
            */

            DirectoryInfo session = new DirectoryInfo(String.Format(@"{0}/{1}", dataDir, 28));
            if (!session.Exists)
                session.Create();

            foreach (var item in bills)
            {
                //bill directory
                DirectoryInfo bill = new DirectoryInfo(String.Format(@"{0}/{1}/{2}", dataDir, 28, item.name));
                if (!bill.Exists)
                    bill.Create();
            }

            /*
            3. Associated bill data
            *  Grab associated bill data. Name, Title, LongTitle,
             *  Minutes Content, Bill Revisions, Bill Activity
            */

            foreach (var bill in bills)
            {

                //Results placeholders
                List<iCollector> meta = new List<iCollector>();
                List<iCollector> revisions = new List<iCollector>();
                List<iCollector> minutes = new List<iCollector>();
                List<iCollector> committee = new List<iCollector>();

                //Document history, activity and kvp..
                meta.AddRange((from h in rslt.Parsings
                               where h.source.AbsoluteUri.AbsoluteUri == bill.url.AbsoluteUri
                                  || h.source.Parent.AbsoluteUri == bill.url.AbsoluteUri
                               select h).ToList());

                //Bill Content
                revisions.AddRange((from d in rslt.Parsings
                                    where d.source.Parent.AbsoluteUri
                                           .Contains(String.Format(@"get_fulltext.asp?session={0}&bill={1}", 28, bill.name))
                                    select d).ToList());

                //Committee
                committee.AddRange((from d in rslt.Resources
                           join p in rslt.Parsings
                           on d.AbsoluteUri.AbsoluteUri equals
                                     p.source.Parent.AbsoluteUri
                           where p.source.AbsoluteUri.AbsoluteUri.Contains("get_minutes.asp")
                           && d.AbsoluteUri.AbsoluteUri.Contains(String.Format("{0}", bill.name))
                           select p).ToList());

                //Minutes Transcript
                minutes.AddRange((from d in rslt.Resources
                                  join p in rslt.Parsings
                                  on d.AbsoluteUri.AbsoluteUri equals
                                     p.source.Parent.AbsoluteUri
                                  where p.source.AbsoluteUri.AbsoluteUri
                                         .Contains(@"get_single_minute.asp")
                                         && d.AbsoluteUri.AbsoluteUri
                                         .Contains(String.Format("{0}", bill.name))
                                  select p).ToList());

                /*
                4. Start saving off the data
                * We have a list of bills, now where are we going to save the data?
                */

                String fileLoc = String.Format(@"{0}\{1}\", 28, bill.name);

                GitRepository gr = new GitRepository();
                gr.ProcessBill(fileLoc, new ParsedBill() {
                    meta = meta,
                    minutes = minutes,
                    revisions = revisions,
                    committee = committee });

            }

            Assert.IsTrue(true);
        }
Exemplo n.º 16
0
        /// <summary>
        /// Donwload a binary content.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="binaryContent">Content of the binary.</param>
        /// <param name="options">The options.</param>
        public static void DownloadBinary(
            Uri absoluteUri,
            out byte[] binaryContent,
            WebSiteDownloaderOptions options)
        {
            Debug.WriteLine(
                string.Format(
                    @"Reading content from URL '{0}'.",
                    absoluteUri));

            try
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(absoluteUri);
                ApplyProxy(req, options);

                RequestCachePolicy cp = new RequestCachePolicy(
                    RequestCacheLevel.BypassCache);
                req.CachePolicy = cp;

                using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse())
                    using (Stream stream = resp.GetResponseStream())
                        using (MemoryStream mem = new MemoryStream())
                        {
                            int    blockSize   = 16384;
                            byte[] blockBuffer = new byte[blockSize];
                            int    read;

                            while ((read = stream.Read(blockBuffer, 0, blockSize)) > 0)
                            {
                                mem.Write(blockBuffer, 0, read);
                            }

                            mem.Seek(0, SeekOrigin.Begin);

                            binaryContent = mem.GetBuffer();
                        }
            }
            catch (WebException x)
            {
                if (x.Status == WebExceptionStatus.ProtocolError)
                {
                    HttpWebResponse resp =
                        (HttpWebResponse)x.Response;

                    if (resp.StatusCode == HttpStatusCode.NotFound ||
                        resp.StatusCode == HttpStatusCode.InternalServerError)
                    {
                        Trace.WriteLine(
                            string.Format(
                                @"Ignoring web exception: '{0}'.",
                                x.Message));
                        binaryContent = null;
                    }
                    else
                    {
                        throw;
                    }
                }
                else
                {
                    throw;
                }
            }
        }
Exemplo n.º 17
0
		/// <summary>
		/// If a proxy is required, apply it to the request.
		/// </summary>
		/// <param name="req">The req.</param>
		/// <param name="options">The options.</param>
		private static void ApplyProxy(
			WebRequest req,
			WebSiteDownloaderOptions options )
		{
			switch ( options.ProxyUsage )
			{
				default:
				case DownloadProxyUsage.Default:
					req.Proxy = WebRequest.DefaultWebProxy;
					break;

				case DownloadProxyUsage.NoProxy:
					req.Proxy = null;
					break;

				case DownloadProxyUsage.UseProxy:
					Debug.Assert( options.Proxy != null );
					req.Proxy = options.Proxy;
					break;
			}
		}
Exemplo n.º 18
0
		/// <summary>
		/// Donwload a binary content.
		/// </summary>
		/// <param name="absoluteUri">The absolute URI.</param>
		/// <param name="binaryContent">Content of the binary.</param>
		/// <param name="options">The options.</param>
		public static void DownloadBinary(
			Uri absoluteUri,
			out byte[] binaryContent,
			WebSiteDownloaderOptions options )
		{
			Debug.WriteLine(
				string.Format(
				@"Reading content from URL '{0}'.",
				absoluteUri ) );

			try
			{
				HttpWebRequest req = (HttpWebRequest)WebRequest.Create( absoluteUri );
				ApplyProxy( req, options );

				RequestCachePolicy cp = new RequestCachePolicy( 
					RequestCacheLevel.BypassCache );
				req.CachePolicy = cp;

				using ( HttpWebResponse resp = (HttpWebResponse)req.GetResponse() )
				using ( Stream stream = resp.GetResponseStream() )
				using ( MemoryStream mem = new MemoryStream() )
				{
					int blockSize = 16384;
					byte[] blockBuffer = new byte[blockSize];
					int read;

					while ( (read = stream.Read( blockBuffer, 0, blockSize )) > 0 )
					{
						mem.Write( blockBuffer, 0, read );
					}

					mem.Seek( 0, SeekOrigin.Begin );

					binaryContent = mem.GetBuffer();
				}
			}
			catch ( WebException x )
			{
				if ( x.Status == WebExceptionStatus.ProtocolError )
				{
					HttpWebResponse resp =
						(HttpWebResponse)x.Response;

					if ( resp.StatusCode == HttpStatusCode.NotFound ||
						resp.StatusCode == HttpStatusCode.InternalServerError )
					{
						Trace.WriteLine(
							string.Format(
							@"Ignoring web exception: '{0}'.",
							x.Message ) );
						binaryContent = null;
					}
					else
					{
						throw;
					}
				}
				else
				{
					throw;
				}
			}
		}
Exemplo n.º 19
0
 // ------------------------------------------------------------------
 /// <summary>
 /// Initializes a new instance of the <see cref="WebSiteDownloader"/> 
 /// class.
 /// </summary>
 /// <param name="options">The options.</param>
 public WebSiteDownloader(
     WebSiteDownloaderOptions options)
 {
     this.SetOptions(options);
 }
Exemplo n.º 20
0
        // ------------------------------------------------------------------

        /// <summary>
        /// Initializes a new instance of the <see cref="WebSiteDownloader"/>
        /// class.
        /// </summary>
        /// <param name="options">The options.</param>
        public WebSiteDownloader(
            WebSiteDownloaderOptions options)
        {
            this.SetOptions(options);
        }
Exemplo n.º 21
0
		/// <summary>
		/// Download a HTML page. Returns both the binary content,
		/// as well as the textual representation of the HTML page.
		/// </summary>
		/// <param name="absoluteUri">The absolute URI.</param>
		/// <param name="textContent">Content of the text.</param>
		/// <param name="encodingName">Name of the encoding.</param>
		/// <param name="encoding">The encoding.</param>
		/// <param name="binaryContent">Content of the binary.</param>
		/// <param name="options">The options.</param>
		public static void DownloadHtml(
			Uri absoluteUri,
			out string textContent,
			out string encodingName,
			out Encoding encoding,
			out byte[] binaryContent,
			WebSiteDownloaderOptions options )
		{
			DownloadBinary( absoluteUri, out binaryContent, options );

			encodingName = DetectEncodingName( binaryContent );

			Debug.WriteLine(
				string.Format(
				@"Detected encoding '{0}' for remote HTML document from URL '{1}'.",
				encodingName,
				absoluteUri ) );

			if ( binaryContent != null && binaryContent.Length > 0 )
			{
				encoding = GetEncodingByName( encodingName );
				textContent = encoding.GetString( binaryContent );
			}
			else
			{
				// Default.
				encoding = Encoding.Default;
				textContent = null;
			}
		}
        //[TestMethod]
        public void Run_Spider_On_Activity_Parsing_Using_iFollower()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            options.UriFollower.Add(new Follow { depth = 1, pattern = new Regex("get_bill.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_fulltext.asp") });
            options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_bill_text.asp") });

            //Re-inflate the indexing.
            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            Assert.IsTrue(rslt.Resources.Count() > 0);
        }
Exemplo n.º 23
0
		// ------------------------------------------------------------------

		/// <summary>
		/// Downloads the head.
		/// </summary>
		/// <param name="absoluteUri">The absolute URI.</param>
		/// <param name="options">The options.</param>
		/// <returns></returns>
		public static string DownloadHead(
			Uri absoluteUri,
			WebSiteDownloaderOptions options )
		{
			try
			{
				if ( _headPool.ContainsKey( absoluteUri ) )
				{
					return _headPool[absoluteUri];
				}
				else
				{
					Debug.WriteLine(
						string.Format(
						@"Reading HEAD from URL '{0}'.",
						absoluteUri ) );

					HttpWebRequest req =
						(HttpWebRequest)WebRequest.Create( absoluteUri );
					req.Method = @"HEAD";
					ApplyProxy( req, options );

					RequestCachePolicy cp = new RequestCachePolicy(
						RequestCacheLevel.BypassCache );
					req.CachePolicy = cp;

					using ( HttpWebResponse resp =
						(HttpWebResponse)req.GetResponse() )
					{
						_headPool[absoluteUri] = resp.ContentType;
						return resp.ContentType;
					}
				}
			}
			catch ( WebException x )
			{
				if ( x.Status == WebExceptionStatus.ProtocolError )
				{
					HttpWebResponse resp =
						(HttpWebResponse)x.Response;

					if ( resp.StatusCode == HttpStatusCode.NotFound ||
						resp.StatusCode == HttpStatusCode.InternalServerError )
					{
						Trace.WriteLine(
							string.Format(
							@"Ignoring web exception: '{0}'.",
							x.Message ) );
						return null;
					}
					else
					{
                        try
                        {
                           throw;
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine(ex.Message.ToString());
                        }
					}
				}
				else
				{
                    try
                    {
                        throw;
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.Message.ToString());
                    } 
				}
			}

            return "Out OF Robot";
		}
Exemplo n.º 24
0
 /// <summary>
 /// Initializes a new instance of the 
 /// <see cref="UriResourceInformation"/> class.
 /// </summary>
 /// <param name="copyFrom">The copy from.</param>
 public UriResourceInformation(
     UriResourceInformation copyFrom)
 {
     _options = copyFrom._options;
     _originalUrl = copyFrom._originalUrl;
     _relativeUri = copyFrom._relativeUri;
     _baseUri = copyFrom._baseUri;
     _absoluteUri = copyFrom._absoluteUri;
     _linkType = copyFrom._linkType;
 }