/// <summary> /// Download a HTML page. Returns both the binary content, /// as well as the textual representation of the HTML page. /// </summary> /// <param name="absoluteUri">The absolute URI.</param> /// <param name="textContent">Content of the text.</param> /// <param name="encodingName">Name of the encoding.</param> /// <param name="encoding">The encoding.</param> /// <param name="binaryContent">Content of the binary.</param> /// <param name="options">The options.</param> public static void DownloadHtml( Uri absoluteUri, out string textContent, out string encodingName, out Encoding encoding, out byte[] binaryContent, WebSiteDownloaderOptions options) { DownloadBinary(absoluteUri, out binaryContent, options); encodingName = DetectEncodingName(binaryContent); Debug.WriteLine( string.Format( @"Detected encoding '{0}' for remote HTML document from URL '{1}'.", encodingName, absoluteUri)); if (binaryContent != null && binaryContent.Length > 0) { encoding = GetEncodingByName(encodingName); textContent = encoding.GetString(binaryContent).TrimEnd(new char[] { '\0' }).Trim(); } else { // Default. encoding = Encoding.Default; textContent = null; } }
}//End FirstAnalyze public static bool FirstAnalyze(String WebUri, String DirPath) { urlfinishedcount = 0; finished = false; WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); //options.DownloadUri = // new Uri(@"http://sudarshannews.com/"); //options.DestinationFolderPath = // new DirectoryInfo(@"C:\Users\SATWADHIR PAWAR\Desktop\scrap"); options.DownloadUri = new Uri(WebUri); options.DestinationFolderPath = new DirectoryInfo(DirPath); WebSiteDownloader downloader = new WebSiteDownloader(options); downloader.ProcessingUrl += new WebSiteDownloader.ProcessingUrlEventHandler( downloader_ProcessingUrl); downloader.ProcessCompleted += new WebSiteDownloader.ProcessCompletedEventHandler( downloader_ProcessCompleted); downloader.ProcessAsync(); while (true) { Thread.Sleep(100); Console.WriteLine(@"."); urlfinishedcount += 1; if (urlfinishedcount == 200) { urlfinishedcount = 0; finished = true; } lock (typeof(ScrapperDemoStart)) { if (finished) { break; } } } Console.WriteLine(@"finished."); return finished; }//End FirstAnalyze
public void SetOptions(WebSiteDownloaderOptions options) { Trace.WriteLine( string.Format( @"Constructing WebSiteDownloader for URI '{0}', destination folder path '{1}'.", options.DownloadUri, options.DestinationFolderPath)); _settings = SpiderSettings.Restore(options.DestinationFolderPath); _settings.Options = options; }
// ------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="WebSiteDownloader"/> /// class. /// </summary> /// <param name="options">The options.</param> public WebSiteDownloader( WebSiteDownloaderOptions options ) { Trace.WriteLine( string.Format( @"Constructing WebSiteDownloader for URI '{0}', destination folder path '{1}'.", options.DownloadUri, options.DestinationFolderPath ) ); _settings = SpiderSettings.Restore( options.DestinationFolderPath ); _settings.Options = options; }
/// <summary> /// Constructor. /// </summary> /// <param name="options">The options.</param> /// <param name="originalUrl">The original URL.</param> /// <param name="uri">The URI.</param> /// <param name="baseUri">The base URI.</param> /// <param name="folderPath">The folder path.</param> /// <param name="baseFolderPath">The base folder path.</param> /// <param name="linkType">Type of the link.</param> public DownloadedResourceInformation( WebSiteDownloaderOptions options, string originalUrl, Uri uri, Uri baseUri, DirectoryInfo folderPath, DirectoryInfo baseFolderPath, UriType linkType) : base(options, originalUrl, uri, baseUri, linkType) { _localFolderPath = folderPath; _localBaseFolderPath = baseFolderPath; }
public static bool RunTheFirstAnalyze(string url, string folderpath) { WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DownloadUri = new Uri(url); options.DestinationFolderPath = new DirectoryInfo(folderpath); WebSiteDownloader downloader = new WebSiteDownloader(options); downloader.ProcessingUrl += new WebSiteDownloader.ProcessingUrlEventHandler( downloader_ProcessingUrl); downloader.ProcessCompleted += new WebSiteDownloader.ProcessCompletedEventHandler( downloader_ProcessCompleted); downloader.ProcessAsync(); while (true) { Thread.Sleep(1000); Console.WriteLine(@"."); lock (typeof(Program)) { if (finished) { break; } } } Console.WriteLine(@"finished."); return finished; }
/// <summary> /// If a proxy is required, apply it to the request. /// </summary> /// <param name="req">The req.</param> /// <param name="options">The options.</param> private static void ApplyProxy( WebRequest req, WebSiteDownloaderOptions options) { switch (options.ProxyUsage) { default: case DownloadProxyUsage.Default: req.Proxy = WebRequest.DefaultWebProxy; break; case DownloadProxyUsage.NoProxy: req.Proxy = null; break; case DownloadProxyUsage.UseProxy: Debug.Assert(options.Proxy != null); req.Proxy = options.Proxy; break; } }
// ------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the /// <see cref="UriResourceInformation"/> class. /// </summary> /// <param name="options">The options.</param> /// <param name="originalUrl">The original URL.</param> /// <param name="uri">The URI.</param> /// <param name="baseUri">The base URI.</param> /// <param name="linkType">Type of the link.</param> public UriResourceInformation( WebSiteDownloaderOptions options, string originalUrl, Uri uri, Uri baseUri, UriType linkType) { _options = options; _originalUrl = originalUrl; _baseUri = baseUri; uri = new Uri(CleanupUrl(uri.OriginalString), UriKind.RelativeOrAbsolute); if (Uri.IsWellFormedUriString(uri.OriginalString, UriKind.Absolute)) { _absoluteUri = uri; _relativeUri = null; } else if (Uri.IsWellFormedUriString(uri.OriginalString, UriKind.Relative)) { _absoluteUri = MakeAbsoluteUri(baseUri, uri); _relativeUri = uri; } else { if (originalUrl.StartsWith(@"#")) { _absoluteUri = null; _relativeUri = new Uri(originalUrl, UriKind.RelativeOrAbsolute); } else { _absoluteUri = MakeAbsoluteUri(baseUri, uri); _relativeUri = uri; } } _linkType = linkType; }
private static void Main( string[] args ) { WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DownloadUri = new Uri(@"http://www.cadfolks.com"); options.DestinationFolderPath = new DirectoryInfo(@"F:\demo"); WebSiteDownloader downloader = new WebSiteDownloader( options ); downloader.ProcessingUrl += new WebSiteDownloader.ProcessingUrlEventHandler( downloader_ProcessingUrl ); downloader.ProcessCompleted += new WebSiteDownloader.ProcessCompletedEventHandler( downloader_ProcessCompleted ); downloader.ProcessAsync(); while ( true ) { Thread.Sleep( 1000 ); Console.WriteLine( @"." ); lock ( typeof( Program ) ) { if ( finished ) { break; } } } Console.WriteLine( @"finished." ); }
// ------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the /// <see cref="UriResourceInformation"/> class. /// </summary> /// <param name="options">The options.</param> /// <param name="originalUrl">The original URL.</param> /// <param name="uri">The URI.</param> /// <param name="baseUri">The base URI.</param> /// <param name="linkType">Type of the link.</param> public UriResourceInformation( WebSiteDownloaderOptions options, string originalUrl, Uri uri, Uri baseUri, UriType linkType ) { _options = options; _originalUrl = originalUrl; _baseUri = baseUri; uri = new Uri( CleanupUrl( uri.OriginalString ), UriKind.RelativeOrAbsolute ); if ( Uri.IsWellFormedUriString( uri.OriginalString, UriKind.Absolute ) ) { _absoluteUri = uri; _relativeUri = null; } else if ( Uri.IsWellFormedUriString( uri.OriginalString, UriKind.Relative ) ) { _absoluteUri = MakeAbsoluteUri( baseUri, uri ); _relativeUri = uri; } else { if ( originalUrl.StartsWith( @"#" ) ) { _absoluteUri = null; _relativeUri = new Uri( originalUrl, UriKind.RelativeOrAbsolute ); } else { _absoluteUri = MakeAbsoluteUri( baseUri, uri ); _relativeUri = uri; } } _linkType = linkType; }
//[TestMethod] public void Run_Spider_Inflate_Check_Results_For_Minutes() { DateTime Start = new DateTime(2013, 2, 15); DateTime End = new DateTime(2013, 2, 18); WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DestinationFolderPath = new DirectoryInfo(dataDir); options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state", Start.Date.ToShortDateString().Replace("/", "-"), End.Date.ToShortDateString().Replace("/", "-")); options.MaximumLinkDepth = 3; options.TargetSession = 28; options.DownloadUri = new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}", options.TargetSession, Start.Date.ToShortDateString(), End.Date.ToShortDateString())); WebSiteDownloader rslt = Spider.DownloadingProcessor(options); var minutes = (from m in rslt.Resources where m.AbsoluteUri.AbsoluteUri.Contains(@"get_minutes.asp") select m).ToList(); string stop = @""; }
/// <summary> /// Re-inflates the search results. /// </summary> //[TestMethod] public void Run_Spider_On_Activity_Search_Results() { DateTime Start = new DateTime(2013, 2, 15); DateTime End = new DateTime(2013, 2, 18); WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DestinationFolderPath = new DirectoryInfo(dataDir); options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state", Start.Date.ToShortDateString().Replace("/", "-"), End.Date.ToShortDateString().Replace("/", "-")); //Download que engine. WebSiteDownloader downloader = new WebSiteDownloader(options); List<iCollector> coll = (from p in downloader.Parsings select p).ToList(); //RegEx for matching bill copy. Regex r = new Regex("get[_]bill[_]text[.]asp"); //Get all matches. List<iCollector> refined = (from el in coll let matches = r.Matches(el.source.AbsoluteUri.AbsoluteUri) where matches.Count != 0 select el).ToList(); Assert.IsTrue(coll.Count() > 0); }
// ------------------------------------------------------------------ /// <summary> /// Downloads the head. /// </summary> /// <param name="absoluteUri">The absolute URI.</param> /// <param name="options">The options.</param> /// <returns></returns> public static string DownloadHead( Uri absoluteUri, WebSiteDownloaderOptions options) { try { if (_headPool.ContainsKey(absoluteUri)) { return(_headPool[absoluteUri]); } else { Debug.WriteLine( string.Format( @"Reading HEAD from URL '{0}'.", absoluteUri)); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(absoluteUri); req.Method = @"HEAD"; ApplyProxy(req, options); RequestCachePolicy cp = new RequestCachePolicy( RequestCacheLevel.BypassCache); req.CachePolicy = cp; using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse()) { _headPool[absoluteUri] = resp.ContentType; return(resp.ContentType); } } } catch (WebException x) { if (x.Status == WebExceptionStatus.ProtocolError) { HttpWebResponse resp = (HttpWebResponse)x.Response; if (resp.StatusCode == HttpStatusCode.NotFound || resp.StatusCode == HttpStatusCode.InternalServerError) { Trace.WriteLine( string.Format( @"Ignoring web exception: '{0}'.", x.Message)); return(null); } else { throw; } } else { throw; } } }
//[TestMethod] public void Run_Spider_On_Activity_Parsing_Using_iFollower_And_Saving_Using_GitCollector() { DateTime Start = new DateTime(2013, 2, 15); DateTime End = new DateTime(2013, 2, 18); WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DestinationFolderPath = new DirectoryInfo(dataDir); options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state", Start.Date.ToShortDateString().Replace("/", "-"), End.Date.ToShortDateString().Replace("/", "-")); options.MaximumLinkDepth = 3; options.TargetSession = 28; options.DownloadUri = new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}", options.TargetSession, Start.Date.ToShortDateString(), End.Date.ToShortDateString())); //What pages to follow. options.UriFollower.Add(new Follow { depth = 1, pattern = new Regex("get_bill.asp") }); options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_fulltext.asp") }); options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_complete_bill.asp") }); options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_minutes.asp") }); options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_bill_text.asp") }); options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_single_minute.asp[?]ch") }); //What content to serialize and save off. options.GitCollectionRequest.Add(new DocumentHistory() { pageName = "get_fulltext.asp", pageType = UriType.Form }); options.GitCollectionRequest.Add(new DocumentActivity() { pageName = "get_complete_bill.asp", pageType = UriType.Form }); options.GitCollectionRequest.Add(new DocumentMeeting() { pageName = "get_minutes.asp", pageType = UriType.Form }); options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_bill_text.asp", pageType = UriType.Content }); options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_single_minute.asp", pageType = UriType.Content }); //Specialized content filter using regular expression matches. var d = new Dictionary<String, Regex>(); d.Add("Bill Name", new Regex(@"(?<=<b>BILL:</b>)[a-z,\s,\w,(,),"",',;,.]{5,800}", RegexOptions.IgnoreCase)); d.Add("Title", new Regex(@"(?<=<b>TITLE:</b>)[a-z,\s,\w,(,),"",',\-,;,.]{5,800}", RegexOptions.IgnoreCase)); d.Add("Short Title", new Regex(@"(?<=<b>SHORT TITLE:</b>)(.*)(?=</font>)", RegexOptions.IgnoreCase)); d.Add("Status Date", new Regex(@"(?<=<b>STATUS DATE:</b>)[0-9,/\,\w\s]{5,50}", RegexOptions.IgnoreCase)); d.Add("Current Status", new Regex(@"(?<=<b>CURRENT STATUS:</b>)[a-z,\s,&,\ ,(,)]{2,60}", RegexOptions.IgnoreCase)); d.Add("Sponsors", new Regex(@"(?<=<b>SPONSOR[(]S[)]:</b>)[a-z,\s,\w,(,),.,;,""]{5,800}", RegexOptions.IgnoreCase)); //RegEx matching container. options.GitCollectionRequest.Add(new DocumentKVP() { pageName = "get_bill.asp", pageType = UriType.Content, rvp = d }); //Re-inflate the indexing. WebSiteDownloader rslt = Spider.DownloadingProcessor(options); Assert.IsTrue(rslt.Parsings.Count() > 0); }
public void Run_Spider_Inflate_And_Save_Results_To_GitHub() { DateTime Start = new DateTime(2013, 2, 15); DateTime End = new DateTime(2013, 2, 18); WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DestinationFolderPath = new DirectoryInfo(dataDir); options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state", Start.Date.ToShortDateString().Replace("/", "-"), End.Date.ToShortDateString().Replace("/", "-")); options.MaximumLinkDepth = 4; options.TargetSession = 28; options.DownloadUri = new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}", options.TargetSession, Start.Date.ToShortDateString(), End.Date.ToShortDateString())); WebSiteDownloader rslt = Spider.DownloadingProcessor(options); /* 1. Select Bill Names * We need to know the bill name (HB16), so we can save data in a folder of the same name. */ //Static List of Bills var masterlist = (from r in rslt.Resources where r.Index == 1 && (r.AbsoluteUri.AbsoluteUri.Contains(@"get_bill.asp")) select r).ToList(); //Match bill titles in the URI (HB16,SB12..) Regex billTitles = new Regex(@"(?<=[=])[H|R|S][B|C|R|J]{0,3}[0-9]{1,4}", RegexOptions.IgnoreCase); //Return a list of the first matches var bills = (from b in masterlist let matches = billTitles.Matches(b.AbsoluteUri.AbsoluteUri) where matches.Count > 0 select new { resource = b, url = b.AbsoluteUri, name = matches.Cast<Match>().FirstOrDefault() }).ToList(); /* 2. Build out directory structure for bill data. * We have a list of bills, now where are we going to save the data? */ DirectoryInfo session = new DirectoryInfo(String.Format(@"{0}/{1}", dataDir, 28)); if (!session.Exists) session.Create(); foreach (var item in bills) { //bill directory DirectoryInfo bill = new DirectoryInfo(String.Format(@"{0}/{1}/{2}", dataDir, 28, item.name)); if (!bill.Exists) bill.Create(); } /* 3. Associated bill data * Grab associated bill data. Name, Title, LongTitle, * Minutes Content, Bill Revisions, Bill Activity */ foreach (var bill in bills) { //Results placeholders List<iCollector> meta = new List<iCollector>(); List<iCollector> revisions = new List<iCollector>(); List<iCollector> minutes = new List<iCollector>(); List<iCollector> committee = new List<iCollector>(); //Document history, activity and kvp.. meta.AddRange((from h in rslt.Parsings where h.source.AbsoluteUri.AbsoluteUri == bill.url.AbsoluteUri || h.source.Parent.AbsoluteUri == bill.url.AbsoluteUri select h).ToList()); //Bill Content revisions.AddRange((from d in rslt.Parsings where d.source.Parent.AbsoluteUri .Contains(String.Format(@"get_fulltext.asp?session={0}&bill={1}", 28, bill.name)) select d).ToList()); //Committee committee.AddRange((from d in rslt.Resources join p in rslt.Parsings on d.AbsoluteUri.AbsoluteUri equals p.source.Parent.AbsoluteUri where p.source.AbsoluteUri.AbsoluteUri.Contains("get_minutes.asp") && d.AbsoluteUri.AbsoluteUri.Contains(String.Format("{0}", bill.name)) select p).ToList()); //Minutes Transcript minutes.AddRange((from d in rslt.Resources join p in rslt.Parsings on d.AbsoluteUri.AbsoluteUri equals p.source.Parent.AbsoluteUri where p.source.AbsoluteUri.AbsoluteUri .Contains(@"get_single_minute.asp") && d.AbsoluteUri.AbsoluteUri .Contains(String.Format("{0}", bill.name)) select p).ToList()); /* 4. Start saving off the data * We have a list of bills, now where are we going to save the data? */ String fileLoc = String.Format(@"{0}\{1}\", 28, bill.name); GitRepository gr = new GitRepository(); gr.ProcessBill(fileLoc, new ParsedBill() { meta = meta, minutes = minutes, revisions = revisions, committee = committee }); } Assert.IsTrue(true); }
/// <summary> /// Donwload a binary content. /// </summary> /// <param name="absoluteUri">The absolute URI.</param> /// <param name="binaryContent">Content of the binary.</param> /// <param name="options">The options.</param> public static void DownloadBinary( Uri absoluteUri, out byte[] binaryContent, WebSiteDownloaderOptions options) { Debug.WriteLine( string.Format( @"Reading content from URL '{0}'.", absoluteUri)); try { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(absoluteUri); ApplyProxy(req, options); RequestCachePolicy cp = new RequestCachePolicy( RequestCacheLevel.BypassCache); req.CachePolicy = cp; using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse()) using (Stream stream = resp.GetResponseStream()) using (MemoryStream mem = new MemoryStream()) { int blockSize = 16384; byte[] blockBuffer = new byte[blockSize]; int read; while ((read = stream.Read(blockBuffer, 0, blockSize)) > 0) { mem.Write(blockBuffer, 0, read); } mem.Seek(0, SeekOrigin.Begin); binaryContent = mem.GetBuffer(); } } catch (WebException x) { if (x.Status == WebExceptionStatus.ProtocolError) { HttpWebResponse resp = (HttpWebResponse)x.Response; if (resp.StatusCode == HttpStatusCode.NotFound || resp.StatusCode == HttpStatusCode.InternalServerError) { Trace.WriteLine( string.Format( @"Ignoring web exception: '{0}'.", x.Message)); binaryContent = null; } else { throw; } } else { throw; } } }
/// <summary> /// If a proxy is required, apply it to the request. /// </summary> /// <param name="req">The req.</param> /// <param name="options">The options.</param> private static void ApplyProxy( WebRequest req, WebSiteDownloaderOptions options ) { switch ( options.ProxyUsage ) { default: case DownloadProxyUsage.Default: req.Proxy = WebRequest.DefaultWebProxy; break; case DownloadProxyUsage.NoProxy: req.Proxy = null; break; case DownloadProxyUsage.UseProxy: Debug.Assert( options.Proxy != null ); req.Proxy = options.Proxy; break; } }
/// <summary> /// Donwload a binary content. /// </summary> /// <param name="absoluteUri">The absolute URI.</param> /// <param name="binaryContent">Content of the binary.</param> /// <param name="options">The options.</param> public static void DownloadBinary( Uri absoluteUri, out byte[] binaryContent, WebSiteDownloaderOptions options ) { Debug.WriteLine( string.Format( @"Reading content from URL '{0}'.", absoluteUri ) ); try { HttpWebRequest req = (HttpWebRequest)WebRequest.Create( absoluteUri ); ApplyProxy( req, options ); RequestCachePolicy cp = new RequestCachePolicy( RequestCacheLevel.BypassCache ); req.CachePolicy = cp; using ( HttpWebResponse resp = (HttpWebResponse)req.GetResponse() ) using ( Stream stream = resp.GetResponseStream() ) using ( MemoryStream mem = new MemoryStream() ) { int blockSize = 16384; byte[] blockBuffer = new byte[blockSize]; int read; while ( (read = stream.Read( blockBuffer, 0, blockSize )) > 0 ) { mem.Write( blockBuffer, 0, read ); } mem.Seek( 0, SeekOrigin.Begin ); binaryContent = mem.GetBuffer(); } } catch ( WebException x ) { if ( x.Status == WebExceptionStatus.ProtocolError ) { HttpWebResponse resp = (HttpWebResponse)x.Response; if ( resp.StatusCode == HttpStatusCode.NotFound || resp.StatusCode == HttpStatusCode.InternalServerError ) { Trace.WriteLine( string.Format( @"Ignoring web exception: '{0}'.", x.Message ) ); binaryContent = null; } else { throw; } } else { throw; } } }
// ------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="WebSiteDownloader"/> /// class. /// </summary> /// <param name="options">The options.</param> public WebSiteDownloader( WebSiteDownloaderOptions options) { this.SetOptions(options); }
// ------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="WebSiteDownloader"/> /// class. /// </summary> /// <param name="options">The options.</param> public WebSiteDownloader( WebSiteDownloaderOptions options) { this.SetOptions(options); }
/// <summary> /// Download a HTML page. Returns both the binary content, /// as well as the textual representation of the HTML page. /// </summary> /// <param name="absoluteUri">The absolute URI.</param> /// <param name="textContent">Content of the text.</param> /// <param name="encodingName">Name of the encoding.</param> /// <param name="encoding">The encoding.</param> /// <param name="binaryContent">Content of the binary.</param> /// <param name="options">The options.</param> public static void DownloadHtml( Uri absoluteUri, out string textContent, out string encodingName, out Encoding encoding, out byte[] binaryContent, WebSiteDownloaderOptions options ) { DownloadBinary( absoluteUri, out binaryContent, options ); encodingName = DetectEncodingName( binaryContent ); Debug.WriteLine( string.Format( @"Detected encoding '{0}' for remote HTML document from URL '{1}'.", encodingName, absoluteUri ) ); if ( binaryContent != null && binaryContent.Length > 0 ) { encoding = GetEncodingByName( encodingName ); textContent = encoding.GetString( binaryContent ); } else { // Default. encoding = Encoding.Default; textContent = null; } }
//[TestMethod] public void Run_Spider_On_Activity_Parsing_Using_iFollower() { DateTime Start = new DateTime(2013, 2, 15); DateTime End = new DateTime(2013, 2, 18); WebSiteDownloaderOptions options = new WebSiteDownloaderOptions(); options.DestinationFolderPath = new DirectoryInfo(dataDir); options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state", Start.Date.ToShortDateString().Replace("/", "-"), End.Date.ToShortDateString().Replace("/", "-")); options.MaximumLinkDepth = 3; options.TargetSession = 28; options.DownloadUri = new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}", options.TargetSession, Start.Date.ToShortDateString(), End.Date.ToShortDateString())); options.UriFollower.Add(new Follow { depth = 1, pattern = new Regex("get_bill.asp") }); options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_fulltext.asp") }); options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_bill_text.asp") }); //Re-inflate the indexing. WebSiteDownloader rslt = Spider.DownloadingProcessor(options); Assert.IsTrue(rslt.Resources.Count() > 0); }
// ------------------------------------------------------------------ /// <summary> /// Downloads the head. /// </summary> /// <param name="absoluteUri">The absolute URI.</param> /// <param name="options">The options.</param> /// <returns></returns> public static string DownloadHead( Uri absoluteUri, WebSiteDownloaderOptions options ) { try { if ( _headPool.ContainsKey( absoluteUri ) ) { return _headPool[absoluteUri]; } else { Debug.WriteLine( string.Format( @"Reading HEAD from URL '{0}'.", absoluteUri ) ); HttpWebRequest req = (HttpWebRequest)WebRequest.Create( absoluteUri ); req.Method = @"HEAD"; ApplyProxy( req, options ); RequestCachePolicy cp = new RequestCachePolicy( RequestCacheLevel.BypassCache ); req.CachePolicy = cp; using ( HttpWebResponse resp = (HttpWebResponse)req.GetResponse() ) { _headPool[absoluteUri] = resp.ContentType; return resp.ContentType; } } } catch ( WebException x ) { if ( x.Status == WebExceptionStatus.ProtocolError ) { HttpWebResponse resp = (HttpWebResponse)x.Response; if ( resp.StatusCode == HttpStatusCode.NotFound || resp.StatusCode == HttpStatusCode.InternalServerError ) { Trace.WriteLine( string.Format( @"Ignoring web exception: '{0}'.", x.Message ) ); return null; } else { try { throw; } catch (Exception ex) { Console.WriteLine(ex.Message.ToString()); } } } else { try { throw; } catch (Exception ex) { Console.WriteLine(ex.Message.ToString()); } } } return "Out OF Robot"; }
/// <summary> /// Initializes a new instance of the /// <see cref="UriResourceInformation"/> class. /// </summary> /// <param name="copyFrom">The copy from.</param> public UriResourceInformation( UriResourceInformation copyFrom) { _options = copyFrom._options; _originalUrl = copyFrom._originalUrl; _relativeUri = copyFrom._relativeUri; _baseUri = copyFrom._baseUri; _absoluteUri = copyFrom._absoluteUri; _linkType = copyFrom._linkType; }