コード例 #1
0
        // ------------------------------------------------------------------
        /// <summary>
        /// Initializes a new instance of the <see cref="WebSiteDownloader"/> 
        /// class.
        /// </summary>
        /// <param name="options">The options.</param>
        public WebSiteDownloader(
            WebSiteDownloaderOptions options)
        {
            Console.WriteLine(
                string.Format(
                    @"Constructing WebSiteDownloader for URI '{0}', destination folder path '{1}'.",
                    options.DownloadUri,
                    options.DestinationFolderPath));

            _settings = SpiderSettings.Restore(options.DestinationFolderPath, options.DestinationFileName);
            _settings.Options = options;
        }
コード例 #2
0
        // ------------------------------------------------------------------
        /// <summary>
        /// Initializes a new instance of the 
        /// <see cref="UriResourceInformation"/> class.
        /// </summary>
        /// <param name="options">The options.</param>
        /// <param name="originalUrl">The original URL.</param>
        /// <param name="uri">The URI.</param>
        /// <param name="baseUri">The base URI.</param>
        /// <param name="linkType">Type of the link.</param>
        public UriResourceInformation(
            WebSiteDownloaderOptions options,
            string originalUrl,
            Uri uri,
            Uri baseUri,
            UriType linkType,
            Uri parentUri,
            Int32 index)
        {
            _options = options;
            _originalUrl = originalUrl;
            _baseUri = baseUri;
            _parentUri = parentUri;
            _index = index;

            uri = new Uri(CleanupUrl(uri.OriginalString), UriKind.RelativeOrAbsolute);

            if (Uri.IsWellFormedUriString(uri.OriginalString, UriKind.Absolute))
            {
                _absoluteUri = uri;
                _relativeUri = null;
            }
            else if (Uri.IsWellFormedUriString(uri.OriginalString, UriKind.Relative))
            {
                _absoluteUri = MakeAbsoluteUri(baseUri, uri);
                _relativeUri = uri;
            }
            else
            {
                if (originalUrl.StartsWith(@"#"))
                {
                    _absoluteUri = null;
                    _relativeUri = new Uri(originalUrl, UriKind.RelativeOrAbsolute);
                }
                else
                {
                    _absoluteUri = MakeAbsoluteUri(baseUri, uri);
                    _relativeUri = uri;
                }
            }

            _linkType = linkType;
        }
コード例 #3
0
ファイル: Spider.cs プロジェクト: ragingsmurf/myLegis
        //private List<String> SessionActivity(DateTime Start, DateTime End)
        //{
        //    WebSiteDownloaderOptions options =
        //        new WebSiteDownloaderOptions();
        //    options.DestinationFolderPath =
        //        new DirectoryInfo(dataDir);
        //    options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
        //                                                Start.Date.ToShortDateString().Replace("/", "-"),
        //                                                End.Date.ToShortDateString().Replace("/", "-"));
        //    options.MaximumLinkDepth = 0;
        //    options.TargetSession = 28;
        //    options.DownloadUri =
        //        new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
        //            options.TargetSession,
        //            Start.Date.ToShortDateString(),
        //            End.Date.ToShortDateString()));
        //    //Get all bill links.
        //    options.GitCollectionRequest.Add(new DocumentHrefList()
        //    {
        //        pageName = "range_multi.asp",
        //        pageType = UriType.Content,
        //        pattern = new Regex(@"(?<=[=])[H|R|S][B|C|R|J]{0,3}[0-9]{1,4}", RegexOptions.IgnoreCase)
        //    });
        //    //Download que engine.
        //    WebSiteDownloader downloader = new WebSiteDownloader(options);
        //    downloader.ProcessingUrl +=
        //       new WebSiteDownloader.ProcessingUrlEventHandler(
        //       downloader_ProcessingUrl);
        //    downloader.ProcessCompleted +=
        //        new WebSiteDownloader.ProcessCompletedEventHandler(
        //        downloader_ProcessCompleted);
        //    downloader.ProcessAsync();
        //    while (true)
        //    {
        //        Thread.Sleep(1000);
        //        Console.WriteLine(@".");
        //        lock (typeof(Spider))
        //        {
        //            if (finished)
        //            {
        //                break;
        //            }
        //        }
        //    }
        //    Console.WriteLine(@"finished processing.");
        //    foreach (iCollector col in downloader.Parsings)
        //        Console.WriteLine(String.Format("Rule found for {0}", col.pageName));
        //    //Reset the exit.
        //    finished = false;
        //    //Grab saved targets.
        //    return ((DocumentHrefList)downloader.Parsings[0]).matches;
        //}
        public static WebSiteDownloader DownloadingProcessor(WebSiteDownloaderOptions options)
        {
            //Download que engine.
            WebSiteDownloader downloader = new WebSiteDownloader(options);

            downloader.ProcessingUrl +=
               new WebSiteDownloader.ProcessingUrlEventHandler(
               downloader_ProcessingUrl);

            downloader.ProcessCompleted +=
                new WebSiteDownloader.ProcessCompletedEventHandler(
                downloader_ProcessCompleted);

            downloader.ProcessAsync();

            while (true)
            {
                Thread.Sleep(1000);
                Console.WriteLine(@".");

                lock (typeof(Spider))
                {
                    if (finished)
                    {
                        break;
                    }
                }
            }

            Console.WriteLine(@"finished processing.");

            foreach (iCollector col in downloader.Parsings)
                Console.WriteLine(String.Format("Rule found for {0}", col.pageName));

            finished = false;

            return downloader;
        }
コード例 #4
0
        /// <summary>
        /// Download a binary content.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="binaryContent">Content of the binary.</param>
        /// <param name="options">The options.</param>
        public static void DownloadBinary(
            Uri absoluteUri,
            out byte[] binaryContent,
            WebSiteDownloaderOptions options)
        {
            Debug.WriteLine(
                string.Format(
                @"Reading content from URL '{0}'.",
                absoluteUri));

            try
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(absoluteUri);
                ApplyProxy(req, options);

                RequestCachePolicy cp = new RequestCachePolicy(
                    RequestCacheLevel.BypassCache);
                req.CachePolicy = cp;

                using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse())
                using (Stream stream = resp.GetResponseStream())
                using (MemoryStream mem = new MemoryStream())
                {
                    int blockSize = 16384;
                    byte[] blockBuffer = new byte[blockSize];
                    int read;

                    while ((read = stream.Read(blockBuffer, 0, blockSize)) > 0)
                    {
                        mem.Write(blockBuffer, 0, read);
                    }

                    mem.Seek(0, SeekOrigin.Begin);

                    binaryContent = mem.GetBuffer();
                }
            }
            catch (WebException x)
            {
                if (x.Status == WebExceptionStatus.ProtocolError)
                {
                    HttpWebResponse resp =
                        (HttpWebResponse)x.Response;

                    if (resp.StatusCode == HttpStatusCode.NotFound ||
                        resp.StatusCode == HttpStatusCode.InternalServerError ||
                        resp.StatusCode == HttpStatusCode.Forbidden)
                    {
                        Trace.WriteLine(
                            string.Format(
                            @"Ignoring web exception: '{0}'.",
                            x.Message));
                        binaryContent = null;
                    }
                    else
                    {
                        throw;
                    }
                }
                else
                {
                    throw;
                }
            }
        }
コード例 #5
0
        /// <summary>
        /// If a proxy is required, apply it to the request.
        /// </summary>
        /// <param name="req">The req.</param>
        /// <param name="options">The options.</param>
        private static void ApplyProxy(
            WebRequest req,
            WebSiteDownloaderOptions options)
        {
            switch (options.ProxyUsage)
            {
                default:
                case DownloadProxyUsage.Default:
                    req.Proxy = WebRequest.DefaultWebProxy;
                    break;

                case DownloadProxyUsage.NoProxy:
                    req.Proxy = null;
                    break;

                case DownloadProxyUsage.UseProxy:
                    Debug.Assert(options.Proxy != null);
                    req.Proxy = options.Proxy;
                    break;
            }
        }
コード例 #6
0
        /// <summary>
        /// Download a FORM post response content.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="binaryContent">Content of the binary.</param>
        /// <param name="options">The options.</param>
        public static void DownloadPost(
            Uri absoluteUri,
            out byte[] binaryContent,
            WebSiteDownloaderOptions options)
        {
            Debug.WriteLine(
                string.Format(
                @"Reading content from FORM '{0}'.",
                absoluteUri));

            try
            {
                using (WebClient wc = new WebClient())
                {
                    binaryContent = wc.UploadData(absoluteUri, new byte[0]);
                }

                //HttpWebRequest req = (HttpWebRequest)WebRequest.Create(absoluteUri);

                //req.Method = @"POST";
                //req.ContentType = "application/x-www-form-urlencoded";
                //req.ContentLength = 1;

                ////Apply proxy settings.
                //ApplyProxy(req, options);

                //RequestCachePolicy cp = new RequestCachePolicy(
                //    RequestCacheLevel.BypassCache);
                //req.CachePolicy = cp;

                //using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse())
                //using (Stream stream = resp.GetResponseStream())
                //using (MemoryStream mem = new MemoryStream())
                //{
                //    int blockSize = 16384;
                //    byte[] blockBuffer = new byte[blockSize];
                //    int read;

                //    while ((read = stream.Read(blockBuffer, 0, blockSize)) > 0)
                //    {
                //        mem.Write(blockBuffer, 0, read);
                //    }

                //    mem.Seek(0, SeekOrigin.Begin);

                //    binaryContent = mem.GetBuffer();
                //}
            }
            catch (WebException x)
            {
                if (x.Status == WebExceptionStatus.ProtocolError)
                {
                    HttpWebResponse resp =
                        (HttpWebResponse)x.Response;

                    if (resp.StatusCode == HttpStatusCode.NotFound ||
                        resp.StatusCode == HttpStatusCode.InternalServerError ||
                        resp.StatusCode == HttpStatusCode.Forbidden)
                    {
                        Console.WriteLine(
                            string.Format(
                            @"Ignoring web exception: '{0}'.",
                            x.Message));
                        binaryContent = null;
                    }
                    else
                    {
                        throw;
                    }
                }
                else
                {
                    throw;
                }
            }
        }
コード例 #7
0
        /// <summary>
        /// Download a HTML page. Returns both the binary content,
        /// as well as the textual representation of the HTML page.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="textContent">Content of the text.</param>
        /// <param name="encodingName">Name of the encoding.</param>
        /// <param name="encoding">The encoding.</param>
        /// <param name="binaryContent">Content of the binary.</param>
        /// <param name="options">The options.</param>
        public static void DownloadHtml(
            Uri absoluteUri,
            out string textContent,
            out string encodingName,
            out Encoding encoding,
            out byte[] binaryContent,
            WebSiteDownloaderOptions options)
        {
            DownloadBinary(absoluteUri, out binaryContent, options);

            encodingName = DetectEncodingName(binaryContent);

            Debug.WriteLine(
                string.Format(
                @"Detected encoding '{0}' for remote HTML document from URL '{1}'.",
                encodingName,
                absoluteUri));

            if (binaryContent != null && binaryContent.Length > 0)
            {
                encoding = GetEncodingByName(encodingName);
                textContent = encoding.GetString(binaryContent);
            }
            else
            {
                // Default.
                encoding = Encoding.Default;
                textContent = null;
            }
        }
コード例 #8
0
        // ------------------------------------------------------------------
        /// <summary>
        /// Downloads the head.
        /// </summary>
        /// <param name="absoluteUri">The absolute URI.</param>
        /// <param name="options">The options.</param>
        /// <returns></returns>
        public static string DownloadHead(
            Uri absoluteUri,
            WebSiteDownloaderOptions options)
        {
            try
            {
                if (_headPool.ContainsKey(absoluteUri))
                {
                    return _headPool[absoluteUri];
                }
                else
                {
                    Debug.WriteLine(
                        string.Format(
                        @"Reading HEAD from URL '{0}'.",
                        absoluteUri));

                    HttpWebRequest req =
                        (HttpWebRequest)WebRequest.Create(absoluteUri);
                    req.Method = @"HEAD";
                    ApplyProxy(req, options);

                    RequestCachePolicy cp = new RequestCachePolicy(
                        RequestCacheLevel.BypassCache);
                    req.CachePolicy = cp;

                    using (HttpWebResponse resp =
                        (HttpWebResponse)req.GetResponse())
                    {
                        _headPool[absoluteUri] = resp.ContentType;
                        return resp.ContentType;
                    }
                }
            }
            catch (WebException x)
            {
                if (x.Status == WebExceptionStatus.ProtocolError)
                {
                    HttpWebResponse resp =
                        (HttpWebResponse)x.Response;

                    if (resp.StatusCode == HttpStatusCode.NotFound ||
                        resp.StatusCode == HttpStatusCode.InternalServerError ||
                        resp.StatusCode == HttpStatusCode.Forbidden)
                    {
                        Trace.WriteLine(
                            string.Format(
                            @"Ignoring web exception: '{0}'.",
                            x.Message));
                        return null;
                    }
                    else
                    {
                        throw;
                    }
                }
                else
                {
                    throw;
                }
            }
        }
コード例 #9
0
        public void Run_Spider_Inflate_And_Save_Results_To_GitHub()
        {
            DateTime Start = new DateTime(2014, 2, 19);
            DateTime End = new DateTime(2015, 2, 21);

            WebSiteDownloaderOptions options =
             new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));

            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            /*
            1. Select Bill Names
            * We need to know the bill name (HB16), so we can save data in a folder of the same name.
            */

            //Static List of Bills
            var masterlist = (from r in rslt.Resources
                              where r.Index == 1 && (r.AbsoluteUri.AbsoluteUri.Contains(@"get_bill.asp"))
                              select r).ToList();

            //Match bill titles in the URI (HB16,SB12..)
            Regex billTitles = new Regex(@"(?<=[=])[H|R|S][B|C|R|J]{0,3}[0-9]{1,4}", RegexOptions.IgnoreCase);

            //Return a list of the first matches
            var bills = (from b in masterlist
                         let matches = billTitles.Matches(b.AbsoluteUri.AbsoluteUri)
                         where matches.Count > 0
                         select new
                         {
                             resource = b,
                             url = b.AbsoluteUri,
                             name = matches.Cast<Match>().FirstOrDefault()
                         }).ToList();

            /*
            2. Build out directory structure for bill data.
            * We have a list of bills, now where are we going to save the data?
            */

            DirectoryInfo session = new DirectoryInfo(String.Format(@"{0}/{1}", dataDir, 28));
            if (!session.Exists)
                session.Create();

            foreach (var item in bills)
            {
                //bill directory
                DirectoryInfo bill = new DirectoryInfo(String.Format(@"{0}/{1}/{2}", dataDir, 28, item.name));
                if (!bill.Exists)
                    bill.Create();
            }

            /*
            3. Associated bill data
            *  Grab associated bill data. Name, Title, LongTitle,
             *  Minutes Content, Bill Revisions, Bill Activity
            */

            foreach (var bill in bills)
            {

                //Results placeholders
                List<iCollector> meta = new List<iCollector>();
                List<iCollector> revisions = new List<iCollector>();
                List<iCollector> minutes = new List<iCollector>();
                List<iCollector> committee = new List<iCollector>();

                //Document history, activity and kvp..
                meta.AddRange((from h in rslt.Parsings
                               where h.source.AbsoluteUri.AbsoluteUri == bill.url.AbsoluteUri
                                  || h.source.Parent.AbsoluteUri == bill.url.AbsoluteUri
                               select h).ToList());

                //Bill Content
                revisions.AddRange((from d in rslt.Parsings
                                    where d.source.Parent.AbsoluteUri
                                           .Contains(String.Format(@"get_fulltext.asp?session={0}&bill={1}", 28, bill.name))
                                    select d).ToList());

                //Committee Meetings
                committee.AddRange((from d in rslt.Resources
                                    join p in rslt.Parsings
                                    on d.AbsoluteUri.AbsoluteUri equals
                                              p.source.Parent.AbsoluteUri
                                    where p.source.AbsoluteUri.AbsoluteUri.Contains("get_minutes.asp")
                                    && d.AbsoluteUri.AbsoluteUri.Contains(String.Format("{0}", bill.name))
                                    select p).ToList());

                //Meeting Transcript (minutes)
                minutes.AddRange((from d in rslt.Resources
                                  join p in rslt.Parsings
                                  on d.AbsoluteUri.AbsoluteUri equals
                                     p.source.Parent.AbsoluteUri
                                  where p.source.AbsoluteUri.AbsoluteUri
                                         .Contains(@"get_single_minute.asp")
                                         && d.AbsoluteUri.AbsoluteUri
                                         .Contains(String.Format("{0}", bill.name))
                                  select p).ToList());

                /*
                4. Start saving off the data
                * We have a list of bills, now where are we going to save the data?
                */

                String fileLoc = String.Format(@"{0}\{1}\", 28, bill.name);

                GitRepository gr = new GitRepository();
                //Process bill parts
                gr.ProcessBill(fileLoc, new ParsedBill()
                {
                    meta = meta,
                    minutes = minutes,
                    revisions = revisions,
                    committee = committee
                });

            }

            Assert.IsTrue(true);
        }
コード例 #10
0
        /// <summary>
        /// Re-inflates the search results.
        /// </summary>
        //[TestMethod]
        public void Run_Spider_On_Activity_Search_Results()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options = new WebSiteDownloaderOptions();
            options.DestinationFolderPath = new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));

            //Download que engine.
            WebSiteDownloader downloader = new WebSiteDownloader(options);

            List<iCollector> coll = (from p in downloader.Parsings
                                     select p).ToList();

            //RegEx for matching bill copy.
            Regex r = new Regex("get[_]bill[_]text[.]asp");

            //Get all matches.
            List<iCollector> refined = (from el in coll
                                        let matches = r.Matches(el.source.AbsoluteUri.AbsoluteUri)
                                        where matches.Count != 0
                                        select el).ToList();

            Assert.IsTrue(coll.Count() > 0);
        }
コード例 #11
0
        /// <summary>
        /// Primary spider index and save to the result's .state file. 
        /// </summary>
        //[TestMethod]
        public void Run_Spider_On_Activity_Parsing_Using_iFollower_And_Saving_Using_GitCollector()
        {
            DateTime Start = new DateTime(2014, 2, 19);
            DateTime End = new DateTime(2015, 2, 21);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            //What pages to follow.
            options.UriFollower.Add(new Follow { depth = 1, pattern = new Regex("get_bill.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_fulltext.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_complete_bill.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_minutes.asp") });
            options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_bill_text.asp") });
            options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_single_minute.asp[?]ch") });

            //What content to serialize and save off.
            options.GitCollectionRequest.Add(new DocumentHistory() { pageName = "get_fulltext.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentActivity() { pageName = "get_complete_bill.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentMeeting() { pageName = "get_minutes.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_bill_text.asp", pageType = UriType.Content });
            options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_single_minute.asp", pageType = UriType.Content });

            //Specialized content filter using regular expression matches.
            var d = new Dictionary<String, Regex>();
            d.Add("Bill Name", new Regex(@"(?<=<b>BILL:</b>)[a-z,\s,\w,(,),"",',;,.]{5,800}", RegexOptions.IgnoreCase));
            d.Add("Title", new Regex(@"(?<=<b>TITLE:</b>)[a-z,\s,\w,(,),"",',\-,;,.]{5,5000}", RegexOptions.IgnoreCase));
            d.Add("Short Title", new Regex(@"(?<=<b>SHORT TITLE:</b>)(.*)(?=</font>)", RegexOptions.IgnoreCase));
            d.Add("Status Date", new Regex(@"(?<=<b>STATUS DATE:</b>)[0-9,/\,\w\s]{5,50}", RegexOptions.IgnoreCase));
            d.Add("Current Status", new Regex(@"(?<=<b>CURRENT STATUS:</b>)[a-z,\s,&,;,\ ,(,),/,0-9]{2,60}", RegexOptions.IgnoreCase));
            d.Add("Sponsors", new Regex(@"(?<=<b>SPONSOR[(]S[)]:</b>)[a-z,\s,\w,(,),.,;,""]{5,800}", RegexOptions.IgnoreCase));

            //RegEx matching container.
            options.GitCollectionRequest.Add(new DocumentKVP() { pageName = "get_bill.asp", pageType = UriType.Content, rvp = d });

            //Re-inflate the indexing.
            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            Assert.IsTrue(rslt.Parsings.Count() > 0);
        }
コード例 #12
0
        //[TestMethod]
        public void Run_Spider_On_Activity_Parsing_Using_iFollower()
        {
            DateTime Start = new DateTime(2013, 2, 16);
            DateTime End = new DateTime(2013, 2, 21);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            options.UriFollower.Add(new Follow { depth = 1, pattern = new Regex("get_bill.asp") });
            options.UriFollower.Add(new Follow { depth = 2, pattern = new Regex("get_fulltext.asp") });
            options.UriFollower.Add(new Follow { depth = 3, pattern = new Regex("get_bill_text.asp") });

            //Re-inflate the indexing.
            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            Assert.IsTrue(rslt.Resources.Count() > 0);
        }
コード例 #13
0
        /// <summary>
        /// Basic version of the spider, fewer spidering options are set.
        /// </summary>
        //[TestMethod]
        public void Run_Spider_On_Activity_Parsing_Run()
        {
            DateTime Start = new DateTime(2013, 2, 15);
            DateTime End = new DateTime(2013, 2, 18);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 1;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            //Create git repo hooks.
            options.GitCollectionRequest.Add(new DocumentHistory() { pageName = "get_fulltext.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentActivity() { pageName = "get_complete_bill.asp", pageType = UriType.Form });
            options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_bill_text.asp", pageType = UriType.Content });
            options.GitCollectionRequest.Add(new DocumentCopy() { pageName = "get_single_minute.asp", pageType = UriType.Content });

            //Regular expression matches.
            var d = new Dictionary<String, Regex>();
            d.Add("Title", new Regex(@"(?<=<b>TITLE:</b>)[a-z,\s,\w,(,),"",',;,.]{5,800}", RegexOptions.IgnoreCase));
            d.Add("Short Title", new Regex(@"(?<=<b>SHORT TITLE:</b>)(.*)(?=</font>)", RegexOptions.IgnoreCase));
            d.Add("Status Date", new Regex(@"(?<=<b>STATUS DATE:</b>)[0-9,/\,\w\s]{5,50}", RegexOptions.IgnoreCase));
            d.Add("Current Status", new Regex(@"(?<=<b>CURRENT STATUS:</b>)[a-z,\s,\ ,(,)]{10,60}", RegexOptions.IgnoreCase));
            d.Add("Sponsors", new Regex(@"(?<=<b>SPONSOR[(]S[)]:</b>)[a-z,\s,\w,(,),.,;,""]{5,800}", RegexOptions.IgnoreCase));
            //Regex kvp matching container.
            options.GitCollectionRequest.Add(new DocumentKVP() { pageName = "get_bill.asp", pageType = UriType.Content, rvp = d });

            //Re-inflate the indexing.
            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            Assert.IsTrue(rslt.Parsings.Count() > 0);
        }
コード例 #14
0
        //[TestMethod]
        public void Run_Spider_Inflate_Check_Results_For_Minutes()
        {
            DateTime Start = new DateTime(2013, 2, 16);
            DateTime End = new DateTime(2013, 2, 21);

            WebSiteDownloaderOptions options =
               new WebSiteDownloaderOptions();
            options.DestinationFolderPath =
                new DirectoryInfo(dataDir);
            options.DestinationFileName = String.Format("Session-Activity[{0}][{1}].state",
                                            Start.Date.ToShortDateString().Replace("/", "-"),
                                            End.Date.ToShortDateString().Replace("/", "-"));
            options.MaximumLinkDepth = 3;
            options.TargetSession = 28;
            options.DownloadUri =
                new Uri(String.Format(@"http://www.legis.state.ak.us/basis/range_multi.asp?session={0}&Date1={1}&Date2={2}",
                            options.TargetSession,
                            Start.Date.ToShortDateString(),
                            End.Date.ToShortDateString()));

            WebSiteDownloader rslt = Spider.DownloadingProcessor(options);

            var minutes = (from m in rslt.Resources
                           where m.AbsoluteUri.AbsoluteUri.Contains(@"get_minutes.asp")
                           select m).ToList();

            string stop = @"";
        }
コード例 #15
0
 /// <summary>
 /// Initializes a new instance of the 
 /// <see cref="UriResourceInformation"/> class.
 /// </summary>
 /// <param name="copyFrom">The copy from.</param>
 public UriResourceInformation(
     UriResourceInformation copyFrom)
 {
     _options = copyFrom._options;
     _originalUrl = copyFrom._originalUrl;
     _relativeUri = copyFrom._relativeUri;
     _baseUri = copyFrom._baseUri;
     _absoluteUri = copyFrom._absoluteUri;
     _linkType = copyFrom._linkType;
     _parentUri = copyFrom._parentUri;
 }