/// <summary>
        /// Starts spider to get the list of all relevant urls
        /// Than analyses pages recognized as a record page
        /// </summary>
        /// <param name="firstUrl"></param>
        private void startSpider(string firstUrl)
        {
            WebSpider spider = new WebSpider(firstUrl, REQUIRED_URL_BODY, NUMBER_OF_PAGES_LIMIT);

            spider.Execute();

            StringBuilder builder = new StringBuilder();
            int           i       = 0;

            foreach (DictionaryEntry entry in spider.WebPages)
            {
                var page = ((System.Uri)entry.Key).ToString();
                if (!page.Contains(REQUIRED_URL_BODY_WITHOUT))
                {
                    continue;
                }

                if (page.Contains(RECORD_PAGE_IDENTIFIER))
                {
                    analyzeContent(page);
                }

                builder.AppendLine(page);

                i++;
                if (i % 10 == 0)
                {
                    Console.Out.WriteLine(string.Format("Progress {0} %", Math.Round((i / (double)spider.WebPages.Count * 100)), 2));
                }
            }

            File.WriteAllText(outputDirectory + LOG_NAME, builder.ToString());
            Console.Out.WriteLine("Processed urls: " + collectionIndex);
        }
Beispiel #2
0
        public RunSpider(string uri, string baseUri, int maxUri)
        {
            Spider = new WebSpider(uri, baseUri, maxUri);
            Spider.Execute( );

            ICollection webPages = Spider.WebPages.Values;

            Pages = new WebPageState[webPages.Count];

            int index = 0;

            foreach (WebPageState webPage in webPages)
            {
                Pages[index++] = webPage;
            }
        }