Ejemplo n.º 1
0
        /// <summary>
        /// Crawl the site starting with the SeedUrl and stores the index when done
        /// </summary>
        /// <param name="SeedUrl"></param>
        public async Task Crawl(String SeedUrl, FinishedCrawlAsync CrawlDoneCallBack)///not sure here if i should pass another function responsible for copying the IndexedPages to the IndexCache or just do it automatically
        {
            ///0) Spin off a task or something
            ///1) Get HTML of seed
            ///     - put in _content
            //HttpResponseMessage response = CrawlPage(SeedUrl);
            String html = await CrawlPage(SeedUrl);//await response.Content.ReadAsStringAsync();
            
            ///2) Parse Urls
            ///     - toss out urls if UrlDiscovery is false
            if (MvcIndexer.Configuration.UrlDiscovery)
            {
                List<String> urls = LinkParser.ParseLinks(html, SeedUrl);
                foreach (String url in urls)
                    index.AddLink(url);
            }
            
            index.AddLinks(Indexable.GetIndexable());

            #region add the seed page
            Page p = new Page(SeedUrl, html);
            
            await p.RunFilters(MvcIndexer.Configuration.Filters);
            p.StripHtml();

            index.AddLink(new Link()
            {
                Crawled = true,
                Page = p
            });
            #endregion
            ///3) Cycle through all urls until everything has been crawled
            IEnumerable<Link> links = index.GetUncrawledLinks();
            Int32 blankcounter = 0;
            while (blankcounter < 5)
            {
                foreach (Link link in links)
                {
                    await CrawlPageAsync(link.Url);
                }
                links = index.GetUncrawledLinks();
                if (links.Count() > 0)
                {
                    blankcounter++;
                    Thread.Sleep(10000); ///sleep to give index a chance to repopulate with more links
                }
            }
            
            
            ///5) If crawl type is continuous slowly churn through them based on some 
            /// arbitrary limit based on a page every 3 seconds or something.
            /// if crawl type is scheduled, do a taskfactory and burn through them
            
            ///7) Call the CallBack function

            if(CrawlDoneCallBack != null)
                await CrawlDoneCallBack(index);///???
        }