/// <summary> /// Crawl the site starting with the SeedUrl and stores the index when done /// </summary> /// <param name="SeedUrl"></param> public async Task Crawl(String SeedUrl, FinishedCrawlAsync CrawlDoneCallBack)///not sure here if i should pass another function responsible for copying the IndexedPages to the IndexCache or just do it automatically { ///0) Spin off a task or something ///1) Get HTML of seed /// - put in _content //HttpResponseMessage response = CrawlPage(SeedUrl); String html = await CrawlPage(SeedUrl);//await response.Content.ReadAsStringAsync(); ///2) Parse Urls /// - toss out urls if UrlDiscovery is false if (MvcIndexer.Configuration.UrlDiscovery) { List<String> urls = LinkParser.ParseLinks(html, SeedUrl); foreach (String url in urls) index.AddLink(url); } index.AddLinks(Indexable.GetIndexable()); #region add the seed page Page p = new Page(SeedUrl, html); await p.RunFilters(MvcIndexer.Configuration.Filters); p.StripHtml(); index.AddLink(new Link() { Crawled = true, Page = p }); #endregion ///3) Cycle through all urls until everything has been crawled IEnumerable<Link> links = index.GetUncrawledLinks(); Int32 blankcounter = 0; while (blankcounter < 5) { foreach (Link link in links) { await CrawlPageAsync(link.Url); } links = index.GetUncrawledLinks(); if (links.Count() > 0) { blankcounter++; Thread.Sleep(10000); ///sleep to give index a chance to repopulate with more links } } ///5) If crawl type is continuous slowly churn through them based on some /// arbitrary limit based on a page every 3 seconds or something. /// if crawl type is scheduled, do a taskfactory and burn through them ///7) Call the CallBack function if(CrawlDoneCallBack != null) await CrawlDoneCallBack(index);///??? }