public Spider(Frontier frontier, Index index, Filtering.Filter filter, Action<Index> callback) { this.frontier = frontier; this.index = Index.CreateEmptyCopy(index); this.filter = filter; this.callback = callback; }
public Spider(Frontier frontier, Index index, Filtering.Filter filter, Action <Index> callback) { this.frontier = frontier; this.index = Index.CreateEmptyCopy(index); this.filter = filter; this.callback = callback; }
public static void StartAndWait(Frontier frontier, Index index, Filtering.Filter filter, int pagecount) { int count = (int)Math.Ceiling(pagecount / (double)SPIDER_PAGE_COUNT); Spider[] spiders = new Spider[count]; Thread[] threads = new Thread[count]; for (int i = 0; i < count; i++) { Spider sp = spiders[i] = new Spider(frontier, index, filter, ind => { Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine("Merging Index of {0}", ind.SiteCount); Console.ForegroundColor = ConsoleColor.Gray; lock (index) { index.MergeIn(ind); } }); threads[i] = new Thread(() => sp.Run()); threads[i].Start(); } for (int i = 0; i < count; i++) { threads[i].Join(); } }
static void Main(string[] args) { Console.WindowWidth += 50; frontier = new Frontier(new Exclusions()); frontier.Add(new URL("http://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles")); Filter filter = new DomainFilter("en.wikipedia.org") & new ExtentionFilter(false, "jpg", "jpeg", "gif", "png", "rar", "zip", "exe", "pdf"); DateTime start = DateTime.Now; Crawler.StartAndWait(frontier, index, filter, 100); DateTime end = DateTime.Now; Console.WriteLine("Crawler done in {0:0.00} sec ({1:0.00} pages per sec).", (end - start).TotalSeconds, index.SiteCount / (end - start).TotalSeconds); Console.WriteLine("Press any key to start querying."); Console.ReadKey(true); Console.WriteLine(); start = DateTime.Now; Ranker r = new Ranker(index, TrimmingStemmer.GetStemmer(PorterStemmer.StemTerm)); end = DateTime.Now; Console.WriteLine("Ranker created in {0:0.00} sec.", (end - start).TotalSeconds); string searchQuery = ""; while (true) { Console.WriteLine("Query for data below. Enter an empty string to quit."); Console.Write("Search for: "); searchQuery = Console.ReadLine(); if (searchQuery == "") { break; } start = DateTime.Now; foreach (var doc in r.GetHits(searchQuery).OrderByDescending(x => x.Item2)) { Console.WriteLine("Rank: {1:0.000000} for: {0}", doc.Item1.URL.Address, doc.Item2); } end = DateTime.Now; Console.WriteLine("Query completed in {0:0.00} sec", (end - start).TotalSeconds); Console.WriteLine(); } frontier.Kill(); }
public ThreadCrawler(Frontier frontier) { _frontier = frontier; // setup http var socketsHandler = new SocketsHttpHandler { AllowAutoRedirect = true, PooledConnectionLifetime = TimeSpan.FromSeconds(60), PooledConnectionIdleTimeout = TimeSpan.FromMinutes(5), MaxConnectionsPerServer = _maxConnectionsPerServer }; System.Net.ServicePointManager.SecurityProtocol = System.Net.SecurityProtocolType.Tls12; _httpClient = new HttpClient(socketsHandler); _httpClient.Timeout = TimeSpan.FromSeconds(_urlTimeout); _httpClient.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)"); }
static void Main(string[] args) { Console.WindowWidth += 50; frontier = new Frontier(new Exclusions()); frontier.Add(new URL("http://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles")); Filter filter = new DomainFilter("en.wikipedia.org") & new ExtentionFilter(false, "jpg", "jpeg", "gif", "png", "rar", "zip", "exe", "pdf"); DateTime start = DateTime.Now; Crawler.StartAndWait(frontier, index, filter, 100); DateTime end = DateTime.Now; Console.WriteLine("Crawler done in {0:0.00} sec ({1:0.00} pages per sec).", (end - start).TotalSeconds, index.SiteCount / (end - start).TotalSeconds); Console.WriteLine("Press any key to start querying."); Console.ReadKey(true); Console.WriteLine(); start = DateTime.Now; Ranker r = new Ranker(index, TrimmingStemmer.GetStemmer(PorterStemmer.StemTerm)); end = DateTime.Now; Console.WriteLine("Ranker created in {0:0.00} sec.", (end - start).TotalSeconds); string searchQuery = ""; while (true) { Console.WriteLine("Query for data below. Enter an empty string to quit."); Console.Write("Search for: "); searchQuery = Console.ReadLine(); if (searchQuery == "") break; start = DateTime.Now; foreach (var doc in r.GetHits(searchQuery).OrderByDescending(x => x.Item2)) Console.WriteLine("Rank: {1:0.000000} for: {0}", doc.Item1.URL.Address, doc.Item2); end = DateTime.Now; Console.WriteLine("Query completed in {0:0.00} sec", (end - start).TotalSeconds); Console.WriteLine(); } frontier.Kill(); }
static async System.Threading.Tasks.Task Main(string[] args) { // Start url string url = "https://www.easv.dk/"; string url2 = "https://www.google.com/search?q=autonomous+agent"; // Initialize frontier and add url Frontier frontier = new Frontier(); frontier.AddUrl(url); // Sequential crawler Crawler smallSpider = new Crawler(frontier); //smallSpider.StartCrawling(); // Parallel crawler using threads ThreadCrawler giantHouseSpider = new ThreadCrawler(frontier); await giantHouseSpider.SendAsync(); }
public static void StartAndWait(Frontier frontier, Index index, Filtering.Filter filter, int pagecount) { int count = (int)Math.Ceiling(pagecount / (double)SPIDER_PAGE_COUNT); Spider[] spiders = new Spider[count]; Thread[] threads = new Thread[count]; for (int i = 0; i < count; i++) { Spider sp = spiders[i] = new Spider(frontier, index, filter, ind => { Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine("Merging Index of {0}", ind.SiteCount); Console.ForegroundColor = ConsoleColor.Gray; lock (index) { index.MergeIn(ind); } }); threads[i] = new Thread(() => sp.Run()); threads[i].Start(); } for (int i = 0; i < count; i++) threads[i].Join(); }
public Crawler(Frontier frontier) { _frontier = frontier; }
public void SetFrontier(Frontier frontier) { _frontier = frontier; }