static void Main(string[] args) { var arguments = Parser.ParseArguments <CrawlerArguments>(args); var config = new XiaoyaCrawler.Config.CrawlerConfig { InitUrls = arguments.InitUrl.Split(","), UrlFileStore = new UrlFileStore(), UrlFrontierItemStore = new UrlFrontierItemStore(), LinkStore = new LinkStore(), PostingListStore = new PostingListStore(), InvertedIndexStore = new InvertedIndexStore(), FetchDirectory = arguments.FetchDir, LogDirectory = arguments.LogDir, MaxFetchingConcurrency = int.Parse(arguments.ThreadCount), }; var urlFilters = new List <IUrlFilter> { new DomainUrlFilter(@"^http(s)?://[\w\.\-_]*(bnu\.edu\.cn|oiegg.com($|/$|/(index|viewthread|forumdisplay).php))", @"(v6\.oiegg\.com)" + @"|http://[\w\.\-_]*(oiegg.com)" + @"|((cless|pb\.ss\.graduate|ipv6te)\.bnu\.edu\.cn)" + @"|brain\.bnu\.edu\.cn/mrbs" + @"|cogs\.bnu\.edu\.cn/index\.php\?action=page&pid=1[123]" + @"|532movie\.bnu\.edu\.cn/player" + @"|(/(search|print|login|space)[\./])"), new UrlNormalizer(), }; var crawler = new Crawler( config, new SimpleUrlFrontier(config), new SimpleFetcher(config), new SimpleSimilarContentManager(config), urlFilters ); crawler.StartAsync().GetAwaiter().GetResult(); }
public async Task TestCrawler() { var config = new XiaoyaCrawler.Config.CrawlerConfig { InitUrls = new List <string> { "http://www.bnu.edu.cn", }, UrlFileStore = new UrlFileStore(), UrlFrontierItemStore = new UrlFrontierItemStore(), LinkStore = new LinkStore(), FetchDirectory = fetchDir, LogDirectory = logDir, MaxFetchingConcurrency = 100, }; var urlFilters = new List <IUrlFilter> { new DomainUrlFilter(@"bnu\.edu\.cn"), }; var crawler = new Crawler( config, new SimpleUrlFrontier(config), new SimpleFetcher(config), new SimpleSimilarContentManager(config), urlFilters ); var task = Task.Run(() => { crawler.StartAsync().GetAwaiter().GetResult(); }); Thread.Sleep(10000); await crawler.StopAsync(); lock (RuntimeLogger.ReadLock) { var urlLineNo = new Dictionary <string, int>(); int lineNo = 0; foreach (var line in File.ReadLines(Path.Combine(logDir, "Crawler.log"))) { lineNo++; if (!line.Contains(":") || line.Length <= line.IndexOf(":") + 2) { continue; } var url = line.Substring(line.IndexOf(":") + 2); if (line.StartsWith("Begin Crawl: ")) { if (urlLineNo.ContainsKey(url) && urlLineNo[url] != -1) { Assert.Fail("Duplicate Crawl: " + urlLineNo[url] + ":" + lineNo + " " + url); } else { urlLineNo[url] = lineNo; } } else if (line.StartsWith("End Crawl: ")) { urlLineNo[url] = -1; } } } }