コード例 #1
0
        static void Main(string[] args)
        {
            var arguments = Parser.ParseArguments <CrawlerArguments>(args);

            var config = new XiaoyaCrawler.Config.CrawlerConfig
            {
                InitUrls               = arguments.InitUrl.Split(","),
                UrlFileStore           = new UrlFileStore(),
                UrlFrontierItemStore   = new UrlFrontierItemStore(),
                LinkStore              = new LinkStore(),
                PostingListStore       = new PostingListStore(),
                InvertedIndexStore     = new InvertedIndexStore(),
                FetchDirectory         = arguments.FetchDir,
                LogDirectory           = arguments.LogDir,
                MaxFetchingConcurrency = int.Parse(arguments.ThreadCount),
            };

            var urlFilters = new List <IUrlFilter>
            {
                new DomainUrlFilter(@"^http(s)?://[\w\.\-_]*(bnu\.edu\.cn|oiegg.com($|/$|/(index|viewthread|forumdisplay).php))",
                                    @"(v6\.oiegg\.com)"
                                    + @"|http://[\w\.\-_]*(oiegg.com)"
                                    + @"|((cless|pb\.ss\.graduate|ipv6te)\.bnu\.edu\.cn)"
                                    + @"|brain\.bnu\.edu\.cn/mrbs"
                                    + @"|cogs\.bnu\.edu\.cn/index\.php\?action=page&pid=1[123]"
                                    + @"|532movie\.bnu\.edu\.cn/player"
                                    + @"|(/(search|print|login|space)[\./])"),
                new UrlNormalizer(),
            };

            var crawler = new Crawler(
                config,
                new SimpleUrlFrontier(config),
                new SimpleFetcher(config),
                new SimpleSimilarContentManager(config),
                urlFilters
                );

            crawler.StartAsync().GetAwaiter().GetResult();
        }
コード例 #2
0
        public async Task TestCrawler()
        {
            var config = new XiaoyaCrawler.Config.CrawlerConfig
            {
                InitUrls = new List <string>
                {
                    "http://www.bnu.edu.cn",
                },
                UrlFileStore           = new UrlFileStore(),
                UrlFrontierItemStore   = new UrlFrontierItemStore(),
                LinkStore              = new LinkStore(),
                FetchDirectory         = fetchDir,
                LogDirectory           = logDir,
                MaxFetchingConcurrency = 100,
            };

            var urlFilters = new List <IUrlFilter>
            {
                new DomainUrlFilter(@"bnu\.edu\.cn"),
            };

            var crawler = new Crawler(
                config,
                new SimpleUrlFrontier(config),
                new SimpleFetcher(config),
                new SimpleSimilarContentManager(config),
                urlFilters
                );

            var task = Task.Run(() =>
            {
                crawler.StartAsync().GetAwaiter().GetResult();
            });

            Thread.Sleep(10000);

            await crawler.StopAsync();

            lock (RuntimeLogger.ReadLock)
            {
                var urlLineNo = new Dictionary <string, int>();
                int lineNo    = 0;
                foreach (var line in File.ReadLines(Path.Combine(logDir, "Crawler.log")))
                {
                    lineNo++;
                    if (!line.Contains(":") || line.Length <= line.IndexOf(":") + 2)
                    {
                        continue;
                    }
                    var url = line.Substring(line.IndexOf(":") + 2);
                    if (line.StartsWith("Begin Crawl: "))
                    {
                        if (urlLineNo.ContainsKey(url) && urlLineNo[url] != -1)
                        {
                            Assert.Fail("Duplicate Crawl: " + urlLineNo[url] + ":" + lineNo + " " + url);
                        }
                        else
                        {
                            urlLineNo[url] = lineNo;
                        }
                    }
                    else if (line.StartsWith("End Crawl: "))
                    {
                        urlLineNo[url] = -1;
                    }
                }
            }
        }