public Spider(Frontier frontier, Index index, Filtering.Filter filter, Action<Index> callback)
 {
     this.frontier = frontier;
     this.index = Index.CreateEmptyCopy(index);
     this.filter = filter;
     this.callback = callback;
 }
 public Spider(Frontier frontier, Index index, Filtering.Filter filter, Action <Index> callback)
 {
     this.frontier = frontier;
     this.index    = Index.CreateEmptyCopy(index);
     this.filter   = filter;
     this.callback = callback;
 }
        public static void StartAndWait(Frontier frontier, Index index, Filtering.Filter filter, int pagecount)
        {
            int count = (int)Math.Ceiling(pagecount / (double)SPIDER_PAGE_COUNT);

            Spider[] spiders = new Spider[count];
            Thread[] threads = new Thread[count];

            for (int i = 0; i < count; i++)
            {
                Spider sp = spiders[i] = new Spider(frontier, index, filter, ind =>
                {
                    Console.ForegroundColor = ConsoleColor.Cyan;
                    Console.WriteLine("Merging Index of {0}", ind.SiteCount);
                    Console.ForegroundColor = ConsoleColor.Gray;

                    lock (index) { index.MergeIn(ind); }
                });
                threads[i] = new Thread(() => sp.Run());
                threads[i].Start();
            }

            for (int i = 0; i < count; i++)
            {
                threads[i].Join();
            }
        }
Example #4
0
        static void Main(string[] args)
        {
            Console.WindowWidth += 50;

            frontier = new Frontier(new Exclusions());
            frontier.Add(new URL("http://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles"));

            Filter filter = new DomainFilter("en.wikipedia.org") & new ExtentionFilter(false, "jpg", "jpeg", "gif", "png", "rar", "zip", "exe", "pdf");

            DateTime start = DateTime.Now;

            Crawler.StartAndWait(frontier, index, filter, 100);

            DateTime end = DateTime.Now;

            Console.WriteLine("Crawler done in {0:0.00} sec ({1:0.00} pages per sec).", (end - start).TotalSeconds, index.SiteCount / (end - start).TotalSeconds);
            Console.WriteLine("Press any key to start querying.");
            Console.ReadKey(true);
            Console.WriteLine();

            start = DateTime.Now;
            Ranker r = new Ranker(index, TrimmingStemmer.GetStemmer(PorterStemmer.StemTerm));

            end = DateTime.Now;
            Console.WriteLine("Ranker created in {0:0.00} sec.", (end - start).TotalSeconds);

            string searchQuery = "";

            while (true)
            {
                Console.WriteLine("Query for data below. Enter an empty string to quit.");
                Console.Write("Search for: ");
                searchQuery = Console.ReadLine();
                if (searchQuery == "")
                {
                    break;
                }

                start = DateTime.Now;
                foreach (var doc in r.GetHits(searchQuery).OrderByDescending(x => x.Item2))
                {
                    Console.WriteLine("Rank: {1:0.000000} for: {0}", doc.Item1.URL.Address, doc.Item2);
                }
                end = DateTime.Now;

                Console.WriteLine("Query completed in {0:0.00} sec", (end - start).TotalSeconds);
                Console.WriteLine();
            }
            frontier.Kill();
        }
Example #5
0
        public ThreadCrawler(Frontier frontier)
        {
            _frontier = frontier;

            // setup http
            var socketsHandler = new SocketsHttpHandler
            {
                AllowAutoRedirect           = true,
                PooledConnectionLifetime    = TimeSpan.FromSeconds(60),
                PooledConnectionIdleTimeout = TimeSpan.FromMinutes(5),
                MaxConnectionsPerServer     = _maxConnectionsPerServer
            };

            System.Net.ServicePointManager.SecurityProtocol = System.Net.SecurityProtocolType.Tls12;
            _httpClient         = new HttpClient(socketsHandler);
            _httpClient.Timeout = TimeSpan.FromSeconds(_urlTimeout);
            _httpClient.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)");
        }
        static void Main(string[] args)
        {
            Console.WindowWidth += 50;

            frontier = new Frontier(new Exclusions());
            frontier.Add(new URL("http://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles"));

            Filter filter = new DomainFilter("en.wikipedia.org") & new ExtentionFilter(false, "jpg", "jpeg", "gif", "png", "rar", "zip", "exe", "pdf");

            DateTime start = DateTime.Now;

            Crawler.StartAndWait(frontier, index, filter, 100);

            DateTime end = DateTime.Now;
            Console.WriteLine("Crawler done in {0:0.00} sec ({1:0.00} pages per sec).", (end - start).TotalSeconds, index.SiteCount / (end - start).TotalSeconds);
            Console.WriteLine("Press any key to start querying.");
            Console.ReadKey(true);
            Console.WriteLine();

            start = DateTime.Now;
            Ranker r = new Ranker(index, TrimmingStemmer.GetStemmer(PorterStemmer.StemTerm));
            end = DateTime.Now;
            Console.WriteLine("Ranker created in {0:0.00} sec.", (end - start).TotalSeconds);

            string searchQuery = "";
            while (true)
            {
                Console.WriteLine("Query for data below. Enter an empty string to quit.");
                Console.Write("Search for: ");
                searchQuery = Console.ReadLine();
                if (searchQuery == "")
                    break;

                start = DateTime.Now;
                foreach (var doc in r.GetHits(searchQuery).OrderByDescending(x => x.Item2))
                    Console.WriteLine("Rank: {1:0.000000} for: {0}", doc.Item1.URL.Address, doc.Item2);
                end = DateTime.Now;

                Console.WriteLine("Query completed in {0:0.00} sec", (end - start).TotalSeconds);
                Console.WriteLine();
            }
            frontier.Kill();
        }
Example #7
0
        static async System.Threading.Tasks.Task Main(string[] args)
        {
            // Start url
            string url  = "https://www.easv.dk/";
            string url2 = "https://www.google.com/search?q=autonomous+agent";


            // Initialize frontier and add url
            Frontier frontier = new Frontier();

            frontier.AddUrl(url);

            // Sequential crawler
            Crawler smallSpider = new Crawler(frontier);
            //smallSpider.StartCrawling();

            // Parallel crawler using threads
            ThreadCrawler giantHouseSpider = new ThreadCrawler(frontier);
            await giantHouseSpider.SendAsync();
        }
        public static void StartAndWait(Frontier frontier, Index index, Filtering.Filter filter, int pagecount)
        {
            int count = (int)Math.Ceiling(pagecount / (double)SPIDER_PAGE_COUNT);

            Spider[] spiders = new Spider[count];
            Thread[] threads = new Thread[count];

            for (int i = 0; i < count; i++)
            {
                Spider sp = spiders[i] = new Spider(frontier, index, filter, ind =>
                {
                    Console.ForegroundColor = ConsoleColor.Cyan;
                    Console.WriteLine("Merging Index of {0}", ind.SiteCount);
                    Console.ForegroundColor = ConsoleColor.Gray;

                    lock (index) { index.MergeIn(ind); }
                });
                threads[i] = new Thread(() => sp.Run());
                threads[i].Start();
            }

            for (int i = 0; i < count; i++)
                threads[i].Join();
        }
Example #9
0
 public Crawler(Frontier frontier)
 {
     _frontier = frontier;
 }
Example #10
0
 public void SetFrontier(Frontier frontier)
 {
     _frontier = frontier;
 }