Esempio n. 1
0
        static void Main(string[] args)
        {
            Console.WindowWidth += 50;

            frontier = new Frontier(new Exclusions());
            frontier.Add(new URL("http://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles"));

            Filter filter = new DomainFilter("en.wikipedia.org") & new ExtentionFilter(false, "jpg", "jpeg", "gif", "png", "rar", "zip", "exe", "pdf");

            DateTime start = DateTime.Now;

            Crawler.StartAndWait(frontier, index, filter, 100);

            DateTime end = DateTime.Now;

            Console.WriteLine("Crawler done in {0:0.00} sec ({1:0.00} pages per sec).", (end - start).TotalSeconds, index.SiteCount / (end - start).TotalSeconds);
            Console.WriteLine("Press any key to start querying.");
            Console.ReadKey(true);
            Console.WriteLine();

            start = DateTime.Now;
            Ranker r = new Ranker(index, TrimmingStemmer.GetStemmer(PorterStemmer.StemTerm));

            end = DateTime.Now;
            Console.WriteLine("Ranker created in {0:0.00} sec.", (end - start).TotalSeconds);

            string searchQuery = "";

            while (true)
            {
                Console.WriteLine("Query for data below. Enter an empty string to quit.");
                Console.Write("Search for: ");
                searchQuery = Console.ReadLine();
                if (searchQuery == "")
                {
                    break;
                }

                start = DateTime.Now;
                foreach (var doc in r.GetHits(searchQuery).OrderByDescending(x => x.Item2))
                {
                    Console.WriteLine("Rank: {1:0.000000} for: {0}", doc.Item1.URL.Address, doc.Item2);
                }
                end = DateTime.Now;

                Console.WriteLine("Query completed in {0:0.00} sec", (end - start).TotalSeconds);
                Console.WriteLine();
            }
            frontier.Kill();
        }
        static void Main(string[] args)
        {
            Console.WindowWidth += 50;

            frontier = new Frontier(new Exclusions());
            frontier.Add(new URL("http://en.wikipedia.org/wiki/Teenage_Mutant_Ninja_Turtles"));

            Filter filter = new DomainFilter("en.wikipedia.org") & new ExtentionFilter(false, "jpg", "jpeg", "gif", "png", "rar", "zip", "exe", "pdf");

            DateTime start = DateTime.Now;

            Crawler.StartAndWait(frontier, index, filter, 100);

            DateTime end = DateTime.Now;
            Console.WriteLine("Crawler done in {0:0.00} sec ({1:0.00} pages per sec).", (end - start).TotalSeconds, index.SiteCount / (end - start).TotalSeconds);
            Console.WriteLine("Press any key to start querying.");
            Console.ReadKey(true);
            Console.WriteLine();

            start = DateTime.Now;
            Ranker r = new Ranker(index, TrimmingStemmer.GetStemmer(PorterStemmer.StemTerm));
            end = DateTime.Now;
            Console.WriteLine("Ranker created in {0:0.00} sec.", (end - start).TotalSeconds);

            string searchQuery = "";
            while (true)
            {
                Console.WriteLine("Query for data below. Enter an empty string to quit.");
                Console.Write("Search for: ");
                searchQuery = Console.ReadLine();
                if (searchQuery == "")
                    break;

                start = DateTime.Now;
                foreach (var doc in r.GetHits(searchQuery).OrderByDescending(x => x.Item2))
                    Console.WriteLine("Rank: {1:0.000000} for: {0}", doc.Item1.URL.Address, doc.Item2);
                end = DateTime.Now;

                Console.WriteLine("Query completed in {0:0.00} sec", (end - start).TotalSeconds);
                Console.WriteLine();
            }
            frontier.Kill();
        }
            public void Run()
            {
                int      count = 0;
                Document doc   = null;

                while (doc == null)
                {
                    doc = frontier.Next();
                }
                while (doc != null)
                {
                    if (index.TryAddUrl(doc))
                    {
                        var links = GetLinks(doc.URL, doc.HTML).ToArray();

                        int c = 0;
                        foreach (var l in links)
                        {
                            if (filter.Allow(l))
                            {
                                frontier.Add(l);
                                c++;
                            }
                        }
                    }
                    Console.WriteLine("{0}", doc.URL);

                    count++;
                    if (count == SPIDER_PAGE_COUNT)
                    {
                        break;
                    }
                    doc = frontier.Next();
                }

                callback(this.index);
            }