Esempi di codice in C# (CSharp) per HttpResourceFetcher

Esempio n. 1

0

Mostra file

File: HttpResourceFetcherTest.cs Progetto: dangtranduc/iwebcrawler

        /**
         * This method tests the fetch method when it trys to fetch with a small timeout
         */
        public void test4()
        {
            Console.WriteLine("Trying to fetch using small timeout: ");
            Console.WriteLine("Fetching the url: http://www.example-code.com/csharp/spider.asp with timeout =1ms");
            ResourceContent resource3 = new HttpResourceFetcher().fetch("http://www.example-code.com/csharp/spider.asp", 1, 0);

            Console.WriteLine("The returnCode of the request is: " + resource3.getReturnCode());
            Console.WriteLine("The content of the returned resource is:", resource3.getResourceContent());
            if ((resource3.getReturnCode() == IN_VALID_CODE) && (resource3.getResourceContent() == null))
            {
                Console.WriteLine("Test4 has passed");
            }
            else
            {
                Console.WriteLine("Test4 has failed");
            }
        }

Esempio n. 2

0

Mostra file

File: HttpResourceFetcherTest.cs Progetto: dangtranduc/iwebcrawler

        /**
         * This method tests the fetch method when it trys to fetch a url that does not exist
         */
        public void test3()
        {
            Console.WriteLine("Trying to fetch corrupted url");
            Console.WriteLine("Fetching the url :http://www.adamshobash.com");
            ResourceContent resource2 = new HttpResourceFetcher().fetch("http://www.adamshobash.com", 10000, 0);

            Console.WriteLine("The returnCode of the request is: " + resource2.getReturnCode());
            Console.WriteLine("The content of the returned resource is:", resource2.getResourceContent());
            if ((resource2.getReturnCode() == IN_VALID_CODE) && (resource2.getResourceContent() == null))
            {
                Console.WriteLine("Test3 has passed");
            }
            else
            {
                Console.WriteLine("Test3 has failed");
            }
        }

Esempio n. 3

0

Mostra file

File: HttpResourceFetcherTest.cs Progetto: dangtranduc/iwebcrawler

        /**
         * This method tests the fetch method using ftp,the return code
         * should be 400 and the content of the resource should be null.
         */
        public void test2()
        {
            Console.WriteLine("Trying to fetch using ftp:");
            Console.WriteLine("Fetching the url:ftp://ftp.site.com/dir1/dir2/file.ext.");
            ResourceContent resource1 = new HttpResourceFetcher().fetch("ftp://ftp.site.com/dir1/dir2/file.ext.", 10000, 0);

            Console.WriteLine("The returnCode of the request is: {0} ", resource1.getReturnCode());
            Console.WriteLine("The content of the returned resource is:" + resource1.getResourceContent());
            if ((resource1.getReturnCode() == IN_VALID_CODE) && (resource1.getResourceContent() == null))
            {
                Console.WriteLine("Test1 has passed");
            }
            else
            {
                Console.WriteLine("Test1 has failed");
            }
        }

Esempio n. 4

0

Mostra file

File: HttpResourceFetcherTest.cs Progetto: dangtranduc/iwebcrawler

        /**
         * Testing fetching correct page,the method will fetch a page and print it's content.
         *
         */
        public void test1()
        {
            Console.WriteLine("Testing correct url:");
            Console.WriteLine("Fetching the url: http://www.example-code.com/csharp/spider.asp");
            ResourceContent resource = new HttpResourceFetcher().fetch("http://www.example-code.com/csharp/spider.asp", 10000, 0);

            System.Console.WriteLine("The returnCode of the request is:  " + resource.getReturnCode());
            Console.WriteLine("Content of the fetched page:");
            Console.WriteLine(resource.getResourceContent());
            if (resource.getReturnCode() == VALID_CODE)
            {
                Console.WriteLine("Test1 has passed");
            }
            else
            {
                Console.WriteLine("Test1 has failed");
            }
        }

Esempio n. 5

0

Mostra file

File: RankerTest.cs Progetto: eumagnun/iwebcrawler

        public void Test2()
        {
            List<String> urls = new List<string>();
            urls.Add("http://www.autonews.com/");
            urls.Add("http://www.geonius.com/www/");
            urls.Add("http://en.wikipedia.org/wiki/Main_Page");
            urls.Add("http://www.computerworld.com/");
            List<string> seeds = StorageSystem.StorageSystem.getInstance().getSeedList(taskId);
            foreach (string seed in seeds)
            {
                urls.Add(seed);
            }

            List<Category> _categories;
            Constraints _constraints;

            _categories = StorageSystem.StorageSystem.getInstance().getCategories(taskId);
            _constraints = StorageSystem.StorageSystem.getInstance().getRestrictions(taskId);

            StorageSystem.StorageSystem.getInstance().getSeedList(taskId);
            Filter filter = new Filter("http://", _constraints);
            Categorizer categorizer = new Categorizer(_categories);
            Ranker ranker = new Ranker(categorizer);
            Extractor extractor = new Extractor();

            HttpResourceFetcher httpfetcher = new HttpResourceFetcher();

            foreach (String url in urls)
            {
                DateTime startTime = DateTime.Now;
                ResourceContent resource = null;
                if (httpfetcher.canFetch(url))
                    resource = httpfetcher.fetch(url, 10000, 100);

                DateTime fetchEndTime = DateTime.Now;

                if ((resource == null)||(resource.getResourceContent()==null))
                    continue;

                /*** 0. fetching the link from the internet ***/
                TimeSpan fetchingTime = fetchEndTime - startTime;

                List<LinkItem> listOfLinks = new List<LinkItem>();
                //extract all the links in page
                listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent());
                RuntimeStatistics.addToExtractedUrls(listOfLinks.Count);

                DateTime extEndTime = DateTime.Now;

                /*** 1. Extracting the link from the request ***/
                TimeSpan extRequest = extEndTime - fetchEndTime;

                //reset the dictionary in filter that contains the urls from the same page
                filter.resetDictionary();
                int filteredUrlsCount = 0;
                foreach (LinkItem item in listOfLinks)
                {
                    //Filter the links and return only links that can be crawled
                    List<String> links = new List<String>();
                    links.Add(item.getLink());
                    List<String> filteredLinks = filter.filterLinks(links);

                    //If filteredLinks is not empty
                    if (filteredLinks.Count > 0)
                    {
                        filteredUrlsCount++;
                        Url url1 = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item),
                                          item.getDomainUrl(), hashUrl(item.getDomainUrl()));
                        deployLinksToFrontier(url1);
                        RuntimeStatistics.addToFeedUrls(1);
                    }
                }

                DateTime catStartTime = DateTime.Now;

                /*** 2. Ranking and deployment to the frontier ***/
                TimeSpan rankTotalRequest = catStartTime - extEndTime;

                //Ascribe the url to all the categories it is belonged to.
                List<Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(),
                                                                                resource.getResourceUrl());
                if (classifiedResults.Count != 0) RuntimeStatistics.addToCrawledUrls(1);

                DateTime catEndTime = DateTime.Now;

                /*** 3. Classification of the current request ***/
                TimeSpan catTotalRequest = catEndTime - catStartTime;

                foreach (Result classifiedResult in classifiedResults)
                {
                     Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(),
                                 resource.getRankOfUrl(), classifiedResult.getTrustMeter());
                     deployResourceToStorage(result);
                }

                DateTime endTime = DateTime.Now;

                /*** 4. deployment to the database (result) ***/
                TimeSpan deployRequest = endTime - catEndTime;

                /*** 5. Total processing time ***/
                TimeSpan totalRequest = endTime - startTime;
            }
        }

Esempio n. 6

0

Mostra file

File: RankerTest.cs Progetto: dangtranduc/iwebcrawler

        public void Test2()
        {
            List <String> urls = new List <string>();

            urls.Add("http://www.autonews.com/");
            urls.Add("http://www.geonius.com/www/");
            urls.Add("http://en.wikipedia.org/wiki/Main_Page");
            urls.Add("http://www.computerworld.com/");
            List <string> seeds = StorageSystem.StorageSystem.getInstance().getSeedList(taskId);

            foreach (string seed in seeds)
            {
                urls.Add(seed);
            }

            List <Category> _categories;
            Constraints     _constraints;

            _categories  = StorageSystem.StorageSystem.getInstance().getCategories(taskId);
            _constraints = StorageSystem.StorageSystem.getInstance().getRestrictions(taskId);

            StorageSystem.StorageSystem.getInstance().getSeedList(taskId);
            Filter      filter      = new Filter("http://", _constraints);
            Categorizer categorizer = new Categorizer(_categories);
            Ranker      ranker      = new Ranker(categorizer);
            Extractor   extractor   = new Extractor();

            HttpResourceFetcher httpfetcher = new HttpResourceFetcher();


            foreach (String url in urls)
            {
                DateTime        startTime = DateTime.Now;
                ResourceContent resource  = null;
                if (httpfetcher.canFetch(url))
                {
                    resource = httpfetcher.fetch(url, 10000, 100);
                }

                DateTime fetchEndTime = DateTime.Now;

                if ((resource == null) || (resource.getResourceContent() == null))
                {
                    continue;
                }

                /*** 0. fetching the link from the internet ***/
                TimeSpan fetchingTime = fetchEndTime - startTime;

                List <LinkItem> listOfLinks = new List <LinkItem>();
                //extract all the links in page
                listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent());
                RuntimeStatistics.addToExtractedUrls(listOfLinks.Count);

                DateTime extEndTime = DateTime.Now;

                /*** 1. Extracting the link from the request ***/
                TimeSpan extRequest = extEndTime - fetchEndTime;

                //reset the dictionary in filter that contains the urls from the same page
                filter.resetDictionary();
                int filteredUrlsCount = 0;
                foreach (LinkItem item in listOfLinks)
                {
                    //Filter the links and return only links that can be crawled
                    List <String> links = new List <String>();
                    links.Add(item.getLink());
                    List <String> filteredLinks = filter.filterLinks(links);

                    //If filteredLinks is not empty
                    if (filteredLinks.Count > 0)
                    {
                        filteredUrlsCount++;
                        Url url1 = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item),
                                           item.getDomainUrl(), hashUrl(item.getDomainUrl()));
                        deployLinksToFrontier(url1);
                        RuntimeStatistics.addToFeedUrls(1);
                    }
                }

                DateTime catStartTime = DateTime.Now;

                /*** 2. Ranking and deployment to the frontier ***/
                TimeSpan rankTotalRequest = catStartTime - extEndTime;

                //Ascribe the url to all the categories it is belonged to.
                List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(),
                                                                              resource.getResourceUrl());
                if (classifiedResults.Count != 0)
                {
                    RuntimeStatistics.addToCrawledUrls(1);
                }

                DateTime catEndTime = DateTime.Now;

                /*** 3. Classification of the current request ***/
                TimeSpan catTotalRequest = catEndTime - catStartTime;

                foreach (Result classifiedResult in classifiedResults)
                {
                    Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(),
                                               resource.getRankOfUrl(), classifiedResult.getTrustMeter());
                    deployResourceToStorage(result);
                }

                DateTime endTime = DateTime.Now;

                /*** 4. deployment to the database (result) ***/
                TimeSpan deployRequest = endTime - catEndTime;

                /*** 5. Total processing time ***/
                TimeSpan totalRequest = endTime - startTime;
            }
        }

Esempi in C# (CSharp) per HttpResourceFetcher