/** * This method tests the fetch method when it trys to fetch with a small timeout */ public void test4() { Console.WriteLine("Trying to fetch using small timeout: "); Console.WriteLine("Fetching the url: http://www.example-code.com/csharp/spider.asp with timeout =1ms"); ResourceContent resource3 = new HttpResourceFetcher().fetch("http://www.example-code.com/csharp/spider.asp", 1, 0); Console.WriteLine("The returnCode of the request is: " + resource3.getReturnCode()); Console.WriteLine("The content of the returned resource is:", resource3.getResourceContent()); if ((resource3.getReturnCode() == IN_VALID_CODE) && (resource3.getResourceContent() == null)) { Console.WriteLine("Test4 has passed"); } else { Console.WriteLine("Test4 has failed"); } }
/** * This method tests the fetch method when it trys to fetch a url that does not exist */ public void test3() { Console.WriteLine("Trying to fetch corrupted url"); Console.WriteLine("Fetching the url :http://www.adamshobash.com"); ResourceContent resource2 = new HttpResourceFetcher().fetch("http://www.adamshobash.com", 10000, 0); Console.WriteLine("The returnCode of the request is: " + resource2.getReturnCode()); Console.WriteLine("The content of the returned resource is:", resource2.getResourceContent()); if ((resource2.getReturnCode() == IN_VALID_CODE) && (resource2.getResourceContent() == null)) { Console.WriteLine("Test3 has passed"); } else { Console.WriteLine("Test3 has failed"); } }
/** * This method tests the fetch method using ftp,the return code * should be 400 and the content of the resource should be null. */ public void test2() { Console.WriteLine("Trying to fetch using ftp:"); Console.WriteLine("Fetching the url:ftp://ftp.site.com/dir1/dir2/file.ext."); ResourceContent resource1 = new HttpResourceFetcher().fetch("ftp://ftp.site.com/dir1/dir2/file.ext.", 10000, 0); Console.WriteLine("The returnCode of the request is: {0} ", resource1.getReturnCode()); Console.WriteLine("The content of the returned resource is:" + resource1.getResourceContent()); if ((resource1.getReturnCode() == IN_VALID_CODE) && (resource1.getResourceContent() == null)) { Console.WriteLine("Test1 has passed"); } else { Console.WriteLine("Test1 has failed"); } }
/** * Testing fetching correct page,the method will fetch a page and print it's content. * */ public void test1() { Console.WriteLine("Testing correct url:"); Console.WriteLine("Fetching the url: http://www.example-code.com/csharp/spider.asp"); ResourceContent resource = new HttpResourceFetcher().fetch("http://www.example-code.com/csharp/spider.asp", 10000, 0); System.Console.WriteLine("The returnCode of the request is: " + resource.getReturnCode()); Console.WriteLine("Content of the fetched page:"); Console.WriteLine(resource.getResourceContent()); if (resource.getReturnCode() == VALID_CODE) { Console.WriteLine("Test1 has passed"); } else { Console.WriteLine("Test1 has failed"); } }
public void Test2() { List<String> urls = new List<string>(); urls.Add("http://www.autonews.com/"); urls.Add("http://www.geonius.com/www/"); urls.Add("http://en.wikipedia.org/wiki/Main_Page"); urls.Add("http://www.computerworld.com/"); List<string> seeds = StorageSystem.StorageSystem.getInstance().getSeedList(taskId); foreach (string seed in seeds) { urls.Add(seed); } List<Category> _categories; Constraints _constraints; _categories = StorageSystem.StorageSystem.getInstance().getCategories(taskId); _constraints = StorageSystem.StorageSystem.getInstance().getRestrictions(taskId); StorageSystem.StorageSystem.getInstance().getSeedList(taskId); Filter filter = new Filter("http://", _constraints); Categorizer categorizer = new Categorizer(_categories); Ranker ranker = new Ranker(categorizer); Extractor extractor = new Extractor(); HttpResourceFetcher httpfetcher = new HttpResourceFetcher(); foreach (String url in urls) { DateTime startTime = DateTime.Now; ResourceContent resource = null; if (httpfetcher.canFetch(url)) resource = httpfetcher.fetch(url, 10000, 100); DateTime fetchEndTime = DateTime.Now; if ((resource == null)||(resource.getResourceContent()==null)) continue; /*** 0. fetching the link from the internet ***/ TimeSpan fetchingTime = fetchEndTime - startTime; List<LinkItem> listOfLinks = new List<LinkItem>(); //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - fetchEndTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach (LinkItem item in listOfLinks) { //Filter the links and return only links that can be crawled List<String> links = new List<String>(); links.Add(item.getLink()); List<String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url1 = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url1); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List<Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) RuntimeStatistics.addToCrawledUrls(1); DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** 5. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; } }
public void Test2() { List <String> urls = new List <string>(); urls.Add("http://www.autonews.com/"); urls.Add("http://www.geonius.com/www/"); urls.Add("http://en.wikipedia.org/wiki/Main_Page"); urls.Add("http://www.computerworld.com/"); List <string> seeds = StorageSystem.StorageSystem.getInstance().getSeedList(taskId); foreach (string seed in seeds) { urls.Add(seed); } List <Category> _categories; Constraints _constraints; _categories = StorageSystem.StorageSystem.getInstance().getCategories(taskId); _constraints = StorageSystem.StorageSystem.getInstance().getRestrictions(taskId); StorageSystem.StorageSystem.getInstance().getSeedList(taskId); Filter filter = new Filter("http://", _constraints); Categorizer categorizer = new Categorizer(_categories); Ranker ranker = new Ranker(categorizer); Extractor extractor = new Extractor(); HttpResourceFetcher httpfetcher = new HttpResourceFetcher(); foreach (String url in urls) { DateTime startTime = DateTime.Now; ResourceContent resource = null; if (httpfetcher.canFetch(url)) { resource = httpfetcher.fetch(url, 10000, 100); } DateTime fetchEndTime = DateTime.Now; if ((resource == null) || (resource.getResourceContent() == null)) { continue; } /*** 0. fetching the link from the internet ***/ TimeSpan fetchingTime = fetchEndTime - startTime; List <LinkItem> listOfLinks = new List <LinkItem>(); //extract all the links in page listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent()); RuntimeStatistics.addToExtractedUrls(listOfLinks.Count); DateTime extEndTime = DateTime.Now; /*** 1. Extracting the link from the request ***/ TimeSpan extRequest = extEndTime - fetchEndTime; //reset the dictionary in filter that contains the urls from the same page filter.resetDictionary(); int filteredUrlsCount = 0; foreach (LinkItem item in listOfLinks) { //Filter the links and return only links that can be crawled List <String> links = new List <String>(); links.Add(item.getLink()); List <String> filteredLinks = filter.filterLinks(links); //If filteredLinks is not empty if (filteredLinks.Count > 0) { filteredUrlsCount++; Url url1 = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item), item.getDomainUrl(), hashUrl(item.getDomainUrl())); deployLinksToFrontier(url1); RuntimeStatistics.addToFeedUrls(1); } } DateTime catStartTime = DateTime.Now; /*** 2. Ranking and deployment to the frontier ***/ TimeSpan rankTotalRequest = catStartTime - extEndTime; //Ascribe the url to all the categories it is belonged to. List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(), resource.getResourceUrl()); if (classifiedResults.Count != 0) { RuntimeStatistics.addToCrawledUrls(1); } DateTime catEndTime = DateTime.Now; /*** 3. Classification of the current request ***/ TimeSpan catTotalRequest = catEndTime - catStartTime; foreach (Result classifiedResult in classifiedResults) { Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(), resource.getRankOfUrl(), classifiedResult.getTrustMeter()); deployResourceToStorage(result); } DateTime endTime = DateTime.Now; /*** 4. deployment to the database (result) ***/ TimeSpan deployRequest = endTime - catEndTime; /*** 5. Total processing time ***/ TimeSpan totalRequest = endTime - startTime; } }