Exemplo n.º 1
0
        /**
         * it will invoke the worker to start working on the tasks - never returns
         */
        public void run()
        {
            int      requestNum = 0, timeoutCounter = 0;
            bool     needToTerminate = false;
            TimeSpan totalProcessTime;

            Thread.Sleep(10000);
            while (needToTerminate == false)
            {
                DateTime startTime = DateTime.Now;
                try
                {
                    //System.Console.WriteLine("-<>--------------------------------------------------------------------------");
                    SyncAccessor.getSlot(2, 1);
                    Url task = SyncAccessor.getFromQueue <Url>(_tasks, _timer);

                    //System.Console.WriteLine(" Start Working on : " + task.getUrl() + " ...");
                    ResourceContent content = _fetchers.fetchResource(task.getUrl());

                    if (content.isValid() != true)
                    {
                        timeoutCounter++;
                        //System.Console.WriteLine(" Fetch Failed Ignoring ... ");
                        continue;
                    }

                    //System.Console.WriteLine(" Fetched Successfully ... ");

                    ResourceContent modifiedContent = new ResourceContent(content.getResourceUrl(), content.getResourceType()
                                                                          , content.getResourceContent(), content.getReturnCode(), task.getRank());

                    DateTime startProcess = DateTime.Now;
                    _processors.processResource(modifiedContent);
                    DateTime endProcess = DateTime.Now;
                    totalProcessTime = endProcess - startProcess;
                    //System.Console.WriteLine(" URL Processed Successfully ... ");

                    System.Console.WriteLine(" URL Processed Successfully ... ");
                }
                catch (Exception e)
                {
                    //System.Console.WriteLine("[Exception Happened] " + e);
                    RuntimeStatistics.addToErrors(1);
                    continue;
                }
                DateTime endTime          = DateTime.Now;
                TimeSpan totalRequestTime = endTime - startTime;

                if (LogDebuggerControl.getInstance().enableTiming)
                {
                    // write request time to timing log file
                    StreamWriter sw = new
                                      StreamWriter("_DEBUG_INFO_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                    sw.WriteLine(" TIMING FOR REQ - " + requestNum++ + " takes about " + totalRequestTime.TotalSeconds + " s, Processed At " + totalProcessTime.TotalSeconds + " s");
                    sw.Close();
                }
            }
        }
Exemplo n.º 2
0
 /**
  * This is a test method that checks if the class resourceContent works fine
  */
 public static void Test()
 {
     String rsrcurl = "www.adamsearch.com";
        ResourceType rsrcType = ResourceType.HtmlResource;
        String rsrcContent = "Please click the next buttom to start crawling !!";
        int rtrnCode = 200;
        ResourceContent htmlResource1 = new ResourceContent(rsrcurl, rsrcType, rsrcContent, rtrnCode,0);
        Console.WriteLine("The resource is : " + htmlResource1.isValid());
        Console.WriteLine("Get url: " + htmlResource1.getResourceUrl());
        Console.WriteLine("Get resourceType: " + htmlResource1.getResourceType());
        Console.WriteLine("Get resourceContent: " + htmlResource1.getResourceContent());
        Console.WriteLine("to string:\n" + htmlResource1);
 }
Exemplo n.º 3
0
        /**
         * This is a test method that checks if the class resourceContent works fine
         */
        public static void Test()
        {
            String          rsrcurl       = "www.adamsearch.com";
            ResourceType    rsrcType      = ResourceType.HtmlResource;
            String          rsrcContent   = "Please click the next buttom to start crawling !!";
            int             rtrnCode      = 200;
            ResourceContent htmlResource1 = new ResourceContent(rsrcurl, rsrcType, rsrcContent, rtrnCode, 0);

            Console.WriteLine("The resource is : " + htmlResource1.isValid());
            Console.WriteLine("Get url: " + htmlResource1.getResourceUrl());
            Console.WriteLine("Get resourceType: " + htmlResource1.getResourceType());
            Console.WriteLine("Get resourceContent: " + htmlResource1.getResourceContent());
            Console.WriteLine("to string:\n" + htmlResource1);
        }
        /**
         * This method tries to process the given content assuming that the given content
         * can be processed via this processor
         */
        public void process(ResourceContent resource)
        {
            DateTime startTime = DateTime.Now;

            List <LinkItem> listOfLinks;

            //extract all the links in page
            listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent());
            RuntimeStatistics.addToExtractedUrls(listOfLinks.Count);

            DateTime extEndTime = DateTime.Now;

            /*** 1. Extracting the link from the request ***/
            TimeSpan extRequest = extEndTime - startTime;

            //reset the dictionary in filter that contains the urls from the same page
            filter.resetDictionary();
            int filteredUrlsCount = 0;

            foreach (LinkItem item in listOfLinks)
            {
                //Filter the links and return only links that can be crawled
                List <String> links = new List <String>();
                links.Add(item.getLink());
                List <String> filteredLinks = filter.filterLinks(links);

                //If filteredLinks is not empty
                if (filteredLinks.Count > 0)
                {
                    filteredUrlsCount++;
                    Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource, item),
                                      item.getDomainUrl(), hashUrl(item.getDomainUrl()));
                    deployLinksToFrontier(url);
                    RuntimeStatistics.addToFeedUrls(1);
                }
            }

            DateTime catStartTime = DateTime.Now;

            /*** 2. Ranking and deployment to the frontier ***/
            TimeSpan rankTotalRequest = catStartTime - extEndTime;

            //Ascribe the url to all the categories it is belonged to.
            List <Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(),
                                                                          resource.getResourceUrl());

            if (classifiedResults.Count != 0)
            {
                RuntimeStatistics.addToCrawledUrls(1);
            }
            DateTime catEndTime = DateTime.Now;

            /*** 3. Classification of the current request ***/
            TimeSpan catTotalRequest = catEndTime - catStartTime;

            //Save all the results to Storage
            foreach (Result classifiedResult in classifiedResults)
            {
                Result result = new Result("0", classifiedResult.getUrl(), classifiedResult.getCategoryID(),
                                           resource.getRankOfUrl(), classifiedResult.getTrustMeter());
                deployResourceToStorage(result);
            }

            DateTime endTime = DateTime.Now;
            /*** 4. deployment to the database (result) ***/
            TimeSpan deployRequest = endTime - catEndTime;

            /*** $. Total processing time ***/
            TimeSpan totalRequest = endTime - startTime;

            // write request time to timing log file
            if (LogDebuggerControl.getInstance().enableTiming)
            {
                StreamWriter sw = new
                                  StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" TIMING FOR REQ - [] ");
                sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Ranker    Time " + extRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Deploy    Time " + deployRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds ");
                sw.Close();
            }
        }
        /**
         * This method tries to process the given content assuming that the given content
         * can be processed via this processor
         */
        public void process(ResourceContent resource)
        {
            DateTime startTime = DateTime.Now;

            List<LinkItem> listOfLinks;
            //extract all the links in page
            listOfLinks = extractor.extractLinks(resource.getResourceUrl(), resource.getResourceContent());
            RuntimeStatistics.addToExtractedUrls(listOfLinks.Count);

            DateTime extEndTime = DateTime.Now;

            /*** 1. Extracting the link from the request ***/
            TimeSpan extRequest = extEndTime - startTime;

            //reset the dictionary in filter that contains the urls from the same page
            filter.resetDictionary();
            int filteredUrlsCount = 0;
            foreach(LinkItem item in listOfLinks )
            {
                //Filter the links and return only links that can be crawled
                List<String>links = new List<String>();
                links.Add(item.getLink());
                List<String> filteredLinks = filter.filterLinks(links);

                //If filteredLinks is not empty
                if (filteredLinks.Count > 0)
                {
                    filteredUrlsCount++;
                    Url url = new Url(filteredLinks[0], hashUrl(filteredLinks[0]), ranker.rankUrl(resource,item),
                                      item.getDomainUrl(), hashUrl(item.getDomainUrl()));
                    deployLinksToFrontier(url);
                    RuntimeStatistics.addToFeedUrls(1);

                }
            }

            DateTime catStartTime = DateTime.Now;

            /*** 2. Ranking and deployment to the frontier ***/
            TimeSpan rankTotalRequest = catStartTime - extEndTime;

            //Ascribe the url to all the categories it is belonged to.
            List<Result> classifiedResults = categorizer.classifyContent(resource.getResourceContent(),
                                                                            resource.getResourceUrl());
            if (classifiedResults.Count != 0) RuntimeStatistics.addToCrawledUrls(1);
            DateTime catEndTime = DateTime.Now;

            /*** 3. Classification of the current request ***/
            TimeSpan catTotalRequest = catEndTime - catStartTime;

            //Save all the results to Storage
            foreach (Result classifiedResult in classifiedResults)
            {
                Result result = new Result("0", classifiedResult.getUrl(),classifiedResult.getCategoryID(),
                            resource.getRankOfUrl(), classifiedResult.getTrustMeter());
                deployResourceToStorage(result);
            }

            DateTime endTime = DateTime.Now;
            /*** 4. deployment to the database (result) ***/
            TimeSpan deployRequest = endTime - catEndTime;

            /*** $. Total processing time ***/
            TimeSpan totalRequest = endTime - startTime;

            // write request time to timing log file
            if (LogDebuggerControl.getInstance().enableTiming)
            {
                StreamWriter sw = new
                        StreamWriter("_DEBUG_INFO_PROCESSOR_TIMING@" + System.Threading.Thread.CurrentThread.ManagedThreadId + ".txt", true);
                sw.WriteLine(" TIMING FOR REQ - [] ");
                sw.WriteLine(" - Extractor Time " + extRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Ranker    Time " + extRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Categori. Time " + catTotalRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Deploy    Time " + deployRequest.TotalSeconds + " seconds ");
                sw.WriteLine(" - Total Timing " + totalRequest.TotalSeconds + " seconds ");
                sw.Close();
            }
        }