Exemple #1
0
        private void Main_Load(object sender, EventArgs e)
        {
            try
            {
                _arachnodeDAO = new ArachnodeDAO(_applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true);

                _actionManager      = new ActionManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);
                _consoleManager     = new ConsoleManager <ArachnodeDAO>(_applicationSettings, _webSettings);
                _memoryManager      = new MemoryManager <ArachnodeDAO>(_applicationSettings, _webSettings);
                _cacheManager       = new CacheManager <ArachnodeDAO>(_applicationSettings, _webSettings);
                _crawlerPeerManager = new CrawlerPeerManager <ArachnodeDAO>(_applicationSettings, _webSettings, null, _arachnodeDAO);
                _cache = new Cache <ArachnodeDAO>(_applicationSettings, _webSettings, null, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager);

                _ruleManager      = new RuleManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);
                _discoveryManager = new DiscoveryManager <ArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager);

                nudWebPageID_ValueChanged(null, null);
                nudFileID_ValueChanged(null, null);
                nudImageID_ValueChanged(null, null);
            }
            catch (Exception exception)
            {
                MessageBox.Show(exception.Message + " ::" + exception.StackTrace, "Browser");
            }
        }
Exemple #2
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //if a CrawlRequest has a Priority of double.MaxValue it is a CrawlRequest needed to satisfy a piece of content belonging to a WebPage.  (e.g. An image, a file.)
            if (crawlRequest.Priority != 1000001)
            {
                while (DateTime.Now.Subtract(crawlRequest.Politeness.LastWebPageHttpWebRequestCompleted).TotalMilliseconds < _threadSleepTimeInMillisecondsBetweenWebRequests)
                {
                    Thread.Sleep(10);
                }
            }

            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

            if (DateTime.Now.Subtract(crawlRequest.Politeness.FirstHttpWebRequest) > TimeSpan.FromDays(1))
            {
                crawlRequest.Politeness.FirstHttpWebRequest           = DateTime.Now;
                crawlRequest.Politeness.TotalHttpWebRequestsCompleted = 0;
            }

            if (crawlRequest.Politeness.TotalHttpWebRequestsCompleted + crawlRequest.Politeness.TotalHttpWebRequestsCanceled >= _maximumNumberOfWebRequestsPerHostPerDay)
            {
                crawlRequest.IsDisallowedReason = "Too many HttpWebRequests per day.";
                return(true);
            }

            return(false);
        }
Exemple #3
0
        public override string GetWebPageSource(string webPageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO)
        {
            if (ApplicationSettings.DownloadedWebPagesDirectory == null)
            {
                throw new Exception("_applicationSettings.DownloadedWebPagesDirectory is null.  This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO.");
            }

            string webPageSource = null;

            ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(webPageAbsoluteUriOrID);

            if (webPagesRow != null)
            {
                if (webPagesRow.Source.Length != 0)
                {
                    webPageSource = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                }
                else
                {
                    string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType);

                    if (!File.Exists(discoveryPath))
                    {
                        throw new Exception("Could not find the WebPage Source in the database or on disk.");
                    }

                    webPageSource = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage));
                }
            }

            return(webPageSource);
        }
Exemple #4
0
        public override byte[] GetImageSource(string imageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO)
        {
            var managedImage = new ManagedImage();

            if (ApplicationSettings.DownloadedImagesDirectory == null)
            {
                throw new Exception("_applicationSettings.DownloadedImagesDirectory is null.  This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO.");
            }

            ArachnodeDataSet.ImagesRow imagesRow = arachnodeDAO.GetImage(imageAbsoluteUriOrID);

            if (imagesRow != null)
            {
                if (imagesRow.Source.Length != 0)
                {
                    return(imagesRow.Source);
                }
                else
                {
                    string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, imagesRow.AbsoluteUri, imagesRow.FullTextIndexType);

                    if (!File.Exists(discoveryPath))
                    {
                        throw new Exception("Could not find the Image Source in the database or on disk.");
                    }

                    return(File.ReadAllBytes(discoveryPath));
                }
            }

            return(null);
        }
Exemple #5
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

            if (crawlRequest.MaximumDepth < -1)
            {
                crawlRequest.IsDisallowedReason = "CrawlRequest.MaximumDepth cannot equal less than -1.";

                return(true);
            }

            if (crawlRequest.MaximumDepth == -1)
            {
                crawlRequest.IsDisallowedReason = "CrawlRequest.MaximumDepth cannot equal -1.";

                return(true);
            }

            if (crawlRequest.MaximumDepth > _maximumCrawlRequestDepth)
            {
                crawlRequest.IsDisallowedReason = "CrawlRequest.MaximumDepth cannot exceed " + _maximumCrawlRequestDepth + ".";

                return(true);
            }

            return(false);
        }
 protected ADataManager(ApplicationSettings applicationSettings, WebSettings webSettings, ActionManager <TArachnodeDAO> actionManager, DataTypeManager <TArachnodeDAO> dataTypeManager, DiscoveryManager <TArachnodeDAO> discoveryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings)
 {
     _dataTypeManager  = dataTypeManager;
     _discoveryManager = discoveryManager;
     _ruleManager      = ruleManager;
     _actionManager    = actionManager;
     _arachnodeDAO     = arachnodeDAO;
 }
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //ANODET: When you add the multi-server caching, the robots.txt file will need to be sent to all other CachePeers.

            //if we're not being called by the Engine prior to assigning to a Crawl...
            if (crawlRequest.Crawl != null)
            {
                string robotsDotTextAbsoluteUri = crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host + "/robots.txt";

                crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

                if (!UserDefinedFunctions.IsDisallowedForAbsoluteUri(robotsDotTextAbsoluteUri, false, false))
                {
                    if (crawlRequest.Politeness.DisallowedPaths == null || (crawlRequest.Politeness.DisallowedPaths != null && DateTime.Now.Subtract(crawlRequest.Politeness.DisallowedPathsSince) > TimeSpan.FromDays(1)))
                    {
                        CrawlRequest <TArachnodeDAO> robotsDotTextRequest = new CrawlRequest <TArachnodeDAO>(crawlRequest, crawlRequest.Crawl.Crawler.Cache.GetDiscovery(robotsDotTextAbsoluteUri, arachnodeDAO), 1, 1, (short)UriClassificationType.Host, (short)UriClassificationType.Host, double.MaxValue, RenderType.None, RenderType.None);
                        robotsDotTextRequest.Discovery.DiscoveryState = DiscoveryState.Undiscovered;
                        robotsDotTextRequest.Politeness = crawlRequest.Politeness;

                        Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(crawlRequest.Crawl.Crawler.ApplicationSettings, crawlRequest.Crawl.Crawler.WebSettings, crawlRequest.Crawl.Crawler, crawlRequest.Crawl.Crawler.ActionManager, crawlRequest.Crawl.Crawler.ConsoleManager, crawlRequest.Crawl.Crawler.CookieManager, crawlRequest.Crawl.Crawler.CrawlRequestManager, crawlRequest.Crawl.Crawler.DataTypeManager, crawlRequest.Crawl.Crawler.DiscoveryManager, crawlRequest.Crawl.Crawler.EncodingManager, crawlRequest.Crawl.Crawler.HtmlManager, crawlRequest.Crawl.Crawler.PolitenessManager, crawlRequest.Crawl.Crawler.ProxyManager, crawlRequest.Crawl.Crawler.RuleManager, false);

                        robotsDotTextRequest.Crawl = crawl;

                        crawl.ProcessCrawlRequest(robotsDotTextRequest, false, false);

                        crawlRequest.Politeness.DisallowedPathsSince = DateTime.Now;

                        //The DataManager will not download the byte stream is ApplicationSettings.AssignFileAndImageDicoveries is set to false.  This is by design.
                        if (robotsDotTextRequest.Data != null && robotsDotTextRequest.Data.Length == 0 && robotsDotTextRequest.WebClient.WebException == null)
                        {
                            robotsDotTextRequest.Data = robotsDotTextRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer);
                        }

                        SiteCrawler.Value.RobotsDotText robotsDotText = _robotsDotTextManager.ParseRobotsDotTextSource(new Uri(crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host), robotsDotTextRequest.Data);

                        crawlRequest.Politeness.CrawlDelayInMilliseconds = robotsDotText.CrawlDelay * 1000;
                        crawlRequest.Politeness.DisallowedPaths          = robotsDotText.DisallowedPaths;
                    }

                    if (crawlRequest.Politeness != null)
                    {
                        if (crawlRequest.Politeness.DisallowedPaths != null)
                        {
                            foreach (string disallowedPath in crawlRequest.Politeness.DisallowedPaths)
                            {
                                if (HttpUtility.UrlDecode(crawlRequest.Discovery.Uri.AbsoluteUri).StartsWith(HttpUtility.UrlDecode(disallowedPath)))
                                {
                                    crawlRequest.IsDisallowedReason = "Prohibited by robots.txt.";
                                    return(true);
                                }
                            }
                        }
                    }
                }
            }

            return(false);
        }
Exemple #8
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "discovery">The discovery.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, IArachnodeDAO arachnodeDAO)
        {
            //perform application specific logic here...
            //discovery.IsStorable = discovery.Uri.AbsoluteUri.ToLowerInvariant().Contains(".aspx");

            //this plugin could detemine whether a Discovery was Disallowed, but in this example, it doesn't make this determination.

            return(false);
        }
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (crawlRequest.Politeness != null)
            {
                crawlRequest.Politeness.MaximumActiveHttpWebRequests = 2;
            }

            return(false);
        }
        internal DiscoveryProcessor(ApplicationSettings applicationSettings, Crawler <TArachnodeDAO> crawler, CrawlRequestManager <TArachnodeDAO> crawlRequestManager)
        {
            _applicationSettings = applicationSettings;

            _crawler             = crawler;
            _crawlRequestManager = crawlRequestManager;

            _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString);
            _arachnodeDAO.ApplicationSettings = _applicationSettings;
        }
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.None)
            {
                crawlRequest.IsDisallowedReason = "Disallowed by unassigned DataType.  (" + crawlRequest.DataType.ContentType + ")";
                return(true);
            }

            return(false);
        }
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

            if ((crawlRequest.WebClient.HttpWebResponse).StatusCode != HttpStatusCode.OK)
            {
                crawlRequest.IsDisallowedReason = "Disallowed by Status. (" + (crawlRequest.WebClient.HttpWebResponse).StatusCode + ")";

                return(true);
            }

            return(false);
        }
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null)
            {
                if (DateTime.Now.Subtract(crawlRequest.WebClient.HttpWebResponse.LastModified).TotalHours > _maximumTotalHoursOld)
                {
                    crawlRequest.Discovery.IsDisallowedReason = "More than maximum total hours old.";

                    return(true);
                }
            }

            return(false);
        }
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

            if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null)
            {
                if (crawlRequest.WebClient.HttpWebResponse.ContentLength > _maximumContentLengthInBytes)
                {
                    crawlRequest.IsDisallowedReason = "Disallowed by ContentLength.";

                    return(true);
                }
            }

            return(false);
        }
Exemple #15
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            bool isDisallowed = false;

            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;
            crawlRequest.IsDisallowedReason       = "Disallowed by ResponseHeaders.";

            if (UserDefinedFunctions.IsDisallowedForResponseHeaders(crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowed)
            {
                isDisallowed = !isDisallowed;
            }

            return(isDisallowed);
        }
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

            if (crawlRequest.Crawl.Crawler.DiscoveryManager.IsCrawlRestricted(crawlRequest, crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri))
            {
                crawlRequest.IsDisallowedReason = "Disallowed by ResponseUri. " + crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri;

                return(true);
            }

            crawlRequest.Crawl.Crawler.DiscoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, arachnodeDAO);

            crawlRequest.Discovery = crawlRequest.Crawl.Crawler.Cache.GetDiscovery(crawlRequest.WebClient.HttpWebResponse.ResponseUri, arachnodeDAO);

            crawlRequest.Crawl.Crawler.DiscoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, arachnodeDAO);

            return(false);
        }
Exemple #17
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            bool isDisallowed = false;

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
            {
                crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;
                crawlRequest.IsDisallowedReason       = "Disallowed by Source.";

                if (UserDefinedFunctions.IsDisallowedForSource(crawlRequest.DecodedHtml, false))
                {
                    isDisallowed = true;
                }

                if (_negateIsDisallowed)
                {
                    isDisallowed = !isDisallowed;
                }
            }

            return(isDisallowed);
        }
Exemple #18
0
        internal void LoadCrawlActions(IArachnodeDAO arachnodeDAO)
        {
            CrawlActions = new Dictionary <string, ACrawlAction <TArachnodeDAO> >();

            foreach (ArachnodeDataSet.CrawlActionsRow crawlActionsRow in arachnodeDAO.GetCrawlActions())
            {
                ObjectHandle objectHandle = Engine <TArachnodeDAO> .GetObjectHandle(crawlActionsRow.AssemblyName, crawlActionsRow.TypeName, _applicationSettings, _webSettings);

                ACrawlAction <TArachnodeDAO> crawlAction = (ACrawlAction <TArachnodeDAO>)objectHandle.Unwrap();

                crawlAction.AssemblyName    = crawlActionsRow.AssemblyName;
                crawlAction.IsEnabled       = crawlActionsRow.IsEnabled;
                crawlAction.Order           = crawlActionsRow.Order;
                crawlAction.CrawlActionType = (CrawlActionType)Enum.Parse(typeof(CrawlActionType), crawlActionsRow.CrawlActionTypeID.ToString());
                if (!crawlActionsRow.IsSettingsNull())
                {
                    crawlAction.Settings = crawlActionsRow.Settings;
                }
                crawlAction.TypeName = crawlActionsRow.TypeName;

                CrawlActions.Add(crawlAction.TypeName, crawlAction);
            }
        }
Exemple #19
0
        /// <summary>
        ///     Gets the discovery.
        /// </summary>
        /// <param name = "absoluteUri">The file or image discovery.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns></returns>
        public Discovery <TArachnodeDAO> GetDiscovery(string absoluteUri, IArachnodeDAO arachnodeDAO)
        {
            string cacheKey = _cacheManager.GetCacheKey(absoluteUri);

            Discovery <TArachnodeDAO> discovery;

            //this is a placeholder, if my memory serves me correctly, to expand the referenced functionality...
            if (_memoryManager.HasDesiredMaximumMemoryUsageInMegabytesEverBeenMet)
            {
                discovery = GetDiscovery(absoluteUri, cacheKey, arachnodeDAO);
            }
            else
            {
                discovery = GetDiscovery(absoluteUri, cacheKey, arachnodeDAO);
            }

            if (discovery.Uri.AbsoluteUri != absoluteUri)
            {
                discovery.Uri = new Uri(absoluteUri);
            }

            return(discovery);
        }
Exemple #20
0
 protected override void OnInit(EventArgs e)
 {
     //populates the Application and Web settings...
     IArachnodeDAO arachnodeDAO = ArachnodeDAO;
 }
        public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //Rendering determines the Encoding...
            if (crawlRequest.RenderType == RenderType.None)
            {
                if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
                {
                    string contentType = null;
                    if (crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"] != null)
                    {
                        string[] contentTypeHeader = crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"].Split('=');

                        if (contentTypeHeader.Length == 2)
                        {
                            contentType = contentTypeHeader[1].Replace("utf8", "utf-8");
                        }
                    }

                    Encoding encoding    = null;
                    string   decodedHtml = null;

                    try
                    {
                        //first, try and get the Encoding from the 'Content-Type'...
                        if (!string.IsNullOrEmpty(contentType))
                        {
                            encoding = Encoding.GetEncoding(contentType);
                        }
                        else
                        {
                            decodedHtml = DetermineEncoding(crawlRequest, out encoding);
                        }
                    }
                    catch (Exception exception)
                    {
                        try
                        {
                            //if there is an error, try and get the Encoding from the 'Charset'...
                            decodedHtml = DetermineEncoding(crawlRequest, out encoding);
                        }
                        catch (Exception exception2)
                        {
                            //if there is an error, default to UTF8.
                            arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception, false);
                            arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception2, false);

                            encoding = Encoding.UTF8;
                        }
                    }

                    crawlRequest.Encoding = encoding;

                    if (encoding == Encoding.UTF8 && decodedHtml != null)
                    {
                        crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(decodedHtml);
                        crawlRequest.Html        = decodedHtml;
                    }
                    else
                    {
                        crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(encoding.GetString(crawlRequest.Data));
                        crawlRequest.Html        = encoding.GetString(crawlRequest.Data);
                    }
                }
            }
        }
Exemple #22
0
 /// <summary>
 ///     Determines whether the specified crawl request is disallowed.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 /// <returns>
 ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
 /// </returns>
 public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
 {
     return(IsDisallowed(crawlRequest, crawlRequest.Discovery.Uri));
 }
Exemple #23
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "discovery">The discovery.</param>
        /// <param name = "crawlRuleType">Type of the rule.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, CrawlRuleType crawlRuleType, IArachnodeDAO arachnodeDAO)
        {
            switch (crawlRuleType)
            {
            case CrawlRuleType.PreRequest:
                foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preRequestCrawlRules.Values)
                {
                    foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules)
                    {
                        try
                        {
                            if (crawlRule.IsEnabled && crawlRule.IsDisallowed(discovery, arachnodeDAO))
                            {
                                discovery.IsDisallowed = true;

                                return(true);
                            }
                        }
                        catch (Exception exception)
                        {
                            arachnodeDAO.InsertException(discovery.Uri.AbsoluteUri, discovery.Uri.AbsoluteUri, exception, false);

                            return(true);
                        }
                    }
                }
                break;

            case CrawlRuleType.PreGet:
                foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preGetCrawlRules.Values)
                {
                    foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules)
                    {
                        try
                        {
                            if (crawlRule.IsEnabled && crawlRule.IsDisallowed(discovery, arachnodeDAO))
                            {
                                discovery.IsDisallowed = true;

                                return(true);
                            }
                        }
                        catch (Exception exception)
                        {
                            arachnodeDAO.InsertException(discovery.Uri.AbsoluteUri, discovery.Uri.AbsoluteUri, exception, false);

                            return(true);
                        }
                    }
                }
                break;

            case CrawlRuleType.PostRequest:
                foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _postRequestCrawlRules.Values)
                {
                    foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules)
                    {
                        try
                        {
                            if (crawlRule.IsEnabled && crawlRule.IsDisallowed(discovery, arachnodeDAO))
                            {
                                discovery.IsDisallowed = true;

                                return(true);
                            }
                        }
                        catch (Exception exception)
                        {
                            arachnodeDAO.InsertException(discovery.Uri.AbsoluteUri, discovery.Uri.AbsoluteUri, exception, false);

                            return(true);
                        }
                    }
                }
                break;
            }

            return(false);
        }
Exemple #24
0
 public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
 {
     //as an example...
     if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.ResponseUri != null)
     {
         if (crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri.EndsWith("503.html") || crawlRequest.WebClient.HttpWebResponse.StatusCode == HttpStatusCode.ServiceUnavailable)
         {
             crawlRequest.Crawl.Crawler.PolitenessManager.ResubmitCrawlRequest(crawlRequest, false, arachnodeDAO);
         }
     }
 }
Exemple #25
0
 /// <summary>
 ///     Determines whether the specified crawl request is disallowed.
 /// </summary>
 /// <param name = "discovery">The discovery.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 /// <returns>
 ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
 /// </returns>
 public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, IArachnodeDAO arachnodeDAO)
 {
     return(IsDisallowed(discovery, discovery.Uri));
 }
        /// <summary>
        ///     Processes a FilesRow after crawling.
        /// </summary>
        /// <param name = "filesRow">The files row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();;
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.File;
            crawlRequest.Discovery.ID            = filesRow.ID;
            crawlRequest.Data        = filesRow.Source;
            crawlRequest.ProcessData = true;
            crawlRequest.WebClient   = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the FilesRow.ResponseHeaders string...
            foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            if (applicationSettings.InsertFiles)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
 /// <summary>
 ///     The WebPageManager.
 /// </summary>
 /// <param name = "arachnodeDAO">Must be thread-safe.</param>
 public WebPageManager(ApplicationSettings applicationSettings, WebSettings webSettings, DiscoveryManager <TArachnodeDAO> discoveryManager, HtmlManager <TArachnodeDAO> htmlManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings, discoveryManager, htmlManager, arachnodeDAO)
 {
 }
        /**/

        public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //here you would insert/update your data storage...
            if (_crawler == null)
            {
                _crawler = crawlRequest.Crawl.Crawler;
            }
        }
Exemple #29
0
 /// <summary>
 ///     Initializes a new instance of the <see cref = "FileManager{TArachnodeDAO}" /> class.
 /// </summary>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 protected AFileManager(ApplicationSettings applicationSettings, WebSettings webSettings, DiscoveryManager <TArachnodeDAO> discoveryManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings)
 {
     _discoveryManager = discoveryManager;
     _arachnodeDAO     = arachnodeDAO;
 }
 /// <summary>
 ///     Determines whether the specified crawl request is disallowed.
 /// </summary>
 /// <param name = "discovery">The discovery.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 /// <returns>
 ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
 /// </returns>
 public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, IArachnodeDAO arachnodeDAO)
 {
     return(false);
 }