public void WebsiteCrawl()
        {
            List <KrawlerException> Errors = new List <KrawlerException>();
            string url = "https://giphy.com/";
            Uri    uri = new Uri(url);

            Krawler krawler = new Krawler(uri);

            krawler.KrawlContext.ErrorLogMethod = (LOGTYPE x, string y, Exception ex) => Errors.Add(new KrawlerException {
                ErrorType = x, Ex = ex, Message = y
            });
            //krawler.KrawlContext.SaveHtmlMethod = (x, y) => SaveInS3("test1"+x, y);
            krawler.Krawl();
        }
 public static bool Batchcompleted(Krawler krawlerContext, string projectId, int CrawledPagesCount)
 {
     try
     {
         //Check DB
         KitsuneKrawlerStats statsDetails = MongoHelper.GetCrawlStatsDetails(projectId);
         if ((statsDetails.LinksLimit != 0 && CrawledPagesCount > statsDetails.LinksLimit) || statsDetails.StopCrawl)
         {
             krawlerContext.KrawlContext.Configuration.IsStopCrawlEnabled = true;
             LimitCrossed = true;
         }
         return(true);
     }
     catch (Exception ex)
     {
         return(false);
     }
 }
        public void OnePageCrawl()
        {
            string url = "https://www.religarehealthinsurance.com/";
            Uri    uri = new Uri(url);

            List <KrawlerException> Errors = new List <KrawlerException>();
            Krawler krawler = new Krawler(uri);


            try
            {
                krawler.KrawlContext.Resources.UniqueWebpagesDictionary.TryAdd(uri.AbsoluteUri, new AssetDetails {
                    LinkUrl = uri.AbsoluteUri, PlaceHolder = "[Kitsune_" + uri.AbsoluteUri + "]"
                });
                krawler.KrawlContext.ErrorLogMethod = (LOGTYPE x, string y, Exception ex) => Errors.Add(new KrawlerException {
                    ErrorType = x, Ex = ex, Message = y
                });
                krawler.ProcessUri(uri);
            }
            catch (Exception ex)
            {
            }
        }
        public static void AnalyseTheWebsite(string projectId, Uri uri, bool IsDeepCrawl)
        {
            try
            {
                string        regexToIgnore               = null;
                string        regexToInclude              = null;
                string        ignore_link_conversion      = null;
                List <string> include_static_asset_folder = null;
                Log.Information("Getting project config");
                var result = APIHelper.GetProjectConfig(projectId);
                if (result != null)
                {
                    var customSourceSyncSettings = result["custom_source_sync"];
                    if (customSourceSyncSettings == null)
                    {
                        //Unable to get the customsource settings
                    }
                    else
                    {
                        Log.Information("ProjectConfig : " + JsonConvert.SerializeObject(customSourceSyncSettings));
                        try
                        {
                            ProjectCustomSourceSyncSettings excludeList = JsonConvert.DeserializeObject <ProjectCustomSourceSyncSettings>(customSourceSyncSettings.ToString());
                            if (!String.IsNullOrEmpty(excludeList.Exclude))
                            {
                                regexToIgnore = excludeList.Exclude;
                            }
                            if (!string.IsNullOrEmpty(excludeList.Include))
                            {
                                regexToInclude = excludeList.Include;
                            }
                            ignore_link_conversion      = excludeList.IgnoreLinkConversion;
                            include_static_asset_folder = excludeList.IncludeStaticAssets;
                            if (excludeList.IncludeStaticAssetApis != null)
                            {
                                var assetList = Utils.GetAllStaticAssetList(excludeList.IncludeStaticAssetApis);
                                if (assetList != null)
                                {
                                    if (include_static_asset_folder == null)
                                    {
                                        include_static_asset_folder = new List <string>();
                                    }
                                    include_static_asset_folder.AddRange(assetList);
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            Log.Error("ProjectCustomSourceSync : " + ex.Message);
                        }
                    }
                }

                Krawler krawler = new Krawler(uri);
                krawler.KrawlContext.ProcessedHtmlCallBackMethod   = (filePath, htmlString) => SaveInS3(projectId, projectId + filePath, htmlString);
                krawler.KrawlContext.UpdatedResoucesCallBackMethod = (resourceObject) => UpdateAnalyseDetails(projectId, resourceObject);
                krawler.KrawlContext.BatchCompletedCallBackMethod  = (CrawledPagesCount) => Batchcompleted(krawler, projectId, CrawledPagesCount);
                krawler.KrawlContext.ErrorLogMethod = (x, y, z) => LogError(x, y, z, projectId);
                krawler.KrawlContext.Configuration.MaxConcurrentThreads = 20;
                krawler.KrawlContext.Configuration.IsDeepCrawl          = IsDeepCrawl;
                krawler.KrawlContext.Configuration.UserAgentString      = EnvironmentConstants.ApplicationConfiguration.KitsuneUserAgent;

                if (!String.IsNullOrEmpty(ignore_link_conversion))
                {
                    krawler.KrawlContext.Resources.IgnoreFileNameChangeRegex = new Regex(ignore_link_conversion);
                }
                krawler.KrawlContext.Resources.ExcludeFilesRegex      = regexToIgnore;
                krawler.KrawlContext.Resources.IncludeFilesRegex      = regexToInclude;
                krawler.KrawlContext.Resources.IncludeStaticAssetList = include_static_asset_folder;
                krawler.Krawl();
            }
            catch (Exception ex)
            {
                Log.Error(ex, $"Error Analysing the Url for projectId : {projectId}");
                //TODO: Throw Error
                throw ex;
            }
        }