public void WebsiteCrawl() { List <KrawlerException> Errors = new List <KrawlerException>(); string url = "https://giphy.com/"; Uri uri = new Uri(url); Krawler krawler = new Krawler(uri); krawler.KrawlContext.ErrorLogMethod = (LOGTYPE x, string y, Exception ex) => Errors.Add(new KrawlerException { ErrorType = x, Ex = ex, Message = y }); //krawler.KrawlContext.SaveHtmlMethod = (x, y) => SaveInS3("test1"+x, y); krawler.Krawl(); }
public static void AnalyseTheWebsite(string projectId, Uri uri, bool IsDeepCrawl) { try { string regexToIgnore = null; string regexToInclude = null; string ignore_link_conversion = null; List <string> include_static_asset_folder = null; Log.Information("Getting project config"); var result = APIHelper.GetProjectConfig(projectId); if (result != null) { var customSourceSyncSettings = result["custom_source_sync"]; if (customSourceSyncSettings == null) { //Unable to get the customsource settings } else { Log.Information("ProjectConfig : " + JsonConvert.SerializeObject(customSourceSyncSettings)); try { ProjectCustomSourceSyncSettings excludeList = JsonConvert.DeserializeObject <ProjectCustomSourceSyncSettings>(customSourceSyncSettings.ToString()); if (!String.IsNullOrEmpty(excludeList.Exclude)) { regexToIgnore = excludeList.Exclude; } if (!string.IsNullOrEmpty(excludeList.Include)) { regexToInclude = excludeList.Include; } ignore_link_conversion = excludeList.IgnoreLinkConversion; include_static_asset_folder = excludeList.IncludeStaticAssets; if (excludeList.IncludeStaticAssetApis != null) { var assetList = Utils.GetAllStaticAssetList(excludeList.IncludeStaticAssetApis); if (assetList != null) { if (include_static_asset_folder == null) { include_static_asset_folder = new List <string>(); } include_static_asset_folder.AddRange(assetList); } } } catch (Exception ex) { Log.Error("ProjectCustomSourceSync : " + ex.Message); } } } Krawler krawler = new Krawler(uri); krawler.KrawlContext.ProcessedHtmlCallBackMethod = (filePath, htmlString) => SaveInS3(projectId, projectId + filePath, htmlString); krawler.KrawlContext.UpdatedResoucesCallBackMethod = (resourceObject) => UpdateAnalyseDetails(projectId, resourceObject); krawler.KrawlContext.BatchCompletedCallBackMethod = (CrawledPagesCount) => Batchcompleted(krawler, projectId, CrawledPagesCount); krawler.KrawlContext.ErrorLogMethod = (x, y, z) => LogError(x, y, z, projectId); krawler.KrawlContext.Configuration.MaxConcurrentThreads = 20; krawler.KrawlContext.Configuration.IsDeepCrawl = IsDeepCrawl; krawler.KrawlContext.Configuration.UserAgentString = EnvironmentConstants.ApplicationConfiguration.KitsuneUserAgent; if (!String.IsNullOrEmpty(ignore_link_conversion)) { krawler.KrawlContext.Resources.IgnoreFileNameChangeRegex = new Regex(ignore_link_conversion); } krawler.KrawlContext.Resources.ExcludeFilesRegex = regexToIgnore; krawler.KrawlContext.Resources.IncludeFilesRegex = regexToInclude; krawler.KrawlContext.Resources.IncludeStaticAssetList = include_static_asset_folder; krawler.Krawl(); } catch (Exception ex) { Log.Error(ex, $"Error Analysing the Url for projectId : {projectId}"); //TODO: Throw Error throw ex; } }