/// <summary> /// Process the given uri /// </summary> /// <param name="uri"></param> public void ProcessUri(Uri uri) { if (uri == null) { throw new Exception("Error : Uri cannot be null."); } try { KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, String.Format($"Processing Url:{uri.AbsoluteUri}"), null); string htmlContent = DownloadHtmlAndUpdateResources(uri); if (htmlContent != null) { #region Initialise Html Document KHtmlDocument htmlDocument = new KHtmlDocument(); htmlDocument.LoadHtml(htmlContent); htmlDocument.IdentifyBaseTagAndSetValue(); #endregion ProcessHtml(uri, htmlDocument); #region Generate relative Url and call html callback function if (KrawlContext.ProcessedHtmlCallBackMethod != null) { try { string path = KrawlerUtility.GenerateHtmlLocalPath(uri); AssetDetails linkMap = new AssetDetails(); if (KrawlContext.Resources.UniqueWebpagesDictionary.TryGetValue(uri.AbsoluteUri, out linkMap)) { linkMap.NewUrl = path; KrawlContext.Resources.UniqueWebpagesDictionary[uri.AbsoluteUri] = linkMap; } KrawlContext.ProcessedHtmlCallBackMethod(path, htmlDocument.DocumentNode.InnerHtml); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, "Error Message : Error Generating relative Url or in html callback function", ex); } } #endregion } #region Resouces updated Method call try { KrawlContext.UpdatedResoucesCallBackMethod(KrawlContext.Resources); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, $"Error Message : Error while updating the DB value of Url : {uri.AbsoluteUri}", ex); } #endregion KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, $"Processing Url : {uri.AbsoluteUri} completed.", null); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, $"Something went wrong while processing : {uri.AbsoluteUri}.", null); throw new Exception($"Error while processing the Url : {uri.AbsoluteUri}", ex); } }
public void KrawlerUtilityTest() { var xyz = KrawlerUtility.GenerateFileLocalPath(new Uri("http://13.127.86.77/abc.css?sccss=1&ver=4.9.5"), ".css"); }
public void ProcessTheResource(AssetDetails asset, FileType type, Uri uri) { if (asset == null) { throw new ArgumentNullException(nameof(asset)); } if (String.IsNullOrEmpty(asset.LinkUrl)) { throw new ArgumentException(nameof(asset.LinkUrl)); } try { #region Download File string url = asset.LinkUrl; IRestResponse result = null; //Download the File result = HttpRequest.HttpRequestWithReadAndWriteTimeOut(uri, Context.Configuration.ReadAndWriteTimeOut, Context.Configuration.UserAgentString); //Check the StatuCode if (result.StatusCode.Equals(HttpStatusCode.OK)) { asset.ResponseStatusCode = HttpStatusCode.OK; } else { asset.ResponseStatusCode = result.StatusCode; Context.DownloadedFileCallBackMethod(asset, type, null, null); throw new Exception(String.Format("Error downloading the File : {0}, Status Code : {1}", asset.LinkUrl, result.StatusCode)); } #endregion #region Process File var byteArray = result.RawBytes; if (type.Equals(FileType.STYLE) && Context.Configuration.CrawlCssEnabled) { //CrawlCss //and also update the new files found and push it to queue String cssText = result.Content; cssText = CrawlCss(cssText, uri); byteArray = Encoding.Default.GetBytes(cssText); } var extension = KrawlerUtility.GetExtensionFromContentType(uri.LocalPath, result.ContentType); if (String.IsNullOrEmpty(extension)) { if (type.Equals(FileType.SCRIPT)) { extension = ".js"; } else if (type.Equals(FileType.STYLE)) { extension = ".css"; } } string filePath = KrawlerUtility.GenerateFileLocalPath(uri, extension); if (String.IsNullOrEmpty(filePath)) { filePath = uri.LocalPath; Context.ErrorLogMethod(LOGTYPE.INFORMATION, String.Format("file path generate was nulll or empty for url : {0}", uri.LocalPath), null); } #endregion #region Save File //Callback method to save the file asset.NewUrl = filePath; Context.DownloadedFileCallBackMethod(asset, type, byteArray, result.ContentType); #endregion } catch (Exception ex) { Context.ErrorLogMethod(LOGTYPE.INFORMATION, String.Format("Error downloading Url: {0}", asset.LinkUrl), ex); } }