/// <summary> /// Process the given Html file /// </summary> /// <param name="uri"></param> /// <param name="htmlDocument"></param> public void ProcessHtml(Uri uri, KHtmlDocument htmlDocument) { if (uri == null) { throw new Exception("Error : Uri cannot be null"); } try { #region Parse Html KHtmlParser htmlParser = new KHtmlParser(uri, htmlDocument) { Resources = KrawlContext.Resources, UniqueWebPageQueue = KrawlContext.UniqueWebPageQueue, ErrorLogMethod = KrawlContext.ErrorLogMethod }; htmlParser.Parse(); #endregion #region Parse Css //TODO Parse inner styles try { htmlParser.IdentifyInternalStyles(); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, "Error while parsing inner styles", ex); } #endregion } catch (Exception ex) { throw ex; } }
/// <summary> /// Get the FaviconUrl from the given Uri /// returns null if not present /// </summary> /// <param name="projectId"></param> /// <param name="uri"></param> /// <returns></returns> public static string GetFaviconUrl(string projectId, Uri uri) { try { #region Get the favicon Icon Url Uri link = new Uri(uri.AbsoluteUri); WebClient webClient = new WebClient(); webClient.Encoding = System.Text.Encoding.Default; var originalHtml = webClient.DownloadString(uri); KHtmlDocument doc = new KHtmlDocument(); doc.LoadHtml(originalHtml); var faviconUri = doc.GetFaviconIcon(uri); #endregion #region Download the Favicon Icon webClient.Encoding = System.Text.Encoding.Default; Byte[] favicon = webClient.DownloadData(faviconUri); AmazonS3Helper.SaveTheFileInS3(EnvironmentConstants.ApplicationConfiguration.AWSS3Configuration.AWSAccessKey, EnvironmentConstants.ApplicationConfiguration.AWSS3Configuration.AWSSecretKey, projectId + faviconUri.LocalPath, favicon, EnvironmentConstants.ApplicationConfiguration.AWSBuckets.SourceBucket.Name); #endregion return($"/{projectId}{faviconUri.LocalPath}"); } catch (Exception ex) { Log.Error(ex, $"ProjectId:{projectId}, Message:Error while searching favicon icon for Url : {uri.AbsoluteUri}"); //EventLogger.Write(ex, String.Format("Error while searching favicon icon for Url : {0}", uri.AbsoluteUri), projectId); return(null); } }
/// <summary> /// Process the given uri /// </summary> /// <param name="uri"></param> public void ProcessUri(Uri uri) { if (uri == null) { throw new Exception("Error : Uri cannot be null."); } try { KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, String.Format($"Processing Url:{uri.AbsoluteUri}"), null); string htmlContent = DownloadHtmlAndUpdateResources(uri); if (htmlContent != null) { #region Initialise Html Document KHtmlDocument htmlDocument = new KHtmlDocument(); htmlDocument.LoadHtml(htmlContent); htmlDocument.IdentifyBaseTagAndSetValue(); #endregion ProcessHtml(uri, htmlDocument); #region Generate relative Url and call html callback function if (KrawlContext.ProcessedHtmlCallBackMethod != null) { try { string path = KrawlerUtility.GenerateHtmlLocalPath(uri); AssetDetails linkMap = new AssetDetails(); if (KrawlContext.Resources.UniqueWebpagesDictionary.TryGetValue(uri.AbsoluteUri, out linkMap)) { linkMap.NewUrl = path; KrawlContext.Resources.UniqueWebpagesDictionary[uri.AbsoluteUri] = linkMap; } KrawlContext.ProcessedHtmlCallBackMethod(path, htmlDocument.DocumentNode.InnerHtml); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, "Error Message : Error Generating relative Url or in html callback function", ex); } } #endregion } #region Resouces updated Method call try { KrawlContext.UpdatedResoucesCallBackMethod(KrawlContext.Resources); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, $"Error Message : Error while updating the DB value of Url : {uri.AbsoluteUri}", ex); } #endregion KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, $"Processing Url : {uri.AbsoluteUri} completed.", null); } catch (Exception ex) { KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, $"Something went wrong while processing : {uri.AbsoluteUri}.", null); throw new Exception($"Error while processing the Url : {uri.AbsoluteUri}", ex); } }