コード例 #1
0
        /// <summary>
        /// Process the given Html file
        /// </summary>
        /// <param name="uri"></param>
        /// <param name="htmlDocument"></param>
        public void ProcessHtml(Uri uri, KHtmlDocument htmlDocument)
        {
            if (uri == null)
            {
                throw new Exception("Error : Uri cannot be null");
            }
            try
            {
                #region Parse Html

                KHtmlParser htmlParser = new KHtmlParser(uri, htmlDocument)
                {
                    Resources          = KrawlContext.Resources,
                    UniqueWebPageQueue = KrawlContext.UniqueWebPageQueue,
                    ErrorLogMethod     = KrawlContext.ErrorLogMethod
                };
                htmlParser.Parse();

                #endregion

                #region Parse Css

                //TODO Parse inner styles
                try
                {
                    htmlParser.IdentifyInternalStyles();
                }
                catch (Exception ex)
                {
                    KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, "Error while parsing inner styles", ex);
                }

                #endregion
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
コード例 #2
0
        /// <summary>
        /// Get the FaviconUrl from the given Uri
        /// returns null if not present
        /// </summary>
        /// <param name="projectId"></param>
        /// <param name="uri"></param>
        /// <returns></returns>
        public static string GetFaviconUrl(string projectId, Uri uri)
        {
            try
            {
                #region Get the favicon Icon Url

                Uri link = new Uri(uri.AbsoluteUri);

                WebClient webClient = new WebClient();
                webClient.Encoding = System.Text.Encoding.Default;
                var originalHtml = webClient.DownloadString(uri);


                KHtmlDocument doc = new KHtmlDocument();
                doc.LoadHtml(originalHtml);
                var faviconUri = doc.GetFaviconIcon(uri);

                #endregion

                #region Download the Favicon Icon

                webClient.Encoding = System.Text.Encoding.Default;
                Byte[] favicon = webClient.DownloadData(faviconUri);
                AmazonS3Helper.SaveTheFileInS3(EnvironmentConstants.ApplicationConfiguration.AWSS3Configuration.AWSAccessKey, EnvironmentConstants.ApplicationConfiguration.AWSS3Configuration.AWSSecretKey,
                                               projectId + faviconUri.LocalPath, favicon, EnvironmentConstants.ApplicationConfiguration.AWSBuckets.SourceBucket.Name);

                #endregion

                return($"/{projectId}{faviconUri.LocalPath}");
            }
            catch (Exception ex)
            {
                Log.Error(ex, $"ProjectId:{projectId}, Message:Error while searching favicon icon for Url : {uri.AbsoluteUri}");
                //EventLogger.Write(ex, String.Format("Error while searching favicon icon for Url : {0}", uri.AbsoluteUri), projectId);
                return(null);
            }
        }
コード例 #3
0
        /// <summary>
        /// Process the given uri
        /// </summary>
        /// <param name="uri"></param>
        public void ProcessUri(Uri uri)
        {
            if (uri == null)
            {
                throw new Exception("Error : Uri cannot be null.");
            }
            try
            {
                KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, String.Format($"Processing Url:{uri.AbsoluteUri}"), null);
                string htmlContent = DownloadHtmlAndUpdateResources(uri);

                if (htmlContent != null)
                {
                    #region Initialise Html Document

                    KHtmlDocument htmlDocument = new KHtmlDocument();
                    htmlDocument.LoadHtml(htmlContent);
                    htmlDocument.IdentifyBaseTagAndSetValue();

                    #endregion

                    ProcessHtml(uri, htmlDocument);

                    #region Generate relative Url and call html callback function

                    if (KrawlContext.ProcessedHtmlCallBackMethod != null)
                    {
                        try
                        {
                            string       path    = KrawlerUtility.GenerateHtmlLocalPath(uri);
                            AssetDetails linkMap = new AssetDetails();
                            if (KrawlContext.Resources.UniqueWebpagesDictionary.TryGetValue(uri.AbsoluteUri, out linkMap))
                            {
                                linkMap.NewUrl = path;
                                KrawlContext.Resources.UniqueWebpagesDictionary[uri.AbsoluteUri] = linkMap;
                            }
                            KrawlContext.ProcessedHtmlCallBackMethod(path, htmlDocument.DocumentNode.InnerHtml);
                        }
                        catch (Exception ex)
                        {
                            KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, "Error Message : Error Generating relative Url or in html callback function", ex);
                        }
                    }

                    #endregion
                }

                #region Resouces updated Method call

                try
                {
                    KrawlContext.UpdatedResoucesCallBackMethod(KrawlContext.Resources);
                }
                catch (Exception ex)
                {
                    KrawlContext.ErrorLogMethod(LOGTYPE.ERROR, $"Error Message : Error while updating the DB value of Url : {uri.AbsoluteUri}", ex);
                }

                #endregion

                KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, $"Processing Url : {uri.AbsoluteUri} completed.", null);
            }
            catch (Exception ex)
            {
                KrawlContext.ErrorLogMethod(LOGTYPE.USERINFO, $"Something went wrong while processing : {uri.AbsoluteUri}.", null);
                throw new Exception($"Error while processing the Url : {uri.AbsoluteUri}", ex);
            }
        }