public WebContentDetails GetWebsiteContent([FromBody] WebAddress website) { WebContentDetails websiteContentCountResult = new WebContentDetails(); websiteContentCountResult.WebsiteURL = website.websiteURL; DownloadWebsiteContent(website.websiteURL, websiteContentCountResult); return(websiteContentCountResult); }
/// <summary> /// Download the file and save as html to get and process the file /// </summary> /// <param name="websiteURL">string</param> /// <param name="websiteContentCountResult">WebContentDetails</param> private void DownloadWebsiteContent(string websiteURL, WebContentDetails websiteContentCountResult) { if (!string.IsNullOrEmpty(websiteURL)) { string fileName = Regex.Replace(websiteURL, @"[^0-9a-zA-Z]+", ""); string path = @"D:\" + fileName.Trim() + @".html"; if (!File.Exists(path)) { File.Create(path).Dispose(); } using (var webClient = new System.Net.WebClient()) { webClient.DownloadFile(websiteURL, path); } ProcessContent(path, websiteContentCountResult); } }
/// <summary> /// Process the file to get the list of image and number of words /// </summary> /// <param name="filePath"></param> /// <param name="websiteContentCountResult"></param> private void ProcessContent(string filePath, WebContentDetails websiteContentCountResult) { string fileContent = File.ReadAllText(filePath); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(fileContent); HtmlNode bodyHtml = doc.DocumentNode.SelectSingleNode("//body"); string content = bodyHtml.InnerHtml; int wordCount = 0; List <string> ImageUrlList = new List <string>(); if (!string.IsNullOrEmpty(content)) { ExtractImages(ImageUrlList, content); wordCount = CountWords(content); } websiteContentCountResult.ImageUrl = ImageUrlList; websiteContentCountResult.WordCount = wordCount; websiteContentCountResult.TopWordCount = topWordCountList; }