public static List <VSMasterModel> FillVSModel(ResultNode resultNode, List <char[]> charArrayList, int orderId, IElement IElenode) { var vsModelList = new List <VSMasterModel>(); if (!string.IsNullOrEmpty(resultNode.NodeValue)) { var vsModelCollection = VSMasterSupport.FillingVSMasterModel(resultNode, charArrayList, orderId); if (vsModelCollection != null && vsModelCollection.Count > 0) { orderId = 0; foreach (var vsModel in vsModelCollection) { if (IElenode != null) { if (IElenode.HasAttribute("href") && !IElenode.GetAttribute("href").StartsWith("#")) { vsModel.Link = IElenode.GetAttribute("href"); } if (IElenode.HasAttribute("src") && !IElenode.GetAttribute("src").StartsWith("#")) { vsModel.Link = IElenode.GetAttribute("src"); } } vsModelList.Add(vsModel); } return(vsModelList); } } return(null); }
public static void ParseWithAngleSharp(string fileUrl) { var vsMasterModel = new List <VSMasterModel>(); //var skipItmes = new List<string> { "body", "html", "head", "script", "#comment" }; var orderId = -1; var vsModelProperties = GetVSMasterModelProperties(); var tagName = string.Empty; // generating a list with punctuation mark AddPunctuationMarks(); var charArrayList = lstPunctuation.Select(str => str.ToCharArray()).ToList(); //We require a custom configuration var config = Configuration.Default.WithJavaScript().WithCss(); //Let's create a new parser using this configuration //var parser = new HtmlParser(new HtmlParserOptions {IsEmbedded = true }, config); var parser = new HtmlParser(config); // https://github.com/glienard/html-to-sql/blob/master/htmlexample.html var source = File.ReadAllText(fileUrl); var document = parser.Parse(source); var pageData = new Page(); #region For All Page Data var robots = document.All.FirstOrDefault(x => x.LocalName == "meta" && x.GetAttribute("Name") == "robots"); if ((robots as IHtmlMetaElement) != null) { var content = (robots as IHtmlMetaElement).Content; if (content.IndexOf("NoFollow", StringComparison.OrdinalIgnoreCase) != -1) { pageData.NoFollow = true; } if (content.IndexOf("NoIndex", StringComparison.OrdinalIgnoreCase) != -1) { pageData.NoIndex = true; } if (content.IndexOf("NoArchive", StringComparison.OrdinalIgnoreCase) != -1) { pageData.NoArchive = true; } if (content.IndexOf("NoImageIndex", StringComparison.OrdinalIgnoreCase) != -1) { pageData.NoImageIndex = true; } if (content.IndexOf("NoSnippet", StringComparison.OrdinalIgnoreCase) != -1) { pageData.NoSnippet = true; } } #endregion #region For All Links Of Page var linkTags = document.All.Where(x => x.Attributes["href"] != null || x.Attributes["src"] != null); var frame = document.QuerySelectorAll("frame"); var lstLinks = new List <Links>(); foreach (var link in linkTags) { var nofollow = link.Attributes["rel"]; var nofollowInt = nofollow?.Value == "nofollow" ? 1 : 0; lstLinks.Add(new Links { Url = link.Attributes["href"] == null ? link.Attributes["src"].Value : link.Attributes["href"].Value, LinkType = link.LocalName, NotFollow = nofollowInt }); } #endregion var html = document.All.FirstOrDefault(x => x.LocalName == "html"); var lang = (html as IHtmlElement).Language; #region HEADER PARSE var heder = document.All.Where(x => x.LocalName == "head"); var headerTotalTags = parser.Parse(heder.FirstOrDefault().OuterHtml).All; foreach (var tag in headerTotalTags) { var outerHmtl = tag.OuterHtml; if (skipItmes.Contains(tag.LocalName)) { continue; } var newDoc = parser.ParseFragment(tag.TextContent.ToString(), null); var txtContent = newDoc.FirstOrDefault().TextContent; if (tag.LocalName == "meta") { var localName = Helper.FirstCharToUpper(tag.LocalName); txtContent = tag.GetAttribute("content"); if (!string.IsNullOrEmpty(txtContent)) { var criteria = false; tagName = string.Empty; if (!tag.HasAttribute("name")) { continue; } criteria = (tag.GetAttribute("name").Equals("description") || tag.GetAttribute("name").Equals("keywords")); if (!criteria) { txtContent = null; } else { var name = tag.GetAttribute("name"); name = Helper.FirstCharToUpper(name); tagName = $"{"Tag"}{localName}{name}"; } } } // Model for header implimenting here... if (!string.IsNullOrEmpty(txtContent)) { if (tag.LocalName != "meta") { var localName = Helper.FirstCharToUpper(tag.LocalName); tagName = $"{"Tag"}{localName}"; } var resultnode = new ResultNode { NodeName = tagName, NodeValue = txtContent }; var ielemnode = tag as IHtmlElement; //var language1 = ielemnode == null ? lang : (tag as IHtmlElement).Language; //if (language1.Contains("-")) //{ var langCountry = lang.Split('-'); resultnode.Lang = langCountry[0].ToString(); resultnode.Country = langCountry[1].ToString(); //} var vsModelCollection = VSMasterSupport.FillingVSMasterModel(resultnode, charArrayList, orderId); if (vsModelCollection != null && vsModelCollection.Count > 0) { orderId = 0; foreach (var vsModel in vsModelCollection) { vsMasterModel.Add(vsModel); } } } } #endregion #region BODY PARSE skipItmes.Add("title"); tagName = string.Empty; var body = document.QuerySelector("body"); var language = (body as IHtmlElement).Language; if (language.Contains("-")) { var langCountry = language.Split('-'); pageData.Lang = language[0].ToString(); pageData.Country = language[1].ToString(); } var bodyChildNodes = body.ChildNodes; // new code foreach (var node in bodyChildNodes) { var vsModelCollection = new List <VSMasterModel>(); var txtContent = node.TextContent; var IElenode = node as IElement; var resultNode = new ResultNode(); if (string.IsNullOrEmpty(txtContent.Trim()) && IElenode == null) { continue; } if (skipItmes.FindIndex(x => x.Equals(node.NodeName, StringComparison.OrdinalIgnoreCase)) != -1) { continue; } ProcessNode(node, skipItmes, charArrayList, vsMasterModel, IElenode); } #endregion if (vsMasterModel.Count > 0) { var words = vsMasterModel.Select(x => x.Word).ToList(); var res = string.Join(" ", words); var csvString = ToCsv(vsMasterModel); File.WriteAllText("Output.csv", csvString); } }