Exemplo n.º 1
0
        public static List <VSMasterModel> FillVSModel(ResultNode resultNode, List <char[]> charArrayList, int orderId, IElement IElenode)
        {
            var vsModelList = new List <VSMasterModel>();

            if (!string.IsNullOrEmpty(resultNode.NodeValue))
            {
                var vsModelCollection = VSMasterSupport.FillingVSMasterModel(resultNode, charArrayList, orderId);
                if (vsModelCollection != null && vsModelCollection.Count > 0)
                {
                    orderId = 0;
                    foreach (var vsModel in vsModelCollection)
                    {
                        if (IElenode != null)
                        {
                            if (IElenode.HasAttribute("href") && !IElenode.GetAttribute("href").StartsWith("#"))
                            {
                                vsModel.Link = IElenode.GetAttribute("href");
                            }
                            if (IElenode.HasAttribute("src") && !IElenode.GetAttribute("src").StartsWith("#"))
                            {
                                vsModel.Link = IElenode.GetAttribute("src");
                            }
                        }

                        vsModelList.Add(vsModel);
                    }

                    return(vsModelList);
                }
            }
            return(null);
        }
Exemplo n.º 2
0
        public static void ParseWithAngleSharp(string fileUrl)
        {
            var vsMasterModel = new List <VSMasterModel>();
            //var skipItmes = new List<string> { "body", "html", "head", "script", "#comment" };
            var orderId           = -1;
            var vsModelProperties = GetVSMasterModelProperties();
            var tagName           = string.Empty;

            // generating a list with punctuation mark
            AddPunctuationMarks();
            var charArrayList = lstPunctuation.Select(str => str.ToCharArray()).ToList();
            //We require a custom configuration
            var config = Configuration.Default.WithJavaScript().WithCss();
            //Let's create a new parser using this configuration
            //var parser = new HtmlParser(new HtmlParserOptions {IsEmbedded = true }, config);
            var parser = new HtmlParser(config);

            // https://github.com/glienard/html-to-sql/blob/master/htmlexample.html
            var source   = File.ReadAllText(fileUrl);
            var document = parser.Parse(source);
            var pageData = new Page();

            #region For All Page Data
            var robots = document.All.FirstOrDefault(x => x.LocalName == "meta" && x.GetAttribute("Name") == "robots");
            if ((robots as IHtmlMetaElement) != null)
            {
                var content = (robots as IHtmlMetaElement).Content;
                if (content.IndexOf("NoFollow", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoFollow = true;
                }
                if (content.IndexOf("NoIndex", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoIndex = true;
                }
                if (content.IndexOf("NoArchive", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoArchive = true;
                }
                if (content.IndexOf("NoImageIndex", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoImageIndex = true;
                }
                if (content.IndexOf("NoSnippet", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoSnippet = true;
                }
            }
            #endregion

            #region For All Links Of Page
            var linkTags = document.All.Where(x => x.Attributes["href"] != null || x.Attributes["src"] != null);
            var frame    = document.QuerySelectorAll("frame");

            var lstLinks = new List <Links>();
            foreach (var link in linkTags)
            {
                var nofollow    = link.Attributes["rel"];
                var nofollowInt = nofollow?.Value == "nofollow" ? 1 : 0;
                lstLinks.Add(new Links
                {
                    Url       = link.Attributes["href"] == null ? link.Attributes["src"].Value : link.Attributes["href"].Value,
                    LinkType  = link.LocalName,
                    NotFollow = nofollowInt
                });
            }
            #endregion

            var html = document.All.FirstOrDefault(x => x.LocalName == "html");
            var lang = (html as IHtmlElement).Language;

            #region HEADER PARSE
            var heder           = document.All.Where(x => x.LocalName == "head");
            var headerTotalTags = parser.Parse(heder.FirstOrDefault().OuterHtml).All;
            foreach (var tag in headerTotalTags)
            {
                var outerHmtl = tag.OuterHtml;

                if (skipItmes.Contains(tag.LocalName))
                {
                    continue;
                }

                var newDoc     = parser.ParseFragment(tag.TextContent.ToString(), null);
                var txtContent = newDoc.FirstOrDefault().TextContent;

                if (tag.LocalName == "meta")
                {
                    var localName = Helper.FirstCharToUpper(tag.LocalName);
                    txtContent = tag.GetAttribute("content");
                    if (!string.IsNullOrEmpty(txtContent))
                    {
                        var criteria = false;
                        tagName = string.Empty;

                        if (!tag.HasAttribute("name"))
                        {
                            continue;
                        }
                        criteria = (tag.GetAttribute("name").Equals("description") || tag.GetAttribute("name").Equals("keywords"));
                        if (!criteria)
                        {
                            txtContent = null;
                        }
                        else
                        {
                            var name = tag.GetAttribute("name");
                            name    = Helper.FirstCharToUpper(name);
                            tagName = $"{"Tag"}{localName}{name}";
                        }
                    }
                }

                // Model for header implimenting here...
                if (!string.IsNullOrEmpty(txtContent))
                {
                    if (tag.LocalName != "meta")
                    {
                        var localName = Helper.FirstCharToUpper(tag.LocalName);
                        tagName = $"{"Tag"}{localName}";
                    }
                    var resultnode = new ResultNode {
                        NodeName = tagName, NodeValue = txtContent
                    };
                    var ielemnode = tag as IHtmlElement;
                    //var language1 =  ielemnode == null ? lang : (tag as IHtmlElement).Language;
                    //if (language1.Contains("-"))
                    //{
                    var langCountry = lang.Split('-');
                    resultnode.Lang    = langCountry[0].ToString();
                    resultnode.Country = langCountry[1].ToString();
                    //}
                    var vsModelCollection = VSMasterSupport.FillingVSMasterModel(resultnode, charArrayList, orderId);
                    if (vsModelCollection != null && vsModelCollection.Count > 0)
                    {
                        orderId = 0;
                        foreach (var vsModel in vsModelCollection)
                        {
                            vsMasterModel.Add(vsModel);
                        }
                    }
                }
            }
            #endregion

            #region BODY PARSE
            skipItmes.Add("title");
            tagName = string.Empty;

            var body     = document.QuerySelector("body");
            var language = (body as IHtmlElement).Language;
            if (language.Contains("-"))
            {
                var langCountry = language.Split('-');
                pageData.Lang    = language[0].ToString();
                pageData.Country = language[1].ToString();
            }

            var bodyChildNodes = body.ChildNodes;

            // new code
            foreach (var node in bodyChildNodes)
            {
                var vsModelCollection = new List <VSMasterModel>();
                var txtContent        = node.TextContent;
                var IElenode          = node as IElement;
                var resultNode        = new ResultNode();
                if (string.IsNullOrEmpty(txtContent.Trim()) && IElenode == null)
                {
                    continue;
                }
                if (skipItmes.FindIndex(x => x.Equals(node.NodeName, StringComparison.OrdinalIgnoreCase)) != -1)
                {
                    continue;
                }

                ProcessNode(node, skipItmes, charArrayList, vsMasterModel, IElenode);
            }
            #endregion

            if (vsMasterModel.Count > 0)
            {
                var words = vsMasterModel.Select(x => x.Word).ToList();
                var res   = string.Join(" ", words);

                var csvString = ToCsv(vsMasterModel);
                File.WriteAllText("Output.csv", csvString);
            }
        }