Пример #1
0
        public static List <VSMasterModel> FillVSModel(ResultNode resultNode, List <char[]> charArrayList, int orderId, IElement IElenode)
        {
            var vsModelList = new List <VSMasterModel>();

            if (!string.IsNullOrEmpty(resultNode.NodeValue))
            {
                var vsModelCollection = VSMasterSupport.FillingVSMasterModel(resultNode, charArrayList, orderId);
                if (vsModelCollection != null && vsModelCollection.Count > 0)
                {
                    orderId = 0;
                    foreach (var vsModel in vsModelCollection)
                    {
                        if (IElenode != null)
                        {
                            if (IElenode.HasAttribute("href") && !IElenode.GetAttribute("href").StartsWith("#"))
                            {
                                vsModel.Link = IElenode.GetAttribute("href");
                            }
                            if (IElenode.HasAttribute("src") && !IElenode.GetAttribute("src").StartsWith("#"))
                            {
                                vsModel.Link = IElenode.GetAttribute("src");
                            }
                        }

                        vsModelList.Add(vsModel);
                    }

                    return(vsModelList);
                }
            }
            return(null);
        }
Пример #2
0
        public static void AddResultNodeToMasterModel(ResultNode node, ref List <VSMasterModel> vsMasterModel, List <char[]> charArrayList)
        {
            char resultchar;

            char.TryParse(node.NodeValue.Trim(), out resultchar);
            if (!char.IsPunctuation(resultchar))
            {
                vsMasterModel.AddRange(FillVSModel(node, charArrayList, orderId: 0, IElenode: null));
            }
        }
Пример #3
0
        public static List <VSMasterModel> FillingVSMasterModel(ResultNode resultnode, List <char[]> charArrayList, int orderId)
        {
            var vsModelList = new List <VSMasterModel>();

            // Model implimenting here...
            if (!string.IsNullOrEmpty(resultnode.NodeValue))
            {
                //var asciiEncoding = Encoding.Unicode;
                //var bytes = asciiEncoding.GetBytes(resultnode.NodeValue);
                //var txt = asciiEncoding.GetString(bytes);
                var words = resultnode.NodeValue.Split(new char[] { ' ' });

                foreach (var word in words)
                {
                    var vsModel      = new VSMasterModel();
                    var wordForModel = word;
                    for (int i = 0; i <= charArrayList.Count - 1; i++)
                    {
                        var newWord = word.Split(charArrayList[i]);
                        if (newWord.Length > 1)
                        {
                            if (lstPunctuation[i] == ".")
                            {
                                var startIndex         = 0;
                                var indexOfPunctuation = word.IndexOf(lstPunctuation[i]);
                                if (indexOfPunctuation != word.Length - 1)
                                {
                                    var nextWord = word[indexOfPunctuation + 1].ToStringCustom();
                                    if (!string.IsNullOrEmpty(nextWord))
                                    {
                                        vsModel.PunctuationMarkBefore = null;
                                        vsModel.PunctuationMarkAfter  = null;
                                        wordForModel = word;
                                        continue;
                                    }
                                }
                            }
                            if (newWord[1] == "") // PunctuationMarkBefore--> null
                            {
                                vsModel.PunctuationMarkBefore = null;
                                vsModel.PunctuationMarkAfter  = lstPunctuation[i].ToString();
                                wordForModel = newWord[0];
                                break;
                            }
                            else  // PunctuationMarkAfter--> null
                            {
                                vsModel.PunctuationMarkBefore = lstPunctuation[i].ToString();
                                vsModel.PunctuationMarkAfter  = null;
                                wordForModel = newWord[1];
                                break;
                            }
                        }
                    }

                    if (!string.IsNullOrEmpty(wordForModel))
                    {
                        vsModel.Word = wordForModel.Trim();
                        var charArray = wordForModel.ToCharArray();
                        if (char.IsUpper(charArray[0]))
                        {
                            vsModel.FirstLetterUppercase = AnkCommonFunctions.SetTrue;
                        }

                        var isAllUpper = Helper.IsAllUpper(wordForModel);
                        vsModel.AllInUpperCase = isAllUpper ? AnkCommonFunctions.SetTrue : AnkCommonFunctions.SetFalse;
                    }
                    else
                    {
                        continue;
                    }

                    if (vsModelProperties.Contains(resultnode.NodeName))
                    {
                        var value        = AnkCommonFunctions.SetTrue;
                        var propertyInfo = vsModel.GetType().GetProperty(resultnode.NodeName);
                        propertyInfo.SetValue(vsModel, value, null);
                    }
                    // for inherited Tags
                    if (resultnode.InheritedTags.Count > 0)
                    {
                        foreach (var inheritedTag in resultnode.InheritedTags)
                        {
                            if (vsModelProperties.Contains(inheritedTag))
                            {
                                var value        = AnkCommonFunctions.SetTrue;
                                var propertyInfo = vsModel.GetType().GetProperty(inheritedTag);
                                propertyInfo.SetValue(vsModel, value, null);
                            }
                        }
                    }

                    // set orderId...
                    vsModel.OrderId = OrderId++;
                    vsModel.Lang    = resultnode.Lang;
                    vsModel.Country = resultnode.Country;
                    vsModel.Link    = resultnode.Link;
                    vsModelList.Add(vsModel);
                }
            }
            return(vsModelList);
        }
Пример #4
0
        public static void ParseWithAngleSharp(string fileUrl)
        {
            var vsMasterModel = new List <VSMasterModel>();
            //var skipItmes = new List<string> { "body", "html", "head", "script", "#comment" };
            var orderId           = -1;
            var vsModelProperties = GetVSMasterModelProperties();
            var tagName           = string.Empty;

            // generating a list with punctuation mark
            AddPunctuationMarks();
            var charArrayList = lstPunctuation.Select(str => str.ToCharArray()).ToList();
            //We require a custom configuration
            var config = Configuration.Default.WithJavaScript().WithCss();
            //Let's create a new parser using this configuration
            //var parser = new HtmlParser(new HtmlParserOptions {IsEmbedded = true }, config);
            var parser = new HtmlParser(config);

            // https://github.com/glienard/html-to-sql/blob/master/htmlexample.html
            var source   = File.ReadAllText(fileUrl);
            var document = parser.Parse(source);
            var pageData = new Page();

            #region For All Page Data
            var robots = document.All.FirstOrDefault(x => x.LocalName == "meta" && x.GetAttribute("Name") == "robots");
            if ((robots as IHtmlMetaElement) != null)
            {
                var content = (robots as IHtmlMetaElement).Content;
                if (content.IndexOf("NoFollow", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoFollow = true;
                }
                if (content.IndexOf("NoIndex", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoIndex = true;
                }
                if (content.IndexOf("NoArchive", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoArchive = true;
                }
                if (content.IndexOf("NoImageIndex", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoImageIndex = true;
                }
                if (content.IndexOf("NoSnippet", StringComparison.OrdinalIgnoreCase) != -1)
                {
                    pageData.NoSnippet = true;
                }
            }
            #endregion

            #region For All Links Of Page
            var linkTags = document.All.Where(x => x.Attributes["href"] != null || x.Attributes["src"] != null);
            var frame    = document.QuerySelectorAll("frame");

            var lstLinks = new List <Links>();
            foreach (var link in linkTags)
            {
                var nofollow    = link.Attributes["rel"];
                var nofollowInt = nofollow?.Value == "nofollow" ? 1 : 0;
                lstLinks.Add(new Links
                {
                    Url       = link.Attributes["href"] == null ? link.Attributes["src"].Value : link.Attributes["href"].Value,
                    LinkType  = link.LocalName,
                    NotFollow = nofollowInt
                });
            }
            #endregion

            var html = document.All.FirstOrDefault(x => x.LocalName == "html");
            var lang = (html as IHtmlElement).Language;

            #region HEADER PARSE
            var heder           = document.All.Where(x => x.LocalName == "head");
            var headerTotalTags = parser.Parse(heder.FirstOrDefault().OuterHtml).All;
            foreach (var tag in headerTotalTags)
            {
                var outerHmtl = tag.OuterHtml;

                if (skipItmes.Contains(tag.LocalName))
                {
                    continue;
                }

                var newDoc     = parser.ParseFragment(tag.TextContent.ToString(), null);
                var txtContent = newDoc.FirstOrDefault().TextContent;

                if (tag.LocalName == "meta")
                {
                    var localName = Helper.FirstCharToUpper(tag.LocalName);
                    txtContent = tag.GetAttribute("content");
                    if (!string.IsNullOrEmpty(txtContent))
                    {
                        var criteria = false;
                        tagName = string.Empty;

                        if (!tag.HasAttribute("name"))
                        {
                            continue;
                        }
                        criteria = (tag.GetAttribute("name").Equals("description") || tag.GetAttribute("name").Equals("keywords"));
                        if (!criteria)
                        {
                            txtContent = null;
                        }
                        else
                        {
                            var name = tag.GetAttribute("name");
                            name    = Helper.FirstCharToUpper(name);
                            tagName = $"{"Tag"}{localName}{name}";
                        }
                    }
                }

                // Model for header implimenting here...
                if (!string.IsNullOrEmpty(txtContent))
                {
                    if (tag.LocalName != "meta")
                    {
                        var localName = Helper.FirstCharToUpper(tag.LocalName);
                        tagName = $"{"Tag"}{localName}";
                    }
                    var resultnode = new ResultNode {
                        NodeName = tagName, NodeValue = txtContent
                    };
                    var ielemnode = tag as IHtmlElement;
                    //var language1 =  ielemnode == null ? lang : (tag as IHtmlElement).Language;
                    //if (language1.Contains("-"))
                    //{
                    var langCountry = lang.Split('-');
                    resultnode.Lang    = langCountry[0].ToString();
                    resultnode.Country = langCountry[1].ToString();
                    //}
                    var vsModelCollection = VSMasterSupport.FillingVSMasterModel(resultnode, charArrayList, orderId);
                    if (vsModelCollection != null && vsModelCollection.Count > 0)
                    {
                        orderId = 0;
                        foreach (var vsModel in vsModelCollection)
                        {
                            vsMasterModel.Add(vsModel);
                        }
                    }
                }
            }
            #endregion

            #region BODY PARSE
            skipItmes.Add("title");
            tagName = string.Empty;

            var body     = document.QuerySelector("body");
            var language = (body as IHtmlElement).Language;
            if (language.Contains("-"))
            {
                var langCountry = language.Split('-');
                pageData.Lang    = language[0].ToString();
                pageData.Country = language[1].ToString();
            }

            var bodyChildNodes = body.ChildNodes;

            // new code
            foreach (var node in bodyChildNodes)
            {
                var vsModelCollection = new List <VSMasterModel>();
                var txtContent        = node.TextContent;
                var IElenode          = node as IElement;
                var resultNode        = new ResultNode();
                if (string.IsNullOrEmpty(txtContent.Trim()) && IElenode == null)
                {
                    continue;
                }
                if (skipItmes.FindIndex(x => x.Equals(node.NodeName, StringComparison.OrdinalIgnoreCase)) != -1)
                {
                    continue;
                }

                ProcessNode(node, skipItmes, charArrayList, vsMasterModel, IElenode);
            }
            #endregion

            if (vsMasterModel.Count > 0)
            {
                var words = vsMasterModel.Select(x => x.Word).ToList();
                var res   = string.Join(" ", words);

                var csvString = ToCsv(vsMasterModel);
                File.WriteAllText("Output.csv", csvString);
            }
        }
Пример #5
0
        public static ResultNode GetNodeWithValue1(INode node, IElement IElemNode, ref List <string> parentInheritedNode, string ParentLink)
        {
            var txtContent = node.TextContent;

            if (IElemNode != null)
            {
                txtContent = string.Join(" ", IElemNode.ChildNodes.OfType <IText>().Select(m => m.Text.Trim()));

                if (IElemNode.HasAttribute("label"))
                {
                    txtContent = IElemNode.GetAttribute("label");
                }

                if (string.IsNullOrEmpty(txtContent))
                {
                    if (IElemNode.HasAttribute("alt"))
                    {
                        txtContent = IElemNode.GetAttribute("alt");
                    }
                }
            }

            if (txtContent.Length == 1 && char.IsPunctuation(txtContent[0]))
            {
                return(null);
            }

            // TODO :- Need to handle with previous sibling...of lstPunctuation.
            var nextsib = node.NextSibling;

            if (nextsib != null)
            {
                var child = IElemNode == null ? null : IElemNode.ChildNodes;
                if (child == null || (child.Length == 0 || (child[child.Length - 1] as IHtmlElement) == null))
                {
                    var isPunctuation = lstPunctuation.Contains(nextsib.TextContent.Trim());
                    if (isPunctuation)
                    {
                        txtContent = $"{txtContent}{nextsib.TextContent.Trim()}";
                    }
                }
            }
            else if (node.ParentElement.LocalName != "body")
            {
                if (node == node.ParentElement.ChildNodes[node.ParentElement.ChildNodes.Length - 1])
                {
                    nextsib = node.ParentElement.NextSibling;
                    if (nextsib != null)
                    {
                        var isPunctuation = lstPunctuation.Contains(nextsib.TextContent.Trim());
                        if (isPunctuation)
                        {
                            txtContent = $"{txtContent}{nextsib.TextContent.Trim()}";
                        }
                    }
                }
            }
            // var prevSib = node.PreviousSibling;

            var nodeName = node.NodeName.ToLower();

            if (node.NodeName.Trim().Equals("SELECT") && IElemNode != null)
            {
                nodeName = IElemNode.TagName.ToLower();
            }

            var localName  = FirstCharToUpper(nodeName);
            var tagName    = $"{"Tag"}{localName}";
            var resultNode = new ResultNode {
                NodeName = tagName, NodeValue = txtContent
            };

            if (IElemNode != null && IElemNode.Style.Children.Count() > 0)
            {
                var txt = IElemNode.Style.CssText;
                resultNode.CssOrInheritedProperties = txt;
                // for now only adding display, need to add othe properties too
                if (IElemNode.Style.GetPropertyValue("display") == "none" || IElemNode.Style.GetPropertyValue("display") == "hidden")
                {
                    resultNode.InheritedTags.Add("TagHidden");
                }
            }

            if (parentInheritedNode != null)
            {
                resultNode.InheritedTags.UnionWith(parentInheritedNode);
            }

            if (parentInheritedNode == null)
            {
                parentInheritedNode = new List <string>();
            }
            parentInheritedNode.Add(tagName);

            if (resultNode.InheritedTags.Count > 0)
            {
                if (parentInheritedNode == null)
                {
                    parentInheritedNode = new List <string>();
                }
                parentInheritedNode.AddRange(resultNode.InheritedTags);
            }

            #region Set Language
            // TODO :- default language should be page language
            var language = IElemNode == null ? "en-gb" : (node as IHtmlElement).Language;
            if (language.Contains("-"))
            {
                var langCountry = language.Split('-');
                resultNode.Lang    = langCountry[0].ToString();
                resultNode.Country = langCountry[1].ToString();
            }
            #endregion

            #region Set Link
            var boolNofollow = false;
            if (IElemNode != null && IElemNode.HasAttribute("href") && !IElemNode.GetAttribute("href").StartsWith("#"))
            {
                resultNode.Link = IElemNode.GetAttribute("href");
                var nofollow = IElemNode.Attributes["rel"];
                boolNofollow = nofollow?.Value == "nofollow" ? true : false;
            }
            else if (IElemNode != null && IElemNode.HasAttribute("src") && !IElemNode.GetAttribute("src").StartsWith("#"))
            {
                resultNode.Link = IElemNode.GetAttribute("src");
                var nofollow = IElemNode.Attributes["rel"];
                boolNofollow = nofollow?.Value == "nofollow" ? true : false;
            }
            else
            {
                resultNode.Link = ParentLink;
            }

            #endregion

            if (boolNofollow)
            {
                resultNode.InheritedTags.Add("NoFollowLink");
            }

            return(resultNode);
        }
Пример #6
0
        public static void ProcessNode(INode Node, List <string> skipItmes, List <char[]> charArrayList, List <VSMasterModel> vsMasterModel, IElement IElemNode = null, HashSet <string> parentInheritedNode = null, string parentLink = null, IElement parentElement = null, string parentText = null, string parentLanguage = null)
        {
            var isParentNode = false;

            if (parentElement == null)
            {
                isParentNode = true;
            }
            if (Node.NodeName.ToLower() == "script")
            {
                var data = IElemNode.ChildNodes.OfType <IText>().Select(m => m.Text.Trim());
            }
            if (IElemNode != null)
            {
                if (parentElement == null)
                {
                    parentElement = IElemNode;
                }
                var textOfElement1 = string.Join("", IElemNode.ChildNodes.OfType <IText>().Select(m => m.Text.Trim()));
                if (string.IsNullOrEmpty(textOfElement1?.Trim()) && IElemNode.Children.Length == 0 && !(IElemNode.HasAttribute("label") || IElemNode.HasAttribute("alt")))
                {
                    return;
                }
            }

            var resultNode = GetNodeWithValue(Node, parentElement, ref parentInheritedNode, parentLink, parentText);

            if (resultNode?.NodeValue.Trim().ToLower() == "menu")
            {
            }
            if (resultNode != null && !isParentNode && !string.IsNullOrEmpty(resultNode.NodeValue?.Trim()) && IsContinueWord(resultNode.NodeValue, Node))
            {
                if (intermediateResultNode != null && !string.IsNullOrEmpty(intermediateResultNode.NodeValue.Trim()))
                {
                    intermediateResultNode.NodeValue = intermediateResultNode.NodeValue + resultNode.NodeValue;
                }
                else
                {
                    intermediateResultNode = resultNode;
                }
                resultNode = null;
            }

            if (IElemNode != null && (IElemNode.Children.Length > 0 || Node.ChildNodes.Length > 0))
            {
                if (resultNode.IsLabelValue) // is label value true, means this have some data in there alt or lable
                {
                    AddResultNodeToMasterModel(resultNode, ref vsMasterModel, charArrayList);
                }
                foreach (var INodeElem in Node.ChildNodes)
                {
                    //if ((INodeElem as IElement) == null) continue;
                    if (skipItmes.FindIndex(x => x.Equals(INodeElem.NodeName, StringComparison.OrdinalIgnoreCase)) != -1)
                    {
                        continue;
                    }
                    var lang = string.IsNullOrEmpty(resultNode.Lang) ? "" : $"{resultNode.Lang}-{resultNode.Country}";
                    ProcessNode(INodeElem, skipItmes, charArrayList, vsMasterModel, INodeElem as IElement, parentInheritedNode, resultNode.Link, parentElement, "", lang);
                }
            }
            else
            {
                if (resultNode != null && !string.IsNullOrEmpty(resultNode.NodeValue.Trim()) && intermediateResultNode == null)
                {
                    AddResultNodeToMasterModel(resultNode, ref vsMasterModel, charArrayList);
                }
                else if (intermediateResultNode != null && !string.IsNullOrEmpty(intermediateResultNode.NodeValue.Trim()) && intermediateResultNode != null && IsLastSibling(Node, parentElement))
                {
                    AddResultNodeToMasterModel(intermediateResultNode, ref vsMasterModel, charArrayList);
                    intermediateResultNode = null;
                    if (resultNode != null && !string.IsNullOrEmpty(resultNode.NodeValue?.Trim())) // add result node too, this can be a case when the sibling doesnt follows the continue word rule
                    {
                        AddResultNodeToMasterModel(resultNode, ref vsMasterModel, charArrayList);
                    }
                }
            }
        }