Example #1
0
 public JObject ParseDocument(string html)
 {
     JObject obj = null;
     html = System.Text.RegularExpressions.Regex.Replace(html, "(<script.*?>(.|\n)*?</script>)|(<style.*?>(.|\n)*?</style>)|(<!--.*?-->)", "");
     TextNavigator sc = new TextNavigator(html);
     sc.TrimStart();
     if (sc.StartsWith("<"))
     {
         obj = new JObject();
         while (!sc.IsAtEnd)
         {
             this.ParseNode(sc, obj);
         }
     }
     return obj;
 }
Example #2
0
        private KeyValuePair<string, JToken> ParseNode(TextNavigator tn, JObject parent)
        {
            tn.TrimStart();
            if (tn.StartValue == '<')
            {
                string name = null;
                JToken node = null;
                bool isComment = false;
                bool isDeclaration = false;

                #region "Start Tag: Name, Declaration, Comment"
                tn.Index = tn.Index + 1;
                int breakOff = 0;
                if (tn.StartsWith("?xml"))
                {
                    //Declaration
                    isDeclaration = true;
                    tn.Index += 4;
                }
                else if (tn.StartValue == '!')
                {
                    //Comment
                    isComment = true;
                    if (tn.Value[tn.Index + 1] == '-') breakOff = 1;
                    if (tn.Value[tn.Index + 2] == '-') breakOff = 2;
                    tn.Index += breakOff + 1;
                    tn.TrimStart();
                }
                else if (IsValidTagChar(tn.StartValue))
                {
                    //Name
                    name = ParseName(tn);
                    if (name != null) { node = new JObject(); }
                    else { throw new Exception("Invalid Node Name."); }
                }
                else
                {
                    throw new Exception("Invalid Node Name.");
                }
                #endregion

                if (node != null || isComment || isDeclaration)
                {
                    var attNames = new List<string>();
                    bool elementAtEnd = false;

                    if (name == "br") { elementAtEnd = true; node = null; }

                    #region "Attributes, Declaration, Comment"
                    //Declaration, Attributes, Comment

                    string comment = string.Empty;
                    string declVer = string.Empty;
                    string declEnc = string.Empty;
                    string declSta = string.Empty;
                    for (int i = tn.Index; i < tn.Value.Length; i++)
                    {
                        if (!isComment && !isDeclaration)
                        {
                            #region "Attributes"
                            //Attributes
                            if (tn.Value[i] != ' ')
                            {
                                if (tn.Value[i] == '>')
                                {
                                    tn.Index = i + 1;
                                    break;
                                }
                                else if (tn.Value[i] == '/' && tn.Value[i + 1] == '>')
                                {
                                    elementAtEnd = true;
                                    tn.Index += 2;
                                    break;
                                }
                                else if (IsValidTagChar(tn.Value[i]))
                                {
                                    JProperty att = ParseAttribute(tn.NewIndex(i));
                                    i = tn.Index - 1;
                                    if (att != null && node != null) { ((JObject)node).Add(att); attNames.Add(att.Name); }
                                }
                            }
                            #endregion
                        }
                        else
                        {
                            #region "Comment"
                            if (isComment)
                            {
                                //Comment
                                if ((breakOff == 2 && tn.Value[i] == '-' && tn.Value[i + 1] == '-' && tn.Value[i + 2] == '>') || (breakOff == 0 && tn.Value[i] == '>') || (breakOff == 1 && tn.Value[i] == '-' && tn.Value[i + 1] == '>'))
                                {
                                    //if (parent != null) parent.Add(new XComment(comment));
                                    tn.Index = i + breakOff + 1;
                                    break;
                                }
                                else
                                {
                                    comment += tn.Value[i];
                                }/**/
                            }
                            #endregion
                            #region"Declaration"
                            else if (isDeclaration)
                            {
                                //Declaration
                                if (tn.Value[i] == '?' && tn.Value[i + 1] == '>')
                                {
                                    //if (parent != null && parent is XDocument) this.SetDeclaration(declVer, declEnc, declSta, (XDocument)parent);
                                    tn.Index = i + 2;
                                    break;
                                }
                                else if (IsValidTagChar(tn.Value[i]))
                                {
                                    JProperty att = ParseAttribute(tn.NewIndex(i));
                                    i = tn.Index - 1;
                                    /* if (att != null)
                                     {
                                         if (att.Name.ToLower() == "version") { declVer = att.Value; }
                                         else if (att.Name.ToLower() == "encoding") { declEnc = att.Value; }
                                         else if (att.Name.ToLower() == "standalone") { declSta = att.Value; }
                                     }*/
                                }
                            }
                            #endregion
                        }
                    }

                    #endregion

                    if (name == "link") { elementAtEnd = true; }

                    ///Add to parent
                    if (node != null && parent != null)
                    {
                        this.AddItemToParent(node, name, parent);
                    }


                    if (node != null && elementAtEnd == false)
                    {
                        #region "Content & End Tag"
                        //Content & End Tag

                        string innerText = string.Empty;
                        for (int i = tn.Index; i < tn.Value.Length; i++)
                        {
                            if (tn.Value[i] == '<')
                            {
                                if (tn.Value[i + 1] == '/')
                                {
                                    #region "InnerText"
                                    //InnerText --> JValue
                                    if (innerText.Trim(new char[] { ' ', '\n', '\r', '\t' }).IsNullOrWhiteSpace() == false)
                                    {
                                        if (((JObject)node).Count == 0)
                                        {
                                            var newValue = new JValue(this.DecodeXml(innerText));
                                            foreach (JProperty elem in parent.Children<JProperty>())
                                            {
                                                if (elem.Name == name)
                                                {
                                                    if (elem.Value is JObject)
                                                    {
                                                        elem.Value = newValue;
                                                    }
                                                    else if (elem.Value is JArray)
                                                    {
                                                        var arr = (JArray)elem.Value;
                                                        int index = arr.IndexOf(node);
                                                        if (index >= 0)
                                                        {
                                                            arr.Insert(index, newValue);
                                                            arr.RemoveAt(index + 1);
                                                        }
                                                    }
                                                }
                                            }
                                            node = newValue;
                                        }
                                        else
                                        {
                                            if (node[CONTENT_PROPERTY_NAME] == null)
                                            {
                                                ((JObject)node).Add(new JProperty(CONTENT_PROPERTY_NAME, DecodeXml(innerText)));
                                            }
                                            else
                                            {
                                                ((JValue)node[CONTENT_PROPERTY_NAME]).Value = ((JValue)node[CONTENT_PROPERTY_NAME]).Value.ToString() + "\n" + DecodeXml(innerText);
                                            }
                                        }
                                        innerText = string.Empty;
                                    }
                                    #endregion

                                    #region "End Tag"

                                    //End Tag
                                    string endName = ParseName(tn.NewIndex(i + 2));
                                    if (endName != null)
                                    {
                                        if (endName == name)
                                        {
                                            //Correct actual end name
                                            tn.Index = i + 3 + name.Length;
                                            break;
                                        }
                                        else
                                        {
                                            //Other end name
                                            JObject pare = this.FindParent(node, endName);
                                            if (pare != null)
                                            {
                                                //Other end name relies to one parent --> move all, except attributes, to direct parent.
                                                if (node is JObject)
                                                {
                                                    this.MoveAllChildElementsToParent((JObject)node, parent, attNames);
                                                }
                                                tn.Index = i;
                                                break;
                                            }
                                            else
                                            {
                                                //Unknown end name --> ignore
                                                tn.Index = i + endName.ToString().Length + 2;
                                                i = tn.Index - 1;
                                            }
                                        }
                                    }
                                    else
                                    {
                                        throw new Exception("Invalid End Name.");
                                    }
                                    #endregion
                                }
                                else
                                {
                                    //Child Start Tag
                                    var child = this.ParseNode(tn.NewIndex(i), (JObject)node);

                                    if (child.Key == "br") { innerText += "\n"; }
                                    else if (child.Key != "a" && child.Key != "small")
                                    {
                                        if (innerText.IsNullOrWhiteSpace() == false)
                                        {
                                            JObject p = new JObject();
                                            p.Add(new JProperty(CONTENT_PROPERTY_NAME, innerText));

                                            innerText = string.Empty;
                                            this.AddItemToParent(p, "p", (JObject)node);
                                        }
                                    }
                                    i = tn.Index - 1;
                                }
                            }
                            else if (!(tn.Value[i] == ' ' && innerText == string.Empty))
                            {
                                //Inner Text
                                innerText += tn.Value[i];
                            }
                        }

                        #endregion
                    }

                }
                tn.TrimStart();
                return new KeyValuePair<string, JToken>(name, node);
            }