public JObject ParseDocument(string html) { JObject obj = null; html = System.Text.RegularExpressions.Regex.Replace(html, "(<script.*?>(.|\n)*?</script>)|(<style.*?>(.|\n)*?</style>)|(<!--.*?-->)", ""); TextNavigator sc = new TextNavigator(html); sc.TrimStart(); if (sc.StartsWith("<")) { obj = new JObject(); while (!sc.IsAtEnd) { this.ParseNode(sc, obj); } } return obj; }
private KeyValuePair<string, JToken> ParseNode(TextNavigator tn, JObject parent) { tn.TrimStart(); if (tn.StartValue == '<') { string name = null; JToken node = null; bool isComment = false; bool isDeclaration = false; #region "Start Tag: Name, Declaration, Comment" tn.Index = tn.Index + 1; int breakOff = 0; if (tn.StartsWith("?xml")) { //Declaration isDeclaration = true; tn.Index += 4; } else if (tn.StartValue == '!') { //Comment isComment = true; if (tn.Value[tn.Index + 1] == '-') breakOff = 1; if (tn.Value[tn.Index + 2] == '-') breakOff = 2; tn.Index += breakOff + 1; tn.TrimStart(); } else if (IsValidTagChar(tn.StartValue)) { //Name name = ParseName(tn); if (name != null) { node = new JObject(); } else { throw new Exception("Invalid Node Name."); } } else { throw new Exception("Invalid Node Name."); } #endregion if (node != null || isComment || isDeclaration) { var attNames = new List<string>(); bool elementAtEnd = false; if (name == "br") { elementAtEnd = true; node = null; } #region "Attributes, Declaration, Comment" //Declaration, Attributes, Comment string comment = string.Empty; string declVer = string.Empty; string declEnc = string.Empty; string declSta = string.Empty; for (int i = tn.Index; i < tn.Value.Length; i++) { if (!isComment && !isDeclaration) { #region "Attributes" //Attributes if (tn.Value[i] != ' ') { if (tn.Value[i] == '>') { tn.Index = i + 1; break; } else if (tn.Value[i] == '/' && tn.Value[i + 1] == '>') { elementAtEnd = true; tn.Index += 2; break; } else if (IsValidTagChar(tn.Value[i])) { JProperty att = ParseAttribute(tn.NewIndex(i)); i = tn.Index - 1; if (att != null && node != null) { ((JObject)node).Add(att); attNames.Add(att.Name); } } } #endregion } else { #region "Comment" if (isComment) { //Comment if ((breakOff == 2 && tn.Value[i] == '-' && tn.Value[i + 1] == '-' && tn.Value[i + 2] == '>') || (breakOff == 0 && tn.Value[i] == '>') || (breakOff == 1 && tn.Value[i] == '-' && tn.Value[i + 1] == '>')) { //if (parent != null) parent.Add(new XComment(comment)); tn.Index = i + breakOff + 1; break; } else { comment += tn.Value[i]; }/**/ } #endregion #region"Declaration" else if (isDeclaration) { //Declaration if (tn.Value[i] == '?' && tn.Value[i + 1] == '>') { //if (parent != null && parent is XDocument) this.SetDeclaration(declVer, declEnc, declSta, (XDocument)parent); tn.Index = i + 2; break; } else if (IsValidTagChar(tn.Value[i])) { JProperty att = ParseAttribute(tn.NewIndex(i)); i = tn.Index - 1; /* if (att != null) { if (att.Name.ToLower() == "version") { declVer = att.Value; } else if (att.Name.ToLower() == "encoding") { declEnc = att.Value; } else if (att.Name.ToLower() == "standalone") { declSta = att.Value; } }*/ } } #endregion } } #endregion if (name == "link") { elementAtEnd = true; } ///Add to parent if (node != null && parent != null) { this.AddItemToParent(node, name, parent); } if (node != null && elementAtEnd == false) { #region "Content & End Tag" //Content & End Tag string innerText = string.Empty; for (int i = tn.Index; i < tn.Value.Length; i++) { if (tn.Value[i] == '<') { if (tn.Value[i + 1] == '/') { #region "InnerText" //InnerText --> JValue if (innerText.Trim(new char[] { ' ', '\n', '\r', '\t' }).IsNullOrWhiteSpace() == false) { if (((JObject)node).Count == 0) { var newValue = new JValue(this.DecodeXml(innerText)); foreach (JProperty elem in parent.Children<JProperty>()) { if (elem.Name == name) { if (elem.Value is JObject) { elem.Value = newValue; } else if (elem.Value is JArray) { var arr = (JArray)elem.Value; int index = arr.IndexOf(node); if (index >= 0) { arr.Insert(index, newValue); arr.RemoveAt(index + 1); } } } } node = newValue; } else { if (node[CONTENT_PROPERTY_NAME] == null) { ((JObject)node).Add(new JProperty(CONTENT_PROPERTY_NAME, DecodeXml(innerText))); } else { ((JValue)node[CONTENT_PROPERTY_NAME]).Value = ((JValue)node[CONTENT_PROPERTY_NAME]).Value.ToString() + "\n" + DecodeXml(innerText); } } innerText = string.Empty; } #endregion #region "End Tag" //End Tag string endName = ParseName(tn.NewIndex(i + 2)); if (endName != null) { if (endName == name) { //Correct actual end name tn.Index = i + 3 + name.Length; break; } else { //Other end name JObject pare = this.FindParent(node, endName); if (pare != null) { //Other end name relies to one parent --> move all, except attributes, to direct parent. if (node is JObject) { this.MoveAllChildElementsToParent((JObject)node, parent, attNames); } tn.Index = i; break; } else { //Unknown end name --> ignore tn.Index = i + endName.ToString().Length + 2; i = tn.Index - 1; } } } else { throw new Exception("Invalid End Name."); } #endregion } else { //Child Start Tag var child = this.ParseNode(tn.NewIndex(i), (JObject)node); if (child.Key == "br") { innerText += "\n"; } else if (child.Key != "a" && child.Key != "small") { if (innerText.IsNullOrWhiteSpace() == false) { JObject p = new JObject(); p.Add(new JProperty(CONTENT_PROPERTY_NAME, innerText)); innerText = string.Empty; this.AddItemToParent(p, "p", (JObject)node); } } i = tn.Index - 1; } } else if (!(tn.Value[i] == ' ' && innerText == string.Empty)) { //Inner Text innerText += tn.Value[i]; } } #endregion } } tn.TrimStart(); return new KeyValuePair<string, JToken>(name, node); }