public void Navigate(string _Url) { Uri _uri = null; try { _uri = new Uri(_Url); } catch (UriFormatException) { IsError = true; workingdownload = true; // Throw Incorrect Url return; } if (curNavigate != null && curNavigate.ThreadState == ThreadState.Running) { curNavigate.Abort(); while (curNavigate.ThreadState == ThreadState.Running) ; } IsError = false; workingUrl = _uri; workingPage = null; workingdownload = false; retry = 0; curNavigate = new Thread(new ThreadStart(DoNavigate)); curNavigate.Start(); }
static void DoNavigate() { if (++retry > 5) return; HttpWebRequest.DefaultWebProxy = null; HttpWebRequest wq = (HttpWebRequest)HttpWebRequest.Create(workingUrl); HttpWebResponse wr = null; try { wr = (HttpWebResponse)wq.GetResponse(); } catch { IsError = true; workingdownload = true; // Throw Unable to Connect return; } //If it is not a html page, try to download as file. if (!wr.ContentType.Contains("text/html")) { DownloadQueue.Enqueue(wr); return; } try { Stream wrs = null; try { wrs = wr.GetResponseStream(); } catch (ObjectDisposedException) { // If the connection was disconnected, Do Again. DoNavigate(); return; } StreamReader sr = new StreamReader(wrs); string html = sr.ReadToEnd(); workingPage = HtmlReader.GetDocument(html); workingdownload = true; } catch { IsError = true; workingdownload = true; // Error occured. } }
/// <summary> /// Tokenize an html document. /// </summary> /// <param name="HtmlString">A html document string</param> /// <returns>Tokenized html document</returns> public static Document Tokenize(string HtmlString) { HtmlTag.Initialize(); TokenType prevtype = TokenType.Text; Document doc = new Document(); TokenType type = TokenType.Text; bool quote = false; //True:" , False:' int sp = 0; for (int pos = 0; pos < HtmlString.Length; ++pos) { switch (HtmlString[pos]) { case '<': // If it is in a string or comment, ignore if (type == TokenType.String || type == TokenType.Comment) break; // If it is in a tag, actually previous one is not a tag. so change previous one as Text if (type == TokenType.Tag) { type = TokenType.Text; --pos; break; } //If it is possible to be a tag, add previous one. if (pos + 1 != HtmlString.Length && IsTagStartPossibleCharacter(HtmlString[pos + 1])) { if (sp <= pos - 1) { doc.Items.Add(GetElement(TokenType.Text, HtmlString.Substring(sp, pos - sp))); } type = TokenType.Tag; sp = pos; } //If it is a comment else if (pos + 4 < HtmlString.Length && HtmlString[pos + 1] == '!' && HtmlString[pos + 2] == '-' && HtmlString[pos + 3] == '-') { if (sp <= pos - 1) { doc.Items.Add(GetElement(TokenType.Text, HtmlString.Substring(sp, pos - sp))); } type = TokenType.Comment; sp = pos; } // If it is possible to be an end or special tag, add previous one. else if (pos + 2 != HtmlString.Length && (HtmlString[pos + 1] == '/' || HtmlString[pos + 1] == '!') && IsTagStartPossibleCharacter(HtmlString[pos + 2])) { if (sp <= pos - 1) { doc.Items.Add(GetElement(TokenType.Text, HtmlString.Substring(sp, pos - sp))); } type = TokenType.Tag; sp = pos; } // Otherwise, Process as Text else { type = TokenType.Text; } break; case '>': if (type == TokenType.Tag) { doc.Items.Add(GetElement(TokenType.Tag, HtmlString.Substring(sp, pos - sp + 1))); sp = pos + 1; type = TokenType.Text; } else if (type == TokenType.Comment) { // If it is the end of a comment if (pos - 2 > 0 && HtmlString[pos - 2] == '-' && HtmlString[pos - 1] == '-') { doc.Items.Add(GetElement(TokenType.Comment, HtmlString.Substring(sp, pos - sp + 1))); sp = pos + 1; type = TokenType.Text; } } break; case '\'': case '"': switch (type) { case TokenType.String: if ((quote && HtmlString[pos] == '\'') || (!quote && HtmlString[pos] == '"')) break; else type = prevtype; break; case TokenType.Tag: prevtype = type; quote = (HtmlString[pos] == '"'); type = TokenType.String; break; } break; } } if (sp != HtmlString.Length) { doc.Items.Add(GetElement(TokenType.Text, HtmlString.Substring(sp, HtmlString.Length - sp))); } return doc; }
public static string MakeHtml(Document doc) { StringBuilder sb = new StringBuilder(); foreach (var i in doc.Items) { switch (i.Type) { case Element.ElementType.Special: case Element.ElementType.Object: sb.Append('<'); if (!i.IsStartTag) sb.Append('/'); sb.Append(i.Name); foreach (var j in i.Attributes.Items.Values) { sb.Append(' '); sb.Append(j.Name); if (j.Value != "") { sb.Append("=\""); sb.Append(j.Value); sb.Append("\""); } } sb.Append(">\n"); break; case Element.ElementType.Text: sb.Append(i.Name); break; case Element.ElementType.Structure: sb.Append("\n<"); if (!i.IsStartTag) sb.Append('/'); sb.Append(i.Name); foreach (var j in i.Attributes.Items.Values) { sb.Append(' '); sb.Append(j.Name); if (j.Value != "") { sb.Append("=\""); sb.Append(j.Value); sb.Append("\""); } } sb.Append(">\n"); break; case Element.ElementType.Markup: sb.Append("<"); if (!i.IsStartTag) sb.Append('/'); sb.Append(i.Name); foreach (var j in i.Attributes.Items.Values) { sb.Append(' '); sb.Append(j.Name); if (j.Value != "") { sb.Append("=\""); sb.Append(j.Value); sb.Append("\""); } } sb.Append(">"); break; } } return sb.ToString(); }
/// <summary> /// Parse a tokenized html document. /// </summary> /// <param name="html">A tokenized html document</param> /// <returns>Parsed html document</returns> public static Document ParseHtml(Document html) { Document doc = new Document(); HtmlTag.Initialize(); Stack<Element> stk = new Stack<Element>(); //List<HtmlTag.Entity> markup = new List<HtmlTag.Entity>(); Stack<HtmlTag.Entity> markup = new Stack<HtmlTag.Entity>(); Element top = null; foreach (Element i in html.Items) { top = stk.Count > 0 ? stk.Peek() : null; switch (i.Type) { case Element.ElementType.Special: break; case Element.ElementType.Structure: if (i.IsStartTag) { if (top != null && !HtmlTag.TagMap[top.Name].Children.Contains(HtmlTag.TagMap[i.Name])) break; if (HtmlTag.TagMap[i.Name].IsStartTag) stk.Push(i); doc.Items.Add(i); } else { if (!HtmlTag.TagMap[i.Name].IsStartTag) break; if (top != null && top.Name == i.Name) { stk.Pop(); doc.Items.Add(i); } } break; case Element.ElementType.Markup: if (i.IsStartTag) { if (top != null && !HtmlTag.TagMap[top.Name].Children.Contains(HtmlTag.TagMap[i.Name])) break; /*if (CheckCanExist(markup, i)) { markup.Add(HtmlTag.TagMap[i.Name]); doc.Items.Add(i); }*/ markup.Push(HtmlTag.TagMap[i.Name]); doc.Items.Add(i); } else { /*if (markup.Contains(HtmlTag.TagMap[i.Name])) { markup.Reverse(); markup.Remove(HtmlTag.TagMap[i.Name]); markup.Reverse(); doc.Items.Add(i); }*/ if (markup.Peek() == HtmlTag.TagMap[i.Name]) { markup.Pop(); doc.Items.Add(i); } } break; case Element.ElementType.Object: if (top != null && !HtmlTag.TagMap[top.Name].Children.Contains(HtmlTag.TagMap[i.Name])) break; doc.Items.Add(i); break; case Element.ElementType.Text: if (top != null && !HtmlTag.TagMap[top.Name].Children.Contains(HtmlTag.TagMap["CDATA"])) break; doc.Items.Add(i); break; case Element.ElementType.Unknown: break; } } //Auto Recovery for unmarked elements =_=;; /*markup.Reverse(); foreach (var i in markup) { doc.Items.Add(new Element(i.Name, Element.ElementType.Markup, false)); }*/ while (markup.Count != 0) { Element i = markup.Pop(); doc.Items.Add(new Element(i.Name, Element.ElementType.Markup, false)); } while (stk.Count != 0) { Element i = stk.Pop(); doc.Items.Add(new Element(i.Name, i.Type, false)); } return doc; }