private void ParseCountries(HtmlTag parent) { List<HtmlTag> tagList = null; string currentCountry = ""; string currentState = ""; string currentCity = ""; parent.FilterForChildrenByNameAndAttribute("div", new KeyValuePair<string, string>("class", "colmask"), out tagList); if (tagList != null) { Dictionary<string, List<KeyValuePair<string, string>>> searchTagList = new Dictionary<string, List<KeyValuePair<string, string>>>(); searchTagList["a"] = new List<KeyValuePair<string, string>>() { new KeyValuePair<string, string>("name", "*"), new KeyValuePair<string, string>("href", "*") }; searchTagList["div"] = new List<KeyValuePair<string, string>>() { new KeyValuePair<string, string>("class", "state_delimiter") }; foreach (HtmlTag child in tagList) { List<HtmlTag> stateList = null; child.FilterForChildrenByNameAndAttribute(searchTagList, out stateList); if (stateList != null) { foreach (HtmlTag stateChild in stateList) { if (stateChild.Name == "a") { if (stateChild.Attributes.ContainsKey("name")) { //Country stateChild.Attributes.TryGetValue("name", out currentCountry); SectionToName.TryGetValue(currentCountry, out currentCountry); LocationDictionary[currentCountry] = new Dictionary<string, Dictionary<string, string>>(); } else { //City/Entry String entry = null; stateChild.Attributes.TryGetValue("href", out entry); LocationDictionary[currentCountry][currentState][stateChild.Value] = entry; } } else if (stateChild.Name == "div") { //State currentState = stateChild.Value; LocationDictionary[currentCountry][currentState] = new Dictionary<string, string>(); } } } } } }
public bool FilterNodes(ParseFilter filter, out HtmlTag parent) { parent = null; foreach (HtmlTag tag in _nodes) { if (tag.Name == filter.Name) { if (filter.isParent()) parent = tag.VanillaCopy(null); else parent = new HtmlTag(); return _FilterNodes(ref filter, ref parent, tag); } } return false; }
protected bool ParseTag( ref HtmlTag parent ) { //Have we found our opening tag? bool found = false; //While we haven't found our corresponding opening tag and we still have room left: while (!found && !EOF()) { //We didn't find it. Damn. if (!MoveToNextTag()) break; //Temporary traversal index int index = _pos; SkipWhitespace(ref index); //The end-tag of a nested child if (_html[_pos + 1] == '/' && !NextTagIsComment()) { //Nested tag parent.HadChildren = true; HtmlTag child_tag = new HtmlTag(); //Closing tag start child_tag.CloseTag_Start = _pos; child_tag.Level = parent.Level + 1; //This tag's parent is us. child_tag.Parent = parent; //Move passed the '</' index += 2; //Skip to first character of name SkipWhitespace(ref index); //Parse name child_tag.Name = ParseTagName(ref index); //Find the closing of this close tag SkipWhitespace(ref index); child_tag.CloseTag_Close = index; //We don't want scripts if (child_tag.Name == "script") continue; //Valid tag, parse. ParseTag(ref child_tag); //Check if there is something in between the end of this tag and our previous first child index++; SkipWhitespace(ref index); if (parent.Children.Count > 0 && parent.Children[0].OpenTag_Start != index) parent.MiscellaneousItems.Insert(0, _html.Substring(index, (parent.Children[0].OpenTag_Start - index))); //Insert at head of children parent.Children.Insert(0, child_tag); } else if (!NextTagIsComment()) { index++; string name = ParseTagName(ref index); if(name == "script") continue; SkipWhitespace(ref index); if (name == parent.Name) { //Found it parent.OpenTag_Start = _pos; ParseTagAttributes(ref parent, ref index); return true; } else { parent.HadChildren = true; HtmlTag child_tag = new HtmlTag(); child_tag.Name = name; child_tag.Level = parent.Level + 1; child_tag.OpenTag_Start = _pos; child_tag.Parent = parent; ParseTagAttributes(ref child_tag, ref index); if (child_tag.TrailingSlash && parent.Children.Count > 0 && parent.Children[0].OpenTag_Start != index) parent.MiscellaneousItems.Insert(0, _html.Substring(index, (parent.Children[0].OpenTag_Start - index))); parent.Children.Insert(0, child_tag); } } } return false; }
private bool _FilterNodes(ref ParseFilter filter, ref HtmlTag new_parent, HtmlTag current_parent) { if (!filter.AllChildren) { if (filter.AcceptableChildren.Count == 0) { if (current_parent.Children.Count == 0) return true; return false; } } foreach (HtmlTag tag in current_parent.Children) { ParseFilter child_filter = null; if (filter.AllChildren || filter.AcceptableChildren.TryGetValue(tag.Name, out child_filter)) { if (child_filter.isParent()) { new_parent = tag.VanillaCopy(null); if (_FilterNodes(ref child_filter, ref new_parent, tag)) { filter = child_filter; return true; } } else { HtmlTag new_child = tag.VanillaCopy(new_parent); if (_FilterNodes(ref child_filter, ref new_child, tag)) { if (child_filter.isParent()) { filter = child_filter; new_parent = new_child; return true; } new_parent.Children.Add(new_child); } } } } if (filter.AcceptableChildren.Count > 0 && new_parent.Children.Count == 0) return false; return true; }
/// <summary> /// Parses the contents of an HTML tag. The current position should /// be at the first character following the tag's opening less-than /// character. /// /// Note: We parse to the end of the tag even if this tag was not /// requested by the caller. This ensures subsequent parsing takes /// place after this tag /// </summary> /// <param name="name">Name of the tag the caller is requesting, /// or "*" if caller is requesting all tags</param> /// <param name="tag">Returns information on this tag if it's one /// the caller is requesting</param> /// <returns>True if data is being returned for a tag requested by /// the caller or false otherwise</returns> protected bool ParseTag( ref HtmlTag parent ) { // Special handling /*bool doctype = _scriptBegin = false; if (String.Compare(s, "!DOCTYPE", true) == 0) doctype = true; else if (String.Compare(s, "script", true) == 0) _scriptBegin = true;*/ bool found = false; while (!found && !EOF()) { if (!MoveToNextTag()) break; int index = _pos; SkipWhitespace(ref index); if (_html[_pos + 1] == '/' && !NextTagIsComment()) { //Nested tag parent.HadChildren = true; HtmlTag child_tag = new HtmlTag(); child_tag.CloseTag_Start = _pos; child_tag.Parent = parent; index += 2; SkipWhitespace(ref index); child_tag.Name = ParseTagName(ref index); if (child_tag.Name == "script") continue; ParseTag(ref child_tag); parent.Children.Insert(0, child_tag); } else if (!NextTagIsComment()) { index++; string name = ParseTagName(ref index); if(name == "script") continue; SkipWhitespace(ref index); if (name == parent.Name) { //Found it parent.OpenTag_Start = _pos; ParseTagAttributes(ref parent, ref index); return true; } else { parent.HadChildren = true; HtmlTag child_tag = new HtmlTag(); child_tag.Name = name; child_tag.OpenTag_Start = _pos; child_tag.Parent = parent; ParseTagAttributes(ref child_tag, ref index); parent.Children.Insert(0, child_tag); } } /*if (parent.Children.Count > 0) _pos = parent.Children.First().OpenTag_Start; else _pos = tag_position;*/ } return false; }
/// <summary> /// JH: Parses entire html stream in to HtmlTag format /// </summary> public bool ParseHTML() { while (MoveToNextTag()) { HtmlTag node = new HtmlTag(); node.Parent = null; int index = _pos; if (_html[_pos + 1] == '/') { node.CloseTag_Start = index; index += 2; node.Name = ParseTagName(ref index); _pos = node.CloseTag_Start; if (node.Name != null) if (ParseTag(ref node)) _pos = node.OpenTag_Start; } else { node.OpenTag_Start = _pos; index++; node.Name = ParseTagName(ref index); ParseTagAttributes(ref node, ref index); } _nodes.Insert(0, node); } return true; }
/// <summary> /// Parses the contents of an HTML tag. The current position should /// be at the first character following the tag's opening less-than /// character. /// /// Note: We parse to the end of the tag even if this tag was not /// requested by the caller. This ensures subsequent parsing takes /// place after this tag /// </summary> /// <param name="name">Name of the tag the caller is requesting, /// or "*" if caller is requesting all tags</param> /// <param name="tag">Returns information on this tag if it's one /// the caller is requesting</param> /// <returns>True if data is being returned for a tag requested by /// the caller or false otherwise</returns> protected bool ParseTag(string name, ref HtmlTag tag) { // Get name of this tag string s = ParseTagName(); // Special handling bool doctype = _scriptBegin = false; if (String.Compare(s, "!DOCTYPE", true) == 0) { doctype = true; } else if (String.Compare(s, "script", true) == 0) { _scriptBegin = true; } // Is this a tag requested by caller? bool requested = false; if (name == "*" || String.Compare(s, name, true) == 0) { // Yes, create new tag object tag = new HtmlTag(); tag.Name = s; tag.Attributes = new Dictionary <string, string>(); requested = true; } // Parse attributes SkipWhitespace(); while (Peek() != '>') { if (Peek() == '/') { // Handle trailing forward slash 处理关闭标签 if (requested) { tag.TrailingSlash = true; } Move(); SkipWhitespace(); // If this is a script tag, it was closed _scriptBegin = false; } else { // Parse attribute name s = (!doctype) ? ParseAttributeName() : ParseAttributeValue(); SkipWhitespace(); // Parse attribute value string value = String.Empty; if (Peek() == '=') { Move(); SkipWhitespace(); value = ParseAttributeValue(); SkipWhitespace(); } // Add attribute to collection if requested tag if (requested) { // This tag replaces existing tags with same name if (tag.Attributes.Keys.Contains(s)) { tag.Attributes.Remove(s); } tag.Attributes.Add(s, value); } } } // Skip over closing '>' Move(); return(requested); }
private void FillLastFive(ref List<string> LastFiveEntriesSearched, HtmlTag parent) { int max_count = (parent.Children.Count < 5) ? parent.Children.Count : 5; for (int i = 0; i < max_count; i++) { HtmlTag temp_p = parent.Children[i]; if (temp_p.Children.Count > 0) { HtmlTag temp_a = temp_p.Children[0]; if (temp_a.Attributes.Count > 0) { string temp_site = ""; if (temp_a.Attributes.TryGetValue("href", out temp_site)) LastFiveEntriesSearched.Insert(LastFiveEntriesSearched.Count, temp_site); } } } }
public void FilterForChildrenByNameAndAttribute(Dictionary<string, KeyValuePair<string, string>> tag_list, ref HtmlTag parent) { if (Children.Count == 0 || parent == null) return; for (int i = 0; i < Children.Count; i++) { KeyValuePair<string, string> valid_attribute = new KeyValuePair<string, string>(); if (tag_list.TryGetValue(Children[i].Name, out valid_attribute)) { if (valid_attribute.Key == "*") { if (Children[i].Attributes.ContainsValue(valid_attribute.Value)) parent.Children.Add(Children[i]); } else if (valid_attribute.Value == "*") { if (Children[i].Attributes.ContainsKey(valid_attribute.Key)) parent.Children.Add(Children[i]); } else if (Children[i].Attributes.Contains(valid_attribute)) parent.Children.Add(Children[i]); } Children[i].FilterForChildrenByNameAndAttribute(tag_list, ref parent); } }
private void _FilterForChildrenByName(string name, ref HtmlTag parent) { if (parent == null) parent = this.VanillaCopy(null); if (this.Children.Count == 0) return; for (int i = 0; i < this.Children.Count; i++) { if (this.Children[i].Name == name) parent.Children.Add(this.Children[i]); this.Children[i]._FilterForChildrenByName(name, ref parent); } }
public void FilterForChildrenByName(List<string> names, ref HtmlTag parent) { if (Children.Count == 0 || parent == null) return; for (int i = 0; i < Children.Count; i++) { if (names.Contains(Children[i].Name)) parent.Children.Add(Children[i]); Children[i].FilterForChildrenByName(names, ref parent); } }
public void FilterForChildrenByName(string name, out HtmlTag parent) { parent = this.VanillaCopy(null); this._FilterForChildrenByName(name, ref parent); }
/// <summary> /// Parses the next tag that matches the specified tag name 解析下一个标记,它指定标记名称匹配 /// </summary> /// <param name="name">Name of the tags to parse ("*" = parse all tags) 标记的名称解析("*"=解析所有标签) /// </param> /// <param name="tag">Returns information on the next occurrence of the specified tag or null if none found 返回下一个出现的指定标签上的信息如果未找到,则为null</param> /// <returns>True if a tag was parsed or false if the end of the document was reached 文档解析状态成功或失败或到文档末尾</returns> public bool ParseNext(string name, out HtmlTag tag) { tag = null; // Nothing to do if no tag specified if (String.IsNullOrEmpty(name)) { return(false); } // Loop until match is found or there are no more tags 循环查找匹配项 while (MoveToNextTag()) { // Skip opening '<' Move(); // Examine first tag character char c = Peek(); if (c == '!' && Peek(1) == '-' && Peek(2) == '-') { // Skip over comments 跳过注释 const string endComment = "-->"; _pos = _html.IndexOf(endComment, _pos); NormalizePosition(); Move(endComment.Length); } else if (c == '/') { // Skip over closing tags 跳过关闭标签 _pos = _html.IndexOf('>', _pos); NormalizePosition(); Move(); } else { // Parse tag 解析标签 bool result = ParseTag(name, ref tag); // Because scripts may contain tag characters, // we need special handling to skip over // script contents if (_scriptBegin) { const string endScript = "</script"; _pos = _html.IndexOf(endScript, _pos, StringComparison.OrdinalIgnoreCase); NormalizePosition(); Move(endScript.Length); SkipWhitespace(); if (Peek() == '>') { Move(); } } // Return true if requested tag was found if (result) { return(true); } } } return(false); }
protected void ParseTagAttributes(ref HtmlTag tag, ref int index) { while (_html[index] != '>') { if (_html[index] == '/') { // Handle trailing forward slash tag.TrailingSlash = true; index++; SkipWhitespace(ref index); tag.CloseTag_Close = index; } else { // Parse attribute name string attribute_name = ParseAttributeName(ref index); SkipWhitespace(ref index); // Parse attribute value string value = String.Empty; if (_html[index] == '=') { index++; SkipWhitespace(ref index); value = ParseAttributeValue(ref index); SkipWhitespace(ref index); } // This tag replaces existing tags with same name if (tag.Attributes.Keys.Contains(attribute_name)) tag.Attributes.Remove(attribute_name); tag.Attributes.Add(attribute_name, value); } } tag.OpenTag_Close = index; index++; int value_end = _html.IndexOf("<", index); if( value_end != -1 ) tag.Value = _html.Substring(index, value_end - index); }
//title = <html><head><title> //body = <html><body><div id="userbody"> private void SearchEntry(string entry_site, HtmlTag parent) { if (entry_site == String.Empty) return; // HtmlParser.HtmlParser parser = new HtmlParser.HtmlParser(); if (!parser.ParseURL(entry_site, true, new string[] {"<br>"})) return; List<HtmlParser.HtmlTag> nodes = parser._nodes; HtmlTag new_parent = new HtmlTag(); new_parent.Name = "Artificial Parent"; HtmlParser.ParseFilter title_filter = HtmlParser.ParseFilter.Create("html(head(title[parent]))"); HtmlParser.ParseFilter body_filter = HtmlParser.ParseFilter.Create("html(head(div[parent]))"); HtmlParser.HtmlTag title_tag = null; parser.FilterNodes(title_filter, out title_tag); HtmlParser.HtmlTag body_tag = null; parser.FilterNodes(body_filter, out body_tag); for (int i = 0; i < nodes.Count; i++) { if (nodes[i].Name == "html") { try { HtmlTag header_tag = null; nodes[i].FilterForChildrenByName("title", out header_tag); string title = header_tag.Children[0].Value; Dictionary<string, KeyValuePair<string, string>> tag_list = new Dictionary<string, KeyValuePair<string, string>>(); tag_list.Add("div", new KeyValuePair<string, string>("id", "userbody")); nodes[i].FilterForChildrenByNameAndAttribute(tag_list, ref new_parent); //Deleted by author, expired, etc. if(new_parent.Children.Count == 0) continue; string body = new_parent.Children[0].Value; if (body != null && body != String.Empty) { body = body.ToLower(); foreach (string keyword in Details_.Keywords_) { if (!body.Contains(keyword) && !title.Contains(keyword)) { int start = entry_site.LastIndexOf('/'); string output_file = entry_site.Substring(start, entry_site.Length - start); output_file += ".xml"; System.IO.StreamWriter test_xml = new System.IO.StreamWriter(output_file, false); test_xml.WriteLine("Couldn't find: '" + keyword + "' in body: '" + body + "'"); test_xml.WriteLine(parser.ToString()); test_xml.Close(); return; } } matchingEntriesFound++; Parent_.UpdateEntries(parent.ToString()); } } catch (Exception error) { Logger.Instance.Log(error.ToString(), Details_.City_, LogType.ltError); } } entries_searched_++; Parent_.UpdateTotalSearched(); } }
/// <summary> /// JH: Parses entire html stream in to HtmlTag format /// </summary> public bool ParseHTML() { //Find our first tag while (MoveToNextTag()) { //Create a new tag HtmlTag node = new HtmlTag(); node.Parent = null; node.Level = 0; //Our temporary forward-traversal index int index = _pos; //This is a closing tag if (_html[_pos + 1] == '/') { node.CloseTag_Start = index; //Move passed the '</' index += 2; //Parse tag name from current index node.Name = ParseTagName(ref index); //Reset to the index of the closing '</' _pos = node.CloseTag_Start; //If this was a valid closing tag, parse until we find our opening tag if (node.Name != null) if (ParseTag(ref node)) _pos = node.OpenTag_Start;//Found our opening tag } else { //This is an opening tag. node.OpenTag_Start = _pos; //Move passed the '<' index++; //Get our name node.Name = ParseTagName(ref index); //Parse our attributes. ParseTagAttributes(ref node, ref index); } //Add this tag to our list of top-level tags _nodes.Insert(0, node); } return true; }
public HtmlTag VanillaCopy(HtmlTag NewParent) { HtmlTag copy = new HtmlTag(); copy.Attributes = this.Attributes; copy.Name = this.Name; copy.Parent = NewParent; copy.TrailingSlash = this.TrailingSlash; copy.Value = this.Value; copy.OpenTag_Start = this.OpenTag_Start; copy.OpenTag_Close = this.OpenTag_Close; copy.CloseTag_Start = this.CloseTag_Start; return copy; }
private void ParseSectionNames(HtmlTag parent) { List<HtmlTag> tagList = null; parent.FilterForChildrenByNameAndAttribute("div", new KeyValuePair<string, string>("class", "jump_to_continents"), out tagList); if (tagList != null) { HtmlTag locationsStuff = tagList[0]; foreach (HtmlTag child in locationsStuff.Children) { String key = String.Empty; if (child.Attributes.TryGetValue("href", out key)) { key = key.Substring(1, key.Length - 1); SectionToName.Add(key, child.Value); } } } }
public void DownloadLocations() { string site = "http://www.craigslist.org/about/sites"; HtmlParser.HtmlParser parser = new HtmlParser.HtmlParser(); if (!parser.ParseURL(site, false, new string[] {})) return; List<HtmlParser.HtmlTag> nodes = parser._nodes; HtmlParser.HtmlTag new_parent = null; for (int i = 0; i < nodes.Count; i++) { if (nodes[i].Name == "html") { nodes[i].FilterForChildrenByName("div", out new_parent); List<string> class_names = new List<string>(); class_names.Add("jump_to_continents"); class_names.Add("colmask"); Dictionary<string, List<string>> filter_for = new Dictionary<string, List<string>>(); filter_for.Add("class", class_names); new_parent.FilterOutChildrenByAttribute(filter_for); Dictionary<string, string> abbr_to_area = new Dictionary<string, string>(); for (int z = 0; z < new_parent.Children.Count; z++) { HtmlTag Child = new_parent.Children[z]; if(Child.Attributes.Contains(new KeyValuePair<string,string>("class", "jump_to_continents"))) { for(int a = 0; a < Child.Children.Count; a++) { string abbreviation = Child.Children[a].Attributes.ElementAt(0).Value; abbreviation = abbreviation.Substring(1, abbreviation.Length - 1); abbr_to_area.Add(abbreviation, Child.Children[a].Value); } } else if(Child.Attributes.Contains(new KeyValuePair<string,string>("class", "colmask"))) { HtmlParser.HtmlTag NewerChild = new HtmlParser.HtmlTag(); Dictionary<string, KeyValuePair<string, string>> tag_list = new Dictionary<string, KeyValuePair<string, string>>(); tag_list.Add("h1", new KeyValuePair<string, string>("class", "continent_header")); tag_list.Add("div", new KeyValuePair<string, string>("class", "state_delimiter")); tag_list.Add("a", new KeyValuePair<string, string>("href", "*")); Child.FilterForChildrenByNameAndAttribute(tag_list, ref NewerChild); string current_area = ""; string current_state = ""; for (int count_newer_children = 0; count_newer_children < NewerChild.Children.Count; count_newer_children++ ) { HtmlParser.HtmlTag temp_child = NewerChild.Children[count_newer_children]; bool val_found = true; switch (temp_child.Name) { /*area*/ case "h1": { string temp_area = temp_child.Children[0].Attributes.ElementAt(0).Value; if (!abbr_to_area.TryGetValue(temp_area, out current_area)) { val_found = false; break; } if (!LocationDictionary.ContainsKey(current_area)) LocationDictionary.Add(current_area, new Dictionary<string, Dictionary<string, string>>()); } break; /*state*/ case "div": { current_state = temp_child.Value; if (current_state == null) current_state = "Unspecified"; try { if (!LocationDictionary[current_area].ContainsKey(current_state)) LocationDictionary[current_area].Add(current_state, new Dictionary<string, string>()); } catch (System.Exception ex) { string error = ex.ToString(); } } break; /*city*/ case "a": { string city = temp_child.Value; string website = temp_child.Attributes.ElementAt(0).Value; LocationDictionary[current_area][current_state].Add(city, website); } break; } if( !val_found ) break; } new_parent.Children[z] = NewerChild; } } } } }