/// <summary> /// Saves the <see cref="HtmlDocument"/> to the specified path. /// </summary> /// <param name="path">The file to write to.</param> /// <param name="node">The <see cref="HtmlNode"/> to save.</param> /// <param name="encoding">The encoding to use when writing the file.</param> /// <seealso cref="HtmlNode"/> /// <see cref="HtmlDocument"/> public static void Save(string path, HtmlNode node, Encoding encoding) { if (path == null || node == null) { DebugBreakOrThrow("Figure out why " + (path == null ? "path" : "node") + "is null", new ArgumentNullException(path == null ? "path" : "node")); } // Will only be triggered if the caller isn't called by another Save(). if (encoding == null) { DebugBreakOrThrow("Figure out why encoding is null.", new ArgumentNullException("encoding")); } using (FileStream fileStream = new FileStream(path, FileMode.Create, FileAccess.Write, FileShare.Read, 1024)) { using (StreamWriter streamWriter = new StreamWriter(fileStream, encoding)) { // StreamWriter is a TextWriter, so we can pass it to HtmlNode.WriteContentTo(TextWriter). // So node.WriteTo only saves the current node, which is only useful if the node has no children. // node.WriteContentTo, only saves the current nodes children, therefore if the current node has a parent, we use the parent to save. // this will include all siblings aswell :S if (node.ParentNode != null) { node.ParentNode.WriteContentTo(streamWriter); } else if (node.HasChildNodes == false) { node.WriteTo(streamWriter); } else if (node.Name == HtmlNode.HtmlNodeTypeNameDocument) { node.WriteContentTo(streamWriter); } else { // TODO: Properly save parent-less node with children. DebugBreakOrThrow("Properly save parent-less node with children. Inspect 'node'.", new InvalidOperationException("Don't know how to save the node, and it's children!")); } streamWriter.Flush(); } } }
private static void GetSafeHtmlIter(HtmlNode node, TextWriter writer, List<string[]> removals, string[] tagsWhiteList, string[] tagsBlackListDeleteContent) { bool found = false; bool deleleteTagFound = false; if (node.NodeType == HtmlNodeType.Text) { node.WriteTo(writer); } else { foreach (string tagName in tagsWhiteList) { if (node.Name.ToLower() == tagName.ToLower()) { found = true; break; } } } if (found) { WriteBeginTag(node, writer); } foreach (string tagName in tagsBlackListDeleteContent) { if (node.Name.ToLower() == tagName.ToLower()) { deleleteTagFound = true; removals.Add(new string[] { "Deleted tag and child content", node.Name }); break; } } if (!deleleteTagFound) { foreach (HtmlNode childNode in node.ChildNodes) { GetSafeHtmlIter(childNode, writer, removals, tagsWhiteList, tagsBlackListDeleteContent); } } if (found) { WriteEndTag(node, writer); } }
/// <summary> /// Convert an HtmlNode into a string /// </summary> /// <param name="node">HtmlNode</param> /// <returns>string of the HTML</returns> public static string Output(HtmlNode node) { string output = null; using (StringWriter sw = new StringWriter()) { node.WriteTo(sw); output = sw.ToString(); // strip off XML doc header if (!string.IsNullOrEmpty(output)) { int at = output.IndexOf("?>"); if (at >= 0) output = output.Substring(at + 2); } } return output; }
public static BsonDocument select_ul(HtmlNode node_input) { BsonDocument doc_result = new BsonDocument(); doc_result.Add("doc_id", DateTime.Now.ToString("yyyyMMddHHmmss") + DateTime.Now.Millisecond.ToString()); doc_result.Add("from_url", global_url); doc_result.Add("from_html_type", "ul"); doc_result.Add("html_path", node_input.XPath); doc_result.Add("original_html", node_input.WriteTo()); HtmlNodeCollection ul_nodes = node_input.SelectNodes(node_input.XPath + @"//li"); BsonArray ul_array = new BsonArray(); foreach (HtmlNode node in ul_nodes) { if (!string.IsNullOrEmpty(node.InnerText)) { ul_array.Add(node.InnerText); } } doc_result.Add("ul", ul_array); if (is_open_mongo) MongoHelper.insert_bson("web", doc_result); return doc_result; }
public static BsonDocument select_table(HtmlNode node_input) { BsonDocument doc_result = new BsonDocument(); doc_result.Add("doc_id", DateTime.Now.ToString("yyyyMMddHHmmss") + DateTime.Now.Millisecond.ToString()); doc_result.Add("from_url", global_url); doc_result.Add("from_html_type", "table"); doc_result.Add("html_path", node_input.XPath); doc_result.Add("original_html", node_input.WriteTo()); HtmlNodeCollection tr_nodes = node_input.SelectNodes(node_input.XPath + @"//tr"); string[] cells = new string[] { @"//td", @"//th" }; DataTable table = new DataTable(); for (int i = 0; i < 500; i++) { table.Columns.Add("C" + i.ToString()); } for (int i = 0; i < 500; i++) { DataRow row_new = table.NewRow(); for (int j = 0; j < 500; j++) { row_new[j] = "X000000X"; } table.Rows.Add(row_new); } for (int i = 0; i < tr_nodes.Count; i++) { BsonArray td_array = new BsonArray(); foreach (string cell in cells) { HtmlNodeCollection td_nodes = node_input.SelectNodes(tr_nodes[i].XPath + cell); int start = 0; if (td_nodes != null) { for (int k = 0; k < td_nodes.Count; k++) { if (table.Rows[i][start].ToString() == "X000000X") { foreach (HtmlAttribute attr in td_nodes[k].Attributes) { if (attr.Name.ToLower() == "rowspan") { int span_count = Convert.ToInt32(attr.Value); for (int j = 1; j < span_count; j++) { table.Rows[i + j][start] = td_nodes[k].InnerText; } } if (attr.Name.ToLower() == "colspan") { int span_count = Convert.ToInt32(attr.Value); for (int j = 1; j < span_count; j++) { table.Rows[i][start + j] = td_nodes[k].InnerText; } } } table.Rows[i][start] = td_nodes[k].InnerText; start = start + 1; } else { start = start + 1; k = k - 1; } } } } } //add table to doc BsonArray header_array = new BsonArray(); for (int i = 0; i < 500; i++) { if (table.Rows[0][i].ToString() != "X000000X") { header_array.Add(table.Rows[0][i].ToString()); } } doc_result.Add("header", header_array); for (int i = 1; i < 500; i++) { BsonArray td_array = new BsonArray(); for (int j = 0; j < 500; j++) { if (table.Rows[i][j].ToString() != "X000000X") { td_array.Add(table.Rows[i][j].ToString()); } } if (td_array.Count != 0) { doc_result.Add((i - 1).ToString(), td_array); } } if (is_open_mongo) MongoHelper.insert_bson("web", doc_result); return doc_result; }
/// <summary> /// Decodes the given HTML tag into a corresponding the web shape. /// </summary> /// <returns>A webshape corresponding to the given HTML content.</returns> /// <param name="htmlNode">The given node to decode.</param> /// <remarks> /// <para>If the tagname cannot be found or the content cannot be deserialized, <c>null</c> is returned.</para> /// </remarks> public static IWebShape DecodeWebShape(HtmlNode htmlNode) { string tagname = htmlNode.Name; string data = htmlNode.WriteTo (); using (StringReader sr = new StringReader(data)) { using (XmlReader xr = XmlReader.Create (sr)) { return DecodeWebShape (tagname, xr); } } }
public void saveNodeToFile(HtmlNode _node) { HtmlNode nodeContent; string stNodeContent; Encoding locEncoding = Encoding.Default; StreamWriter swriter = new StreamWriter(fileName, true, locEncoding); _node.WriteTo(swriter); swriter.Close(); }