Beispiel #1
0
        public static string ToString(string html, HtmlNode node)
        {
            StringBuilder builder = new StringBuilder(128);

            if (node.NodeType == HtmlNodeType.Element)
            {
                string tagName = HtmlTags.GetTagName(node.TagId);
                builder.Append("<");
                builder.Append(tagName);
                if (node.Attributes != null)
                {
                    foreach (AttributeItem item in node.Attributes)
                    {
                        if (item.Value.Contains("\""))
                        {
                            builder.AppendFormat(" {0}='{1}'", item.Name, item.Value);
                        }
                        else
                        {
                            builder.AppendFormat(" {0}=\"{1}\"", item.Name, item.Value);
                        }
                    }
                }
                builder.Append(">");
            }
            else if (node.NodeType == HtmlNodeType.EndTag)
            {
                string tagName = HtmlTags.GetTagName(node.TagId);
                builder.AppendFormat("</{0}>", tagName);
            }
            else if (node.NodeType == HtmlNodeType.Text)
            {
                string text = HtmlUtility.NormalizeText(html, node.Index, node.Length);
                if (text != null)
                {
                    builder.Append(text);
                }
            }
            else if (node.NodeType == HtmlNodeType.Comment)
            {
                string text = html.Substring(node.Index, node.Length);
                builder.Append(text);
            }
            else
            {
                Debug.Assert(false);
            }

            return(builder.ToString());
        }
        private string GetNodeStack(int startTag, int endTag)
        {
            string nodeStack = "";

            if (endTag < parsedHtml.Nodes.Count - 1)
            {
                endTag++;
            }
            if (startTag > 0)
            {
                startTag--;
            }

            while (startTag <= endTag)
            {
                HtmlNode node = GetNode(startTag);
                switch (node.NodeType)
                {
                case HtmlNodeType.Element:
                    if (node.GetAttributeValue(HtmlAttributeId.Class) != null)
                    {
                        nodeStack += "<" + HtmlTags.GetTagName(node.TagId) + " class=" + node.GetAttributeValue(HtmlAttributeId.Class) + ">";
                    }
                    else
                    {
                        nodeStack += "<" + HtmlTags.GetTagName(node.TagId) + ">";
                    }
                    break;

                case HtmlNodeType.EndTag:
                    nodeStack += "</" + HtmlTags.GetTagName(node.TagId) + ">";
                    break;

                case HtmlNodeType.Text:
                    nodeStack += "_";
                    break;
                }
                startTag++;
            }
            return(nodeStack);
        }
Beispiel #3
0
        public static void RootRightpath(string html, List <HtmlNode> nodes)
        {
            string       L1_path = @"D:\265.com\L1\RightL1Categories.txt";
            StreamWriter sw      = new StreamWriter(L1_path);

            int    start = 0;
            string url = "", category = "";

            for (int i = 0; i < nodes.Count; i++)
            {
                if (nodes[i].NodeType == HtmlNodeType.Element)
                {
                    string tagName = HtmlTags.GetTagName(nodes[i].TagId);

                    if (tagName.ToLower() == "li" && nodes[i].Attributes != null && nodes[i].Attributes.Count > 0)
                    {
                        for (int k = 0; k < nodes[i].Attributes.Count; k++)
                        {
                            if (nodes[i].Attributes[k].Name == "class" && nodes[i].Attributes[k].Value == "title")
                            {
                                start = 1;
                                continue;
                            }
                        }
                    }
                    if (start == 1 && tagName.ToLower() == "a" && nodes[i].Attributes != null && nodes[i].Attributes.Count > 0)
                    {
                        string site = @"http://www.265.com";
                        for (int k = 0; k < nodes[i].Attributes.Count; k++)
                        {
                            if (nodes[i].Attributes[k].Name == "href")
                            {
                                url = site + nodes[i].Attributes[k].Value;
                            }
                        }
                    }
                }
                else if (nodes[i].NodeType == HtmlNodeType.Text)
                {
                    if (start == 1)
                    {
                        category = html.Substring(nodes[i].Index, nodes[i].Length);
                        //Console.WriteLine(category);
                    }
                }
                else if (nodes[i].NodeType == HtmlNodeType.EndTag)
                {
                    string tagName = HtmlTags.GetTagName(nodes[i].TagId);
                    if (tagName.ToLower() == "li" && start > 0)
                    {
                        start = 0;
                        sw.WriteLine("{0}\t{1}", category, url);
                        sw.Flush();
                    }
                }
                else
                {
                    continue;
                }
            }
            sw.Close();
        }
Beispiel #4
0
        public static void Subpath(string html, List <HtmlNode> nodes, string filepath)
        {
            StreamWriter sw = new StreamWriter(filepath);

            int    start = 0;
            string category = "", url = "";

            for (int i = 0; i < nodes.Count; i++)
            {
                if (nodes[i].NodeType == HtmlNodeType.Element)
                {
                    string tagName = HtmlTags.GetTagName(nodes[i].TagId);
                    if (tagName.ToLower() == "div" && nodes[i].Attributes != null && nodes[i].Attributes.Count > 0)
                    {
                        for (int k = 0; k < nodes[i].Attributes.Count; k++)
                        {
                            if (nodes[i].Attributes[k].Name == "class" && nodes[i].Attributes[k].Value == "titleCS")
                            {
                                start = 1;
                            }
                        }
                    }

                    if (tagName.ToLower() == "ul" && nodes[i].Attributes != null && nodes[i].Attributes.Count > 0)
                    {
                        for (int k = 0; k < nodes[i].Attributes.Count; k++)
                        {
                            if (nodes[i].Attributes[k].Name == "class" && nodes[i].Attributes[k].Value == "listUrl")
                            {
                                start = 2;
                            }
                        }
                    }

                    if (tagName.ToLower() == "a" && nodes[i].Attributes != null && nodes[i].Attributes.Count > 0 && start == 2)
                    {
                        for (int k = 0; k < nodes[i].Attributes.Count; k++)
                        {
                            if (nodes[i].Attributes[k].Name == "href")
                            {
                                url = nodes[i].Attributes[k].Value;
                            }
                        }
                    }
                }
                else if (nodes[i].NodeType == HtmlNodeType.Text)
                {
                    if (start == 1)
                    {
                        category = html.Substring(nodes[i].Index, nodes[i].Length).ToString();
                        start    = 2;
                    }
                }
                else if (nodes[i].NodeType == HtmlNodeType.EndTag)
                {
                    string tagName = HtmlTags.GetTagName(nodes[i].TagId);
                    if (tagName.ToLower() == "li" && start == 2)
                    {
                        sw.WriteLine("{0}\t{1}", category, url);
                        sw.Flush();
                    }

                    if (tagName.ToLower() == "ul" && start == 2)
                    {
                        start = 0;
                    }
                }
            }
            sw.Close();
        }