private void paserData(INode node)
        {
            ITag tag = getTag(node);

            if (tag != null && !tag.IsEndTag() && !start.Contains(tag.StartPosition))
            {
                object oId    = tag.GetAttribute("ID");
                object oName  = tag.GetAttribute("name");
                object oClass = tag.GetAttribute("class");
                parseResult += tag.TagName + ":\r\nID:" + oId + " Name:" + oName
                               + " Class:" + oClass + " StartPosition:" + tag.StartPosition.ToString() + "\r\n";
                start.Add(tag.StartPosition);
            }
            //子节点
            if (node.Children != null && node.Children.Count > 0)
            {
                paserData(node.FirstChild);
            }
            //兄弟节点
            INode siblingNode = node.NextSibling;

            while (siblingNode != null)
            {
                paserData(siblingNode);
                siblingNode = siblingNode.NextSibling;
            }
        }
Пример #2
0
        /// <summary> Searches all children who for a name attribute. Returns first match.</summary>
        /// <param name="name">Attribute to match in tag
        /// </param>
        /// <returns> Tag Tag matching the name attribute
        /// </returns>
        public virtual ITag SearchByName(System.String name)
        {
            INode node;
            ITag  tag   = null;
            bool  found = false;

            for (ISimpleNodeIterator e = GetChildren(); e.HasMoreNodes() && !found;)
            {
                node = e.NextNode();
                if (node is ITag)
                {
                    tag = (ITag)node;
                    System.String nameAttribute = tag.GetAttribute("NAME");
                    if (nameAttribute != null && nameAttribute.Equals(name))
                    {
                        found = true;
                    }
                }
            }
            if (found)
            {
                return(tag);
            }
            else
            {
                return(null);
            }
        }
Пример #3
0
        public List <String> start(String htmlContent)
        {
            Parser parser = new Parser();

            parser.InputHTML = htmlContent;
            NodeList      nodelist = parser.Parse(nodefilter);
            int           size     = nodelist.Size();
            List <String> results  = new List <String>();

            for (int i = 0; i < size; i++)
            {
                INode node = nodelist.ElementAt(i);
                if (node is INode)
                {
                    ITag tag = node as ITag;
                    if (needValue == "href")
                    {
                        results.Add(tag.GetAttribute(needValue));
                    }
                    else
                    {
                        results.Add(tag.FirstChild.GetText());
                    }
                }
            }
            return(results);
        }
Пример #4
0
        /// <summary>
        /// 设置过滤条件
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        public bool Accept(INode node)
        {
            Boolean isAccept = false;

            if (node is ITag)
            {
                ITag tagNode = (ITag)node;
                if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
                {
                    int totalNum   = mRules.Count;
                    int currentNum = 0;
                    foreach (var item in mRules)
                    {
                        String getValue = tagNode.GetAttribute(item.Key);
                        if (getValue == item.Value)
                        {
                            currentNum++;
                        }
                    }
                    if (currentNum == totalNum)
                    {
                        isAccept = true;
                    }
                }
            }
            return(isAccept);
        }
Пример #5
0
 public bool Accept(INode node)
 {
     if (node is ITag)
     {
         ITag tag = node as ITag;
         if (tag.GetAttribute(key).StartsWith(value))
         {
             return(true);
         }
     }
     return(false);
 }
Пример #6
0
        public static void parseIndexHtml(string HtmlString, int num, out string href, out string title)
        {
            href  = "";
            title = "";
            //进行解析
            Parser parser = Parser.CreateParser(HtmlString, "utf-8");
            //筛选要查找的对象 这里查找td,封装成过滤器
            NodeFilter filter = new TagNameFilter("dd");

            new AndFilter(new TagNameFilter("dd"), new HasParentFilter(new AndFilter(new TagNameFilter("dl"), new HasAttributeFilter("class", "txt_box"))));
            //将过滤器导入筛选,得到对象列表
            NodeList nodes = parser.Parse(filter);

            if (nodes.Size() > num)
            {
                INode textnode = nodes[num];
                ITag  tag      = getTag(textnode.FirstChild.NextSibling);
                href  = tag.GetAttribute("href");
                title = tag.GetAttribute("title");
            }
        }
Пример #7
0
        static void GetMenus()
        {
            string      str       = GetHtml("https://www.qisuu.la/du/23/23361/", true);
            Lexer       lexer     = new Lexer(str);
            Parser      parser    = new Parser(lexer);
            NodeList    htmlNodes = parser.Parse(new HasAttributeFilter("class", "pc_list"));
            IList <int> start     = new List <int>();

            List <INode> nodes = new List <INode>();

            for (int i = 1; i < htmlNodes.Count; i++)
            {
                ForeachNode(htmlNodes[i], (node) => {
                    ITag tag   = node as ITag;
                    object cls = tag.GetAttribute("href");
                    if (cls != null)
                    {
                        nodes.Add(node);
                    }
                });
            }

            foreach (INode node in nodes)
            {
                ITag   tag      = node as ITag;
                string href     = tag.GetAttribute("href");
                IText  nextNode = node.Children[0] as IText;
                string title    = nextNode.GetText();

                str       = GetHtml("https://www.qisuu.la/du/23/23361/" + href, true);
                lexer     = new Lexer(str);
                parser    = new Parser(lexer);
                htmlNodes = parser.Parse(new HasAttributeFilter("id", "content1"));

                string result = "";
                for (int i = 0; i < htmlNodes[0].Children.Count; i++)
                {
                    INode txt = htmlNodes[0].Children[i] as INode;
                    if (txt is IText)
                    {
                        string line = txt.GetText();
                        line = line.Replace("&nbsp;", "").Trim();
                        if (line != "")
                        {
                            result += line + "\n";
                        }
                    }
                }
                Console.WriteLine(title);
                File.WriteAllText("books/" + title, result);
            }
        }
Пример #8
0
        static List <string> GetUrls()
        {
            List <string> ret = new List <string>();
            string        url = "https://wallhaven.cc/search?";
            Dictionary <string, string> dic = new Dictionary <string, string>()
            {
                { "q", "明日方舟" },
                { "categories", "111" },
                { "purity", "100" },
                { "resolutions", "1280x800,1600x1000,1920x1200,2560x1600,3840x2400" },
                { "sorting", "relevance" },
                { "order", "desc" },
            };

            foreach (var key in dic.Keys)
            {
                url += $"{key}={WebUtility.UrlEncode(dic[key])}&";
            }

            WebClient myWebClient = new WebClient();
            string    html        = myWebClient.DownloadString(url);
            Lexer     lexer       = new Lexer(html);
            Parser    parser      = new Parser(lexer);
            NodeList  htmlNodes   = parser.Parse(new HasAttributeFilter("class", "lazyload"));

            for (int i = 0; i < htmlNodes.Count; i++)
            {
                INode  node = htmlNodes[i];
                ITag   tag  = (ITag)node;
                string src  = tag.GetAttribute("data-src");
                ITag   png  = (ITag)(node.Parent.Children[3].Children[2]);
                string val  = png.GetAttribute("class");
                val = val == "png" ? val : "jpg";

                src = src.Replace(".jpg", $".{val}");
                src = src.Replace("https://th.wallhaven.cc/small/", "https://w.wallhaven.cc/full/");
                string[] arr = src.Split('/');
                string   str = $"wallhaven-{src.Substring(src.LastIndexOf('/') + 1)}";
                src = src.Replace(arr[arr.Length - 1], str);
                Console.WriteLine(src);
                ret.Add(src);
            }
            return(ret);
        }
Пример #9
0
        static void AuthorBooks()
        {
            string   url       = "https://www.qisuu.la/Writer/13.html";
            string   html      = GetHtml(url, true);
            Lexer    lexer     = new Lexer(html);
            Parser   parser    = new Parser(lexer);
            NodeList htmlNodes = parser.Parse(new HasAttributeFilter("class", "listBox"));
            INode    node      = htmlNodes[0];
            string   title     = node.Children[1].Children[1].Children[0].GetText();

            Console.WriteLine(title);

            INode ul = node.Children[3];

            for (int i = 1; i < ul.Children.Count; i += 2)
            {
                INode  li         = ul.Children[i];
                ITag   s          = (ITag)li.Children[1];
                ITag   a          = (ITag)li.Children[3];
                ITag   u          = (ITag)li.Children[5];
                string bookTitle  = a.Children[1].GetText();
                string bookImgSrc = ((ITag)a.Children[0]).GetAttribute("src");
                string bookHref   = a.GetAttribute("href").Replace("/Shtml", "").Replace(".html", "");
                string bookDesc   = u.Children[0].GetText();
                Console.WriteLine(bookTitle);
                Console.WriteLine(bookImgSrc);
                Console.WriteLine(bookHref);
                Console.WriteLine(bookDesc);

                string author = s.Children[0].GetText() + s.Children[1].Children[0].GetText();
                string status = s.Children[3].GetText();
                string size   = s.Children[5].GetText();
                string words  = s.Children[7].GetText();
                Console.WriteLine(author);
                Console.WriteLine(status);
                Console.WriteLine(size);
                Console.WriteLine(words);
                Console.WriteLine();
            }
        }
Пример #10
0
 /// <summary>
 /// 对第一层页面进行解析,返回第二层页面url
 /// </summary>
 /// <param name="url"></param>
 /// <returns></returns>
 public static string GetUrl(string url)
 {
     try
     {
         StringBuilder linkUrl = new StringBuilder();
         //过滤条件:获得所有li标签
         NodeFilter filterFirst = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "hui3"));
         //获得所有li标签
         NodeList linkNodeList = GetNodeList(url, filterFirst);
         //获取第一条数据(最新的一条)
         ITag tempTag1 = linkNodeList[0] as ITag;
         ITag tempTag2 = tempTag1.LastChild as ITag;
         //拼接url
         linkUrl.Append("http://www.cjmsa.gov.cn");
         linkUrl.Append(tempTag2.GetAttribute("href"));
         return(linkUrl.ToString());
     }
     catch (Exception ex)
     {
         Console.WriteLine("解析第一层出错" + ex.Message);
         return("failed");
     }
 }
Пример #11
0
 internal virtual void ConvertFromTag(ITag obTag)
 {
     this.m_strId   = obTag.GetAttribute("ID");
     this.m_strName = obTag.GetAttribute("NAME");
 }
Пример #12
0
        public ActionResult YiLong()
        {
            string        title          = "";
            List <string> secnicNameList = new List <string>();
            List <string> secnicLocList  = new List <string>();
            //for (int num = 1; num <= 2 ; num++ )
            //{
            string strBaidu;

            strBaidu = "http://trip.elong.com/shanghai/jingdian/";
            Uri        uri        = new Uri(strBaidu);
            WebRequest webRequest = WebRequest.Create(uri);

            httpWebRequest = (HttpWebRequest)webRequest;
            // 发送请求信息
            WebResponse webResponse = httpWebRequest.GetResponse(); // 获得响应信息

            httpWebResponse = (HttpWebResponse)webResponse;
            // 获得从当前Internet资源返回的响应流数据
            Stream stream = httpWebResponse.GetResponseStream();
            // 利用获得的响应流和系统缺省编码来初始化StreamReader实例。
            StreamReader sr        = new StreamReader(stream, Encoding.UTF8);
            string       strResult = sr.ReadToEnd(); //从响应流从读取数据

            sr.Close();
            //通过Winista.HtmlParser解析Html

            Parser     parser      = Parser.CreateParser(strResult, "utf-8"); //utf-8
            NodeFilter filterDiv   = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zyjd-li"));
            NodeList   nodeList    = parser.Parse(filterDiv);
            NodeList   subNodeList = nodeList[0].Children[1].Children;

            if (subNodeList.Count > 0)
            {
                for (int i = 0; i < subNodeList.Count; i++)
                {
                    ITag tempTag = subNodeList[i] as ITag;
                    if (tempTag != null)
                    {
                        ITag titleTag = tempTag.Children[0] as ITag;
                        secnicNameList.Add(titleTag.GetAttribute("TITLE").ToString());
                        title += titleTag.GetAttribute("TITLE").ToString() + ";";
                    }
                }
            }

            //通过newtonsoft.json解析Json
            //JObject jo = (JObject)JsonConvert.DeserializeObject(strResult);
            //IList<string> secnicNames = jo["sights"].Select(m => (string)m.SelectToken("name")).ToList();
            //IList<string> SecnicLng = jo["sights"].Select(m => (string)m.SelectToken("longitude")).ToList();
            //IList<string> SecnicLat = jo["sights"].Select(m => (string)m.SelectToken("latitude")).ToList();
            //for (int i = 0; i < secnicNames.Count; i++)
            //{
            //    secnicNameList.Add(secnicNames[i]);
            //    secnicLocList.Add(SecnicLng[i]+","+SecnicLat[i]);
            //    title += secnicNames[i] + " ; ";
            //}

            //}
            //保存XML
            xmlDoc = new XmlDocument();
            //加入XML的声明段落,<?xml version="1.0" encoding="gb2312"?>
            XmlDeclaration xmldecl;

            xmldecl = xmlDoc.CreateXmlDeclaration("1.0", "gb2312", null);
            xmlDoc.AppendChild(xmldecl);
            //加入一个根元素
            xmlElement = xmlDoc.CreateElement("", "scenic", "");
            xmlDoc.AppendChild(xmlElement);
            for (int i = 0; i < secnicNameList.Count; i++)
            {
                XmlNode    root   = xmlDoc.SelectSingleNode("scenic"); //查找<scenic>
                XmlElement xeItem = xmlDoc.CreateElement("item");      //创建一个<item>节点
                XmlElement xeName = xmlDoc.CreateElement("scenicname");
                xeName.InnerText = secnicNameList[i];                  //设置文本节点
                xeItem.AppendChild(xeName);                            //添加到<item>节点中
                //string strLng = secnicLocList[i].Split(',')[0];
                //string strLat = secnicLocList[i].Split(',')[1];
                //XmlElement xeLng = xmlDoc.CreateElement("lng");
                //xeLng.InnerText = strLng;//设置文本节点
                //xeItem.AppendChild(xeLng);//添加到<item>节点中
                //XmlElement xeLat = xmlDoc.CreateElement("lat");
                //xeLat.InnerText = strLat;//设置文本节点
                //xeItem.AppendChild(xeLat);//添加到<item>节点中
                root.AppendChild(xeItem);//添加到<scenic>节点中
            }
            //保存创建好的XML文档
            xmlDoc.Save(Server.MapPath("data1.xml"));

            ViewBag.Message = title;


            return(View());
        }
Пример #13
0
        private void GrapBaiduMsg(HttpContext context)
        {
            string sRslt = GetHtmlStr("http://www.yododo.com/ask/list/");

            ClassLibrary.BLL.RouteClass          rcBll  = new ClassLibrary.BLL.RouteClass();
            List <ClassLibrary.Model.RouteClass> rcList = rcBll.GetModelList("classLevel = 3");

            Parser     parser   = Parser.CreateParser(sRslt, "utf-8");
            NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix"));
            NodeList   liList   = parser.Parse(filterUL);
            string     links    = liList[0].ToHtml();

            parser = Parser.CreateParser(links, "utf-8");
            NodeFilter filterLI = new TagNameFilter("li"); //new NodeClassFilter(typeof(ATag));
            NodeList   nodelist = parser.Parse(filterLI);

            //string strGn = nodelist[1].ToHtml();
            string strCj = nodelist[0].ToHtml();

            //parser = Parser.CreateParser(nodelist.ToHtml(), "utf-8");
            NodeFilter filterA = new NodeClassFilter(typeof(ATag));

            /*NodeList aGnList = parser.Parse(filterA);
             * for (int i = 0; i < aGnList.Count; i++)
             * {
             *  ITag tag = getTag(aGnList[i]);
             *  string url = "http://www.yododo.com" + tag.GetAttribute("href") + "s1";  //已解决
             *  string className = tag.ToPlainTextString();
             *  if (className == "全部") continue;
             *
             *  ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return rc.ClassName == className; });
             *  if (model == null) continue;
             *
             *  paserData(aGnList[i], url, model.ID);
             * }*/

            parser = Parser.CreateParser(strCj, "utf-8");
            NodeList areaCjList = parser.Parse(filterA);

            for (int i = 0; i < areaCjList.Count; i++)
            {
                ITag   tag       = getTag(areaCjList[i]);
                string url       = "http://www.yododo.com" + tag.GetAttribute("href"); //各洲
                string className = tag.ToPlainTextString();
                if (className == "全部" || className == "中国")
                {
                    continue;
                }

                parser = Parser.CreateParser(GetHtmlStr(url), "utf-8");
                //NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix"));
                NodeList liListCj = parser.Parse(filterUL);
                string   linksCj  = liListCj[0].ToHtml();

                parser = Parser.CreateParser(linksCj, "utf-8");
                //NodeFilter filterA = new NodeClassFilter(typeof(ATag));
                NodeList aCjList = parser.Parse(filterA);
                for (int j = 0; j < aCjList.Count; j++)
                {
                    ITag   cjtag      = getTag(aCjList[j]);
                    string url1       = "http://www.yododo.com" + cjtag.GetAttribute("href") + "s1"; //已解决
                    string className1 = cjtag.ToPlainTextString();
                    if (className1 == "全部")
                    {
                        continue;
                    }

                    ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return(rc.ClassName == className1); });
                    if (model == null)
                    {
                        continue;
                    }

                    paserData(aCjList[j], url1, model.ID);
                }
            }

            Print(context, "success");
        }
Пример #14
0
        static void download_url(string url)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393";
            request.Timeout   = 30000;
            try
            {
                using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
                {
                    if (response.StatusCode == HttpStatusCode.OK)
                    {
                        using (Stream s = response.GetResponseStream())
                        {
                            using (StreamReader sr = new StreamReader(s, Encoding.UTF8))
                            {
                                string html   = sr.ReadToEnd();
                                string encode = HttpUtility.HtmlDecode(html);
                                download_pic(encode);
                                Lexer      lexer      = new Lexer(encode);
                                Parser     par        = new Parser(lexer);
                                NodeFilter nodefilter = new TagNameFilter("a");
                                NodeList   nodes      = par.ExtractAllNodesThatMatch(nodefilter);
                                for (int i = 0; i < nodes.Count; i++)
                                {
                                    ITag tag     = nodes[i] as ITag;
                                    bool isexist = false;
                                    foreach (string ss in links)
                                    {
                                        if (ss == tag.GetAttribute("href"))
                                        {
                                            isexist = true;
                                            break;
                                        }
                                    }
                                    if (!isexist)
                                    {
                                        links.Add(tag.GetAttribute("href"));
                                        Console.WriteLine("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href"));
                                        using (FileStream fs = new FileStream(@"e:/Photos/crawl_log.txt", FileMode.Append))
                                        {
                                            byte[] bytes = Encoding.UTF8.GetBytes("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href") + "\r\n");
                                            fs.Write(bytes, 0, bytes.Length);
                                        }
                                        download_url("http://taylorpictures.net/" + tag.GetAttribute("href"));
                                    }
                                    else
                                    {
                                        continue;
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("Error");
                    }
                }
            }
            catch
            {
                Console.WriteLine("404");
            }
        }
Пример #15
0
        static void SearchBook()
        {
            string   query     = WebUtility.HtmlEncode("唐家三少");
            string   url       = $"http://zhannei.baidu.com/cse/site/?cc=qisuu.la&s=11735927224209550458&q=" + query;
            string   html      = GetHtml(url);
            Lexer    lexer     = new Lexer(html);
            Parser   parser    = new Parser(lexer);
            NodeList htmlNodes = parser.Parse(new HasAttributeFilter("class", "result f s0"));

            for (int i = 0; i < htmlNodes.Count; i++)
            {
                {
                    INode  node       = htmlNodes[i];
                    INode  nodeTitle  = node.Children[1];
                    INode  nodeATitle = nodeTitle.Children[1];
                    ITag   aTag       = nodeATitle as ITag;
                    object attribute  = aTag.GetAttribute("href");
                    if (attribute != null)
                    {
                        string href = attribute.ToString().Trim();
                        href = Regex.Replace(href, "^http.*//", "");
                        {
                            int idx = href.IndexOf("/?");
                            if (idx != -1)
                            {
                                href = href.Substring(0, idx);
                            }
                        }
                        {
                            int idx = href.IndexOf("/");
                            if (idx != -1)
                            {
                                href = href.Substring(idx + 1);
                            }
                        }

                        href = href.Replace("Shtml", "");
                        href = href.Replace(".html", "");
                        if (href.Contains("du"))
                        {
                            href = href.Split('/')[2];
                        }
                        if (href.Contains("Writer"))
                        {
                            href = href.Split('/')[1];
                            Console.WriteLine("作者");
                        }
                        if (href != "")
                        {
                            Console.WriteLine($"href:{href}");
                        }
                    }
                    {
                        string title = "";
                        for (int j = 0; j < nodeATitle.Children.Count; j++)
                        {
                            INode tempNode = nodeATitle.Children[j];
                            if (tempNode is IText)
                            {
                                title += tempNode.GetText().Trim();
                            }
                        }
                        title = title.Trim();
                        if (title != "")
                        {
                            Console.WriteLine($"title:{title}");
                        }
                    }
                    {
                        INode nodeDiv = node.Children[3];
                        ForeachNode(nodeDiv, (_node) =>
                        {
                            ITag tag      = _node as ITag;
                            object _class = tag.GetAttribute("class");
                            if (_class != null && _class.ToString() == "c-abstract")
                            {
                                string desc = "";
                                for (int j = 0; j < tag.Children.Count; j++)
                                {
                                    INode tempNode = tag.Children[j];
                                    if (tempNode is IText)
                                    {
                                        desc += tempNode.GetText().Trim();
                                    }
                                }
                                desc = desc.Trim();
                                if (desc != "")
                                {
                                    Console.WriteLine($"desc:{desc}");
                                }
                            }
                        });
                    }

                    Console.WriteLine();
                }

                //ForeachNode(htmlNodes[i], (node) =>
                //{
                //    ITag tag = node as ITag;
                //    object cpos = tag.GetAttribute("cpos");
                //    object href = tag.GetAttribute("href");
                //    object _class = tag.GetAttribute("class");
                //    if (_class != null && (string)_class == "c-abstract")
                //    {
                //        string content = "";
                //        for (int j = 0; j < node.Children.Count; j++)
                //        {
                //            if (node.Children[j] is IText)
                //            {
                //                string str = node.Children[j].GetText();
                //                str = str.Trim();
                //                if (str != "")
                //                {
                //                    content += str + " ";
                //                }
                //            }
                //        }
                //        content = content.Trim();
                //        if (content != "")
                //        {
                //            Console.WriteLine(content);
                //        }
                //    }
                //    if (href != null && (string)cpos == "title")
                //    {
                //        string line = "";
                //        for (int j = 0; j < node.Children.Count; j++)
                //        {
                //            if (node.Children[j] is IText)
                //            {
                //                string str = node.Children[j].GetText();
                //                str = str.Trim();
                //                if (str != "")
                //                {
                //                    line += str + " ";
                //                }
                //            }
                //        }
                //        line = line.Trim();
                //        if (line != "")
                //        {
                //            Console.WriteLine(line);
                //            Console.WriteLine(href);
                //        }
                //    }
                //});
            }
        }