private string HtmlText(string sourceHtml)
        {
            hParser.Parser parser = hParser.Parser.CreateParser(sourceHtml.Replace(System.Environment.NewLine, ""), "utf-8");

            StringBuilder builderHead = new StringBuilder();
            StringBuilder builderBody = new StringBuilder();


            hParser.NodeFilter html  = new TagNameFilter("TR");
            hParser.INode      nodes = parser.Parse(html)[0];
            builderHead.Append(nodes.Children[0].ToHtml());
            hParser.INode body = nodes.Children[1];
            hParser.INode div  = body.Children[0];


            for (int i = 0; i < div.Children.Count; i++)
            {
                if (div.Children[i] is hParser.ITag)
                {
                    builderBody.Append(div.Children[i].ToHtml());
                }
            }

            StringBuilder builder = new StringBuilder();

            builder.Append("<html>");
            builder.Append(builderHead.ToString());
            builder.Append("<body>");
            builder.Append(string.Format("<{0}>", div.GetText()));
            builder.Append(builderBody.ToString());
            builder.Append("</div>");
            builder.Append("</body>");
            builder.Append("</html>");
            return(builder.ToString());
        }
Пример #2
0
 static void GetBlogLink(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList articleList = parser.Parse(articleFilter);
     if (articleList.Count == 1)
     {
         NodeList candidateNodeList = articleList[0].Children.ExtractAllNodesThatMatch(wrapFilter, true);
         for (int i = 0; i < candidateNodeList.Count; i++)
         {
             NodeList linkNodeList = candidateNodeList[i].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), false);
             if (linkNodeList.Count == 1)
             {
                 string blogLink = ((ATag)linkNodeList[0]).ExtractLink();
                 blogLinkList.Add(blogLink);
             }
             else
             {
                 Console.WriteLine("第" + i + "个条目中,判断链接出错!");
             }
         }
     }
     else
     {
         Console.WriteLine("获取包含日志列表出错!");
     }
 }
Пример #3
0
 /// <summary>
 /// 获取目标数据
 /// </summary>
 /// <param name="parser">目标html文件</param>
 /// <param name="tag">标签名称</param>
 /// <param name="attribute">标签里面的属性名称</param>
 /// <param name="attValue">属性的值</param>
 /// <returns>标签内的目标数据</returns>
 public static string getValue(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new TagNameFilter(tag);
     NodeList nodeList = parser.Parse(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             foreach (string key in tagNode.Attributes.Keys)
             {
                 if (key.Contains("<TAGNAME>"))
                 {
                     continue;
                 }
                 if (key.Contains(attribute))
                 {
                     if (tagNode.Attributes[key].ToString() == attValue)
                     {
                         value = tagNode.ToPlainTextString();
                         return value;
                     }
                 }
             }
         }
     }
     return null;
 }
Пример #4
0
        /// <summary>
        /// 增加一条数据
        /// </summary>
        public string Add(string scheduleID, string companyids, string historyids,DateTime time)
        {
            WebClientBLL bll = new WebClientBLL();
            string[] companyidArr = companyids.Split(',');
            string[] historyidArr = historyids.Split(',');
            int count = 0;
            if (companyidArr.Length == historyidArr.Length)
            {
                dal.Delete(scheduleID);
                for (int i = 0; i < companyidArr.Length; i++)
                {
                    string s = bll.GetOddsHistoryContent(historyidArr[i]);

                    Lexer lexer = new Lexer(s);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag table = bodyNodes.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableTag))[0] as ITag;

                    NodeList tableRows = table.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableRow));

                    for (int f = 0; f < tableRows.Count; f++)
                    {
                        ITag row = tableRows[f] as ITag;
                        if (row.Attributes["ALIGN"].Equals("center") && row.Attributes["BGCOLOR"].Equals("#FFFFFF")){
                            Odds1x2History model = new Odds1x2History();
                            model.companyid = int.Parse(companyidArr[i]);
                            model.scheduleid = int.Parse(scheduleID);
                            model.home = float.Parse(row.Children[0].ToPlainTextString());
                            model.draw = float.Parse(row.Children[1].ToPlainTextString());
                            model.away = float.Parse(row.Children[2].ToPlainTextString());
                            this.FillOdds1x2History(model);
                            string[] t2 = row.Children[3].ToPlainTextString().Replace("showtime(", "").Replace(")", "").Split(',');
                            int yy = int.Parse(t2[0]);
                            int mm = int.Parse(t2[1].Remove(2));
                            int dd = int.Parse(t2[2]);
                            int hh = int.Parse(t2[3]);
                            int mi = int.Parse(t2[4]);
                            int ss = int.Parse(t2[5]);
                            model.time = new DateTime(yy, mm, dd, hh, mi, ss, DateTimeKind.Utc).AddHours(8d);
                            if (model.time > time)
                            {
                                continue;
                            }
                            dal.Add(model);
                            count++;
                        }
                    }
                }
            }
            JSONHelper json = new JSONHelper();
            json.success = true;
            json.totlalCount = count;
            return json.ToString();
        }
Пример #5
0
 static void GetStoryOfRevolution()
 {
     StreamReader reader = new StreamReader("catalogue.htm");
     Lexer lexer = new Lexer(reader.ReadToEnd());
     Parser parser = new Parser(lexer);
     HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl");
     HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink");
     AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass);
     NodeList linkNodeList = parser.Parse(linkFilter);
     List<string> linkUrlList = new List<string>(linkNodeList.Size());
     List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size());
     HttpWebRequest httpWebRequest;
     StreamReader chapterReader = null;
     for (int i = 0; i < linkNodeList.Size(); i++)
     {
         ATag linkNode = (ATag)linkNodeList[i];
         linkUrlList.Add(linkNode.Link);
         httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]);
         chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024));
         string chapterHtmlContent = chapterReader.ReadToEnd();
         chapterHtmlContentList.Add(chapterHtmlContent);
         Console.WriteLine("第" + (i + 1) + "个页面获取完毕!");
     }
     chapterReader.Close();
     HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText");
     StreamWriter writer = new StreamWriter("革命逸事.txt");
     for (int i = 0; i < chapterHtmlContentList.Count; i++)
     {
         writer.WriteLine("第" + (i + 1) + "章");
         lexer = new Lexer(chapterHtmlContentList[i]);
         parser = new Parser(lexer);
         NodeList praghNodeList = parser.Parse(praghFilter);
         if (praghNodeList.Size() == 1)
         {
             for (int j = 0; j < praghNodeList[0].Children.Size(); j++)
             {
                 if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag)))
                 {
                     ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j];
                     writer.WriteLine("    " + praghTag.StringText);
                 }
             }
             writer.WriteLine();
         }
         else
         {
             Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!");
         }
     }
     writer.Close();
 }
Пример #6
0
 static string GetBlogTitle(string htmlContent)
 {
     string result = "";
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList titleList = parser.Parse(titleFilter);
     if (titleList.Count == 1)
     {
         TitleTag titleTag = (TitleTag)titleList[0];
         result = titleTag.Title;
     }
     else
     {
         Console.WriteLine("获取标题信息出错!");
     }
     return result;
 }
Пример #7
0
        /// <summary>
        /// 获得列表
        /// </summary>
        /// <returns></returns>
        public List<OddsLiveMatch> GetMatchScrollOdds(string matchid,string urlparams)
        {
            List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>();
            try
            {
                HttpHelper h = new HttpHelper();
                Cookie lng = new Cookie("lng", "2");
                lng.Domain = domain;
                h.CookieContainer.Add(lng);
                //string zoudi = h.GetHtml("https://" + domain + "/default.aspx" + urlparams);
                string zoudi = h.GetHtml(urlparams);
                if (!string.IsNullOrEmpty(zoudi))
                {
                    #region 分析网页html节点
                    Lexer lexer = new Lexer(zoudi);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag;
                    if (divNode.Attributes["ID"].Equals("PageBody"))
                    {
                        NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div));
                        if (dataDivList[0].ToPlainTextString() == "走地盤")
                        {
                            if (dataDivList[2].ToPlainTextString() == "全場賽果")
                            {
                                OddsLiveHistory liveHistory = new OddsLiveHistory();
                                liveHistory.matchid = matchid;
                                liveHistory.home = float.Parse(dataDivList[3].ToPlainTextString().Split(' ')[0]);
                                liveHistory.draw = float.Parse(dataDivList[5].ToPlainTextString().Split(' ')[0]);
                                liveHistory.away = float.Parse(dataDivList[7].ToPlainTextString().Split(' ')[0]);
                                liveHistory.time = DateTime.Now;
                                dal.AddHistory(liveHistory);
                            }
                        }
                    }
                    #endregion 分析网页html节点
                }
            }
            catch (Exception)
            {

            }
            return liveMatchList;
        }
        public void StartCrawl()//  private void BtnDownload_Click(object sender, RoutedEventArgs e)
        {
            this.parseResult = "";
            Uri uri = new Uri(this.TargetUri);

            #region

            //<N>基于Httphelper,这样下载会要求程序自己实现验证授权
            //HttpHelper httpHelper = new HttpHelper();
            //HttpItem rq = new HttpItem();
            //rq.URL = uri.AbsoluteUri;
            //HttpResult html = httpHelper.GetHtml(rq);
            //Debug.WriteLine(html.Html);

            //直接基于WebBrowser,授权是由用户手动实现的
            mshtml.IHTMLDocument2 doc2 = null;//(mshtml.IHTMLDocument2)webBox.Document;
            string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0 ? s_htmlFake : doc2.body.innerHTML;
            Debug.WriteLine(html);
            #endregion

            #region 使用HtmlParser提取HTML
            Lexer              lexer    = new Lexer(html);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    parserTR(nodeList[i]);
                }
                MessageBox.Show(parseResult);
            }

            /*  parseResult = HtmlText(html);
             * MessageBox.Show(parseResult);*/
            #endregion
        }
Пример #9
0
 //分析HtmlContents中给定索引条目的内容,提取信息
 private static void GetInfoFromHtml(int index)
 {
     //使用Winista.HtmlParser库解析HTML
     //建立HTML分析工具对象
     Lexer lexer = new Lexer(HtmlContents[index]);
     Parser parser = new Parser(lexer);
     //按属性的过滤器:两个参数分别代表要过滤的属性和属性值
     HasAttributeFilter nameFilter = new HasAttributeFilter("class", "lrg");
     HasAttributeFilter priceFilter = new HasAttributeFilter("class", "bld lrg red");
     //获得所有满足过滤条件的HTML节点
     NodeList nameList = parser.Parse(nameFilter);
     for (int j = 0; j < nameList.Size(); j++)
     {
         //确定节点nameList[j]为Span类型的标签;HttpUtility.HtmlDecode方法把HTML编码转为文本编码,使中文正常显示
         string name = HttpUtility.HtmlDecode(((Span)nameList[j]).StringText);
         //Parent表示该HTML节点的父节点
         //NextSobling表示该HTML节点的下一个兄弟节点
         //Children表示该HTML节点的所有孩子节点组成的集合
         //ExtractAllNodesThatMatch表示获取所有满足给定过滤器条件的节点,两个参数分别代表过滤器和是否进入孩子节点中迭代查找
         //注意:对Winista.HtmlParser来说,“空文本节点”也是一个节点(在IE的开发者工具中显示“空文本节点”,而Chrome则不显示);形似<del>内容</ del>在Children中会表达成三个节点
         NodeList priceList = nameList[j].Parent.Parent.NextSibling.NextSibling.Children.ExtractAllNodesThatMatch(priceFilter, true);
         if (priceList.Size() == 1)
         {
             string priceStr = ((Span)priceList[0]).StringText;
             double price = Double.Parse(priceStr.Substring(2, priceStr.Length - 2));
             TradeList.Add(new Commodity(name, price, "RMB"));
         }
         else
         {
             badRecordCount++;
         }
     }
     Console.WriteLine("第" + (index + 1) + "个页面处理完成!");
     //保存当前页面到本地文件
     //StreamWriter writer = new StreamWriter("searchresult"+i+".html");
     //writer.Write(s);
     //writer.Close();
 }
Пример #10
0
        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="fansList">保存爬得的粉丝数组</param>
        public void GetInfoFromHtml(List<Fan> fansList)
        {
            Lexer lexer = new Lexer(currentHtmlContent);
            Parser parser = new Parser(lexer);
            //获取包含每条微博的div标记列表
            NodeList fansNodeList = parser.Parse(fanFilter);
            for (int i = 0; i < fansNodeList.Size(); i++)
            {
                Fan fan = new Fan();
                //获取包含一个粉丝的<li>标记
                Bullet fanBullet = (Bullet)fansNodeList[i];

                #region 获取该粉丝头像
                NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true);
                if (fanPortraitNodeList.Size() == 1)
                {
                    Div fanPortraitDiv = (Div)fanPortraitNodeList[0];
                    NodeList imgNodeList = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true);
                    if (imgNodeList.Size() == 1)
                    {
                        ImageTag imgNode = (ImageTag)imgNodeList[0];
                        if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT"))
                        {
                            string imgUrl = imgNode.GetAttribute("SRC");
                            string imgName = imgNode.GetAttribute("ALT");
                            fan.Name = imgName;
                            WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie
                            wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg");
                            wc.DownloadFileCompleted += wc_DownloadFileCompleted;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!");
                        }

                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!");
                }
                #endregion

                #region 获取该粉丝的关注数/粉丝数/微博数
                NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true);
                if (fanConnectNodeList.Size() == 1)
                {
                    NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (ATagList.Size() == 3)
                    {
                        for (int j = 0; j < 3; j++)
                        {
                            ATag aTag = (ATag)ATagList[j];
                            switch (j)
                            {
                                case 0:
                                    if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow"))
                                    {
                                        fan.FollowCount = Int32.Parse(aTag.StringText);
                                    }
                                    else
                                    {
                                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!");
                                    }
                                    break;
                                case 1:
                                    if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans"))
                                    {
                                        fan.FansCount = Int32.Parse(aTag.StringText);
                                    }
                                    else
                                    {
                                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!");
                                    }
                                    break;
                                default:
                                    fan.FeedsCount = Int32.Parse(aTag.StringText);
                                    break;
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!");
                }
                #endregion

                #region 获取该粉丝的简介信息
                NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true);
                if (fanInfoNodeList.Size() == 1)
                {
                    //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml());
                    Div fanInfoDiv = (Div)fanInfoNodeList[0];
                    string intro = fanInfoDiv.StringText;
                    if (intro.Substring(0, 2).Equals("简介"))
                    {
                        fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " ");
                    }
                }
                else
                {
                    if (fanInfoNodeList.Size() == 0)
                    {
                        fan.Introduction = "";
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!");
                    }
                }
                #endregion

                #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息
                NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true);
                if (fanLocationNodeList.Size() == 1)
                {
                    //获取粉丝的UserID信息;校验该粉丝的用户名信息
                    NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (aTagNodeList.Size() >= 1)
                    {
                        ATag nameNode = (ATag)aTagNodeList[0];
                        if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF"))
                        {
                            //获取粉丝的UserID信息
                            string uidStr = nameNode.GetAttribute("USERCARD");
                            if (uidStr.Substring(0, 3).Equals("id="))
                            {
                                fan.UserID = uidStr.Substring(3, uidStr.Length - 3);
                            }

                            //获取粉丝的微博链接
                            string linkUrl = nameNode.GetAttribute("HREF");
                            fan.LinkURL = "http://www.weibo.com" + linkUrl;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!");
                        }
                        //校验该粉丝的用户名信息
                        if (!nameNode.StringText.Equals(fan.Name))
                        {
                            Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!");
                        }
                    }

                    //获取粉丝的性别和地点信息
                    NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true);
                    if (locationNodeList.Size() == 1)
                    {
                        string locationStr = "";
                        for (int j = 0; j < locationNodeList[0].Children.Size(); j++)
                        {
                            INode node = locationNodeList[0].Children[j];
                            if (node.GetType().Equals(typeof(TextNode)))
                            {
                                TextNode tNode = (TextNode)node;
                                locationStr += tNode.ToPlainTextString();
                            }
                            if (node.GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tNode = (TagNode)node;
                                if (tNode.Attributes.ContainsKey("CLASS"))
                                {
                                    if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= =
                                    {
                                        fan.Gender = "female";
                                    }
                                    else
                                    {
                                        if (tNode.GetAttribute("CLASS").Contains("male"))
                                        {
                                            fan.Gender = "male";
                                        }
                                        else
                                        {
                                            fan.Gender = "unknown";
                                            Console.WriteLine("第" + i + "个粉丝性别不明!");
                                        }
                                    }
                                }
                            }
                        }
                        fan.Location = locationStr.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!");
                }
                #endregion

                #region 获取该粉丝关注用户的方式
                NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true);
                if (followMethodNodeList.Size() == 1)
                {
                    NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                    if (methodNodeList.Size() == 1)
                    {
                        ATag methodNode = (ATag)methodNodeList[0];
                        fan.FollowMethod = methodNode.StringText.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!");
                }
                #endregion

                fansList.Add(fan);
            }
        }
Пример #11
0
 public void GetInfoFromHtml(int currentPage)
 {
     Lexer lexer = new Lexer(currentHtml);
     Parser parser = new Parser(lexer);
     NodeList poiHeadList = parser.Parse(poiListFilter);
     if (poiHeadList.Count == 1)
     {
         NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
         int numCount = 0;
         for (int i = 0; i < poiNodeList.Count; i++)
         {
             POI poi = new POI();
             DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
             if (poiNode.TagName.Equals("DD"))
             {
                 numCount++;
                 poi.Page = currentPage;
                 poi.Number = numCount;
                 #region 获取口味、环境和服务评分,以及获取星级
                 NodeList tasteNodeList = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                 NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                 NodeList serviceNodeList = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                 if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                 {
                     Span spanNode = (Span)tasteNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)environmentNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)serviceNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     #region 获取星级
                     INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                     if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                     {
                         INode rankNodeCandidate = rankNodeOfParent.Children[0];
                         if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                         {
                             Span rankNode = (Span)rankNodeCandidate;
                             string rank = rankNode.GetAttribute("TITLE");
                             if (rank.Contains("五"))
                             {
                                 poi.Rank = 5;
                             }
                             else
                             {
                                 if (rank.Contains("四"))
                                 {
                                     poi.Rank = 4;
                                 }
                                 else
                                 {
                                     if (rank.Contains("三"))
                                     {
                                         poi.Rank = 3;
                                     }
                                     else
                                     {
                                         if (rank.Contains("二"))
                                         {
                                             poi.Rank = 2;
                                         }
                                         else
                                         {
                                             if (rank.Contains("一"))
                                             {
                                                 poi.Rank = 1;
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                     #endregion
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!");
                 }
                 #endregion
                 #region 获取平均消费
                 NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                 if (averageNodeList.Count == 1)
                 {
                     INode averageNode = averageNodeList[0];
                     if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                     {
                         string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                         poi.AverageCost = Int32.Parse(cost);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!");
                 }
                 #endregion
                 #region 获取点评数
                 NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                 if (commentNodeList.Count == 1)
                 {
                     INode commentNode = commentNodeList[0];
                     if (commentNode.GetType().Equals(typeof(ATag)))
                     {
                         string commentNum = ((ATag)commentNode).StringText;
                         if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                         {
                             commentNum = commentNum.Substring(0, commentNum.Length - 3);
                         }
                         poi.CommentCount = Int32.Parse(commentNum);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!");
                 }
                 #endregion
                 #region 获取店名
                 NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                 if (nameNodeList.Count == 1)
                 {
                     INode nameNode = nameNodeList[0];
                     if (nameNode.GetType().Equals(typeof(ATag)))
                     {
                         poi.Name = ((ATag)nameNode).StringText;
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!");
                 }
                 #endregion
                 #region 获取地址和电话
                 NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                 if (addressNodeList.Count == 1)
                 {
                     NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                     if (districtNodeList.Count == 1)
                     {
                         ATag districtTag = (ATag)districtNodeList[0];
                         string address = districtTag.ToPlainTextString();
                         if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                         {
                             TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                             string detailAddress = detailAddressNode.ToPlainTextString();
                             detailAddress = detailAddress.Trim();
                             string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                             poi.Phone = phoneStr;
                             address += detailAddress.Substring(0, detailAddress.Length - 8);
                         }
                         char[] removeChrVector = { ' ', '\n', '\t' };
                         address = address.Trim(removeChrVector);
                         foreach (char c in removeChrVector)
                         {
                             address = address.Replace(c.ToString(), "");
                         }
                         poi.Address = address;
                     }
                     else
                     {
                         Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!");
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!");
                 }
                 #endregion
                 #region 获取标签
                 NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                 if (tagsNodeList.Count == 1)
                 {
                     INode tagsNode = tagsNodeList[0];
                     if (tagsNode.Children != null)
                     {
                         for (int j = 0; j < tagsNode.Children.Count; j++)
                         {
                             INode node = tagsNode.Children[j];
                             if (node.GetType().Equals(typeof(ATag)))
                             {
                                 poi.Tags.Add(node.ToPlainTextString());
                             }
                         }
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!");
                 }
                 #endregion
                 poiList.Add(poi);
             }
         }
     }
     else
     {
         Console.WriteLine("获取POI列表出错");
     }
 }
Пример #12
0
        public void GetLinkForPage(string url)
        {
            Lexer lexer = new Lexer(GetHtml(url));
            Parser parse = new Parser(lexer);
            parse.Encoding = "gb2312";
            NodeFilter linkFilter = new LinkRegexFilter(@"^http\://item\.taobao\.com/item\.htm\?id\=\d+$");
            NodeFilter classFilter = new HasAttributeFilter("class", "EventCanSelect");
            AndFilter andFilter = new AndFilter(linkFilter, classFilter);
            NodeList result = parse.Parse(andFilter);

            int length = result.Count;
            for (int i = 0; i < length; i++)
            {
                ItemLink.Add(result[i]);
            }
        }
Пример #13
0
        public List<ATag> ParseCatelog(string html)
        {
            List<ATag> atags = new List<ATag>();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "fenlei_list");
            NodeList navNodes = parser.Parse(nav);

            NodeFilter catelog = new LinkRegexFilter(@"^\.\./product/index\.php\?cplm\=\-\d\d\d\-$");
            catelog = new HasChildFilter(catelog);
            NodeList catelogNodes = navNodes[0].Children.ExtractAllNodesThatMatch(catelog);

            if(catelogNodes==null){
                return atags;
            }

            int length = catelogNodes.Count;
            for (int i=0;i<length;i++)
            {
                INode node = catelogNodes[i];
                ATag a = node.Children[0] as ATag;
                atags.Add(a);
            }

            return atags;
        }
Пример #14
0
 /// <summary>
 /// 辅助函数:从HTML中获得max_id
 /// </summary>
 /// <param name="htmlContent">HTML文本</param>
 /// <returns></returns>
 private bool GetMaxIdFromHtml(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList feedNodeList = parser.Parse(idFilter[(int)Type]);
     if (feedNodeList.Size() >= 1)
     {
         max_id = ((TagNode)feedNodeList[feedNodeList.Size() - 1]).GetAttribute("MID");
         return true;
     }
     else
     {
         return false;
     }
 }
Пример #15
0
 /// <summary>
 /// 辅助函数:从HTML中获得end_id
 /// </summary>
 /// <param name="htmlContent">HTML文本</param>
 /// <returns></returns>
 private bool GetEndIdFromHtml(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList feedNodeList = parser.Parse(idFilter[(int)Type]);
     if (feedNodeList.Size() >= 1 && feedNodeList[0].GetType().Equals(typeof(Div)) && ((TagNode)feedNodeList[0]).Attributes.ContainsKey("MID"))
     {
         end_id = ((TagNode)feedNodeList[0]).GetAttribute("MID");
         return true;
     }
     else
     {
         return false;
     }
 }
Пример #16
0
        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="currentPage">爬得的微博所在页面序号</param>
        /// <param name="feedList">保存爬得的微博的数组</param>
        public void GetInfoFromHtml(int currentPage, List<Feed> feedList)
        {
            foreach (string htmlContent in htmlContentList)
            {
                Lexer lexer = new Lexer(htmlContent);
                Parser parser = new Parser(lexer);
                //获取包含每条微博的div标记列表
                NodeList feedNodeList = parser.Parse(feedFilter);
                for (int i = 0; i < feedNodeList.Size(); i++)
                {
                    serialNumber++;
                    Feed feed = new Feed();
                    feed.Page = currentPage;
                    feed.Number = serialNumber;
                    //类似微博转发的数量
                    int similarFeedCount = 0;

                    //取得第i条微博的div
                    TagNode feedDiv = (TagNode)feedNodeList[i];

                    //判断是否含有“还有X条对原微博的转发”
                    NodeList similarfeedCountNodeList = feedDiv.Children.ExtractAllNodesThatMatch(similarFeedCountFilter, true);
                    switch (similarfeedCountNodeList.Size())
                    {
                        case 1:
                            //说明存在“还有X条对原微博的转发”的div;此处看起来此HTML解析器不认<b>标记,而把其中包含的内容作为其下一个兄弟节点= =
                            similarFeedCount = Int32.Parse(((TextNode)(similarfeedCountNodeList[0].NextSibling)).ToPlainTextString());
                            break;
                        case 0:
                            //说明不存在“还有X条对原微博的转发”
                            similarFeedCount = 0;
                            break;
                        default:
                            Console.WriteLine("第" + i + "条微博中,判断是否含有类似微博转发的标准出错!");
                            break;
                    }

                    #region 获取微博作者
                    NodeList feedAuthorNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true);
                    //在整个一条微博的范围(即一个feedDiv)内,满足feedAuthorFilter过滤器的div节点数量应该是本条微博作者加上转发类似微博的作者(如果有的话),所以是(1 + similarFeedCount)
                    if (feedAuthorNodeList.Size() == (1 + similarFeedCount))
                    {
                        ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[0];
                        string author = feedAuthorTag.GetAttribute("TITLE");
                        feed.Author = author;
                        //如果存在,则获取该作者的备注名
                        INode remarkNameNode = feedAuthorTag.NextSibling;
                        if (remarkNameNode.GetType().Equals(typeof(Span)))
                        {
                            string remarkName = ((Span)remarkNameNode).StringText;
                            //去掉前后括号
                            remarkName = remarkName.Substring(1, remarkName.Length - 2);
                            feed.RemarkName = remarkName;
                        }
                    }
                    else
                    {
                        //从首页爬取微博时,微博来自不同的被关注者,所以是有微博作者的;而从个人主页爬取微博时,由于所有微博作者都是该用户,所以是没有微博作者相关节点的
                        if (!user.NickName.Equals(""))
                        {
                            feed.Author = user.NickName;
                            feed.RemarkName = user.RemarkName;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条微博中,判断微博作者的标准出错!");
                        }
                    }
                    #endregion

                    #region 获取转发微博
                    NodeList reFeedNodeList = feedDiv.Children.ExtractAllNodesThatMatch(reFeedFilter, true);
                    //转发微博;(1 + similarFeedCount)的理由和获取微博作者时相同
                    if (reFeedNodeList.Size() == (1 + similarFeedCount))
                    {
                        //获取转发微博的div
                        TagNode reFeedDiv = (TagNode)reFeedNodeList[0];
                        //先获取本次转发微博的相关信息
                        GetReFeedInfo(i, feed, reFeedDiv, feedDiv);
                        #region 考虑“还有X条对原微博的转发”的情况
                        if (similarFeedCount > 0)
                        {
                            NodeList similarFeedNodeList = feedDiv.Children.ExtractAllNodesThatMatch(similarFeedFilter, true);
                            if (similarFeedNodeList.Size() == similarFeedCount)
                            {
                                for (int j = 0; j < similarFeedCount; j++)
                                {
                                    feedList.Add(GetSimilarFeed(currentPage, i, (TagNode)similarFeedNodeList[j], feed.OriginalAuthor, feed.Content));
                                }
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条微博中,获取转发微博的数量出错!");
                            }
                        }
                        #endregion
                    }
                    else
                    {
                        if (reFeedNodeList.Size() == 0)
                        {
                            //获取本条微博内容作为微博内容
                            NodeList feedContentNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
                            if (feedContentNodeList.Size() == 1)
                            {
                                feed.Content = GetContentFromChildren(feed, feedContentNodeList[0], false);
                                #region 由于存在某些情况,转发微博被删除后更不过滤不到reFeedDiv,所以需要再次检查是否存在已删除的转发微博
                                NodeList deletedFeedList = feedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter2, true);
                                if (deletedFeedList.Size() > 0)
                                {
                                    feed.OriginalAuthor = "Unknown";
                                    feed.ReFeedOrNot = true;
                                    feed.ReFeedReason = feed.Content;
                                    feed.Content = "微博已删除";
                                }
                                #endregion
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条微博中,判断微博内容的标准出错!");
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条微博中,判断转发微博的标准出错!");
                        }
                    }
                    #endregion

                    //获取包含微博发送地点的div
                    feed.Location = GetLocationInfo(i, feedDiv);
                    //获取包含微博“赞”数的标记
                    feed.LikeCount = GetFeedLikeInfo(i, feedDiv);
                    //获取包含微博转发数的链接标记
                    feed.ReFeedCount = GetFeedForwardCount(i, feedDiv);
                    //获取包含微博评论数的链接标记
                    feed.CommentCount = GetFeedCommentCount(i, feedDiv);
                    //获取包含微博发送时间的链接标记
                    feed.Time = GetFeedTimeInfo(i, feedDiv);
                    //获取包含微博发送设备的链接标记
                    feed.Device = GetFeedSendTypeInfo(i, feedDiv);

                    feedList.Add(feed);
                }
            }
        }
Пример #17
0
 public void InitPage()
 {
     Lexer lexer = new Lexer(GetHtml(urlBase));
     Parser parse = new Parser(lexer);
     parse.Encoding = "gb2312";
     NodeFilter linkFilter = new LinkRegexFilter(@"s\=\d+\#J_FilterTabBar");
     _linkResult = parse.Parse(linkFilter);
 }
        public void StartCrawl()//  private void BtnDownload_Click(object sender, RoutedEventArgs e)
        {
            List <ImportInvoiceDTO>      list         = new List <ImportInvoiceDTO>();
            List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>();

            this.parseResult = "";
            Uri uri = new Uri(this.TargetUri);

            #region <N>基于Httphelper,这样下载会要求程序自己实现验证授权

            //<N>基于Httphelper,这样下载会要求程序自己实现验证授权
            //HttpHelper httpHelper = new HttpHelper();
            //HttpItem rq = new HttpItem();
            //rq.URL = uri.AbsoluteUri;
            //HttpResult html = httpHelper.GetHtml(rq);
            //Debug.WriteLine(html.Html);

            //直接基于WebBrowser,授权是由用户手动实现的
            mshtml.IHTMLDocument2 doc2 = null;//(mshtml.IHTMLDocument2)webBox.Document;
            string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0 ? s_htmlFake : doc2.body.innerHTML;
            Debug.WriteLine(html);
            #endregion

            #region 使用HtmlParser提取HTML
            Lexer              lexer    = new Lexer(html);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    //抓取一行
                    var tagTR = parserTR(nodeList[i]);

                    #region 充填有效行
                    if (tagTR != null)
                    {
                        validRowList.Add(tagTR);
                    }
                    #endregion
                }

                parserValidTR(validRowList, ref list);
#if DEBUG
                // MessageBox.Show(parseResult);
#endif
            }

            /*  parseResult = HtmlText(html);
             * MessageBox.Show(parseResult);*/
            #endregion

            #region  步

            if (list == null || list.Count == 0)
            {
                MessageBox.Show("该页面上没有检测到预期数据");
                return;
            }

            ImportInvoiceListDTO soap = new ImportInvoiceListDTO
            {
                List   = list,
                Result = new ImportInvoiceResultDTO
                {
                    Message = "CALLBACK",
                    Status  = 9
                }
            };
            //using (var factory = new ChannelFactory<ISyncImportInvoiceService>("*"))
            //{
            //    var chl = factory.CreateChannel();
            //    soap = chl.PullImportInvoices(soap);

            //    if (soap.Result.Status == 0)
            //    {
            //        //重试
            //        soap = chl.PullImportInvoices(soap);
            //    }
            //}

            //if (soap.Result.Status == -1)
            //{
            //    // 修改UI线程
            //      MessageBox.Show(soap.Result.Message);
            //}
            CallWS(soap);
            MessageBox.Show("本页已同步完成,请点击下一页继续同步");
            //FakeBusy();

            #endregion
        }
Пример #19
0
        public void ParseProducts(ATag a)
        {
            string html = GetHtml(a.Link.Replace("../", "http://rrxf.cn/"));

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "photoyi");
            NodeList navNodes = parser.Parse(nav);

            if (navNodes == null)
                return;

            int length = navNodes.Count;
            for (int i = 0; i < length; i++)
            {
                ATag link = ParseProductUrl(navNodes[i].ToHtml());
                Console.WriteLine(link.Link);
                ParseProduct(link);
            }
        }
        public override IEnumerable <string> Process(string fileName)
        {
            System.Net.WebClient aWebClient = new System.Net.WebClient();
            aWebClient.Encoding = System.Text.Encoding.Default;
            string html = aWebClient.DownloadString(this.url);

            //string html = File.ReadAllText(fileName);
            Lexer lexer = new Lexer(html);

            Winista.Text.HtmlParser.Parser parser = new Winista.Text.HtmlParser.Parser(lexer);
            NodeList htmlNodes = parser.Parse(null);

            List <IsotopicAtom> atoms = new List <IsotopicAtom>();

            var          node = FindFirstNode(htmlNodes, "tbody");
            INode        nextNode;
            IsotopicAtom atom = null;

            while ((nextNode = FindFirstNode(node, "tr")) != null)
            {
                if (nextNode.Children != null)
                {
                    var tds = nextNode.Children.ExtractAllNodesThatMatch(new NameFilter("td"));
                    if (tds.Count == 1)
                    {
                        atom = null;
                    }

                    if (tds.Count >= 3 && tds[0].FirstChild != null)
                    {
                        var t1 = tds[0].FirstChild.GetText().Trim();
                        var t2 = tds[1].FirstChild.GetText().Trim();

                        if (Char.IsDigit(t1[0]) && Char.IsLetter(t2[0]))
                        {
                            atom = new IsotopicAtom();
                            atoms.Add(atom);

                            atom.Name = tds[1].FirstChild.GetText().Trim();
                            Peak p = new Peak();
                            p.Mz        = GetDouble(tds[3]);
                            p.Intensity = GetDouble(tds[4]);
                            atom.Isotopics.Add(p);
                        }
                        else if (atom != null)
                        {
                            var txt = tds[0].FirstChild.GetText().Trim();
                            if (txt.Length > 0 && Char.IsLetter(txt[0]))
                            {
                                tds.Remove(0);
                            }

                            Peak p = new Peak();
                            p.Mz        = GetDouble(tds[1]);
                            p.Intensity = GetDouble(tds[2]);
                            atom.Isotopics.Add(p);
                        }
                    }
                }

                node = nextNode.NextSibling;
                if (node == null)
                {
                    break;
                }
            }

            atoms.ForEach(m => m.Isotopics.RemoveAll(n => n.Intensity == 0.0));

            atoms.RemoveAll(m => m.Isotopics.Count == 0);

            var dic = atoms.ToDictionary(m => m.Name);

            var x = new IsotopicAtom();

            x.Name = "X";
            x.Isotopics.Add(new Peak(1, 0.9));
            x.Isotopics.Add(new Peak(2, 0.1));
            atoms.Insert(0, x);

            atoms.Add(AddHevayAtom("(H2)", "H", 1, dic));
            atoms.Add(AddHevayAtom("(C13)", "C", 1, dic));
            atoms.Add(AddHevayAtom("(N15)", "N", 1, dic));
            atoms.Add(AddHevayAtom("(O18)", "O", 2, dic));

            using (StreamWriter sw = new StreamWriter(fileName))
            {
                atoms.ForEach(m =>
                {
                    sw.WriteLine("{0}\t{1}", m.Name, m.Isotopics.Count);
                    m.Isotopics.ForEach(n => sw.WriteLine("{0:0.000000}\t{1:0.000000}", n.Mz, n.Intensity));
                    sw.WriteLine();
                });
            }
            return(new string[] { fileName });
        }
Пример #21
0
 /// <summary>
 /// 获取目标数据
 /// </summary>
 /// <param name="parser">目标html文件</param>
 /// <param name="tag">标签名称</param>
 /// <param name="attribute">标签里面的属性名称</param>
 /// <param name="attValue">属性的值</param>
 /// <returns>标签内的目标数据</returns>
 public static string getValue(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.Parse(nodeFilter);
     if (nodeList.Count >= 1)
     {
         ITag tagNode = (nodeList[0] as ITag);
         return tagNode.ToPlainTextString();
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.ToPlainTextString();
     //                    return value;
     //                }
     //            }
     //        }
     //    }
     //}
     return null;
 }
Пример #22
0
        public ATag ParseProductUrl(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter filter = new LinkRegexFilter(@"lookcp\.php\?cpid\=\d{0,}");
            NodeList alist = parser.Parse(filter);
            ATag a = alist[0] as ATag;
            a.Link = "http://rrxf.cn/product/" + a.Link;
            return a;
        }
Пример #23
0
        /// <summary>
        /// 获得列表
        /// </summary>
        /// <returns></returns>
        public List<OddsLiveMatch> GetScrollMatchList()
        {
            List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>();
            try
            {
                HttpHelper h = new HttpHelper();
                Cookie lng = new Cookie("lng", "2");
                lng.Domain = domain;
                h.CookieContainer.Add(lng);
                string zoudi = h.GetHtml("https://" +domain+ "/default.aspx"+ zoudiUrl);
                if (!string.IsNullOrEmpty(zoudi))
                {
                    #region 分析网页html节点
                    Lexer lexer = new Lexer(zoudi);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag;
                    if (divNode.Attributes["ID"].Equals("PageBody"))
                    {
                        NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div));
                        if (dataDivList[0].ToPlainTextString() == "走地盤")
                        {
                            if (dataDivList[2].ToPlainTextString() == "全場賽果")
                            {
                                return liveMatchList;
                            }
                            for (int i = 0; i < dataDivList.Count; i++)
                            {
                                ITag div = dataDivList[i] as ITag;
                                if (div.Attributes["CLASS"] != null && div.Attributes["CLASS"].Equals("menuRow"))
                                {
                                    OddsLiveMatch oddsLive = new OddsLiveMatch();
                                    oddsLive.urlparams = (div.FirstChild as ITag).Attributes["HREF"].ToString();
                                    oddsLive.id = oddsLive.urlparams.Split('&')[0].Substring(4);
                                    oddsLive.time = DateTime.Now;
                                    oddsLive.name = div.ToPlainTextString();
                                    liveMatchList.Add(oddsLive);
                                }
                            }
                        }
                    }
                    #endregion 分析网页html节点
                }
            }
            catch (Exception)
            {

            }
            return liveMatchList;
        }
        private void CrawlCurrentPage(WebBrowser wb, bool isOffline, bool IsUnConfirmChecked, ref bool hasValidData)
        {
            mshtml.IHTMLDocument2 doc2 = isOffline ? null : (mshtml.IHTMLDocument2)wb.Document;
            string html = isOffline ? s_htmlFake : doc2.body.innerHTML;


            Debug.WriteLine(html);


            List <ImportInvoiceDTO>      list         = new List <ImportInvoiceDTO>();
            List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>();

            //this.parseResult = "";

            #region  使用IHTMLDocument2提取HTML

            mshtml.HTMLTableClass table = IsUnConfirmChecked ? (mshtml.HTMLTableClass)doc2.all.item("example1", 0) : (mshtml.HTMLTableClass)doc2.all.item("example", 0);
            if (table == null)
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效table");
                return;
            }
            mshtml.HTMLTableSectionClass tbody = (mshtml.HTMLTableSectionClass)table.lastChild;
            if (tbody == null)
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效tbody");
                return;
            }

            var tbodyHtml = tbody.innerHTML;

            if (0 == string.Compare(tbody.innerText, "没找到记录", StringComparison.InvariantCultureIgnoreCase))
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效tbody");
                return;
            }
            #region WPF WebBroswer交互源代码DOM元素总结

#if RESEARCH
            //HTMLDocument doc01 = wb.Document as HTMLDocument;
            ////IHTMLDocument2 doc02 = wb.Document as IHTMLDocument2;
            //Debug.WriteLine(doc01.body.innerHTML);


            ///读/写元素
            ///
            mshtml.IHTMLElement login_pass = (mshtml.IHTMLElement)doc2.all.item("login_pass", 0);
            mshtml.IHTMLElement password   = (mshtml.IHTMLElement)doc2.all.item("password", 0);
            password.setAttribute("value", "12345678");
            login_pass.setAttribute("style", "");

            mshtml.IHTMLElement login_pass1 = (mshtml.IHTMLElement)doc2.all.item("login_pass1", 0);
            mshtml.IHTMLElement password1   = (mshtml.IHTMLElement)doc2.all.item("password1", 0);
            login_pass1.setAttribute("style", "display:none;");
            //password1.setAttribute("style", "width:1px");

            //IHTMLElement item = doc01.getElementById("ptmm");
            //item.innerHTML = "<INPUT id=\"pwd\" class=\"login_input password\" type=\"text\" value=\"\" />";

            ////  doc01.body.insertAdjacentHTML(,);
            //MessageBox.Show(item.innerText);

            //wb.NavigateToString(doc01.body.innerHTML);

            /// Trigger event
            //点击确定按钮
            loginBT.click();


            /// script injection
            ///
            //Basic ds = new Basic();
            //wb.ObjectForScripting = ds;//该对象可由显示在WebBrowser控件中的网页所包含的脚本代码访问

            ///Levarage JS
            ///
            mshtml.IHTMLWindow2 win = (mshtml.IHTMLWindow2)doc2.parentWindow;
            win.execScript("Login('12345678', '', 1)", "javascript");
            return;
#endif


            #endregion

            #endregion
            #region 使用HtmlParser提取tbodyHtml
            Lexer              lexer    = new Lexer(tbodyHtml);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                hasValidData = false;
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    //抓取一行
                    var tagTR = parserTR(nodeList[i]);

                    #region 充填有效行
                    if (tagTR != null)
                    {
                        validRowList.Add(tagTR);
                    }
                    #endregion
                }

                parserValidTR(validRowList, IsUnConfirmChecked, ref list);
            }

            #endregion
            #region 使用HtmlParser提取HTML

            /* Lexer lexer = new Lexer(html);
             * hParser.Parser parser = new hParser.Parser(lexer);
             * hParser.NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
             * NodeList nodeList = parser.Parse(filter);
             * if (nodeList.Count == 0)
             *  MessageBox.Show("没有符合要求的节点");
             * else
             * {
             *  for (int i = 0; i < nodeList.Count; i++)
             *  {
             *      //抓取一行
             *      var tagTR = parserTR(nodeList[i]);
             *
             #region 充填有效行
             *      if (tagTR != null)
             *          validRowList.Add(tagTR);
             #endregion
             *
             *  }
             *
             *  parserValidTR(validRowList, ref list);
             *
             * }
             */
            #endregion

            #region 日志 & 导出 & 持久化

            if (list == null || list.Count == 0)
            {
                MessageBox.Show("该页面上没有检测到预期数据");
                hasValidData = false;
            }

            ImportInvoiceListDTO soap = new ImportInvoiceListDTO
            {
                List   = list,
                Result = new ImportInvoiceResultDTO
                {
                    Message = "CALLBACK",
                    Status  = 9
                }
            };

            Debug.Write(soap);
            #region Log
            if (this.IfLog == "1")
            {
                soap.List.ForEach(impinfo =>
                {
                    if (IsUnConfirmChecked)
                    {
                        LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 勾选标志{8} 操作时间{9}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag, impinfo.ChosenTime));
                    }
                    else
                    {
                        LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 确认月份{8}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag));
                    }
                });
            }
            #endregion
            if (this.IfCallWS == "1")
            {
                CallWS(soap);
            }
            Debug.Write("本页已同步完成,请点击下一页继续同步");
            //FakeBusy();

            #endregion
        }