private string HtmlText(string sourceHtml)
        {
            hParser.Parser parser = hParser.Parser.CreateParser(sourceHtml.Replace(System.Environment.NewLine, ""), "utf-8");

            StringBuilder builderHead = new StringBuilder();
            StringBuilder builderBody = new StringBuilder();


            hParser.NodeFilter html  = new TagNameFilter("TR");
            hParser.INode      nodes = parser.Parse(html)[0];
            builderHead.Append(nodes.Children[0].ToHtml());
            hParser.INode body = nodes.Children[1];
            hParser.INode div  = body.Children[0];


            for (int i = 0; i < div.Children.Count; i++)
            {
                if (div.Children[i] is hParser.ITag)
                {
                    builderBody.Append(div.Children[i].ToHtml());
                }
            }

            StringBuilder builder = new StringBuilder();

            builder.Append("<html>");
            builder.Append(builderHead.ToString());
            builder.Append("<body>");
            builder.Append(string.Format("<{0}>", div.GetText()));
            builder.Append(builderBody.ToString());
            builder.Append("</div>");
            builder.Append("</body>");
            builder.Append("</html>");
            return(builder.ToString());
        }
Ejemplo n.º 2
0
 /// <summary>
 /// 获取目标数据
 /// </summary>
 /// <param name="parser">目标html文件</param>
 /// <param name="tag">标签名称</param>
 /// <param name="attribute">标签里面的属性名称</param>
 /// <param name="attValue">属性的值</param>
 /// <returns>标签内的目标数据</returns>
 public static string getValue(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new TagNameFilter(tag);
     NodeList nodeList = parser.Parse(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             foreach (string key in tagNode.Attributes.Keys)
             {
                 if (key.Contains("<TAGNAME>"))
                 {
                     continue;
                 }
                 if (key.Contains(attribute))
                 {
                     if (tagNode.Attributes[key].ToString() == attValue)
                     {
                         value = tagNode.ToPlainTextString();
                         return value;
                     }
                 }
             }
         }
     }
     return null;
 }
Ejemplo n.º 3
0
 static void GetBlogLink(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList articleList = parser.Parse(articleFilter);
     if (articleList.Count == 1)
     {
         NodeList candidateNodeList = articleList[0].Children.ExtractAllNodesThatMatch(wrapFilter, true);
         for (int i = 0; i < candidateNodeList.Count; i++)
         {
             NodeList linkNodeList = candidateNodeList[i].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), false);
             if (linkNodeList.Count == 1)
             {
                 string blogLink = ((ATag)linkNodeList[0]).ExtractLink();
                 blogLinkList.Add(blogLink);
             }
             else
             {
                 Console.WriteLine("第" + i + "个条目中,判断链接出错!");
             }
         }
     }
     else
     {
         Console.WriteLine("获取包含日志列表出错!");
     }
 }
Ejemplo n.º 4
0
        public static List<Product> LoadGoods(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter filter = new HasAttributeFilter("class", "product");
            NodeList products = parser.ExtractAllNodesThatMatch(filter);

            List<Product> result = new List<Product>();
            for (int i = 0; i < products.Count; i++)
            {
                try
                {
                    Product p = new Product();
                    string pname = "", ppromo = "", pimg = "";
                    decimal pprice = 0;
                    ITag product = products[i] as ITag;

                    //name
                    NodeFilter nameFilter = new HasAttributeFilter("class", "product-title");
                    NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true);
                    ITag name = names[0] as ITag;
                    pname = name.ToPlainTextString().Trim();

                    //name
                    NodeFilter priceFilter = new HasAttributeFilter("class", "product-price");
                    NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true);
                    ITag price = prices[0] as ITag;
                    pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7));

                    //img
                    NodeFilter imgFilter = new TagNameFilter("img");
                    NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true);
                    ITag img = imgs[0] as ITag;
                    pimg = img.GetAttribute("DATA-KS-LAZYLOAD");

                    //promo
                    NodeFilter promoFilter = new HasAttributeFilter("class", "promo");
                    NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true);
                    if (promos.Count > 0)
                    {
                        ITag promo = promos[0] as ITag;
                        ppromo = promo.GetAttribute("data-promo");
                    }

                    p.img = pimg;
                    p.name = pname;
                    p.price = pprice;
                    p.promo = ppromo;
                    result.Add(p);
                }
                catch
                {

                }

            }

            return result;
        }
Ejemplo n.º 5
0
        /// <summary>
        /// 增加一条数据
        /// </summary>
        public string Add(string scheduleID, string companyids, string historyids,DateTime time)
        {
            WebClientBLL bll = new WebClientBLL();
            string[] companyidArr = companyids.Split(',');
            string[] historyidArr = historyids.Split(',');
            int count = 0;
            if (companyidArr.Length == historyidArr.Length)
            {
                dal.Delete(scheduleID);
                for (int i = 0; i < companyidArr.Length; i++)
                {
                    string s = bll.GetOddsHistoryContent(historyidArr[i]);

                    Lexer lexer = new Lexer(s);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag table = bodyNodes.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableTag))[0] as ITag;

                    NodeList tableRows = table.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableRow));

                    for (int f = 0; f < tableRows.Count; f++)
                    {
                        ITag row = tableRows[f] as ITag;
                        if (row.Attributes["ALIGN"].Equals("center") && row.Attributes["BGCOLOR"].Equals("#FFFFFF")){
                            Odds1x2History model = new Odds1x2History();
                            model.companyid = int.Parse(companyidArr[i]);
                            model.scheduleid = int.Parse(scheduleID);
                            model.home = float.Parse(row.Children[0].ToPlainTextString());
                            model.draw = float.Parse(row.Children[1].ToPlainTextString());
                            model.away = float.Parse(row.Children[2].ToPlainTextString());
                            this.FillOdds1x2History(model);
                            string[] t2 = row.Children[3].ToPlainTextString().Replace("showtime(", "").Replace(")", "").Split(',');
                            int yy = int.Parse(t2[0]);
                            int mm = int.Parse(t2[1].Remove(2));
                            int dd = int.Parse(t2[2]);
                            int hh = int.Parse(t2[3]);
                            int mi = int.Parse(t2[4]);
                            int ss = int.Parse(t2[5]);
                            model.time = new DateTime(yy, mm, dd, hh, mi, ss, DateTimeKind.Utc).AddHours(8d);
                            if (model.time > time)
                            {
                                continue;
                            }
                            dal.Add(model);
                            count++;
                        }
                    }
                }
            }
            JSONHelper json = new JSONHelper();
            json.success = true;
            json.totlalCount = count;
            return json.ToString();
        }
Ejemplo n.º 6
0
 static void GetStoryOfRevolution()
 {
     StreamReader reader = new StreamReader("catalogue.htm");
     Lexer lexer = new Lexer(reader.ReadToEnd());
     Parser parser = new Parser(lexer);
     HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl");
     HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink");
     AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass);
     NodeList linkNodeList = parser.Parse(linkFilter);
     List<string> linkUrlList = new List<string>(linkNodeList.Size());
     List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size());
     HttpWebRequest httpWebRequest;
     StreamReader chapterReader = null;
     for (int i = 0; i < linkNodeList.Size(); i++)
     {
         ATag linkNode = (ATag)linkNodeList[i];
         linkUrlList.Add(linkNode.Link);
         httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]);
         chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024));
         string chapterHtmlContent = chapterReader.ReadToEnd();
         chapterHtmlContentList.Add(chapterHtmlContent);
         Console.WriteLine("第" + (i + 1) + "个页面获取完毕!");
     }
     chapterReader.Close();
     HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText");
     StreamWriter writer = new StreamWriter("革命逸事.txt");
     for (int i = 0; i < chapterHtmlContentList.Count; i++)
     {
         writer.WriteLine("第" + (i + 1) + "章");
         lexer = new Lexer(chapterHtmlContentList[i]);
         parser = new Parser(lexer);
         NodeList praghNodeList = parser.Parse(praghFilter);
         if (praghNodeList.Size() == 1)
         {
             for (int j = 0; j < praghNodeList[0].Children.Size(); j++)
             {
                 if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag)))
                 {
                     ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j];
                     writer.WriteLine("    " + praghTag.StringText);
                 }
             }
             writer.WriteLine();
         }
         else
         {
             Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!");
         }
     }
     writer.Close();
 }
Ejemplo n.º 7
0
        public Job GetJobInfoParser(string url)
        {
            Job jobinfo = new Job();

            string title = string.Empty;
            string description = string.Empty;
            DateTime dt = DateTime.Now;
            string email = string.Empty;

            Parser parser = new Parser(new HttpProtocol(new Uri(url)));

            NodeFilter detail = new HasAttributeFilter("class", "d_left");

            NodeList nodeDetail = parser.ExtractAllNodesThatMatch(detail);
            if (nodeDetail == null || nodeDetail.Count == 0)
            {
                return jobinfo;
            }

            description = GetDetailString(nodeDetail);
            Match m = Regex.Match(description, @"发布时间:(?<date>\d\d\d\d-\d{1,2}\-\d{1,2} \d{1,2}\:\d{1,2})");

            dt = DateTime.Now;

            if (m.Success && m.Groups["date"].Success && DateTime.TryParse(m.Groups["date"].Value, out dt)) { }

            Match emailMatch = Regex.Match(description, @"([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)");
            if (emailMatch.Success)
            {
                email = emailMatch.Value;
            }

            Match telMatch = Regex.Match(description, @"(1[3|5|8][0-9]|15[0|3|6|7|8|9]|18[8|9])\d{8}");
            if (telMatch.Success)
            {
                jobinfo.tel = telMatch.Value;
            }

            jobinfo.category_id = Catalog.id;
            jobinfo.title = title;
            jobinfo.description = description;
            jobinfo.created_on = dt;
            jobinfo.is_active = true;
            jobinfo.city_id = Catalog.city.id;
            jobinfo.sp1010url = url;
            jobinfo.poster_email = email;

            return jobinfo;
        }
Ejemplo n.º 8
0
        public List<PlayTime> getPlayTimes(string xmlFile)
        {
            Match match = Regex.Match(xmlFile, @"\d\d\d\d");
            string cinemaID = match.Value;//电影院的ID
            List<PlayTime> playTimes = new List<PlayTime>();
            string html = File.ReadAllText(xmlFile);

            Lexer lexer = new Lexer(html);
            Parser playParser = new Parser(lexer);
            NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14");
            NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter);
            if (playTimeList.Count >= 1)
            {
                for (int i = 0; i < playTimeList.Count; i++)
                {
                    PlayTime playTime = new PlayTime();
                    ITag playTag = (playTimeList[i] as ITag);
                    ITag idTag = (playTag.FirstChild as ITag);
                    if (idTag.Attributes != null)
                    {
                        string strID = idTag.Attributes["HREF"].ToString();
                        Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d");
                        if (idMatch.Success)
                        {
                            playTime.MovieID = int.Parse(idMatch.Value);
                        }
                        else
                        {
                            Match strMatch = Regex.Match(strID, @"\d\d\d\d\d");
                            if (strMatch.Success)
                            {
                                playTime.MovieID = int.Parse(strMatch.Value);
                            }
                        }

                    }
                    string strTime = playTag.NextSibling.NextSibling.ToPlainTextString();
                    char[] a = {'上','映'};
                    strTime = strTime.Trim(a);
                    playTime.Playtime = DateTime.Parse(strTime);
                    playTime.CinemaID = int.Parse(cinemaID);
                    playTime.PlayState = true;

                    playTimes.Add(playTime);
                }
                return playTimes;
            }
            return null;
        }
Ejemplo n.º 9
0
 static string GetBlogTitle(string htmlContent)
 {
     string result = "";
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList titleList = parser.Parse(titleFilter);
     if (titleList.Count == 1)
     {
         TitleTag titleTag = (TitleTag)titleList[0];
         result = titleTag.Title;
     }
     else
     {
         Console.WriteLine("获取标题信息出错!");
     }
     return result;
 }
Ejemplo n.º 10
0
        /// <summary>
        /// 获得列表
        /// </summary>
        /// <returns></returns>
        public List<OddsLiveMatch> GetMatchScrollOdds(string matchid,string urlparams)
        {
            List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>();
            try
            {
                HttpHelper h = new HttpHelper();
                Cookie lng = new Cookie("lng", "2");
                lng.Domain = domain;
                h.CookieContainer.Add(lng);
                //string zoudi = h.GetHtml("https://" + domain + "/default.aspx" + urlparams);
                string zoudi = h.GetHtml(urlparams);
                if (!string.IsNullOrEmpty(zoudi))
                {
                    #region 分析网页html节点
                    Lexer lexer = new Lexer(zoudi);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag;
                    if (divNode.Attributes["ID"].Equals("PageBody"))
                    {
                        NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div));
                        if (dataDivList[0].ToPlainTextString() == "走地盤")
                        {
                            if (dataDivList[2].ToPlainTextString() == "全場賽果")
                            {
                                OddsLiveHistory liveHistory = new OddsLiveHistory();
                                liveHistory.matchid = matchid;
                                liveHistory.home = float.Parse(dataDivList[3].ToPlainTextString().Split(' ')[0]);
                                liveHistory.draw = float.Parse(dataDivList[5].ToPlainTextString().Split(' ')[0]);
                                liveHistory.away = float.Parse(dataDivList[7].ToPlainTextString().Split(' ')[0]);
                                liveHistory.time = DateTime.Now;
                                dal.AddHistory(liveHistory);
                            }
                        }
                    }
                    #endregion 分析网页html节点
                }
            }
            catch (Exception)
            {

            }
            return liveMatchList;
        }
Ejemplo n.º 11
0
 public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP)
 {
     //string vip = string.Empty;
     //string html = File.ReadAllText(url);
     dining = string.Empty;
     park = string.Empty;
     gameCenter = string.Empty;
     intro3D = string.Empty;
     introVIP = string.Empty;
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000");
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagPar = (node.Parent as ITag);
         ITag tagSib = (node.PreviousSibling as ITag);
         if (tagSib.Attributes["CLASS"] != null)
         {
             switch (tagSib.Attributes["CLASS"].ToString())
             {
                 case "ico_cside1 mr12":
                     dining = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside2 mr12":
                     park = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside3 mr12":
                     gameCenter = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside5 mr12":
                     intro3D = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside7 mr12":
                     introVIP = tagPar.ToPlainTextString();
                     break;
             }
         }
     }
     return true;
     //throw new NotImplementedException();
 }
        public void StartCrawl()//  private void BtnDownload_Click(object sender, RoutedEventArgs e)
        {
            this.parseResult = "";
            Uri uri = new Uri(this.TargetUri);

            #region

            //<N>基于Httphelper,这样下载会要求程序自己实现验证授权
            //HttpHelper httpHelper = new HttpHelper();
            //HttpItem rq = new HttpItem();
            //rq.URL = uri.AbsoluteUri;
            //HttpResult html = httpHelper.GetHtml(rq);
            //Debug.WriteLine(html.Html);

            //直接基于WebBrowser,授权是由用户手动实现的
            mshtml.IHTMLDocument2 doc2 = null;//(mshtml.IHTMLDocument2)webBox.Document;
            string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0 ? s_htmlFake : doc2.body.innerHTML;
            Debug.WriteLine(html);
            #endregion

            #region 使用HtmlParser提取HTML
            Lexer              lexer    = new Lexer(html);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    parserTR(nodeList[i]);
                }
                MessageBox.Show(parseResult);
            }

            /*  parseResult = HtmlText(html);
             * MessageBox.Show(parseResult);*/
            #endregion
        }
Ejemplo n.º 13
0
 public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     if (nodeList.Count == 1)
     {
         ITag tagNode = (nodeList[0] as ITag);
         if (tagNode.Attributes != null)
         {
             return tagNode.Attributes[attributeV].ToString();
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.Attributes[attributeV].ToString();
     //                    return value;
     //                }
     //            }
     //        }
     //    }
     //}
     return null;
 }
Ejemplo n.º 14
0
 static void BeautyFlow(int id)
 {
     HttpWebResponse httpWebResponse = null;
     try
     {
         string htmlContent = "";
         string url = BeautyFlowBaseUrl + id + "/";
         HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url);
         httpWebRequest.Method = "GET";
         httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
         {
             StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
             htmlContent = reader.ReadToEnd();
             //调试代码
             //StreamWriter fw = new StreamWriter("debug.html");
             //fw.Write(htmlContent);
             //fw.Close();
             //调试完毕
             httpWebResponse.Close();
             reader.Close();
         }
         if (!htmlContent.Equals(""))
         {
             Console.WriteLine("第一个html读取完成!");
             int startIndex = htmlContent.IndexOf("/girl/");
             int endIndex = htmlContent.IndexOf("/", startIndex + 6) + 1;
             string beautyMorePicturesLink = "http://curator.im" + htmlContent.Substring(startIndex, endIndex - startIndex);
             //Console.WriteLine(beautyMorePicturesLink);
             string htmlContentTwo = "";
             httpWebRequest = HttpWebRequest.CreateHttp(beautyMorePicturesLink);
             httpWebRequest.Method = "GET";
             httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
             if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
             {
                 StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
                 htmlContentTwo = reader.ReadToEnd();
                 httpWebResponse.Close();
                 reader.Close();
             }
             Console.WriteLine("第二个html读取完成!");
             Lexer lexer = new Lexer(htmlContentTwo);
             Parser parser = new Parser(lexer);
             parser.AnalyzePage();
             NodeList divList = parser.ExtractAllNodesThatMatch(BeautyNameFilter);
             string beautyName = "";
             if (divList.Count == 1)
             {
                 beautyName = divList[0].ToPlainTextString();
                 endIndex = beautyName.IndexOf('|') - 1;
                 beautyName = beautyName.Substring(0, endIndex);
             }
             else
             {
                 Console.WriteLine("获取正妹名称出错! id=" + id);
                 Console.Read();
                 return;
             }
             parser.AnalyzePage();
             divList = parser.ExtractAllNodesThatMatch(BeautyFlowImgFilter);
             for (int i = 0; i < divList.Count; i++)
             {
                 ImageTag imgNode = (ImageTag)divList[i];
                 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 2);
             }
         }
         else
         {
             Console.WriteLine("得到的HTML为空!");
             return;
         }
     }
     catch (Exception ex)
     {
         //if (httpWebResponse != null)
         //{
         //    httpWebResponse = (HttpWebResponse)ex.Response;
         //    if (!httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound))
         //    {
         //        Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode);
         //    }
         //    httpWebResponse.Close();
         //}
     }
 }
Ejemplo n.º 15
0
 static int OneDayOneBeauty(string date)
 {
     try
     {
         string htmlContent = "";
         string url = oneDayOneBeautyBaseUrl + date + "/";
         HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url);
         httpWebRequest.Method = "GET";
         HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
         {
             StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
             htmlContent = reader.ReadToEnd();
             //调试代码
             //StreamWriter fw = new StreamWriter("debug.html");
             //fw.Write(htmlContent);
             //fw.Close();
             //调试完毕
             httpWebResponse.Close();
             reader.Close();
         }
         if (!htmlContent.Equals(""))
         {
             Lexer lexer = new Lexer(htmlContent);
             Parser parser = new Parser(lexer);
             parser.AnalyzePage();
             NodeList divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter);
             if (divList.Count == 0)
             {
                 parser.AnalyzePage();
                 divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter2);
             }
             for (int i = 0; i < divList.Count; i++)
             {
                 ImageTag imgNode = (ImageTag)divList[i];
                 //2014年5月16日根据网页结构修改
                 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 1);
             }
             return divList.Count;
         }
         else
         {
             Console.WriteLine("得到的HTML为空!");
             return 0;
         }
     }
     catch (WebException e)
     {
         HttpWebResponse httpWebResponse = (HttpWebResponse)e.Response;
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound))
         {
             Console.WriteLine("网页未找到!");
         }
         else
         {
             Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode);
         }
         httpWebResponse.Close();
         return 0;
     }
 }
Ejemplo n.º 16
0
        public float getCinemaGrade(string html)
        {
            //string tag = "dd";
            //string attribute = "CLASS";
            //string attValue = "total";
            //string left = Spider.getValue(html, tag, attribute, attValue);
            //string tag2 = "dd";
            //string attribute2 = "CLASS";
            //string attValue2 = "total2";
            //string right = Spider.getValue(html, tag2, attribute2, attValue2);
            //string grade = left + right;
            ////return float.Parse(grade);
            //return 1.1f;
            //throw new NotImplementedException();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18");
            NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
            if (nodeList.Count == 1)
            {
                INode node = nodeList[0];
                ITag tagLeft = (node.FirstChild as ITag);
                ITag tagRight = (node.LastChild as ITag);
                string left = tagLeft.ToPlainTextString();
                string right = tagRight.ToPlainTextString();
                string strGrade = left + right;
                return float.Parse(strGrade);
            }
            return 7.0f;
        }
Ejemplo n.º 17
0
        public void ParseProduct(ATag a)
        {
            string html = GetHtml(a.Link);

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter productArea = new HasAttributeFilter("id", "productyou");
            NodeList nodes = parser.ExtractAllNodesThatMatch(productArea);

            ParseProductTitle(nodes);
            ParseProductShowPhoto(nodes);
            ParseProductDemoPhoto(nodes);
            ParsePorductDescribe(nodes);

            NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans");
            NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true);

            NodeFilter productAttributes = new HasAttributeFilter("class", "cph");
            NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true);

            int length = productAttributeNodes.Count;
            for (int i = 0; i < length; i++)
            {
                INode n = productAttributeNodes[i].Children[0];
                string t =n.ToPlainTextString();
                if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success)
                {
                    ParseProductColors(n);
                }
                Console.WriteLine();
            }
        }
Ejemplo n.º 18
0
        public void ParseProducts(ATag a)
        {
            string html = GetHtml(a.Link.Replace("../", "http://rrxf.cn/"));

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "photoyi");
            NodeList navNodes = parser.Parse(nav);

            if (navNodes == null)
                return;

            int length = navNodes.Count;
            for (int i = 0; i < length; i++)
            {
                ATag link = ParseProductUrl(navNodes[i].ToHtml());
                Console.WriteLine(link.Link);
                ParseProduct(link);
            }
        }
Ejemplo n.º 19
0
		/// <summary> Creates the parser on an input string.</summary>
		/// <param name="html">The string containing HTML.
		/// </param>
		/// <param name="charset"><em>Optional</em>. The character set encoding that will
		/// be reported by {@link #getEncoding}. If charset is <code>null</code>
		/// the default character set is used.
		/// </param>
		/// <returns> A parser with the <code>html</code> string as input.
		/// </returns>
		public static Parser CreateParser(System.String html, System.String charset)
		{
			Parser ret;
			
			if (null == html)
			{
				throw new System.ArgumentException("Html cannot be null");
			}
			ret = new Parser(new Lexer(new Page(html, charset)));
			
			return (ret);
		}
Ejemplo n.º 20
0
 public virtual void Navigate(string url)
 {
     try
     {
         m_parser = new Parser(new Lexer(getHtml(url,null)),null);
         m_nodestack.Clear();
         m_node = null;
         m_nodeenum = null;
         m_nodelist = null;
         m_url = url;
         //m_parser.InputHTML = getHtml(url, null);
         //m_parser.URL = url;
         //m_parser.AnalyzePage();
     }
     catch (Exception e)
     {
         log.Error("Navigate: "+url, e);
     }
 }
Ejemplo n.º 21
0
        protected string getPaperID(string paper_name)
        {
            string html_page = _HttpUtil.getPaperIDHTML(paper_name);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("A");
            HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank");
            HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name));

            AndFilter af = new AndFilter(tag_f,attr_f);
            AndFilter aff = new AndFilter(af, child_f);

            NodeList childs = p.ExtractAllNodesThatMatch(aff);

            if (childs == null || childs.Count <= 0)
            {
                //Paper not found
                return null;
            }
            //TODO Multi Paper found

            INode node = childs[0];
            if (node is ITag)
            {
                ITag t = node as ITag;

                string href = t.GetAttribute("href");

                if (href != null && href != "")
                {
                    string [] sp = href.Split(new char[]{'/'});

                    return sp[sp.Length - 1].Split(new char[]{'.'})[0];
                }
            }

            //Not Found
            return null;
        }
Ejemplo n.º 22
0
        public ATag ParseProductUrl(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter filter = new LinkRegexFilter(@"lookcp\.php\?cpid\=\d{0,}");
            NodeList alist = parser.Parse(filter);
            ATag a = alist[0] as ATag;
            a.Link = "http://rrxf.cn/product/" + a.Link;
            return a;
        }
Ejemplo n.º 23
0
        public void GetFromWeb(IGetFromWebNotify notifier)
        {
            Directory.CreateDirectory(Config.ImagePath);

            if (notifier != null)
                notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f);
            WebClient webClient = new WebClient();
            webClient.Encoding = Encoding.UTF8;
            String strHtml = webClient.DownloadString(Config.Uri);

            if (notifier != null)
                notifier.Notity("����html�ĵ�", 0.0f);
            Lexer lexer = new Lexer(strHtml);
            Parser parser = new Parser(lexer);
            AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd")));
            NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter);
            lock (this)
            {
                m_Cards = new List<Card>();
                foreach (INode node in htmlNodes.ToNodeArray())
                {
                    int iFiledIndex = 0;
                    Card card = new Card();
                    foreach (INode subNode in node.Children.ToNodeArray())
                    {
                        if (subNode is TextNode)
                        {
                            continue;
                        }

                        switch (iFiledIndex)
                        {
                            case 0:
                                card.ID = Convert.ToInt32(subNode.FirstChild.GetText());
                                card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg");
                                break;
                            case 1:
                                card.Name = subNode.FirstChild.FirstChild.GetText();
                                break;
                            case 2:
                                StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card);
                                break;
                            case 3:
                                if (subNode.FirstChild != null)
                                {
                                    card.ManaCost = subNode.FirstChild.GetText();
                                }
                                else
                                {
                                    card.ManaCost = String.Empty;
                                }
                                break;
                            case 4:
                                card.Rare = subNode.FirstChild.GetText();
                                break;
                        }

                        iFiledIndex++;
                    }
                    m_Cards.Add(card);
                }
            }

            XmlSerializer s = new XmlSerializer(typeof(List<Card>));
            FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew);
            s.Serialize(fstream, m_Cards);
            fstream.Close();

            foreach (Card card in m_Cards)
            {
                if (notifier != null)
                    notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count);
                webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath);
            }
        }
Ejemplo n.º 24
0
 /// <summary>
 /// 辅助函数:从HTML中获得max_id
 /// </summary>
 /// <param name="htmlContent">HTML文本</param>
 /// <returns></returns>
 private bool GetMaxIdFromHtml(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList feedNodeList = parser.Parse(idFilter[(int)Type]);
     if (feedNodeList.Size() >= 1)
     {
         max_id = ((TagNode)feedNodeList[feedNodeList.Size() - 1]).GetAttribute("MID");
         return true;
     }
     else
     {
         return false;
     }
 }
Ejemplo n.º 25
0
        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="fansList">保存爬得的粉丝数组</param>
        public void GetInfoFromHtml(List<Fan> fansList)
        {
            Lexer lexer = new Lexer(currentHtmlContent);
            Parser parser = new Parser(lexer);
            //获取包含每条微博的div标记列表
            NodeList fansNodeList = parser.Parse(fanFilter);
            for (int i = 0; i < fansNodeList.Size(); i++)
            {
                Fan fan = new Fan();
                //获取包含一个粉丝的<li>标记
                Bullet fanBullet = (Bullet)fansNodeList[i];

                #region 获取该粉丝头像
                NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true);
                if (fanPortraitNodeList.Size() == 1)
                {
                    Div fanPortraitDiv = (Div)fanPortraitNodeList[0];
                    NodeList imgNodeList = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true);
                    if (imgNodeList.Size() == 1)
                    {
                        ImageTag imgNode = (ImageTag)imgNodeList[0];
                        if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT"))
                        {
                            string imgUrl = imgNode.GetAttribute("SRC");
                            string imgName = imgNode.GetAttribute("ALT");
                            fan.Name = imgName;
                            WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie
                            wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg");
                            wc.DownloadFileCompleted += wc_DownloadFileCompleted;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!");
                        }

                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!");
                }
                #endregion

                #region 获取该粉丝的关注数/粉丝数/微博数
                NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true);
                if (fanConnectNodeList.Size() == 1)
                {
                    NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (ATagList.Size() == 3)
                    {
                        for (int j = 0; j < 3; j++)
                        {
                            ATag aTag = (ATag)ATagList[j];
                            switch (j)
                            {
                                case 0:
                                    if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow"))
                                    {
                                        fan.FollowCount = Int32.Parse(aTag.StringText);
                                    }
                                    else
                                    {
                                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!");
                                    }
                                    break;
                                case 1:
                                    if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans"))
                                    {
                                        fan.FansCount = Int32.Parse(aTag.StringText);
                                    }
                                    else
                                    {
                                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!");
                                    }
                                    break;
                                default:
                                    fan.FeedsCount = Int32.Parse(aTag.StringText);
                                    break;
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!");
                }
                #endregion

                #region 获取该粉丝的简介信息
                NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true);
                if (fanInfoNodeList.Size() == 1)
                {
                    //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml());
                    Div fanInfoDiv = (Div)fanInfoNodeList[0];
                    string intro = fanInfoDiv.StringText;
                    if (intro.Substring(0, 2).Equals("简介"))
                    {
                        fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " ");
                    }
                }
                else
                {
                    if (fanInfoNodeList.Size() == 0)
                    {
                        fan.Introduction = "";
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!");
                    }
                }
                #endregion

                #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息
                NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true);
                if (fanLocationNodeList.Size() == 1)
                {
                    //获取粉丝的UserID信息;校验该粉丝的用户名信息
                    NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (aTagNodeList.Size() >= 1)
                    {
                        ATag nameNode = (ATag)aTagNodeList[0];
                        if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF"))
                        {
                            //获取粉丝的UserID信息
                            string uidStr = nameNode.GetAttribute("USERCARD");
                            if (uidStr.Substring(0, 3).Equals("id="))
                            {
                                fan.UserID = uidStr.Substring(3, uidStr.Length - 3);
                            }

                            //获取粉丝的微博链接
                            string linkUrl = nameNode.GetAttribute("HREF");
                            fan.LinkURL = "http://www.weibo.com" + linkUrl;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!");
                        }
                        //校验该粉丝的用户名信息
                        if (!nameNode.StringText.Equals(fan.Name))
                        {
                            Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!");
                        }
                    }

                    //获取粉丝的性别和地点信息
                    NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true);
                    if (locationNodeList.Size() == 1)
                    {
                        string locationStr = "";
                        for (int j = 0; j < locationNodeList[0].Children.Size(); j++)
                        {
                            INode node = locationNodeList[0].Children[j];
                            if (node.GetType().Equals(typeof(TextNode)))
                            {
                                TextNode tNode = (TextNode)node;
                                locationStr += tNode.ToPlainTextString();
                            }
                            if (node.GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tNode = (TagNode)node;
                                if (tNode.Attributes.ContainsKey("CLASS"))
                                {
                                    if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= =
                                    {
                                        fan.Gender = "female";
                                    }
                                    else
                                    {
                                        if (tNode.GetAttribute("CLASS").Contains("male"))
                                        {
                                            fan.Gender = "male";
                                        }
                                        else
                                        {
                                            fan.Gender = "unknown";
                                            Console.WriteLine("第" + i + "个粉丝性别不明!");
                                        }
                                    }
                                }
                            }
                        }
                        fan.Location = locationStr.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!");
                }
                #endregion

                #region 获取该粉丝关注用户的方式
                NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true);
                if (followMethodNodeList.Size() == 1)
                {
                    NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                    if (methodNodeList.Size() == 1)
                    {
                        ATag methodNode = (ATag)methodNodeList[0];
                        fan.FollowMethod = methodNode.StringText.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!");
                }
                #endregion

                fansList.Add(fan);
            }
        }
Ejemplo n.º 26
0
 public void GetInfoFromHtml(int currentPage)
 {
     Lexer lexer = new Lexer(currentHtml);
     Parser parser = new Parser(lexer);
     NodeList poiHeadList = parser.Parse(poiListFilter);
     if (poiHeadList.Count == 1)
     {
         NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
         int numCount = 0;
         for (int i = 0; i < poiNodeList.Count; i++)
         {
             POI poi = new POI();
             DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
             if (poiNode.TagName.Equals("DD"))
             {
                 numCount++;
                 poi.Page = currentPage;
                 poi.Number = numCount;
                 #region 获取口味、环境和服务评分,以及获取星级
                 NodeList tasteNodeList = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                 NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                 NodeList serviceNodeList = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                 if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                 {
                     Span spanNode = (Span)tasteNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)environmentNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)serviceNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     #region 获取星级
                     INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                     if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                     {
                         INode rankNodeCandidate = rankNodeOfParent.Children[0];
                         if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                         {
                             Span rankNode = (Span)rankNodeCandidate;
                             string rank = rankNode.GetAttribute("TITLE");
                             if (rank.Contains("五"))
                             {
                                 poi.Rank = 5;
                             }
                             else
                             {
                                 if (rank.Contains("四"))
                                 {
                                     poi.Rank = 4;
                                 }
                                 else
                                 {
                                     if (rank.Contains("三"))
                                     {
                                         poi.Rank = 3;
                                     }
                                     else
                                     {
                                         if (rank.Contains("二"))
                                         {
                                             poi.Rank = 2;
                                         }
                                         else
                                         {
                                             if (rank.Contains("一"))
                                             {
                                                 poi.Rank = 1;
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                     #endregion
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!");
                 }
                 #endregion
                 #region 获取平均消费
                 NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                 if (averageNodeList.Count == 1)
                 {
                     INode averageNode = averageNodeList[0];
                     if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                     {
                         string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                         poi.AverageCost = Int32.Parse(cost);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!");
                 }
                 #endregion
                 #region 获取点评数
                 NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                 if (commentNodeList.Count == 1)
                 {
                     INode commentNode = commentNodeList[0];
                     if (commentNode.GetType().Equals(typeof(ATag)))
                     {
                         string commentNum = ((ATag)commentNode).StringText;
                         if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                         {
                             commentNum = commentNum.Substring(0, commentNum.Length - 3);
                         }
                         poi.CommentCount = Int32.Parse(commentNum);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!");
                 }
                 #endregion
                 #region 获取店名
                 NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                 if (nameNodeList.Count == 1)
                 {
                     INode nameNode = nameNodeList[0];
                     if (nameNode.GetType().Equals(typeof(ATag)))
                     {
                         poi.Name = ((ATag)nameNode).StringText;
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!");
                 }
                 #endregion
                 #region 获取地址和电话
                 NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                 if (addressNodeList.Count == 1)
                 {
                     NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                     if (districtNodeList.Count == 1)
                     {
                         ATag districtTag = (ATag)districtNodeList[0];
                         string address = districtTag.ToPlainTextString();
                         if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                         {
                             TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                             string detailAddress = detailAddressNode.ToPlainTextString();
                             detailAddress = detailAddress.Trim();
                             string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                             poi.Phone = phoneStr;
                             address += detailAddress.Substring(0, detailAddress.Length - 8);
                         }
                         char[] removeChrVector = { ' ', '\n', '\t' };
                         address = address.Trim(removeChrVector);
                         foreach (char c in removeChrVector)
                         {
                             address = address.Replace(c.ToString(), "");
                         }
                         poi.Address = address;
                     }
                     else
                     {
                         Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!");
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!");
                 }
                 #endregion
                 #region 获取标签
                 NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                 if (tagsNodeList.Count == 1)
                 {
                     INode tagsNode = tagsNodeList[0];
                     if (tagsNode.Children != null)
                     {
                         for (int j = 0; j < tagsNode.Children.Count; j++)
                         {
                             INode node = tagsNode.Children[j];
                             if (node.GetType().Equals(typeof(ATag)))
                             {
                                 poi.Tags.Add(node.ToPlainTextString());
                             }
                         }
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!");
                 }
                 #endregion
                 poiList.Add(poi);
             }
         }
     }
     else
     {
         Console.WriteLine("获取POI列表出错");
     }
 }
Ejemplo n.º 27
0
 public static List<string> getValues(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     List<string> values = new List<string>();
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             value = tagNode.ToPlainTextString();
             values.Add(value);
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.ToPlainTextString();
     //                    values.Add(value);
     //                }
     //            }
     //        }
     //    }
     //}
     return values;
 }
Ejemplo n.º 28
0
        public List<ATag> ParseCatelog(string html)
        {
            List<ATag> atags = new List<ATag>();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "fenlei_list");
            NodeList navNodes = parser.Parse(nav);

            NodeFilter catelog = new LinkRegexFilter(@"^\.\./product/index\.php\?cplm\=\-\d\d\d\-$");
            catelog = new HasChildFilter(catelog);
            NodeList catelogNodes = navNodes[0].Children.ExtractAllNodesThatMatch(catelog);

            if(catelogNodes==null){
                return atags;
            }

            int length = catelogNodes.Count;
            for (int i=0;i<length;i++)
            {
                INode node = catelogNodes[i];
                ATag a = node.Children[0] as ATag;
                atags.Add(a);
            }

            return atags;
        }
        public override IEnumerable <string> Process(string fileName)
        {
            System.Net.WebClient aWebClient = new System.Net.WebClient();
            aWebClient.Encoding = System.Text.Encoding.Default;
            string html = aWebClient.DownloadString(this.url);

            //string html = File.ReadAllText(fileName);
            Lexer lexer = new Lexer(html);

            Winista.Text.HtmlParser.Parser parser = new Winista.Text.HtmlParser.Parser(lexer);
            NodeList htmlNodes = parser.Parse(null);

            List <IsotopicAtom> atoms = new List <IsotopicAtom>();

            var          node = FindFirstNode(htmlNodes, "tbody");
            INode        nextNode;
            IsotopicAtom atom = null;

            while ((nextNode = FindFirstNode(node, "tr")) != null)
            {
                if (nextNode.Children != null)
                {
                    var tds = nextNode.Children.ExtractAllNodesThatMatch(new NameFilter("td"));
                    if (tds.Count == 1)
                    {
                        atom = null;
                    }

                    if (tds.Count >= 3 && tds[0].FirstChild != null)
                    {
                        var t1 = tds[0].FirstChild.GetText().Trim();
                        var t2 = tds[1].FirstChild.GetText().Trim();

                        if (Char.IsDigit(t1[0]) && Char.IsLetter(t2[0]))
                        {
                            atom = new IsotopicAtom();
                            atoms.Add(atom);

                            atom.Name = tds[1].FirstChild.GetText().Trim();
                            Peak p = new Peak();
                            p.Mz        = GetDouble(tds[3]);
                            p.Intensity = GetDouble(tds[4]);
                            atom.Isotopics.Add(p);
                        }
                        else if (atom != null)
                        {
                            var txt = tds[0].FirstChild.GetText().Trim();
                            if (txt.Length > 0 && Char.IsLetter(txt[0]))
                            {
                                tds.Remove(0);
                            }

                            Peak p = new Peak();
                            p.Mz        = GetDouble(tds[1]);
                            p.Intensity = GetDouble(tds[2]);
                            atom.Isotopics.Add(p);
                        }
                    }
                }

                node = nextNode.NextSibling;
                if (node == null)
                {
                    break;
                }
            }

            atoms.ForEach(m => m.Isotopics.RemoveAll(n => n.Intensity == 0.0));

            atoms.RemoveAll(m => m.Isotopics.Count == 0);

            var dic = atoms.ToDictionary(m => m.Name);

            var x = new IsotopicAtom();

            x.Name = "X";
            x.Isotopics.Add(new Peak(1, 0.9));
            x.Isotopics.Add(new Peak(2, 0.1));
            atoms.Insert(0, x);

            atoms.Add(AddHevayAtom("(H2)", "H", 1, dic));
            atoms.Add(AddHevayAtom("(C13)", "C", 1, dic));
            atoms.Add(AddHevayAtom("(N15)", "N", 1, dic));
            atoms.Add(AddHevayAtom("(O18)", "O", 2, dic));

            using (StreamWriter sw = new StreamWriter(fileName))
            {
                atoms.ForEach(m =>
                {
                    sw.WriteLine("{0}\t{1}", m.Name, m.Isotopics.Count);
                    m.Isotopics.ForEach(n => sw.WriteLine("{0:0.000000}\t{1:0.000000}", n.Mz, n.Intensity));
                    sw.WriteLine();
                });
            }
            return(new string[] { fileName });
        }
Ejemplo n.º 30
0
        protected void Dispose(bool Disposing)  
        {  
            if(!IsDisposed)  
            {
                if (Disposing)  
                {
                    //清理托管资源
                    m_parser = null;
                    //if (m_nodelist != null) m_nodelist.Clear();
                    m_nodelist = null;
                    m_node = null;
                    m_nodeenum = null;
                    //m_nodestack.Clear();
                    m_nodestack = null;
                }  
                //清理非托管资源

            }  
            IsDisposed=true;  
        }
        private void CrawlCurrentPage(WebBrowser wb, bool isOffline, bool IsUnConfirmChecked, ref bool hasValidData)
        {
            mshtml.IHTMLDocument2 doc2 = isOffline ? null : (mshtml.IHTMLDocument2)wb.Document;
            string html = isOffline ? s_htmlFake : doc2.body.innerHTML;


            Debug.WriteLine(html);


            List <ImportInvoiceDTO>      list         = new List <ImportInvoiceDTO>();
            List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>();

            //this.parseResult = "";

            #region  使用IHTMLDocument2提取HTML

            mshtml.HTMLTableClass table = IsUnConfirmChecked ? (mshtml.HTMLTableClass)doc2.all.item("example1", 0) : (mshtml.HTMLTableClass)doc2.all.item("example", 0);
            if (table == null)
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效table");
                return;
            }
            mshtml.HTMLTableSectionClass tbody = (mshtml.HTMLTableSectionClass)table.lastChild;
            if (tbody == null)
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效tbody");
                return;
            }

            var tbodyHtml = tbody.innerHTML;

            if (0 == string.Compare(tbody.innerText, "没找到记录", StringComparison.InvariantCultureIgnoreCase))
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效tbody");
                return;
            }
            #region WPF WebBroswer交互源代码DOM元素总结

#if RESEARCH
            //HTMLDocument doc01 = wb.Document as HTMLDocument;
            ////IHTMLDocument2 doc02 = wb.Document as IHTMLDocument2;
            //Debug.WriteLine(doc01.body.innerHTML);


            ///读/写元素
            ///
            mshtml.IHTMLElement login_pass = (mshtml.IHTMLElement)doc2.all.item("login_pass", 0);
            mshtml.IHTMLElement password   = (mshtml.IHTMLElement)doc2.all.item("password", 0);
            password.setAttribute("value", "12345678");
            login_pass.setAttribute("style", "");

            mshtml.IHTMLElement login_pass1 = (mshtml.IHTMLElement)doc2.all.item("login_pass1", 0);
            mshtml.IHTMLElement password1   = (mshtml.IHTMLElement)doc2.all.item("password1", 0);
            login_pass1.setAttribute("style", "display:none;");
            //password1.setAttribute("style", "width:1px");

            //IHTMLElement item = doc01.getElementById("ptmm");
            //item.innerHTML = "<INPUT id=\"pwd\" class=\"login_input password\" type=\"text\" value=\"\" />";

            ////  doc01.body.insertAdjacentHTML(,);
            //MessageBox.Show(item.innerText);

            //wb.NavigateToString(doc01.body.innerHTML);

            /// Trigger event
            //点击确定按钮
            loginBT.click();


            /// script injection
            ///
            //Basic ds = new Basic();
            //wb.ObjectForScripting = ds;//该对象可由显示在WebBrowser控件中的网页所包含的脚本代码访问

            ///Levarage JS
            ///
            mshtml.IHTMLWindow2 win = (mshtml.IHTMLWindow2)doc2.parentWindow;
            win.execScript("Login('12345678', '', 1)", "javascript");
            return;
#endif


            #endregion

            #endregion
            #region 使用HtmlParser提取tbodyHtml
            Lexer              lexer    = new Lexer(tbodyHtml);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                hasValidData = false;
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    //抓取一行
                    var tagTR = parserTR(nodeList[i]);

                    #region 充填有效行
                    if (tagTR != null)
                    {
                        validRowList.Add(tagTR);
                    }
                    #endregion
                }

                parserValidTR(validRowList, IsUnConfirmChecked, ref list);
            }

            #endregion
            #region 使用HtmlParser提取HTML

            /* Lexer lexer = new Lexer(html);
             * hParser.Parser parser = new hParser.Parser(lexer);
             * hParser.NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
             * NodeList nodeList = parser.Parse(filter);
             * if (nodeList.Count == 0)
             *  MessageBox.Show("没有符合要求的节点");
             * else
             * {
             *  for (int i = 0; i < nodeList.Count; i++)
             *  {
             *      //抓取一行
             *      var tagTR = parserTR(nodeList[i]);
             *
             #region 充填有效行
             *      if (tagTR != null)
             *          validRowList.Add(tagTR);
             #endregion
             *
             *  }
             *
             *  parserValidTR(validRowList, ref list);
             *
             * }
             */
            #endregion

            #region 日志 & 导出 & 持久化

            if (list == null || list.Count == 0)
            {
                MessageBox.Show("该页面上没有检测到预期数据");
                hasValidData = false;
            }

            ImportInvoiceListDTO soap = new ImportInvoiceListDTO
            {
                List   = list,
                Result = new ImportInvoiceResultDTO
                {
                    Message = "CALLBACK",
                    Status  = 9
                }
            };

            Debug.Write(soap);
            #region Log
            if (this.IfLog == "1")
            {
                soap.List.ForEach(impinfo =>
                {
                    if (IsUnConfirmChecked)
                    {
                        LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 勾选标志{8} 操作时间{9}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag, impinfo.ChosenTime));
                    }
                    else
                    {
                        LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 确认月份{8}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag));
                    }
                });
            }
            #endregion
            if (this.IfCallWS == "1")
            {
                CallWS(soap);
            }
            Debug.Write("本页已同步完成,请点击下一页继续同步");
            //FakeBusy();

            #endregion
        }
Ejemplo n.º 32
0
        /// <summary>
        /// 获得列表
        /// </summary>
        /// <returns></returns>
        public List<OddsLiveMatch> GetScrollMatchList()
        {
            List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>();
            try
            {
                HttpHelper h = new HttpHelper();
                Cookie lng = new Cookie("lng", "2");
                lng.Domain = domain;
                h.CookieContainer.Add(lng);
                string zoudi = h.GetHtml("https://" +domain+ "/default.aspx"+ zoudiUrl);
                if (!string.IsNullOrEmpty(zoudi))
                {
                    #region 分析网页html节点
                    Lexer lexer = new Lexer(zoudi);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag;
                    if (divNode.Attributes["ID"].Equals("PageBody"))
                    {
                        NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div));
                        if (dataDivList[0].ToPlainTextString() == "走地盤")
                        {
                            if (dataDivList[2].ToPlainTextString() == "全場賽果")
                            {
                                return liveMatchList;
                            }
                            for (int i = 0; i < dataDivList.Count; i++)
                            {
                                ITag div = dataDivList[i] as ITag;
                                if (div.Attributes["CLASS"] != null && div.Attributes["CLASS"].Equals("menuRow"))
                                {
                                    OddsLiveMatch oddsLive = new OddsLiveMatch();
                                    oddsLive.urlparams = (div.FirstChild as ITag).Attributes["HREF"].ToString();
                                    oddsLive.id = oddsLive.urlparams.Split('&')[0].Substring(4);
                                    oddsLive.time = DateTime.Now;
                                    oddsLive.name = div.ToPlainTextString();
                                    liveMatchList.Add(oddsLive);
                                }
                            }
                        }
                    }
                    #endregion 分析网页html节点
                }
            }
            catch (Exception)
            {

            }
            return liveMatchList;
        }
Ejemplo n.º 33
0
 /// <summary>
 /// 辅助函数:从HTML中获得end_id
 /// </summary>
 /// <param name="htmlContent">HTML文本</param>
 /// <returns></returns>
 private bool GetEndIdFromHtml(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList feedNodeList = parser.Parse(idFilter[(int)Type]);
     if (feedNodeList.Size() >= 1 && feedNodeList[0].GetType().Equals(typeof(Div)) && ((TagNode)feedNodeList[0]).Attributes.ContainsKey("MID"))
     {
         end_id = ((TagNode)feedNodeList[0]).GetAttribute("MID");
         return true;
     }
     else
     {
         return false;
     }
 }
Ejemplo n.º 34
0
        protected ArrayList getPaperReferenceByID(ArrayList paper_id)
        {
            string html_page = _HttpUtil.getPaperReferenceHTML(paper_id);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("div");
            HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container");

            AndFilter af = new AndFilter(tag_f, attr_f);

            NodeList childs = p.ExtractAllNodesThatMatch(af);

            if (childs == null || childs.Count <= 0)
            {
                return null;
            }

            INode node = childs[0];

            NodeList ref_childs = node.Children;
            ArrayList ref_list = new ArrayList();

            for (int i = 0; i < ref_childs.Count;++i )
            {
                INode tmp = ref_childs[i];

                if (tmp is ITag)
                {
                    ITag tag = tmp as ITag;

                    string str = tag.ToPlainTextString();

                    str = str.Replace('\r', ' ').Replace('\n',' ');

                    str = str.Substring(str.IndexOf(']') + 1);

                    //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", "");

                    ref_list.Add(str);
                }
            }

            if (_Progressable != null)
            {
                _Progressable.onFinish(ref_list);
            }

            return ref_list;
        }
        public void StartCrawl()//  private void BtnDownload_Click(object sender, RoutedEventArgs e)
        {
            List <ImportInvoiceDTO>      list         = new List <ImportInvoiceDTO>();
            List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>();

            this.parseResult = "";
            Uri uri = new Uri(this.TargetUri);

            #region <N>基于Httphelper,这样下载会要求程序自己实现验证授权

            //<N>基于Httphelper,这样下载会要求程序自己实现验证授权
            //HttpHelper httpHelper = new HttpHelper();
            //HttpItem rq = new HttpItem();
            //rq.URL = uri.AbsoluteUri;
            //HttpResult html = httpHelper.GetHtml(rq);
            //Debug.WriteLine(html.Html);

            //直接基于WebBrowser,授权是由用户手动实现的
            mshtml.IHTMLDocument2 doc2 = null;//(mshtml.IHTMLDocument2)webBox.Document;
            string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0 ? s_htmlFake : doc2.body.innerHTML;
            Debug.WriteLine(html);
            #endregion

            #region 使用HtmlParser提取HTML
            Lexer              lexer    = new Lexer(html);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    //抓取一行
                    var tagTR = parserTR(nodeList[i]);

                    #region 充填有效行
                    if (tagTR != null)
                    {
                        validRowList.Add(tagTR);
                    }
                    #endregion
                }

                parserValidTR(validRowList, ref list);
#if DEBUG
                // MessageBox.Show(parseResult);
#endif
            }

            /*  parseResult = HtmlText(html);
             * MessageBox.Show(parseResult);*/
            #endregion

            #region  步

            if (list == null || list.Count == 0)
            {
                MessageBox.Show("该页面上没有检测到预期数据");
                return;
            }

            ImportInvoiceListDTO soap = new ImportInvoiceListDTO
            {
                List   = list,
                Result = new ImportInvoiceResultDTO
                {
                    Message = "CALLBACK",
                    Status  = 9
                }
            };
            //using (var factory = new ChannelFactory<ISyncImportInvoiceService>("*"))
            //{
            //    var chl = factory.CreateChannel();
            //    soap = chl.PullImportInvoices(soap);

            //    if (soap.Result.Status == 0)
            //    {
            //        //重试
            //        soap = chl.PullImportInvoices(soap);
            //    }
            //}

            //if (soap.Result.Status == -1)
            //{
            //    // 修改UI线程
            //      MessageBox.Show(soap.Result.Message);
            //}
            CallWS(soap);
            MessageBox.Show("本页已同步完成,请点击下一页继续同步");
            //FakeBusy();

            #endregion
        }