Esempio n. 1
0
 public static string getAttValue(string html, string tag, string attribute, string attValue,string attributeV)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new TagNameFilter(tag);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             foreach (string key in tagNode.Attributes.Keys)
             {
                 if (key.Contains("<TAGNAME>"))
                 {
                     continue;
                 }
                 if (key.Contains(attribute))
                 {
                     if (tagNode.Attributes[key].ToString() == attValue)
                     {
                         value = tagNode.Attributes[attributeV].ToString();
                         return value;
                     }
                 }
             }
         }
     }
     return null;
 }
Esempio n. 2
0
        public static List<Product> LoadGoods(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter filter = new HasAttributeFilter("class", "product");
            NodeList products = parser.ExtractAllNodesThatMatch(filter);

            List<Product> result = new List<Product>();
            for (int i = 0; i < products.Count; i++)
            {
                try
                {
                    Product p = new Product();
                    string pname = "", ppromo = "", pimg = "";
                    decimal pprice = 0;
                    ITag product = products[i] as ITag;

                    //name
                    NodeFilter nameFilter = new HasAttributeFilter("class", "product-title");
                    NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true);
                    ITag name = names[0] as ITag;
                    pname = name.ToPlainTextString().Trim();

                    //name
                    NodeFilter priceFilter = new HasAttributeFilter("class", "product-price");
                    NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true);
                    ITag price = prices[0] as ITag;
                    pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7));

                    //img
                    NodeFilter imgFilter = new TagNameFilter("img");
                    NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true);
                    ITag img = imgs[0] as ITag;
                    pimg = img.GetAttribute("DATA-KS-LAZYLOAD");

                    //promo
                    NodeFilter promoFilter = new HasAttributeFilter("class", "promo");
                    NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true);
                    if (promos.Count > 0)
                    {
                        ITag promo = promos[0] as ITag;
                        ppromo = promo.GetAttribute("data-promo");
                    }

                    p.img = pimg;
                    p.name = pname;
                    p.price = pprice;
                    p.promo = ppromo;
                    result.Add(p);
                }
                catch
                {

                }

            }

            return result;
        }
Esempio n. 3
0
        public Job GetJobInfoParser(string url)
        {
            Job jobinfo = new Job();

            string title = string.Empty;
            string description = string.Empty;
            DateTime dt = DateTime.Now;
            string email = string.Empty;

            Parser parser = new Parser(new HttpProtocol(new Uri(url)));

            NodeFilter detail = new HasAttributeFilter("class", "d_left");

            NodeList nodeDetail = parser.ExtractAllNodesThatMatch(detail);
            if (nodeDetail == null || nodeDetail.Count == 0)
            {
                return jobinfo;
            }

            description = GetDetailString(nodeDetail);
            Match m = Regex.Match(description, @"发布时间:(?<date>\d\d\d\d-\d{1,2}\-\d{1,2} \d{1,2}\:\d{1,2})");

            dt = DateTime.Now;

            if (m.Success && m.Groups["date"].Success && DateTime.TryParse(m.Groups["date"].Value, out dt)) { }

            Match emailMatch = Regex.Match(description, @"([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)");
            if (emailMatch.Success)
            {
                email = emailMatch.Value;
            }

            Match telMatch = Regex.Match(description, @"(1[3|5|8][0-9]|15[0|3|6|7|8|9]|18[8|9])\d{8}");
            if (telMatch.Success)
            {
                jobinfo.tel = telMatch.Value;
            }

            jobinfo.category_id = Catalog.id;
            jobinfo.title = title;
            jobinfo.description = description;
            jobinfo.created_on = dt;
            jobinfo.is_active = true;
            jobinfo.city_id = Catalog.city.id;
            jobinfo.sp1010url = url;
            jobinfo.poster_email = email;

            return jobinfo;
        }
        public List<PlayTime> getPlayTimes(string xmlFile)
        {
            Match match = Regex.Match(xmlFile, @"\d\d\d\d");
            string cinemaID = match.Value;//电影院的ID
            List<PlayTime> playTimes = new List<PlayTime>();
            string html = File.ReadAllText(xmlFile);

            Lexer lexer = new Lexer(html);
            Parser playParser = new Parser(lexer);
            NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14");
            NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter);
            if (playTimeList.Count >= 1)
            {
                for (int i = 0; i < playTimeList.Count; i++)
                {
                    PlayTime playTime = new PlayTime();
                    ITag playTag = (playTimeList[i] as ITag);
                    ITag idTag = (playTag.FirstChild as ITag);
                    if (idTag.Attributes != null)
                    {
                        string strID = idTag.Attributes["HREF"].ToString();
                        Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d");
                        if (idMatch.Success)
                        {
                            playTime.MovieID = int.Parse(idMatch.Value);
                        }
                        else
                        {
                            Match strMatch = Regex.Match(strID, @"\d\d\d\d\d");
                            if (strMatch.Success)
                            {
                                playTime.MovieID = int.Parse(strMatch.Value);
                            }
                        }

                    }
                    string strTime = playTag.NextSibling.NextSibling.ToPlainTextString();
                    char[] a = {'上','映'};
                    strTime = strTime.Trim(a);
                    playTime.Playtime = DateTime.Parse(strTime);
                    playTime.CinemaID = int.Parse(cinemaID);
                    playTime.PlayState = true;

                    playTimes.Add(playTime);
                }
                return playTimes;
            }
            return null;
        }
 public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP)
 {
     //string vip = string.Empty;
     //string html = File.ReadAllText(url);
     dining = string.Empty;
     park = string.Empty;
     gameCenter = string.Empty;
     intro3D = string.Empty;
     introVIP = string.Empty;
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000");
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagPar = (node.Parent as ITag);
         ITag tagSib = (node.PreviousSibling as ITag);
         if (tagSib.Attributes["CLASS"] != null)
         {
             switch (tagSib.Attributes["CLASS"].ToString())
             {
                 case "ico_cside1 mr12":
                     dining = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside2 mr12":
                     park = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside3 mr12":
                     gameCenter = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside5 mr12":
                     intro3D = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside7 mr12":
                     introVIP = tagPar.ToPlainTextString();
                     break;
             }
         }
     }
     return true;
     //throw new NotImplementedException();
 }
Esempio n. 6
0
 public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     if (nodeList.Count == 1)
     {
         ITag tagNode = (nodeList[0] as ITag);
         if (tagNode.Attributes != null)
         {
             return tagNode.Attributes[attributeV].ToString();
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.Attributes[attributeV].ToString();
     //                    return value;
     //                }
     //            }
     //        }
     //    }
     //}
     return null;
 }
Esempio n. 7
0
 public static List<string> getValues(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     List<string> values = new List<string>();
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             value = tagNode.ToPlainTextString();
             values.Add(value);
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.ToPlainTextString();
     //                    values.Add(value);
     //                }
     //            }
     //        }
     //    }
     //}
     return values;
 }
Esempio n. 8
0
        public void GetFromWeb(IGetFromWebNotify notifier)
        {
            Directory.CreateDirectory(Config.ImagePath);

            if (notifier != null)
                notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f);
            WebClient webClient = new WebClient();
            webClient.Encoding = Encoding.UTF8;
            String strHtml = webClient.DownloadString(Config.Uri);

            if (notifier != null)
                notifier.Notity("����html�ĵ�", 0.0f);
            Lexer lexer = new Lexer(strHtml);
            Parser parser = new Parser(lexer);
            AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd")));
            NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter);
            lock (this)
            {
                m_Cards = new List<Card>();
                foreach (INode node in htmlNodes.ToNodeArray())
                {
                    int iFiledIndex = 0;
                    Card card = new Card();
                    foreach (INode subNode in node.Children.ToNodeArray())
                    {
                        if (subNode is TextNode)
                        {
                            continue;
                        }

                        switch (iFiledIndex)
                        {
                            case 0:
                                card.ID = Convert.ToInt32(subNode.FirstChild.GetText());
                                card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg");
                                break;
                            case 1:
                                card.Name = subNode.FirstChild.FirstChild.GetText();
                                break;
                            case 2:
                                StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card);
                                break;
                            case 3:
                                if (subNode.FirstChild != null)
                                {
                                    card.ManaCost = subNode.FirstChild.GetText();
                                }
                                else
                                {
                                    card.ManaCost = String.Empty;
                                }
                                break;
                            case 4:
                                card.Rare = subNode.FirstChild.GetText();
                                break;
                        }

                        iFiledIndex++;
                    }
                    m_Cards.Add(card);
                }
            }

            XmlSerializer s = new XmlSerializer(typeof(List<Card>));
            FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew);
            s.Serialize(fstream, m_Cards);
            fstream.Close();

            foreach (Card card in m_Cards)
            {
                if (notifier != null)
                    notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count);
                webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath);
            }
        }
Esempio n. 9
0
 static int OneDayOneBeauty(string date)
 {
     try
     {
         string htmlContent = "";
         string url = oneDayOneBeautyBaseUrl + date + "/";
         HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url);
         httpWebRequest.Method = "GET";
         HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
         {
             StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
             htmlContent = reader.ReadToEnd();
             //调试代码
             //StreamWriter fw = new StreamWriter("debug.html");
             //fw.Write(htmlContent);
             //fw.Close();
             //调试完毕
             httpWebResponse.Close();
             reader.Close();
         }
         if (!htmlContent.Equals(""))
         {
             Lexer lexer = new Lexer(htmlContent);
             Parser parser = new Parser(lexer);
             parser.AnalyzePage();
             NodeList divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter);
             if (divList.Count == 0)
             {
                 parser.AnalyzePage();
                 divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter2);
             }
             for (int i = 0; i < divList.Count; i++)
             {
                 ImageTag imgNode = (ImageTag)divList[i];
                 //2014年5月16日根据网页结构修改
                 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 1);
             }
             return divList.Count;
         }
         else
         {
             Console.WriteLine("得到的HTML为空!");
             return 0;
         }
     }
     catch (WebException e)
     {
         HttpWebResponse httpWebResponse = (HttpWebResponse)e.Response;
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound))
         {
             Console.WriteLine("网页未找到!");
         }
         else
         {
             Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode);
         }
         httpWebResponse.Close();
         return 0;
     }
 }
Esempio n. 10
0
 static void BeautyFlow(int id)
 {
     HttpWebResponse httpWebResponse = null;
     try
     {
         string htmlContent = "";
         string url = BeautyFlowBaseUrl + id + "/";
         HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url);
         httpWebRequest.Method = "GET";
         httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
         {
             StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
             htmlContent = reader.ReadToEnd();
             //调试代码
             //StreamWriter fw = new StreamWriter("debug.html");
             //fw.Write(htmlContent);
             //fw.Close();
             //调试完毕
             httpWebResponse.Close();
             reader.Close();
         }
         if (!htmlContent.Equals(""))
         {
             Console.WriteLine("第一个html读取完成!");
             int startIndex = htmlContent.IndexOf("/girl/");
             int endIndex = htmlContent.IndexOf("/", startIndex + 6) + 1;
             string beautyMorePicturesLink = "http://curator.im" + htmlContent.Substring(startIndex, endIndex - startIndex);
             //Console.WriteLine(beautyMorePicturesLink);
             string htmlContentTwo = "";
             httpWebRequest = HttpWebRequest.CreateHttp(beautyMorePicturesLink);
             httpWebRequest.Method = "GET";
             httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
             if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
             {
                 StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
                 htmlContentTwo = reader.ReadToEnd();
                 httpWebResponse.Close();
                 reader.Close();
             }
             Console.WriteLine("第二个html读取完成!");
             Lexer lexer = new Lexer(htmlContentTwo);
             Parser parser = new Parser(lexer);
             parser.AnalyzePage();
             NodeList divList = parser.ExtractAllNodesThatMatch(BeautyNameFilter);
             string beautyName = "";
             if (divList.Count == 1)
             {
                 beautyName = divList[0].ToPlainTextString();
                 endIndex = beautyName.IndexOf('|') - 1;
                 beautyName = beautyName.Substring(0, endIndex);
             }
             else
             {
                 Console.WriteLine("获取正妹名称出错! id=" + id);
                 Console.Read();
                 return;
             }
             parser.AnalyzePage();
             divList = parser.ExtractAllNodesThatMatch(BeautyFlowImgFilter);
             for (int i = 0; i < divList.Count; i++)
             {
                 ImageTag imgNode = (ImageTag)divList[i];
                 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 2);
             }
         }
         else
         {
             Console.WriteLine("得到的HTML为空!");
             return;
         }
     }
     catch (Exception ex)
     {
         //if (httpWebResponse != null)
         //{
         //    httpWebResponse = (HttpWebResponse)ex.Response;
         //    if (!httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound))
         //    {
         //        Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode);
         //    }
         //    httpWebResponse.Close();
         //}
     }
 }
Esempio n. 11
0
        public void SpiderCurrentPage(int idx)
        {
            ParserConf.GetConfiguration().RootPath = AppDomain.CurrentDomain.BaseDirectory;
            string url = Catalog.sp1010 + string.Format("index{0}.html", idx);
            Parser parser;
            NodeList nodeList=null;
            int count = 0;
            bool sign=true;

            while (sign && count<5)
            {
                SpiderEventLog.WriteSourceLog("Spider " + url, url, EventLogEntryType.Information);

                try
                {
                    parser = new Parser(new HttpProtocol(new Uri(url)));
                }
                catch (Exception ex)
                {
                    SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString());
                    return;
                }

                if (parser == null)
                {
                    return;
                }

                sign = false;

                try
                {
                    NodeFilter filter = new HasAttributeFilter("class", "Linklist");
                    nodeList = parser.ExtractAllNodesThatMatch(filter);
                }
                catch (Exception ex)
                {
                    SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString());
                    sign = true;
                }

                count++;
            }

            if (nodeList == null)
            {
                return;
            }

            int length = nodeList.Count;
            for (int i = 0; i < length; i++)
            {
                ATag node = nodeList[i] as ATag;
                if (IsExistJob(node.Link))
                {
                    SpiderEventLog.WriteLog(string.Format("职务 [{0}] 已存在",node.LinkText));
                    continue;
                }

                Job jobinfo = GetJobInfoParser(node.Link);
                jobinfo.title = Regex.Replace(node.LinkText,"&[^&;]{0,};", "",RegexOptions.IgnoreCase);

                ConsoleColor color = Console.ForegroundColor;
                Console.ForegroundColor = ConsoleColor.Red;

                Console.WriteLine("=".PadLeft(120,'='));
                Console.WriteLine("title:{0}", jobinfo.title);
                Console.WriteLine("url:{0}", jobinfo.sp1010url);
                Console.WriteLine("tel:{0}", jobinfo.tel);
                Console.WriteLine("email:{0}", jobinfo.poster_email);
                Console.WriteLine("desc:{0}", jobinfo.description);
                Console.WriteLine("=".PadLeft(120,'='));

                Console.ForegroundColor = color;

                InsertJobInfo(jobinfo);
            }
        }
Esempio n. 12
0
        public float getCinemaGrade(string html)
        {
            //string tag = "dd";
            //string attribute = "CLASS";
            //string attValue = "total";
            //string left = Spider.getValue(html, tag, attribute, attValue);
            //string tag2 = "dd";
            //string attribute2 = "CLASS";
            //string attValue2 = "total2";
            //string right = Spider.getValue(html, tag2, attribute2, attValue2);
            //string grade = left + right;
            ////return float.Parse(grade);
            //return 1.1f;
            //throw new NotImplementedException();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18");
            NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
            if (nodeList.Count == 1)
            {
                INode node = nodeList[0];
                ITag tagLeft = (node.FirstChild as ITag);
                ITag tagRight = (node.LastChild as ITag);
                string left = tagLeft.ToPlainTextString();
                string right = tagRight.ToPlainTextString();
                string strGrade = left + right;
                return float.Parse(strGrade);
            }
            return 7.0f;
        }
Esempio n. 13
0
 static void GetSubtitleHtmlFromFile()
 {
     List<List<NodeFilter>> filters = new List<List<NodeFilter>>();
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters[0].Add(new HasAttributeFilter("class", "xl29"));
     filters[0].Add(new HasAttributeFilter("class", "xl31"));
     filters[0].Add(new HasAttributeFilter("class", "xl32"));
     filters[0].Add(new HasAttributeFilter("class", "xl33"));
     filters[1].Add(new HasAttributeFilter("class", "xl25"));
     filters[1].Add(new HasAttributeFilter("class", "xl26"));
     filters[1].Add(new HasAttributeFilter("class", "xl27"));
     filters[1].Add(new HasAttributeFilter("class", "xl28"));
     filters[2].Add(new HasAttributeFilter("class", "xl27"));
     filters[2].Add(new HasAttributeFilter("class", "xl28"));
     filters[2].Add(new HasAttributeFilter("class", "xl29"));
     filters[2].Add(new HasAttributeFilter("class", "xl30"));
     filters[3].Add(new HasAttributeFilter("class", "xl27"));
     filters[3].Add(new HasAttributeFilter("class", "xl28"));
     filters[3].Add(new HasAttributeFilter("class", "xl29"));
     filters[3].Add(new OrFilter(new HasAttributeFilter("class", "xl31"), new HasAttributeFilter("class", "xl30")));
     filters[4].Add(new HasAttributeFilter("class", "xl27"));
     filters[4].Add(new HasAttributeFilter("class", "xl28"));
     filters[4].Add(new HasAttributeFilter("class", "xl29"));
     filters[4].Add(new HasAttributeFilter("class", "xl30"));
     filters[5].Add(new HasAttributeFilter("class", "xl33"));
     filters[5].Add(new HasAttributeFilter("class", "xl32"));
     filters[5].Add(new HasAttributeFilter("class", "xl30"));
     filters[5].Add(new HasAttributeFilter("class", "xl28"));
     filters[6].Add(new HasAttributeFilter("class", "xl29"));
     filters[6].Add(new HasAttributeFilter("class", "xl30"));
     filters[6].Add(new HasAttributeFilter("class", "xl31"));
     filters[6].Add(new HasAttributeFilter("class", "xl32"));
     filters[7].Add(new HasAttributeFilter("class", "xl28"));
     filters[7].Add(new HasAttributeFilter("class", "xl24"));
     filters[7].Add(new HasAttributeFilter("class", "xl30"));
     filters[7].Add(new HasAttributeFilter("class", "xl31"));
     DirectoryInfo directory = new DirectoryInfo(@"D:\Download\魔戒三部曲电影导演评论字幕\mht");
     int count = 0;
     foreach (FileInfo file in directory.GetFiles("*.htm"))
     {
         StreamReader reader = new StreamReader(file.FullName);
         string htmlContent = reader.ReadToEnd();
         reader.Close();
         string fileName = file.Name.Substring(0, file.Name.IndexOf('.'));
         Lexer lexer = new Lexer(htmlContent);
         Parser parser = new Parser(lexer);
         //红色的是演员解说
         NodeList redNodeList = parser.ExtractAllNodesThatMatch(filters[count][0]);
         GetSubtitleFromHtml(redNodeList, fileName + "_演员解说");
         //黄色的是导演编剧解说
         parser.Reset();
         NodeList yelloNodeList = parser.ExtractAllNodesThatMatch(filters[count][1]);
         GetSubtitleFromHtml(yelloNodeList, fileName + "_导演编剧解说");
         //蓝色的是特技制作组
         parser.Reset();
         NodeList blueNodeList = parser.ExtractAllNodesThatMatch(filters[count][2]);
         GetSubtitleFromHtml(blueNodeList, fileName + "_特技制作组解说");
         //绿色的是幕后制作团队
         parser.Reset();
         NodeList greenNodeList = parser.ExtractAllNodesThatMatch(filters[count][3]);
         GetSubtitleFromHtml(greenNodeList, fileName + "_幕后制作团队解说");
         count++;
     }
 }
Esempio n. 14
0
        private string GetDetailString(NodeList nodeDetail)
        {
            string detailHtml = nodeDetail.ToHtml();
            ReplaceNode(ref detailHtml, nodeDetail, typeof(ScriptTag));
            ReplaceNode(ref detailHtml, nodeDetail, typeof(ATag));

            detailHtml = detailHtml.Replace("<br />", "\r\n").Replace("&nbsp;", "").Replace("&raquo;", "").Replace("1010兼职网", "135free.com").Replace("所在地:", "");
            detailHtml = Regex.Replace(detailHtml, @"信息编号:\d{0,}", "");

            Lexer lexer = new Lexer(detailHtml);
            Parser parser = new Parser(lexer);
            NodeList list = parser.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(Div)));
            if (list.Count > 0)
            {
                return list[0].ToPlainTextString();
            }
            else
            {
                return "";
            }
        }
Esempio n. 15
0
        protected string getPaperID(string paper_name)
        {
            string html_page = _HttpUtil.getPaperIDHTML(paper_name);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("A");
            HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank");
            HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name));

            AndFilter af = new AndFilter(tag_f,attr_f);
            AndFilter aff = new AndFilter(af, child_f);

            NodeList childs = p.ExtractAllNodesThatMatch(aff);

            if (childs == null || childs.Count <= 0)
            {
                //Paper not found
                return null;
            }
            //TODO Multi Paper found

            INode node = childs[0];
            if (node is ITag)
            {
                ITag t = node as ITag;

                string href = t.GetAttribute("href");

                if (href != null && href != "")
                {
                    string [] sp = href.Split(new char[]{'/'});

                    return sp[sp.Length - 1].Split(new char[]{'.'})[0];
                }
            }

            //Not Found
            return null;
        }
Esempio n. 16
0
        public void ParseProduct(ATag a)
        {
            string html = GetHtml(a.Link);

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter productArea = new HasAttributeFilter("id", "productyou");
            NodeList nodes = parser.ExtractAllNodesThatMatch(productArea);

            ParseProductTitle(nodes);
            ParseProductShowPhoto(nodes);
            ParseProductDemoPhoto(nodes);
            ParsePorductDescribe(nodes);

            NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans");
            NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true);

            NodeFilter productAttributes = new HasAttributeFilter("class", "cph");
            NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true);

            int length = productAttributeNodes.Count;
            for (int i = 0; i < length; i++)
            {
                INode n = productAttributeNodes[i].Children[0];
                string t =n.ToPlainTextString();
                if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success)
                {
                    ParseProductColors(n);
                }
                Console.WriteLine();
            }
        }
Esempio n. 17
0
        /// <summary>
        /// 辅助函数:从HTML中获得用户信息
        /// </summary>
        /// <param name="currentUserHtml">包含微博用户信息的HTML文本</param>
        private void GetUserInfoFromHtml(string currentUserHtml)
        {
            //配置相关的过滤器
            HasAttributeFilter nickNameFilter = new HasAttributeFilter("class", "name");
            HasAttributeFilter remarkNameFilter = new HasAttributeFilter("class", "CH");
            HasAttributeFilter linkUrlFilter = new HasAttributeFilter("class", "pf_lin S_link1");
            HasAttributeFilter selfIntroFilter = new HasAttributeFilter("class", "pf_intro bsp");
            HasAttributeFilter tagsFilter = new HasAttributeFilter("class", "S_func1");
            HasAttributeFilter profileFilter = new HasAttributeFilter("class", "tags");

            Lexer lexer = new Lexer(currentUserHtml);
            Parser parser = new Parser(lexer);

            //获取微博名
            NodeList nickNameNodeList = parser.ExtractAllNodesThatMatch(nickNameFilter);

            if (nickNameNodeList.Size() == 1)
            {
                user.NickName = ((Span)nickNameNodeList[0]).ToPlainTextString();
            }
            else
            {
                Console.WriteLine("判断微博名的标准出错!");
            }
            //注意此处:如果要重复使用parser,一定要在本次使用“完”、下次使用前调用reset,否则会出错
            parser.Reset();
            //获取备注名称
            NodeList remarkNameNodeList = parser.ExtractAllNodesThatMatch(remarkNameFilter);

            if (remarkNameNodeList.Size() == 1 && remarkNameNodeList[0].GetType().Equals(typeof(Span)))
            {
                string str = ((Span)remarkNameNodeList[0]).ToPlainTextString();
                //去掉头尾的括号
                user.RemarkName = str.Substring(1, str.Length - 2);
            }
            else
            {
                Console.WriteLine("判断微博备注名称的标准出错!");
            }
            parser.Reset();
            //获取微博链接地址
            NodeList linkUrlNodeList = parser.ExtractAllNodesThatMatch(linkUrlFilter);
            if (linkUrlNodeList.Size() == 1 && linkUrlNodeList[0].GetType().Equals(typeof(ATag)))
            {
                user.LinkURL = ((ATag)linkUrlNodeList[0]).StringText;
            }
            else
            {
                Console.WriteLine("判断微博链接地址的标准出错!");
            }
            parser.Reset();
            //获取自我描述
            NodeList selfIntroNodeList = parser.ExtractAllNodesThatMatch(selfIntroFilter);
            if (selfIntroNodeList.Size() == 1 && selfIntroNodeList[0].Children[1].GetType().Equals(typeof(Span)))
            {
                user.SelfIntroduction = ((Span)selfIntroNodeList[0].Children[1]).GetAttribute("TITLE");
            }
            else
            {
                Console.WriteLine("判断自我描述的标准出错!");
            }
            parser.Reset();
            //获取标签
            NodeList tagsNodeList = parser.ExtractAllNodesThatMatch(tagsFilter);
            string str2 = "";
            for (int i = 0; i < tagsNodeList.Size(); i++)
            {
                if (tagsNodeList[i].GetType().Equals(typeof(Span)))
                {
                    str2 += ((Span)tagsNodeList[i]).ToPlainTextString() + " ";
                }
            }
            user.Tags = str2;
            parser.Reset();
            //获取属性信息
            NodeList profileNodeList = parser.ExtractAllNodesThatMatch(profileFilter);
            if (profileNodeList.Size() == 1)
            {
                //通过分析发现,有用的信息均处于<a>标记中,所以按<a>标记取。然后再分析是其中的文本还是<em>中的title
                NodeClassFilter aTagFilter = new NodeClassFilter(typeof(ATag));
                NodeList profileList = profileNodeList[0].Children.ExtractAllNodesThatMatch(aTagFilter, true);
                for (int j = 0; j < profileList.Size(); j++)
                {
                    ATag aTag = (ATag)profileList[j];
                    if (aTag.Attributes.Contains("TITLE"))
                    {
                        user.Profile += aTag.GetAttribute("TITLE") + " ";
                    }
                    else
                    {
                        //遇到含有node-type="infoSlide"的节点说明所有属性遍历结束
                        if (aTag.Attributes.Contains("NODE-TYPE") && aTag.GetAttribute("NODE-TYPE").Equals("infoSlide"))
                        {
                            break;
                        }
                        else
                        {
                            //包含<em>子节点的情况
                            if (aTag.Children[0].GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tagNode = (TagNode)aTag.Children[0];
                                user.Profile += tagNode.GetAttribute("TITLE") + " ";
                            }
                            else
                            {
                                //直接把<a>标记包含的文本输出
                                user.Profile += aTag.StringText + " ";
                            }
                        }
                    }
                }
            }
            else
            {
                Console.WriteLine("判断用户属性信息的标准出错!");
            }
        }
Esempio n. 18
0
        protected ArrayList getPaperReferenceByID(ArrayList paper_id)
        {
            string html_page = _HttpUtil.getPaperReferenceHTML(paper_id);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("div");
            HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container");

            AndFilter af = new AndFilter(tag_f, attr_f);

            NodeList childs = p.ExtractAllNodesThatMatch(af);

            if (childs == null || childs.Count <= 0)
            {
                return null;
            }

            INode node = childs[0];

            NodeList ref_childs = node.Children;
            ArrayList ref_list = new ArrayList();

            for (int i = 0; i < ref_childs.Count;++i )
            {
                INode tmp = ref_childs[i];

                if (tmp is ITag)
                {
                    ITag tag = tmp as ITag;

                    string str = tag.ToPlainTextString();

                    str = str.Replace('\r', ' ').Replace('\n',' ');

                    str = str.Substring(str.IndexOf(']') + 1);

                    //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", "");

                    ref_list.Add(str);
                }
            }

            if (_Progressable != null)
            {
                _Progressable.onFinish(ref_list);
            }

            return ref_list;
        }
Esempio n. 19
0
 public List<Model.Play> getPlays(string xmlFile)
 {
     Match strCinema = Regex.Match(xmlFile, @"\d\d\d\d");
     string cinemaID = strCinema.Value;
     string html = File.ReadAllText(xmlFile);
     List<Model.Play> plays = new List<Model.Play>();
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     //获取影片列表的node
     string listAttribute = "METHOD";
     string listAttValue = "mdShowtime";
     NodeFilter movieListFilter = new HasAttributeFilter(listAttribute, listAttValue);
     NodeList movieNodeList = parser.ExtractAllNodesThatMatch(movieListFilter);
     if (movieNodeList.Count >= 1)
     {
         //获取每个电影的html
         for (int i = 0; i < movieNodeList.Count; i++)
         {
             INode node = movieNodeList[i];
             string movieHtml = node.ToHtml();
             Lexer movieLexer = new Lexer(movieHtml);
             Parser movieParser = new Parser(movieLexer);
             //获取影片ID
             NodeFilter idFilter = new HasAttributeFilter("CLASS", "c_000");
             NodeList idNodes = movieParser.ExtractAllNodesThatMatch(idFilter);
             string strID = string.Empty;
             if (idNodes.Count >= 1)
             {
                 ITag idTag = (idNodes[0] as ITag);
                 if (idTag.Attributes != null)
                 {
                     string str = idTag.Attributes["HREF"].ToString();
                     Match match = Regex.Match(str, @"\d\d\d\d\d\d");
                     if (match.Success)
                     {
                         strID = match.Value;//电影的ID
                     }
                     else
                     {
                         Match ma = Regex.Match(str, @"\d\d\d\d\d");
                         if (ma.Success)
                         {
                             strID = ma.Value;
                         }
                     }
                     //strID = match.Value;//电影的ID
                 }
             }
             //获取影片播放时段列表
             Lexer lexer2 = new Lexer(movieHtml);
             Parser movieParser2 = new Parser(lexer2);
             NodeFilter playFilter = new HasAttributeFilter("_TYPE", "expiry");
             //NodeList playNodes = parser.ExtractAllNodesThatMatch(playFilter);
             NodeList playNodes = movieParser2.ExtractAllNodesThatMatch(playFilter);
             if (playNodes.Count >= 1)
             {
                 for (int j = 0; j < playNodes.Count; j++)
                 {
                     Model.Play play = new Model.Play();
                     ITag playTag = (playNodes[j] as ITag);
                     if (playTag.Attributes != null)
                     {
                         play.CinemaID = int.Parse(cinemaID);
                         play.MovieID = int.Parse(strID);
                         play.PlayID = int.Parse(playTag.Attributes["SHOWTIMEID"].ToString());
                         string strTime = playTag.Attributes["TIME"].ToString();
                         if (strTime == null || strTime == "")
                         {
                             continue;
                         }
                         strTime = strTime.Trim();
                         strTime = strTime.Remove(0, 10);
                         play.PlayName = strTime.Trim();
                         //ITag tag2 = (playTag.FirstChild as ITag);
                         //string strPrice = tag2.FirstChild.ToPlainTextString(); //playTag.FirstChild.FirstChild.ToPlainTextString();
                         string strPrice = playTag.FirstChild.NextSibling.FirstChild.NextSibling.ToPlainTextString();
                         if (strPrice != null&&strPrice!=""&&strPrice!=" ")
                         {
                             strPrice = strPrice.Trim();
                             strPrice = strPrice.Remove(0, 1);
                             play.Price = float.Parse(strPrice);
                         }
                         else
                         {
                             play.Price = 0f;
                         }
                         plays.Add(play);
                     }
                 }
             }
         }
         return plays;
     }
     return null;
     //throw new NotImplementedException();
 }