Exemplo n.º 1
0
        public void GetInfoFromHtml(int currentPage)
        {
            Lexer    lexer       = new Lexer(currentHtml);
            Parser   parser      = new Parser(lexer);
            NodeList poiHeadList = parser.Parse(poiListFilter);

            if (poiHeadList.Count == 1)
            {
                NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
                int      numCount    = 0;
                for (int i = 0; i < poiNodeList.Count; i++)
                {
                    POI poi = new POI();
                    DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
                    if (poiNode.TagName.Equals("DD"))
                    {
                        numCount++;
                        poi.Page   = currentPage;
                        poi.Number = numCount;
                        #region 获取口味、环境和服务评分,以及获取星级
                        NodeList tasteNodeList       = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                        NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                        NodeList serviceNodeList     = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                        if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                        {
                            Span spanNode = (Span)tasteNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            spanNode = (Span)environmentNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            spanNode = (Span)serviceNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            #region 获取星级
                            INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                            if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                            {
                                INode rankNodeCandidate = rankNodeOfParent.Children[0];
                                if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                                {
                                    Span   rankNode = (Span)rankNodeCandidate;
                                    string rank     = rankNode.GetAttribute("TITLE");
                                    if (rank.Contains("五"))
                                    {
                                        poi.Rank = 5;
                                    }
                                    else
                                    {
                                        if (rank.Contains("四"))
                                        {
                                            poi.Rank = 4;
                                        }
                                        else
                                        {
                                            if (rank.Contains("三"))
                                            {
                                                poi.Rank = 3;
                                            }
                                            else
                                            {
                                                if (rank.Contains("二"))
                                                {
                                                    poi.Rank = 2;
                                                }
                                                else
                                                {
                                                    if (rank.Contains("一"))
                                                    {
                                                        poi.Rank = 1;
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                            #endregion
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!");
                        }
                        #endregion
                        #region 获取平均消费
                        NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                        if (averageNodeList.Count == 1)
                        {
                            INode averageNode = averageNodeList[0];
                            if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                            {
                                string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                                poi.AverageCost = Int32.Parse(cost);
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!");
                        }
                        #endregion
                        #region 获取点评数
                        NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                        if (commentNodeList.Count == 1)
                        {
                            INode commentNode = commentNodeList[0];
                            if (commentNode.GetType().Equals(typeof(ATag)))
                            {
                                string commentNum = ((ATag)commentNode).StringText;
                                if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                                {
                                    commentNum = commentNum.Substring(0, commentNum.Length - 3);
                                }
                                poi.CommentCount = Int32.Parse(commentNum);
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!");
                        }
                        #endregion
                        #region 获取店名
                        NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                        if (nameNodeList.Count == 1)
                        {
                            INode nameNode = nameNodeList[0];
                            if (nameNode.GetType().Equals(typeof(ATag)))
                            {
                                poi.Name = ((ATag)nameNode).StringText;
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!");
                        }
                        #endregion
                        #region 获取地址和电话
                        NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                        if (addressNodeList.Count == 1)
                        {
                            NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                            if (districtNodeList.Count == 1)
                            {
                                ATag   districtTag = (ATag)districtNodeList[0];
                                string address     = districtTag.ToPlainTextString();
                                if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                                {
                                    TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                                    string   detailAddress     = detailAddressNode.ToPlainTextString();
                                    detailAddress = detailAddress.Trim();
                                    string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                                    poi.Phone = phoneStr;
                                    address  += detailAddress.Substring(0, detailAddress.Length - 8);
                                }
                                char[] removeChrVector = { ' ', '\n', '\t' };
                                address = address.Trim(removeChrVector);
                                foreach (char c in removeChrVector)
                                {
                                    address = address.Replace(c.ToString(), "");
                                }
                                poi.Address = address;
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!");
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!");
                        }
                        #endregion
                        #region 获取标签
                        NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                        if (tagsNodeList.Count == 1)
                        {
                            INode tagsNode = tagsNodeList[0];
                            if (tagsNode.Children != null)
                            {
                                for (int j = 0; j < tagsNode.Children.Count; j++)
                                {
                                    INode node = tagsNode.Children[j];
                                    if (node.GetType().Equals(typeof(ATag)))
                                    {
                                        poi.Tags.Add(node.ToPlainTextString());
                                    }
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!");
                        }
                        #endregion
                        poiList.Add(poi);
                    }
                }
            }
            else
            {
                Console.WriteLine("获取POI列表出错");
            }
        }
Exemplo n.º 2
0
 public void GetInfoFromHtml(int currentPage)
 {
     Lexer lexer = new Lexer(currentHtml);
     Parser parser = new Parser(lexer);
     NodeList poiHeadList = parser.Parse(poiListFilter);
     if (poiHeadList.Count == 1)
     {
         NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
         int numCount = 0;
         for (int i = 0; i < poiNodeList.Count; i++)
         {
             POI poi = new POI();
             DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
             if (poiNode.TagName.Equals("DD"))
             {
                 numCount++;
                 poi.Page = currentPage;
                 poi.Number = numCount;
                 #region 获取口味、环境和服务评分,以及获取星级
                 NodeList tasteNodeList = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                 NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                 NodeList serviceNodeList = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                 if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                 {
                     Span spanNode = (Span)tasteNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)environmentNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)serviceNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     #region 获取星级
                     INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                     if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                     {
                         INode rankNodeCandidate = rankNodeOfParent.Children[0];
                         if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                         {
                             Span rankNode = (Span)rankNodeCandidate;
                             string rank = rankNode.GetAttribute("TITLE");
                             if (rank.Contains("五"))
                             {
                                 poi.Rank = 5;
                             }
                             else
                             {
                                 if (rank.Contains("四"))
                                 {
                                     poi.Rank = 4;
                                 }
                                 else
                                 {
                                     if (rank.Contains("三"))
                                     {
                                         poi.Rank = 3;
                                     }
                                     else
                                     {
                                         if (rank.Contains("二"))
                                         {
                                             poi.Rank = 2;
                                         }
                                         else
                                         {
                                             if (rank.Contains("一"))
                                             {
                                                 poi.Rank = 1;
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                     #endregion
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!");
                 }
                 #endregion
                 #region 获取平均消费
                 NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                 if (averageNodeList.Count == 1)
                 {
                     INode averageNode = averageNodeList[0];
                     if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                     {
                         string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                         poi.AverageCost = Int32.Parse(cost);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!");
                 }
                 #endregion
                 #region 获取点评数
                 NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                 if (commentNodeList.Count == 1)
                 {
                     INode commentNode = commentNodeList[0];
                     if (commentNode.GetType().Equals(typeof(ATag)))
                     {
                         string commentNum = ((ATag)commentNode).StringText;
                         if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                         {
                             commentNum = commentNum.Substring(0, commentNum.Length - 3);
                         }
                         poi.CommentCount = Int32.Parse(commentNum);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!");
                 }
                 #endregion
                 #region 获取店名
                 NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                 if (nameNodeList.Count == 1)
                 {
                     INode nameNode = nameNodeList[0];
                     if (nameNode.GetType().Equals(typeof(ATag)))
                     {
                         poi.Name = ((ATag)nameNode).StringText;
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!");
                 }
                 #endregion
                 #region 获取地址和电话
                 NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                 if (addressNodeList.Count == 1)
                 {
                     NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                     if (districtNodeList.Count == 1)
                     {
                         ATag districtTag = (ATag)districtNodeList[0];
                         string address = districtTag.ToPlainTextString();
                         if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                         {
                             TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                             string detailAddress = detailAddressNode.ToPlainTextString();
                             detailAddress = detailAddress.Trim();
                             string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                             poi.Phone = phoneStr;
                             address += detailAddress.Substring(0, detailAddress.Length - 8);
                         }
                         char[] removeChrVector = { ' ', '\n', '\t' };
                         address = address.Trim(removeChrVector);
                         foreach (char c in removeChrVector)
                         {
                             address = address.Replace(c.ToString(), "");
                         }
                         poi.Address = address;
                     }
                     else
                     {
                         Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!");
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!");
                 }
                 #endregion
                 #region 获取标签
                 NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                 if (tagsNodeList.Count == 1)
                 {
                     INode tagsNode = tagsNodeList[0];
                     if (tagsNode.Children != null)
                     {
                         for (int j = 0; j < tagsNode.Children.Count; j++)
                         {
                             INode node = tagsNode.Children[j];
                             if (node.GetType().Equals(typeof(ATag)))
                             {
                                 poi.Tags.Add(node.ToPlainTextString());
                             }
                         }
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!");
                 }
                 #endregion
                 poiList.Add(poi);
             }
         }
     }
     else
     {
         Console.WriteLine("获取POI列表出错");
     }
 }