Exemple #1
0
        /// <summary>
        /// 采集详细页
        /// </summary>
        /// <param name="listPage"></param>
        public void ParseDetailPage(ListPage listPage)
        {
            ListPage backListPage        = null;
            bool     isNotPage           = false;
            int      pageIndex           = this.SnifferContext.GetStartPageIndex(listPage);
            int      snifferContextCount = SnifferContext.GetSnifferPageCount(listPage);
            int      donePageCount       = 0;

            while (!isNotPage)
            {
                listPage.Sniffer();

                //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类
                if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody))
                {
                    break;
                }

                OnPageIndexChange(listPage.PageUrl);

                foreach (UrlItem urlItem in listPage.SubPageUrlResults)
                {
                    DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration;
                    DetailPage detailPage = new DetailPage(listPage, detailPageConf);
                    detailPage.PageIndex = detailPageConf.PageStartIndex;
                    detailPage.PageName  = urlItem.Title;
                    detailPage.PageUrl   = urlItem.Url;
                    detailPage.Sniffer();
                    OnDetailPageParseDone(detailPage);
                }


                pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1);
                donePageCount++;

                ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration);
                newListPage.PageName = listPage.PageName;
                newListPage.PageUrl  = listPage.PageUrl;

                if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get)
                {
                    ReplacePageIndex(newListPage, pageIndex);
                }
                else
                {
                    newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex);
                }

                backListPage = listPage;
                listPage     = newListPage;
            }
        }
Exemple #2
0
        /// <summary>
        /// 读取详细页重载
        /// </summary>
        /// <returns></returns>
        public static DetailPage GetDetailPage(string fileName, string rootPageName, int firstIndex, int pageIndex, int urlIndex)
        {
            ListPage firstPage = GetListPage(fileName, rootPageName, firstIndex, pageIndex);

            UrlItem urlItem = firstPage.SubPageUrlResults[urlIndex];

            DetailPageConfiguration detailPageConf = (DetailPageConfiguration)firstPage.ListPageConfiguration.SubPageConfiguration;
            DetailPage detailPage = new DetailPage(firstPage, detailPageConf);

            detailPage.PageIndex = detailPageConf.PageStartIndex;
            detailPage.PageName  = urlItem.Title;
            detailPage.PageUrl   = urlItem.Url;
            detailPage.Sniffer();

            return(detailPage);
        }
Exemple #3
0
        private static DetailPageConfiguration CreateDetailPageConfiguration(PageConfiguration parent, XmlNode parentConfNode, XmlNode pageConfNode)
        {
            XmlNode commonListConfigNode = null;
            XmlNode commonConfigNode     = null;

            if (parentConfNode.Attributes["Config"] != null)
            {
                commonListConfigNode = parentConfNode.ParentNode.SelectSingleNode(string.Format("CommonConfig[@Name='{0}']", parentConfNode.Attributes["Config"].Value));
                commonConfigNode     = commonListConfigNode.SelectSingleNode("SnifferPage");
            }

            DetailPageConfiguration detailPageConf = new DetailPageConfiguration(parent);

            detailPageConf.PageName = pageConfNode.Attributes["PageName"].Value;

            if (pageConfNode.Attributes["PageUrl"] != null)
            {
                detailPageConf.PageUrl = pageConfNode.Attributes["PageUrl"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageUrl"] != null)
            {
                detailPageConf.PageUrl = commonConfigNode.Attributes["PageUrl"].Value;
            }
            if (pageConfNode.Attributes["PageType"] != null)
            {
                detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), pageConfNode.Attributes["PageType"].Value, false);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageType"] != null)
            {
                detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), commonConfigNode.Attributes["PageType"].Value, false);
            }
            if (pageConfNode.Attributes["Encoding"] != null)
            {
                detailPageConf.Encoding = Encoding.GetEncoding(pageConfNode.Attributes["Encoding"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["Encoding"] != null)
            {
                detailPageConf.Encoding = Encoding.GetEncoding(commonConfigNode.Attributes["Encoding"].Value);
            }


            //翻页
            if (pageConfNode.Attributes["PageQuery"] != null)
            {
                detailPageConf.PageQuery = pageConfNode.Attributes["PageQuery"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageQuery"] != null)
            {
                detailPageConf.PageQuery = commonConfigNode.Attributes["PageQuery"].Value;
            }
            if (pageConfNode.Attributes["ReplacePageQuery"] != null)
            {
                detailPageConf.ReplacePageQuery = pageConfNode.Attributes["ReplacePageQuery"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["ReplacePageQuery"] != null)
            {
                detailPageConf.ReplacePageQuery = commonConfigNode.Attributes["ReplacePageQuery"].Value;
            }
            if (pageConfNode.Attributes["PageIndexFormat"] != null)
            {
                detailPageConf.PageIndexFormat = pageConfNode.Attributes["PageIndexFormat"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexFormat"] != null)
            {
                detailPageConf.PageIndexFormat = commonConfigNode.Attributes["PageIndexFormat"].Value;
            }
            if (pageConfNode.Attributes["PageStartIndex"] != null)
            {
                detailPageConf.PageStartIndex = int.Parse(pageConfNode.Attributes["PageStartIndex"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageStartIndex"] != null)
            {
                detailPageConf.PageStartIndex = int.Parse(commonConfigNode.Attributes["PageStartIndex"].Value);
            }

            if (pageConfNode.Attributes["PageIndexSeed"] != null)
            {
                detailPageConf.PageIndexSeed = int.Parse(pageConfNode.Attributes["PageIndexSeed"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexSeed"] != null)
            {
                detailPageConf.PageIndexSeed = int.Parse(commonConfigNode.Attributes["PageIndexSeed"].Value);
            }

            if (pageConfNode.Attributes["PageIndexStep"] != null)
            {
                detailPageConf.PageIndexStep = int.Parse(pageConfNode.Attributes["PageIndexStep"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexStep"] != null)
            {
                detailPageConf.PageIndexStep = int.Parse(commonConfigNode.Attributes["PageIndexStep"].Value);
            }

            if (pageConfNode.Attributes["EndPageDetermineRegex"] != null)
            {
                detailPageConf.EndPageDetermineRegex = pageConfNode.Attributes["EndPageDetermineRegex"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["EndPageDetermineRegex"] != null)
            {
                detailPageConf.EndPageDetermineRegex = commonConfigNode.Attributes["EndPageDetermineRegex"].Value;
            }

            if (pageConfNode.Attributes["PageMethod"] != null)
            {
                detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), pageConfNode.Attributes["PageMethod"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageMethod"] != null)
            {
                detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), commonConfigNode.Attributes["PageMethod"].Value);
            }



            XmlNode snifferUrlItemNode = pageConfNode.SelectSingleNode("SnifferSubPageUrlItem");

            if (snifferUrlItemNode == null && commonConfigNode != null && commonConfigNode.Attributes["SnifferSubPageUrlItem"] != null)
            {
                snifferUrlItemNode = commonConfigNode.SelectSingleNode("SnifferSubPageUrlItem");
            }
            if (snifferUrlItemNode != null)
            {
                detailPageConf.SnifferSubPageUrlItem = CreateSnifferUrlItem(snifferUrlItemNode);
            }


            List <XmlNodeList> lstItemLists = new List <XmlNodeList>();

            XmlNodeList itemNodes = pageConfNode.SelectNodes("SnifferItem");

            if (itemNodes != null)
            {
                lstItemLists.Add(itemNodes);
            }

            if (commonConfigNode != null)
            {
                XmlNodeList comItemNodes = commonConfigNode.SelectNodes("SnifferItem");
                if (comItemNodes != null)
                {
                    lstItemLists.Add(comItemNodes);
                }
            }

            for (int i = 0; i < lstItemLists.Count; i++)
            {
                itemNodes = lstItemLists[i];
                foreach (XmlNode itemNode in itemNodes)
                {
                    string itemName = itemNode.Attributes["ItemName"].Value;
                    bool   bool1    = false;
                    foreach (SnifferItem itm in detailPageConf.SnifferItems)
                    {
                        if (itm.ItemName == itemName)
                        {
                            bool1 = true;
                        }
                    }

                    if (!bool1)
                    {
                        SnifferItem item = new SnifferItem();

                        item.ItemName = itemNode.Attributes["ItemName"].Value;

                        if (itemNode.Attributes["SaveImage"] != null)
                        {
                            item.SaveImage = bool.Parse(itemNode.Attributes["SaveImage"].Value);
                        }
                        if (itemNode.Attributes["SaveImagesPath"] != null)
                        {
                            item.SaveImagesPath = itemNode.Attributes["SaveImagesPath"].Value;
                        }
                        if (itemNode.Attributes["ImageUrlPath"] != null)
                        {
                            item.ImageUrlPath = itemNode.Attributes["ImageUrlPath"].Value;
                        }
                        if (itemNode.Attributes["IsClearHTML"] != null)
                        {
                            item.IsClearHTML = bool.Parse(itemNode.Attributes["IsClearHTML"].Value);
                        }
                        if (itemNode.Attributes["IsUrl"] != null)
                        {
                            item.IsUrl = bool.Parse(itemNode.Attributes["IsUrl"].Value);
                        }
                        if (itemNode.Attributes["UrlToAbs"] != null)
                        {
                            item.UrlToAbs = bool.Parse(itemNode.Attributes["UrlToAbs"].Value);
                        }
                        if (itemNode.Attributes["Separator"] != null)
                        {
                            item.Separator = itemNode.Attributes["Separator"].Value;
                        }
                        if (itemNode.Attributes["ClearAElement"] != null)
                        {
                            item.ClearAElement = bool.Parse(itemNode.Attributes["ClearAElement"].Value);
                        }
                        if (itemNode.Attributes["MutiPage"] != null)
                        {
                            item.MutiPage = bool.Parse(itemNode.Attributes["MutiPage"].Value);
                        }
                        if (itemNode.Attributes["MutiPageSeparator"] != null)
                        {
                            item.MutiPageSeparator = itemNode.Attributes["MutiPageSeparator"].Value;
                        }

                        XmlNode ndClearRegexString = itemNode.SelectSingleNode("ClearRegexString");
                        if (ndClearRegexString != null)
                        {
                            item.ClearRegexString = ndClearRegexString.InnerText;
                        }

                        XmlNode ndDefaultValue = itemNode.SelectSingleNode("DefaultValue");
                        if (ndDefaultValue != null)
                        {
                            item.DefaultValue = ndDefaultValue.InnerText;
                        }

                        XmlNode ndRegexString = itemNode.SelectSingleNode("RegexString");
                        if (ndRegexString != null)
                        {
                            item.RegexString = CreateRegexString(ndRegexString);
                        }

                        detailPageConf.SnifferItems.Add(item);
                    }
                }
            }


            XmlNodeList subPageNodes = pageConfNode.SelectNodes("SnifferPage");

            foreach (XmlNode subPageNode in subPageNodes)
            {
                detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode));
            }

            if (commonConfigNode != null)
            {
                subPageNodes = commonConfigNode.SelectNodes("SnifferPage");
                foreach (XmlNode subPageNode in subPageNodes)
                {
                    detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode));
                }
            }

            return(detailPageConf);
        }
Exemple #4
0
 public DetailPage(PageBase parent, DetailPageConfiguration configuration) : base(parent, configuration)
 {
 }
Exemple #5
0
 public DetailPage(DetailPageConfiguration configuration) : base(configuration)
 {
 }
Exemple #6
0
 private void AddDetailPageConfiguration(List <DetailPageConfiguration> confis, DetailPageConfiguration conf)
 {
     confis.Add(conf);
     foreach (DetailPageConfiguration c in conf.SubPageConfigurations)
     {
         AddDetailPageConfiguration(confis, c);
     }
 }