예제 #1
0
        /// <summary>
        /// 一个详细页处理完成
        /// </summary>
        /// <param name="detailPage"></param>
        void SnifferThread_DetailPageParseDone(DetailPage detailPage)
        {
            if (Data.Tables.Count == 0)
            {
                //如果没有表,则先创建表结构
                Data.Tables.Add(detailPage.DetailPageConfiguration.CreateDataTable());
            }

            DataTable table = Data.Tables[0];
            DataRow   row   = table.NewRow();

            bool hasData = false;

            foreach (DataColumn col in table.Columns)
            {
                if (col.ColumnName != "ID" && !string.IsNullOrEmpty(row[col].ToString()))
                {
                    hasData = true;
                    break;
                }
            }

            hasData = true;
            if (hasData)
            {
                AddValueToRow(row, detailPage);
                table.Rows.Add(row);
            }
        }
예제 #2
0
 private void OnDetailPageParseDone(DetailPage detailPage)
 {
     if (DetailPageParseDone != null)
     {
         DetailPageParseDone(detailPage);
     }
 }
예제 #3
0
        /// <summary>
        /// 采集
        /// </summary>
        /// <returns></returns>
        public override bool Sniffer()
        {
            bool bool1 = InlineSniffer();

            //return bool1;
            if (!bool1)
            {
                return(false);
            }

            //这里开始内容页的翻页
            if (string.IsNullOrEmpty(this.DetailPageConfiguration.PageQuery))
            {
                return(bool1);
            }

            DetailPage rootDetailPage = this;
            ListPage   listPage       = rootDetailPage.Parent as ListPage;

            while (listPage == null)
            {
                rootDetailPage = rootDetailPage.Parent as DetailPage;
                listPage       = rootDetailPage.Parent as ListPage;
            }

            DetailPage detailPage = new DetailPage(rootDetailPage, (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration);

            detailPage.PageName = this.PageName;
            detailPage.PageUrl  = this.PageUrl;

            int pageIndex = this.PageIndex + this.DetailPageConfiguration.PageIndexStep;

            ReplacePageIndex(detailPage, pageIndex);

            //如果内容一样,则表示已经结束了啦
            if (detailPage.PageBody == this.PageBody)
            {
                return(bool1);
            }
            else
            {
                //否则要看看,识别项的正则结果是否一样
                if (!string.IsNullOrEmpty(this.Configuration.EndPageDetermineRegex))
                {
                    Match match1 = Regex.Match(this.PageBody, this.Configuration.EndPageDetermineRegex, (RegexOptions)25);
                    Match match2 = Regex.Match(detailPage.PageBody, detailPage.Configuration.EndPageDetermineRegex, (RegexOptions)25);
                    if (match1.Value == match2.Value)
                    {
                        return(bool1);
                    }
                }
            }

            detailPage.Sniffer();

            return(bool1);
        }
예제 #4
0
        void AddValueToRow(DataRow row, DetailPage page)
        {
            foreach (string key in page.ResultItems.Keys)
            {
                row[key] = page.ResultItems[key];
            }

            //foreach (DetailPage p in page.SubPages)
            //{
            //    AddValueToRow(row, p);
            //}
        }
예제 #5
0
        /// <summary>
        /// 采集详细页
        /// </summary>
        /// <param name="listPage"></param>
        public void ParseDetailPage(ListPage listPage)
        {
            ListPage backListPage        = null;
            bool     isNotPage           = false;
            int      pageIndex           = this.SnifferContext.GetStartPageIndex(listPage);
            int      snifferContextCount = SnifferContext.GetSnifferPageCount(listPage);
            int      donePageCount       = 0;

            while (!isNotPage)
            {
                listPage.Sniffer();

                //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类
                if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody))
                {
                    break;
                }

                OnPageIndexChange(listPage.PageUrl);

                foreach (UrlItem urlItem in listPage.SubPageUrlResults)
                {
                    DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration;
                    DetailPage detailPage = new DetailPage(listPage, detailPageConf);
                    detailPage.PageIndex = detailPageConf.PageStartIndex;
                    detailPage.PageName  = urlItem.Title;
                    detailPage.PageUrl   = urlItem.Url;
                    detailPage.Sniffer();
                    OnDetailPageParseDone(detailPage);
                }


                pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1);
                donePageCount++;

                ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration);
                newListPage.PageName = listPage.PageName;
                newListPage.PageUrl  = listPage.PageUrl;

                if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get)
                {
                    ReplacePageIndex(newListPage, pageIndex);
                }
                else
                {
                    newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex);
                }

                backListPage = listPage;
                listPage     = newListPage;
            }
        }
예제 #6
0
        /// <summary>
        /// 读取详细页重载
        /// </summary>
        /// <returns></returns>
        public static DetailPage GetDetailPage(string fileName, string rootPageName, int firstIndex, int pageIndex, int urlIndex)
        {
            ListPage firstPage = GetListPage(fileName, rootPageName, firstIndex, pageIndex);

            UrlItem urlItem = firstPage.SubPageUrlResults[urlIndex];

            DetailPageConfiguration detailPageConf = (DetailPageConfiguration)firstPage.ListPageConfiguration.SubPageConfiguration;
            DetailPage detailPage = new DetailPage(firstPage, detailPageConf);

            detailPage.PageIndex = detailPageConf.PageStartIndex;
            detailPage.PageName  = urlItem.Title;
            detailPage.PageUrl   = urlItem.Url;
            detailPage.Sniffer();

            return(detailPage);
        }
예제 #7
0
        private void ReplacePageIndex(DetailPage detailPage, int pageIndex)
        {
            detailPage.PageIndex = pageIndex;
            string stringPageIndex;

            if (!string.IsNullOrEmpty(detailPage.DetailPageConfiguration.PageIndexFormat))
            {
                stringPageIndex = pageIndex.ToString(detailPage.DetailPageConfiguration.PageIndexFormat);
            }
            else
            {
                stringPageIndex = pageIndex.ToString();
            }

            string pageQuery = string.Format(detailPage.DetailPageConfiguration.ReplacePageQuery, stringPageIndex);
            string pageUrl   = Regex.Replace(detailPage.PageUrl, detailPage.DetailPageConfiguration.PageQuery, pageQuery);

            detailPage.PageUrl = pageUrl;
        }
예제 #8
0
        private bool InlineSniffer()
        {
            if (!this.Succeed)
            {
                _done = true;
                return(false);
            }

            //采集项
            foreach (SnifferItem item in this.DetailPageConfiguration.SnifferItems)
            {
                //这里的意思应该是,如果不是多页的内容,则当然页如果大于开始页了,则不再采集了。
                if (this.PageIndex > this.DetailPageConfiguration.PageStartIndex && !item.MutiPage)
                {
                    continue;
                }

                if (item.RegexString != null)
                {
                    MatchCollection matchs = Regex.Matches(this.PageBody, item.RegexString.Expression, (RegexOptions)25);

                    if (matchs.Count > 0)
                    {
                        System.Text.StringBuilder sb = new StringBuilder();
                        foreach (Match match in matchs)
                        {
                            if (matchs.Count > 1 && sb.Length > 0)
                            {
                                sb.Append(item.Separator);
                            }
                            string value = match.Groups[item.RegexString.ValueGroupIndex].Value;

                            if (string.IsNullOrEmpty(value))
                            {
                                value = item.DefaultValue;
                            }
                            else
                            {
                                //清理垃圾
                                ClearRubbish(item, ref value);
                                //将内容里的URL转成绝对路径
                                if (item.UrlToAbs)
                                {
                                    UrlToAbs(item, ref value);
                                }
                                //如果采集的是Url则转换成绝对路径
                                if (item.IsUrl)
                                {
                                    value = FileUtil.GetAbsUrl(value, this.SubPageBaseUrl);
                                }
                                //保存图片
                                if (item.SaveImage)
                                {
                                    SaveImages(item, ref value);
                                }
                                //清除A元素
                                if (item.ClearAElement)
                                {
                                    ClearAElement(item, ref value);
                                }
                                //清除HTML代码
                                if (item.IsClearHTML)
                                {
                                    value = ClearHTML(value);
                                }
                            }

                            sb.Append(value);
                        }
                        this.ResultItems.Add(item.ItemName, sb.ToString().Trim());
                    }
                    else
                    {
                        this.ResultItems.Add(item.ItemName, item.DefaultValue);
                    }
                }
                else
                {
                    this.ResultItems.Add(item.ItemName, item.DefaultValue);
                }
            }


            //整合字段
            string     text1      = this.PageUrl;
            DetailPage parentPage = this.Parent as DetailPage;

            if (parentPage != null)
            {
                foreach (string key in this.ResultItems.Keys)
                {
                    string value = (string)this.ResultItems[key];
                    if (this.ResultItems.Contains(key))
                    {
                        string parentPageValue = (string)parentPage.ResultItems[key];
                        if (parentPageValue != value)
                        {
                            SnifferItem item = null;
                            foreach (SnifferItem itm in parentPage.DetailPageConfiguration.SnifferItems)
                            {
                                if (itm.ItemName == key)
                                {
                                    item = itm;
                                    break;
                                }
                            }
                            parentPage.ResultItems[key] = parentPageValue + item.MutiPageSeparator + value;
                        }
                    }
                    else
                    {
                        parentPage.ResultItems.Add(key, value);
                    }
                }
            }

            //采集子页
            foreach (DetailPageConfiguration conf in this.DetailPageConfiguration.SubPageConfigurations)
            {
                MatchCollection matches = Regex.Matches(this.PageBody, conf.SnifferSubPageUrlItem.Expression, (RegexOptions)25);

                if (matches != null && matches.Count > 0)
                {
                    foreach (Match match in matches)
                    {
                        if (!string.IsNullOrEmpty(match.Value))
                        {
                            DetailPage detailPage = new DetailPage(this, conf);
                            detailPage.PageName = match.Groups[conf.SnifferSubPageUrlItem.TitleGroupIndex].Value;

                            string url = match.Groups[conf.SnifferSubPageUrlItem.UrlGroupIndex].Value;

                            if (!string.IsNullOrEmpty(conf.SnifferSubPageUrlItem.UrlFormat))
                            {
                                url = string.Format(conf.SnifferSubPageUrlItem.UrlFormat, url);
                            }

                            detailPage.PageUrl = FileUtil.GetAbsUrl(url, this.SubPageBaseUrl);

                            detailPage.Sniffer();

                            this.SubPages.Add(detailPage);
                        }
                    }
                }
            }


            _done = true;
            return(true);
        }