Example #1
0
 private void OnFirstPageParseDone(ListPage firstPage)
 {
     if (FirstPageParseDone != null)
     {
         FirstPageParseDone(firstPage);
     }
 }
Example #2
0
 private void OnCategoryParseDone(ListPage listPage)
 {
     if (CategoryParseDone != null)
     {
         CategoryParseDone(listPage);
     }
 }
Example #3
0
 private void OnListPageParseDone(ListPage listPage)
 {
     if (ListPageParseDone != null)
     {
         ListPageParseDone(listPage);
     }
 }
Example #4
0
        /// <summary>
        /// 采集
        /// </summary>
        /// <returns></returns>
        public override bool Sniffer()
        {
            bool bool1 = InlineSniffer();

            //return bool1;
            if (!bool1)
            {
                return(false);
            }

            //这里开始内容页的翻页
            if (string.IsNullOrEmpty(this.DetailPageConfiguration.PageQuery))
            {
                return(bool1);
            }

            DetailPage rootDetailPage = this;
            ListPage   listPage       = rootDetailPage.Parent as ListPage;

            while (listPage == null)
            {
                rootDetailPage = rootDetailPage.Parent as DetailPage;
                listPage       = rootDetailPage.Parent as ListPage;
            }

            DetailPage detailPage = new DetailPage(rootDetailPage, (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration);

            detailPage.PageName = this.PageName;
            detailPage.PageUrl  = this.PageUrl;

            int pageIndex = this.PageIndex + this.DetailPageConfiguration.PageIndexStep;

            ReplacePageIndex(detailPage, pageIndex);

            //如果内容一样,则表示已经结束了啦
            if (detailPage.PageBody == this.PageBody)
            {
                return(bool1);
            }
            else
            {
                //否则要看看,识别项的正则结果是否一样
                if (!string.IsNullOrEmpty(this.Configuration.EndPageDetermineRegex))
                {
                    Match match1 = Regex.Match(this.PageBody, this.Configuration.EndPageDetermineRegex, (RegexOptions)25);
                    Match match2 = Regex.Match(detailPage.PageBody, detailPage.Configuration.EndPageDetermineRegex, (RegexOptions)25);
                    if (match1.Value == match2.Value)
                    {
                        return(bool1);
                    }
                }
            }

            detailPage.Sniffer();

            return(bool1);
        }
Example #5
0
        /// <summary>
        /// 采集详细页
        /// </summary>
        /// <param name="listPage"></param>
        public void ParseDetailPage(ListPage listPage)
        {
            ListPage backListPage        = null;
            bool     isNotPage           = false;
            int      pageIndex           = this.SnifferContext.GetStartPageIndex(listPage);
            int      snifferContextCount = SnifferContext.GetSnifferPageCount(listPage);
            int      donePageCount       = 0;

            while (!isNotPage)
            {
                listPage.Sniffer();

                //如果不成功,或者大于要采集的页数,则表示没有数据了,完成了分类
                if (!listPage.Succeed || donePageCount == snifferContextCount || (backListPage != null && backListPage.PageBody == listPage.PageBody))
                {
                    break;
                }

                OnPageIndexChange(listPage.PageUrl);

                foreach (UrlItem urlItem in listPage.SubPageUrlResults)
                {
                    DetailPageConfiguration detailPageConf = (DetailPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration;
                    DetailPage detailPage = new DetailPage(listPage, detailPageConf);
                    detailPage.PageIndex = detailPageConf.PageStartIndex;
                    detailPage.PageName  = urlItem.Title;
                    detailPage.PageUrl   = urlItem.Url;
                    detailPage.Sniffer();
                    OnDetailPageParseDone(detailPage);
                }


                pageIndex = pageIndex + (listPage.ListPageConfiguration.PageIndexStep - 1);
                donePageCount++;

                ListPage newListPage = new ListPage(listPage.Parent, listPage.ListPageConfiguration);
                newListPage.PageName = listPage.PageName;
                newListPage.PageUrl  = listPage.PageUrl;

                if (listPage.ListPageConfiguration.PageMethod == PageMethod.Get)
                {
                    ReplacePageIndex(newListPage, pageIndex);
                }
                else
                {
                    newListPage.PageQuery = string.Format(newListPage.PageQuery, pageIndex);
                }

                backListPage = listPage;
                listPage     = newListPage;
            }
        }
Example #6
0
        private void _start()
        {
            try
            {
                ListPage firstPage = null;
                lock (SnifferContext.FirstPages)
                {
                    if (SnifferContext.FirstPages.Count > 0)
                    {
                        firstPage = SnifferContext.FirstPages[0];
                        SnifferContext.FirstPages.Remove(firstPage);
                    }
                    else
                    {
                        return;
                    }
                }

                if (firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage && this.SnifferContext.GetStartPageIndex(firstPage) > 1)
                {
                    if (firstPage.ListPageConfiguration.PageMethod == PageMethod.Get)
                    {
                        ReplacePageIndex(firstPage, this.SnifferContext.GetStartPageIndex(firstPage));
                    }
                    else
                    {
                        firstPage.PageQuery = string.Format(firstPage.PageQuery, this.SnifferContext.GetStartPageIndex(firstPage));
                    }
                }
                firstPage.Sniffer();

                if (firstPage.SubPageUrlResults.Count > 0 && firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage)
                {
                    ParseDetailPage(firstPage);
                    OnListPageParseDone(firstPage);
                }
                else
                {
                    ParseListPage(firstPage);
                }

                OnFirstPageParseDone(firstPage);

                _start();
            }
            catch (System.Exception e)
            {
                InfoSniffer.LogManager.WriteLog(string.Format("<error><thread>{0}</thread><message>{1}</message></error>", this.ThreadIndex, e.Message));
            }
        }
Example #7
0
        /// <summary>
        /// 读取详细页重载
        /// </summary>
        /// <returns></returns>
        public static DetailPage GetDetailPage(string fileName, string rootPageName, int firstIndex, int pageIndex, int urlIndex)
        {
            ListPage firstPage = GetListPage(fileName, rootPageName, firstIndex, pageIndex);

            UrlItem urlItem = firstPage.SubPageUrlResults[urlIndex];

            DetailPageConfiguration detailPageConf = (DetailPageConfiguration)firstPage.ListPageConfiguration.SubPageConfiguration;
            DetailPage detailPage = new DetailPage(firstPage, detailPageConf);

            detailPage.PageIndex = detailPageConf.PageStartIndex;
            detailPage.PageName  = urlItem.Title;
            detailPage.PageUrl   = urlItem.Url;
            detailPage.Sniffer();

            return(detailPage);
        }
Example #8
0
        private void ReplacePageIndex(ListPage listPage, int pageIndex)
        {
            string stringPageIndex;

            if (!string.IsNullOrEmpty(listPage.ListPageConfiguration.PageIndexFormat))
            {
                stringPageIndex = pageIndex.ToString(listPage.ListPageConfiguration.PageIndexFormat);
            }
            else
            {
                stringPageIndex = pageIndex.ToString();
            }

            string pageQuery = string.Format(listPage.ListPageConfiguration.ReplacePageQuery, stringPageIndex);

            listPage.PageUrl = Regex.Replace(listPage.PageUrl, listPage.ListPageConfiguration.PageQuery, pageQuery);
        }
Example #9
0
        /// <summary>
        /// 一个列表页采集完成
        /// </summary>
        /// <param name="listPage"></param>
        void SnifferThread_ListPageParseDone(ListPage listPage)
        {
            string dir = listPage.SavePath;

            if (!Directory.Exists(dir))
            {
                Directory.CreateDirectory(dir);
            }

            string dirAndFileName = listPage.SavePathAndFileName;

            //导出到Xml文件
            Data.WriteXml(dirAndFileName);

            //如果有插件则调用插件
            RootPageConfiguration rootPageConf   = null;
            PageConfiguration     parentPageConf = listPage.Configuration.Parent;

            while (parentPageConf != null)
            {
                if (parentPageConf is RootPageConfiguration)
                {
                    rootPageConf = (RootPageConfiguration)parentPageConf;
                    break;
                }
                else
                {
                    parentPageConf = parentPageConf.Parent;
                }
            }
            if (rootPageConf != null && rootPageConf.Plugin != null)
            {
                rootPageConf.Plugin.Receive(Data, dirAndFileName);
            }

            //清空数据
            if (Data.Tables.Count > 0)
            {
                Data.Tables.Clear();
            }

            InfoSniffer.LogManager.WriteLog(string.Format("<donepage><thread>{0}</thread><page>{1}</page><donetime>{2}</donetime></donepage>", this.ThreadIndex, listPage.PageUrl.Replace("&", "&amp;"), DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")));
        }
Example #10
0
        /// <summary>
        /// 读取根页重载
        /// </summary>
        /// <param name="fileName"></param>
        /// <param name="rootPageName"></param>
        /// <returns></returns>
        public static List <ListPage> GetAllFirstPages(string fileName, string rootPageName)
        {
            SnifferConfig.OpenSnfFile(string.Format(AppDataPath + "{0}.xml", fileName));
            RootPageConfiguration rootPageConf = SnifferConfig.GetRootPageConfiguration(rootPageName);

            if (rootPageConf == null)
            {
                return(null);
            }

            ListPage        rootPage      = new ListPage((ListPageConfiguration)rootPageConf);
            List <ListPage> allFirstPages = new List <ListPage>();

            if (rootPageConf.IsSniffer)
            {
                rootPage.Sniffer();

                if (!rootPage.Done || rootPage.SubPageUrlResults.Count == 0)
                {
                    //采集不到
                }

                foreach (UrlItem urlItem in rootPage.SubPageUrlResults)
                {
                    ListPage page = new ListPage(rootPage, (ListPageConfiguration)rootPage.ListPageConfiguration.SubPageConfiguration);
                    page.PageName = urlItem.Title;
                    page.PageUrl  = urlItem.Url;
                    allFirstPages.Add(page);
                }
            }
            else
            {
                foreach (ListPageConfiguration firstPageConfi in rootPageConf.SubPageConfigurations)
                {
                    allFirstPages.Add(new ListPage(rootPage, firstPageConfi));
                }
            }

            return(allFirstPages);
        }
Example #11
0
        /// <summary>
        /// 读取列表页重载
        /// </summary>
        public static ListPage GetListPage(string fileName, string rootPageName, int firstIndex, int pageIndex)
        {
            List <ListPage> allFirstPages = GetAllFirstPages(fileName, rootPageName);

            ListPage firstPage = allFirstPages[firstIndex];

            //列表页的页码大于1,则要替换页码
            if (firstPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage && pageIndex > 1)
            {
                if (firstPage.ListPageConfiguration.PageMethod == PageMethod.Get)
                {
                    ReplacePageIndex(firstPage, pageIndex);
                }
                else
                {
                    firstPage.PageQuery = string.Format(firstPage.PageQuery, pageIndex);
                }
            }

            firstPage.Sniffer();

            return(firstPage);
        }
Example #12
0
        /// <summary>
        /// 一个根页采集完成
        /// </summary>
        /// <param name="firstPage"></param>
        void SnifferThread_FirstPageParseDone(ListPage firstPage)
        {
            ////PagePath = firstPage.Parent.PageName + "\\" + firstPage.PageName + "\\" + PagePath;
            //PagePath = firstPage.Parent.PageName.Trim() + "\\" + PagePath;
            //int int1 = PagePath.LastIndexOf("\\");

            //string dir = string.Format("{0}\\Data\\{1}\\{2}", Application.StartupPath, DateTime.Now.ToShortDateString(), PagePath.Substring(0, int1));
            //string fileName = PagePath.Substring(int1 + 1);

            //if (!Directory.Exists(dir))
            //    Directory.CreateDirectory(dir);

            //string dirAndFileName = string.Format("{0}\\{1}.xml", dir, fileName);

            ////导出到Xml文件
            //Data.WriteXml(dirAndFileName);

            ////如果是要导出到Excel格式,则转换格式
            ////if (SnifferContext.DocumentFormat == DocumentFormat.Xls)
            ////{
            ////    FileUtil.XmlFileToExcelFile(dirAndFileName);

            ////    //删除XML文件
            ////    if (File.Exists(dirAndFileName))
            ////        File.Delete(dirAndFileName);
            ////}

            ////清除PagePath
            //PagePath = string.Empty;

            ////清空数据
            //if (Data.Tables.Count > 0)
            //    Data.Tables.Clear();

            ////清空显示结果
            //SnifferContext.ClearState(this.ThreadIndex);
        }
Example #13
0
        /// <summary>
        /// 采集列表页
        /// </summary>
        /// <param name="listPage"></param>
        public void ParseListPage(ListPage listPage)
        {
            listPage.Sniffer();

            if (listPage.SubPageUrlResults.Count > 0)
            {
                foreach (UrlItem urlItem in listPage.SubPageUrlResults)
                {
                    ListPage subListPage = new ListPage(listPage, (ListPageConfiguration)listPage.ListPageConfiguration.SubPageConfiguration);
                    subListPage.PageName = urlItem.Title;
                    subListPage.PageUrl  = urlItem.Url;
                    subListPage.Sniffer();

                    if (subListPage.SubPageUrlResults.Count > 0)
                    {
                        if (subListPage.ListPageConfiguration.SubPageConfiguration.PageType == PageType.DetailPage)
                        {
                            string pageName = subListPage.PageName;
                            ParseDetailPage(subListPage);
                            OnListPageParseDone(subListPage);
                        }
                        else
                        {
                            ParseListPage(subListPage);
                        }
                    }
                    else if (listPage.ListPageConfiguration.For && !string.IsNullOrEmpty(listPage.PageBody))
                    {
                        subListPage          = new ListPage(listPage, (ListPageConfiguration)listPage.ListPageConfiguration);
                        subListPage.PageName = urlItem.Title;
                        subListPage.PageUrl  = urlItem.Url;

                        ParseListPage(subListPage);
                    }
                }
            }
        }
Example #14
0
        /// <summary>
        /// 保存图片
        /// </summary>
        /// <param name="value"></param>
        public void SaveImages(SnifferItem item, ref string value)
        {
            value = value.Replace("wallpaper:", "http:");

            MatchCollection imgMatchs;

            if (item.IsUrl)
            {
                imgMatchs = Regex.Matches(value, "^.*$", (RegexOptions)25);
            }
            else
            {
                imgMatchs = Regex.Matches(value, "(?<=<img[^>]*?src=[\"']?)(\\.|/|http)[^\"' >]*", (RegexOptions)25);
            }

            ListPage parentPage = this.Parent as ListPage;

            if (parentPage == null)
            {
                parentPage = this.Parent.Parent as ListPage;
            }

            string saveImagesPath = string.Empty;

            string dtPath = DateTime.Now.ToString("yyyy\\\\MM");

            if (!string.IsNullOrEmpty(item.SaveImagesPath))
            {
                saveImagesPath = string.Format(item.SaveImagesPath, dtPath);
            }
            else
            {
                saveImagesPath = parentPage.SavePathAndFileName.Substring(0, parentPage.SavePathAndFileName.LastIndexOf(".")) + "\\";
            }

            if (!saveImagesPath.EndsWith("\\"))
            {
                saveImagesPath += "\\";
            }

            if (!Directory.Exists(saveImagesPath))
            {
                Directory.CreateDirectory(saveImagesPath);
            }

            foreach (Match match in imgMatchs)
            {
                int    lastIndex = match.Value.LastIndexOf("/") + 1;
                string fileName;
                if (lastIndex >= 0)
                {
                    fileName = match.Value.Substring(lastIndex);
                }
                else
                {
                    fileName = match.Value;
                }

                if (fileName.Contains("?"))
                {
                    fileName = fileName.Substring(0, fileName.IndexOf("?"));
                }

                fileName = HttpUtility.UrlDecode(fileName);
                try
                {
                    string absUrl = FileUtil.GetAbsUrl(match.Value, this.SubPageBaseUrl);

                    Stream stream = FileUtil.GetPageStream(absUrl, this.SubPageBaseUrl);
                    if (stream != null)
                    {
                        if (this.Upload)
                        {
                            Thread thread = new Thread(new ParameterizedThreadStart(UploadFile));
                            thread.Start(new object[] { stream, saveImagesPath, fileName, "image/jpeg" });
                        }
                        else
                        {
                            FileUtil.StreamSaveToFile(stream, saveImagesPath + fileName);
                        }
                    }

                    string text1 = string.Empty;
                    if (!string.IsNullOrEmpty(item.ImageUrlPath))
                    {
                        string imageUrlPath = string.Format(item.ImageUrlPath, dtPath.Replace("\\", "/"));
                        if (!imageUrlPath.EndsWith("/"))
                        {
                            imageUrlPath += "/";
                        }
                        text1 = value.Replace(match.Value, imageUrlPath + fileName);
                    }
                    else
                    {
                        text1 = value.Replace(match.Value, saveImagesPath + fileName);
                    }
                    value = text1;
                }
                catch (System.Exception e)
                {
                    LogManager.WriteLog(string.Format("<saveimgerr><img>{0}</img><error>{1}</error></saveimgerr>", match.Value, e.Message));
                    //throw e;
                };
            }
        }
Example #15
0
 public int GetSnifferPageCount(ListPage listPage)
 {
     return(this.SnifferPageCount);
 }
Example #16
0
 public int GetStartPageIndex(ListPage listPage)
 {
     return(this.StartPageIndex);
 }
Example #17
0
 /// <summary>
 /// 分类列表搜索完成,要在这里保存
 /// </summary>
 /// <param name="listPage"></param>
 void SnifferThread_CategoryParseDone(ListPage listPage)
 {
 }