Example #1
0
 /// <summary>
 /// 清理值中一些垃圾,比如js等等
 /// </summary>
 /// <param name="item"></param>
 /// <param name="value"></param>
 public void ClearRubbish(SnifferItem item, ref string value)
 {
     if (!string.IsNullOrEmpty(item.ClearRegexString))
     {
         value = Regex.Replace(value, item.ClearRegexString, string.Empty, (RegexOptions)25);
     }
     value = Regex.Replace(value, "<script[^>]*?>(.*?)</script>", string.Empty, (RegexOptions)25);
     value = Regex.Replace(value, "(<[^>]*? )(onclick=['\"].*?['\"])(( [^>]*?)?>.*?</[^>]*?>)", "$1$3", (RegexOptions)25);
     value = Regex.Replace(value, "(<[^>]*?)=(['\"](java|vb)script:.*?['\"])(( [^>]*?)?>.*?</[^>]*?>)", "$1$4", (RegexOptions)25);
     value = Regex.Replace(value, "<!--.*?-->", "", (RegexOptions)25);
 }
Example #2
0
        /// <summary>
        /// 清理A元素
        /// </summary>
        /// <param name="item"></param>
        /// <param name="value"></param>
        public void ClearAElement(SnifferItem item, ref string value)
        {
            value = Regex.Replace(value, "<a\\s[^>]*?>(.*?)</a>", "$1", (RegexOptions)25);

            //MatchCollection matchs = Regex.Matches(value, "<a [^>]*?>(.*?)</a>", (RegexOptions)25);

            //List<Match> matchList = new List<Match>();

            //foreach (Match match in matchs)
            //{
            //    bool bool1 = false;
            //    foreach (Match match1 in matchList)
            //    {
            //        if (string.Compare(match1.Value, match.Value, true) == 0)
            //            bool1 = true;
            //    }
            //    if (!bool1)
            //    {
            //        value = value.Replace(match.Value, match.Groups[1].Value);
            //        matchList.Add(match);
            //    }
            //}
        }
Example #3
0
        /// <summary>
        /// 将链接替成绝对路径
        /// </summary>
        /// <param name="item"></param>
        /// <param name="value"></param>
        public void UrlToAbs(SnifferItem item, ref string value)
        {
            MatchCollection matchs = Regex.Matches(value, "((?<=<a[^>]*?href=[\"']?)(\\.|/|http)[^\"' >]*)|((?<=<img[^>]*?src=[\"']?)(\\.|/|http)[^\"' >]*)", (RegexOptions)25);

            List <Match> matchList = new List <Match>();

            foreach (Match match in matchs)
            {
                bool bool1 = false;
                foreach (Match match1 in matchList)
                {
                    if (string.Compare(match1.Value, match.Value, true) == 0)
                    {
                        bool1 = true;
                    }
                }
                if (!bool1)
                {
                    string absUrl = FileUtil.GetAbsUrl(match.Value, this.SubPageBaseUrl);
                    value = value.Replace(match.Value, absUrl);
                    matchList.Add(match);
                }
            }
        }
Example #4
0
        private static DetailPageConfiguration CreateDetailPageConfiguration(PageConfiguration parent, XmlNode parentConfNode, XmlNode pageConfNode)
        {
            XmlNode commonListConfigNode = null;
            XmlNode commonConfigNode     = null;

            if (parentConfNode.Attributes["Config"] != null)
            {
                commonListConfigNode = parentConfNode.ParentNode.SelectSingleNode(string.Format("CommonConfig[@Name='{0}']", parentConfNode.Attributes["Config"].Value));
                commonConfigNode     = commonListConfigNode.SelectSingleNode("SnifferPage");
            }

            DetailPageConfiguration detailPageConf = new DetailPageConfiguration(parent);

            detailPageConf.PageName = pageConfNode.Attributes["PageName"].Value;

            if (pageConfNode.Attributes["PageUrl"] != null)
            {
                detailPageConf.PageUrl = pageConfNode.Attributes["PageUrl"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageUrl"] != null)
            {
                detailPageConf.PageUrl = commonConfigNode.Attributes["PageUrl"].Value;
            }
            if (pageConfNode.Attributes["PageType"] != null)
            {
                detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), pageConfNode.Attributes["PageType"].Value, false);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageType"] != null)
            {
                detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), commonConfigNode.Attributes["PageType"].Value, false);
            }
            if (pageConfNode.Attributes["Encoding"] != null)
            {
                detailPageConf.Encoding = Encoding.GetEncoding(pageConfNode.Attributes["Encoding"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["Encoding"] != null)
            {
                detailPageConf.Encoding = Encoding.GetEncoding(commonConfigNode.Attributes["Encoding"].Value);
            }


            //翻页
            if (pageConfNode.Attributes["PageQuery"] != null)
            {
                detailPageConf.PageQuery = pageConfNode.Attributes["PageQuery"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageQuery"] != null)
            {
                detailPageConf.PageQuery = commonConfigNode.Attributes["PageQuery"].Value;
            }
            if (pageConfNode.Attributes["ReplacePageQuery"] != null)
            {
                detailPageConf.ReplacePageQuery = pageConfNode.Attributes["ReplacePageQuery"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["ReplacePageQuery"] != null)
            {
                detailPageConf.ReplacePageQuery = commonConfigNode.Attributes["ReplacePageQuery"].Value;
            }
            if (pageConfNode.Attributes["PageIndexFormat"] != null)
            {
                detailPageConf.PageIndexFormat = pageConfNode.Attributes["PageIndexFormat"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexFormat"] != null)
            {
                detailPageConf.PageIndexFormat = commonConfigNode.Attributes["PageIndexFormat"].Value;
            }
            if (pageConfNode.Attributes["PageStartIndex"] != null)
            {
                detailPageConf.PageStartIndex = int.Parse(pageConfNode.Attributes["PageStartIndex"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageStartIndex"] != null)
            {
                detailPageConf.PageStartIndex = int.Parse(commonConfigNode.Attributes["PageStartIndex"].Value);
            }

            if (pageConfNode.Attributes["PageIndexSeed"] != null)
            {
                detailPageConf.PageIndexSeed = int.Parse(pageConfNode.Attributes["PageIndexSeed"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexSeed"] != null)
            {
                detailPageConf.PageIndexSeed = int.Parse(commonConfigNode.Attributes["PageIndexSeed"].Value);
            }

            if (pageConfNode.Attributes["PageIndexStep"] != null)
            {
                detailPageConf.PageIndexStep = int.Parse(pageConfNode.Attributes["PageIndexStep"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexStep"] != null)
            {
                detailPageConf.PageIndexStep = int.Parse(commonConfigNode.Attributes["PageIndexStep"].Value);
            }

            if (pageConfNode.Attributes["EndPageDetermineRegex"] != null)
            {
                detailPageConf.EndPageDetermineRegex = pageConfNode.Attributes["EndPageDetermineRegex"].Value;
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["EndPageDetermineRegex"] != null)
            {
                detailPageConf.EndPageDetermineRegex = commonConfigNode.Attributes["EndPageDetermineRegex"].Value;
            }

            if (pageConfNode.Attributes["PageMethod"] != null)
            {
                detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), pageConfNode.Attributes["PageMethod"].Value);
            }
            else if (commonConfigNode != null && commonConfigNode.Attributes["PageMethod"] != null)
            {
                detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), commonConfigNode.Attributes["PageMethod"].Value);
            }



            XmlNode snifferUrlItemNode = pageConfNode.SelectSingleNode("SnifferSubPageUrlItem");

            if (snifferUrlItemNode == null && commonConfigNode != null && commonConfigNode.Attributes["SnifferSubPageUrlItem"] != null)
            {
                snifferUrlItemNode = commonConfigNode.SelectSingleNode("SnifferSubPageUrlItem");
            }
            if (snifferUrlItemNode != null)
            {
                detailPageConf.SnifferSubPageUrlItem = CreateSnifferUrlItem(snifferUrlItemNode);
            }


            List <XmlNodeList> lstItemLists = new List <XmlNodeList>();

            XmlNodeList itemNodes = pageConfNode.SelectNodes("SnifferItem");

            if (itemNodes != null)
            {
                lstItemLists.Add(itemNodes);
            }

            if (commonConfigNode != null)
            {
                XmlNodeList comItemNodes = commonConfigNode.SelectNodes("SnifferItem");
                if (comItemNodes != null)
                {
                    lstItemLists.Add(comItemNodes);
                }
            }

            for (int i = 0; i < lstItemLists.Count; i++)
            {
                itemNodes = lstItemLists[i];
                foreach (XmlNode itemNode in itemNodes)
                {
                    string itemName = itemNode.Attributes["ItemName"].Value;
                    bool   bool1    = false;
                    foreach (SnifferItem itm in detailPageConf.SnifferItems)
                    {
                        if (itm.ItemName == itemName)
                        {
                            bool1 = true;
                        }
                    }

                    if (!bool1)
                    {
                        SnifferItem item = new SnifferItem();

                        item.ItemName = itemNode.Attributes["ItemName"].Value;

                        if (itemNode.Attributes["SaveImage"] != null)
                        {
                            item.SaveImage = bool.Parse(itemNode.Attributes["SaveImage"].Value);
                        }
                        if (itemNode.Attributes["SaveImagesPath"] != null)
                        {
                            item.SaveImagesPath = itemNode.Attributes["SaveImagesPath"].Value;
                        }
                        if (itemNode.Attributes["ImageUrlPath"] != null)
                        {
                            item.ImageUrlPath = itemNode.Attributes["ImageUrlPath"].Value;
                        }
                        if (itemNode.Attributes["IsClearHTML"] != null)
                        {
                            item.IsClearHTML = bool.Parse(itemNode.Attributes["IsClearHTML"].Value);
                        }
                        if (itemNode.Attributes["IsUrl"] != null)
                        {
                            item.IsUrl = bool.Parse(itemNode.Attributes["IsUrl"].Value);
                        }
                        if (itemNode.Attributes["UrlToAbs"] != null)
                        {
                            item.UrlToAbs = bool.Parse(itemNode.Attributes["UrlToAbs"].Value);
                        }
                        if (itemNode.Attributes["Separator"] != null)
                        {
                            item.Separator = itemNode.Attributes["Separator"].Value;
                        }
                        if (itemNode.Attributes["ClearAElement"] != null)
                        {
                            item.ClearAElement = bool.Parse(itemNode.Attributes["ClearAElement"].Value);
                        }
                        if (itemNode.Attributes["MutiPage"] != null)
                        {
                            item.MutiPage = bool.Parse(itemNode.Attributes["MutiPage"].Value);
                        }
                        if (itemNode.Attributes["MutiPageSeparator"] != null)
                        {
                            item.MutiPageSeparator = itemNode.Attributes["MutiPageSeparator"].Value;
                        }

                        XmlNode ndClearRegexString = itemNode.SelectSingleNode("ClearRegexString");
                        if (ndClearRegexString != null)
                        {
                            item.ClearRegexString = ndClearRegexString.InnerText;
                        }

                        XmlNode ndDefaultValue = itemNode.SelectSingleNode("DefaultValue");
                        if (ndDefaultValue != null)
                        {
                            item.DefaultValue = ndDefaultValue.InnerText;
                        }

                        XmlNode ndRegexString = itemNode.SelectSingleNode("RegexString");
                        if (ndRegexString != null)
                        {
                            item.RegexString = CreateRegexString(ndRegexString);
                        }

                        detailPageConf.SnifferItems.Add(item);
                    }
                }
            }


            XmlNodeList subPageNodes = pageConfNode.SelectNodes("SnifferPage");

            foreach (XmlNode subPageNode in subPageNodes)
            {
                detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode));
            }

            if (commonConfigNode != null)
            {
                subPageNodes = commonConfigNode.SelectNodes("SnifferPage");
                foreach (XmlNode subPageNode in subPageNodes)
                {
                    detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode));
                }
            }

            return(detailPageConf);
        }
Example #5
0
        /// <summary>
        /// 保存图片
        /// </summary>
        /// <param name="value"></param>
        public void SaveImages(SnifferItem item, ref string value)
        {
            value = value.Replace("wallpaper:", "http:");

            MatchCollection imgMatchs;

            if (item.IsUrl)
            {
                imgMatchs = Regex.Matches(value, "^.*$", (RegexOptions)25);
            }
            else
            {
                imgMatchs = Regex.Matches(value, "(?<=<img[^>]*?src=[\"']?)(\\.|/|http)[^\"' >]*", (RegexOptions)25);
            }

            ListPage parentPage = this.Parent as ListPage;

            if (parentPage == null)
            {
                parentPage = this.Parent.Parent as ListPage;
            }

            string saveImagesPath = string.Empty;

            string dtPath = DateTime.Now.ToString("yyyy\\\\MM");

            if (!string.IsNullOrEmpty(item.SaveImagesPath))
            {
                saveImagesPath = string.Format(item.SaveImagesPath, dtPath);
            }
            else
            {
                saveImagesPath = parentPage.SavePathAndFileName.Substring(0, parentPage.SavePathAndFileName.LastIndexOf(".")) + "\\";
            }

            if (!saveImagesPath.EndsWith("\\"))
            {
                saveImagesPath += "\\";
            }

            if (!Directory.Exists(saveImagesPath))
            {
                Directory.CreateDirectory(saveImagesPath);
            }

            foreach (Match match in imgMatchs)
            {
                int    lastIndex = match.Value.LastIndexOf("/") + 1;
                string fileName;
                if (lastIndex >= 0)
                {
                    fileName = match.Value.Substring(lastIndex);
                }
                else
                {
                    fileName = match.Value;
                }

                if (fileName.Contains("?"))
                {
                    fileName = fileName.Substring(0, fileName.IndexOf("?"));
                }

                fileName = HttpUtility.UrlDecode(fileName);
                try
                {
                    string absUrl = FileUtil.GetAbsUrl(match.Value, this.SubPageBaseUrl);

                    Stream stream = FileUtil.GetPageStream(absUrl, this.SubPageBaseUrl);
                    if (stream != null)
                    {
                        if (this.Upload)
                        {
                            Thread thread = new Thread(new ParameterizedThreadStart(UploadFile));
                            thread.Start(new object[] { stream, saveImagesPath, fileName, "image/jpeg" });
                        }
                        else
                        {
                            FileUtil.StreamSaveToFile(stream, saveImagesPath + fileName);
                        }
                    }

                    string text1 = string.Empty;
                    if (!string.IsNullOrEmpty(item.ImageUrlPath))
                    {
                        string imageUrlPath = string.Format(item.ImageUrlPath, dtPath.Replace("\\", "/"));
                        if (!imageUrlPath.EndsWith("/"))
                        {
                            imageUrlPath += "/";
                        }
                        text1 = value.Replace(match.Value, imageUrlPath + fileName);
                    }
                    else
                    {
                        text1 = value.Replace(match.Value, saveImagesPath + fileName);
                    }
                    value = text1;
                }
                catch (System.Exception e)
                {
                    LogManager.WriteLog(string.Format("<saveimgerr><img>{0}</img><error>{1}</error></saveimgerr>", match.Value, e.Message));
                    //throw e;
                };
            }
        }
Example #6
0
        private bool InlineSniffer()
        {
            if (!this.Succeed)
            {
                _done = true;
                return(false);
            }

            //采集项
            foreach (SnifferItem item in this.DetailPageConfiguration.SnifferItems)
            {
                //这里的意思应该是,如果不是多页的内容,则当然页如果大于开始页了,则不再采集了。
                if (this.PageIndex > this.DetailPageConfiguration.PageStartIndex && !item.MutiPage)
                {
                    continue;
                }

                if (item.RegexString != null)
                {
                    MatchCollection matchs = Regex.Matches(this.PageBody, item.RegexString.Expression, (RegexOptions)25);

                    if (matchs.Count > 0)
                    {
                        System.Text.StringBuilder sb = new StringBuilder();
                        foreach (Match match in matchs)
                        {
                            if (matchs.Count > 1 && sb.Length > 0)
                            {
                                sb.Append(item.Separator);
                            }
                            string value = match.Groups[item.RegexString.ValueGroupIndex].Value;

                            if (string.IsNullOrEmpty(value))
                            {
                                value = item.DefaultValue;
                            }
                            else
                            {
                                //清理垃圾
                                ClearRubbish(item, ref value);
                                //将内容里的URL转成绝对路径
                                if (item.UrlToAbs)
                                {
                                    UrlToAbs(item, ref value);
                                }
                                //如果采集的是Url则转换成绝对路径
                                if (item.IsUrl)
                                {
                                    value = FileUtil.GetAbsUrl(value, this.SubPageBaseUrl);
                                }
                                //保存图片
                                if (item.SaveImage)
                                {
                                    SaveImages(item, ref value);
                                }
                                //清除A元素
                                if (item.ClearAElement)
                                {
                                    ClearAElement(item, ref value);
                                }
                                //清除HTML代码
                                if (item.IsClearHTML)
                                {
                                    value = ClearHTML(value);
                                }
                            }

                            sb.Append(value);
                        }
                        this.ResultItems.Add(item.ItemName, sb.ToString().Trim());
                    }
                    else
                    {
                        this.ResultItems.Add(item.ItemName, item.DefaultValue);
                    }
                }
                else
                {
                    this.ResultItems.Add(item.ItemName, item.DefaultValue);
                }
            }


            //整合字段
            string     text1      = this.PageUrl;
            DetailPage parentPage = this.Parent as DetailPage;

            if (parentPage != null)
            {
                foreach (string key in this.ResultItems.Keys)
                {
                    string value = (string)this.ResultItems[key];
                    if (this.ResultItems.Contains(key))
                    {
                        string parentPageValue = (string)parentPage.ResultItems[key];
                        if (parentPageValue != value)
                        {
                            SnifferItem item = null;
                            foreach (SnifferItem itm in parentPage.DetailPageConfiguration.SnifferItems)
                            {
                                if (itm.ItemName == key)
                                {
                                    item = itm;
                                    break;
                                }
                            }
                            parentPage.ResultItems[key] = parentPageValue + item.MutiPageSeparator + value;
                        }
                    }
                    else
                    {
                        parentPage.ResultItems.Add(key, value);
                    }
                }
            }

            //采集子页
            foreach (DetailPageConfiguration conf in this.DetailPageConfiguration.SubPageConfigurations)
            {
                MatchCollection matches = Regex.Matches(this.PageBody, conf.SnifferSubPageUrlItem.Expression, (RegexOptions)25);

                if (matches != null && matches.Count > 0)
                {
                    foreach (Match match in matches)
                    {
                        if (!string.IsNullOrEmpty(match.Value))
                        {
                            DetailPage detailPage = new DetailPage(this, conf);
                            detailPage.PageName = match.Groups[conf.SnifferSubPageUrlItem.TitleGroupIndex].Value;

                            string url = match.Groups[conf.SnifferSubPageUrlItem.UrlGroupIndex].Value;

                            if (!string.IsNullOrEmpty(conf.SnifferSubPageUrlItem.UrlFormat))
                            {
                                url = string.Format(conf.SnifferSubPageUrlItem.UrlFormat, url);
                            }

                            detailPage.PageUrl = FileUtil.GetAbsUrl(url, this.SubPageBaseUrl);

                            detailPage.Sniffer();

                            this.SubPages.Add(detailPage);
                        }
                    }
                }
            }


            _done = true;
            return(true);
        }