/// <summary> /// 清理值中一些垃圾,比如js等等 /// </summary> /// <param name="item"></param> /// <param name="value"></param> public void ClearRubbish(SnifferItem item, ref string value) { if (!string.IsNullOrEmpty(item.ClearRegexString)) { value = Regex.Replace(value, item.ClearRegexString, string.Empty, (RegexOptions)25); } value = Regex.Replace(value, "<script[^>]*?>(.*?)</script>", string.Empty, (RegexOptions)25); value = Regex.Replace(value, "(<[^>]*? )(onclick=['\"].*?['\"])(( [^>]*?)?>.*?</[^>]*?>)", "$1$3", (RegexOptions)25); value = Regex.Replace(value, "(<[^>]*?)=(['\"](java|vb)script:.*?['\"])(( [^>]*?)?>.*?</[^>]*?>)", "$1$4", (RegexOptions)25); value = Regex.Replace(value, "<!--.*?-->", "", (RegexOptions)25); }
/// <summary> /// 清理A元素 /// </summary> /// <param name="item"></param> /// <param name="value"></param> public void ClearAElement(SnifferItem item, ref string value) { value = Regex.Replace(value, "<a\\s[^>]*?>(.*?)</a>", "$1", (RegexOptions)25); //MatchCollection matchs = Regex.Matches(value, "<a [^>]*?>(.*?)</a>", (RegexOptions)25); //List<Match> matchList = new List<Match>(); //foreach (Match match in matchs) //{ // bool bool1 = false; // foreach (Match match1 in matchList) // { // if (string.Compare(match1.Value, match.Value, true) == 0) // bool1 = true; // } // if (!bool1) // { // value = value.Replace(match.Value, match.Groups[1].Value); // matchList.Add(match); // } //} }
/// <summary> /// 将链接替成绝对路径 /// </summary> /// <param name="item"></param> /// <param name="value"></param> public void UrlToAbs(SnifferItem item, ref string value) { MatchCollection matchs = Regex.Matches(value, "((?<=<a[^>]*?href=[\"']?)(\\.|/|http)[^\"' >]*)|((?<=<img[^>]*?src=[\"']?)(\\.|/|http)[^\"' >]*)", (RegexOptions)25); List <Match> matchList = new List <Match>(); foreach (Match match in matchs) { bool bool1 = false; foreach (Match match1 in matchList) { if (string.Compare(match1.Value, match.Value, true) == 0) { bool1 = true; } } if (!bool1) { string absUrl = FileUtil.GetAbsUrl(match.Value, this.SubPageBaseUrl); value = value.Replace(match.Value, absUrl); matchList.Add(match); } } }
private static DetailPageConfiguration CreateDetailPageConfiguration(PageConfiguration parent, XmlNode parentConfNode, XmlNode pageConfNode) { XmlNode commonListConfigNode = null; XmlNode commonConfigNode = null; if (parentConfNode.Attributes["Config"] != null) { commonListConfigNode = parentConfNode.ParentNode.SelectSingleNode(string.Format("CommonConfig[@Name='{0}']", parentConfNode.Attributes["Config"].Value)); commonConfigNode = commonListConfigNode.SelectSingleNode("SnifferPage"); } DetailPageConfiguration detailPageConf = new DetailPageConfiguration(parent); detailPageConf.PageName = pageConfNode.Attributes["PageName"].Value; if (pageConfNode.Attributes["PageUrl"] != null) { detailPageConf.PageUrl = pageConfNode.Attributes["PageUrl"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["PageUrl"] != null) { detailPageConf.PageUrl = commonConfigNode.Attributes["PageUrl"].Value; } if (pageConfNode.Attributes["PageType"] != null) { detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), pageConfNode.Attributes["PageType"].Value, false); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageType"] != null) { detailPageConf.PageType = (PageType)Enum.Parse(typeof(PageType), commonConfigNode.Attributes["PageType"].Value, false); } if (pageConfNode.Attributes["Encoding"] != null) { detailPageConf.Encoding = Encoding.GetEncoding(pageConfNode.Attributes["Encoding"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["Encoding"] != null) { detailPageConf.Encoding = Encoding.GetEncoding(commonConfigNode.Attributes["Encoding"].Value); } //翻页 if (pageConfNode.Attributes["PageQuery"] != null) { detailPageConf.PageQuery = pageConfNode.Attributes["PageQuery"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["PageQuery"] != null) { detailPageConf.PageQuery = commonConfigNode.Attributes["PageQuery"].Value; } if (pageConfNode.Attributes["ReplacePageQuery"] != null) { detailPageConf.ReplacePageQuery = pageConfNode.Attributes["ReplacePageQuery"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["ReplacePageQuery"] != null) { detailPageConf.ReplacePageQuery = commonConfigNode.Attributes["ReplacePageQuery"].Value; } if (pageConfNode.Attributes["PageIndexFormat"] != null) { detailPageConf.PageIndexFormat = pageConfNode.Attributes["PageIndexFormat"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexFormat"] != null) { detailPageConf.PageIndexFormat = commonConfigNode.Attributes["PageIndexFormat"].Value; } if (pageConfNode.Attributes["PageStartIndex"] != null) { detailPageConf.PageStartIndex = int.Parse(pageConfNode.Attributes["PageStartIndex"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageStartIndex"] != null) { detailPageConf.PageStartIndex = int.Parse(commonConfigNode.Attributes["PageStartIndex"].Value); } if (pageConfNode.Attributes["PageIndexSeed"] != null) { detailPageConf.PageIndexSeed = int.Parse(pageConfNode.Attributes["PageIndexSeed"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexSeed"] != null) { detailPageConf.PageIndexSeed = int.Parse(commonConfigNode.Attributes["PageIndexSeed"].Value); } if (pageConfNode.Attributes["PageIndexStep"] != null) { detailPageConf.PageIndexStep = int.Parse(pageConfNode.Attributes["PageIndexStep"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageIndexStep"] != null) { detailPageConf.PageIndexStep = int.Parse(commonConfigNode.Attributes["PageIndexStep"].Value); } if (pageConfNode.Attributes["EndPageDetermineRegex"] != null) { detailPageConf.EndPageDetermineRegex = pageConfNode.Attributes["EndPageDetermineRegex"].Value; } else if (commonConfigNode != null && commonConfigNode.Attributes["EndPageDetermineRegex"] != null) { detailPageConf.EndPageDetermineRegex = commonConfigNode.Attributes["EndPageDetermineRegex"].Value; } if (pageConfNode.Attributes["PageMethod"] != null) { detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), pageConfNode.Attributes["PageMethod"].Value); } else if (commonConfigNode != null && commonConfigNode.Attributes["PageMethod"] != null) { detailPageConf.PageMethod = (PageMethod)Enum.Parse(typeof(PageMethod), commonConfigNode.Attributes["PageMethod"].Value); } XmlNode snifferUrlItemNode = pageConfNode.SelectSingleNode("SnifferSubPageUrlItem"); if (snifferUrlItemNode == null && commonConfigNode != null && commonConfigNode.Attributes["SnifferSubPageUrlItem"] != null) { snifferUrlItemNode = commonConfigNode.SelectSingleNode("SnifferSubPageUrlItem"); } if (snifferUrlItemNode != null) { detailPageConf.SnifferSubPageUrlItem = CreateSnifferUrlItem(snifferUrlItemNode); } List <XmlNodeList> lstItemLists = new List <XmlNodeList>(); XmlNodeList itemNodes = pageConfNode.SelectNodes("SnifferItem"); if (itemNodes != null) { lstItemLists.Add(itemNodes); } if (commonConfigNode != null) { XmlNodeList comItemNodes = commonConfigNode.SelectNodes("SnifferItem"); if (comItemNodes != null) { lstItemLists.Add(comItemNodes); } } for (int i = 0; i < lstItemLists.Count; i++) { itemNodes = lstItemLists[i]; foreach (XmlNode itemNode in itemNodes) { string itemName = itemNode.Attributes["ItemName"].Value; bool bool1 = false; foreach (SnifferItem itm in detailPageConf.SnifferItems) { if (itm.ItemName == itemName) { bool1 = true; } } if (!bool1) { SnifferItem item = new SnifferItem(); item.ItemName = itemNode.Attributes["ItemName"].Value; if (itemNode.Attributes["SaveImage"] != null) { item.SaveImage = bool.Parse(itemNode.Attributes["SaveImage"].Value); } if (itemNode.Attributes["SaveImagesPath"] != null) { item.SaveImagesPath = itemNode.Attributes["SaveImagesPath"].Value; } if (itemNode.Attributes["ImageUrlPath"] != null) { item.ImageUrlPath = itemNode.Attributes["ImageUrlPath"].Value; } if (itemNode.Attributes["IsClearHTML"] != null) { item.IsClearHTML = bool.Parse(itemNode.Attributes["IsClearHTML"].Value); } if (itemNode.Attributes["IsUrl"] != null) { item.IsUrl = bool.Parse(itemNode.Attributes["IsUrl"].Value); } if (itemNode.Attributes["UrlToAbs"] != null) { item.UrlToAbs = bool.Parse(itemNode.Attributes["UrlToAbs"].Value); } if (itemNode.Attributes["Separator"] != null) { item.Separator = itemNode.Attributes["Separator"].Value; } if (itemNode.Attributes["ClearAElement"] != null) { item.ClearAElement = bool.Parse(itemNode.Attributes["ClearAElement"].Value); } if (itemNode.Attributes["MutiPage"] != null) { item.MutiPage = bool.Parse(itemNode.Attributes["MutiPage"].Value); } if (itemNode.Attributes["MutiPageSeparator"] != null) { item.MutiPageSeparator = itemNode.Attributes["MutiPageSeparator"].Value; } XmlNode ndClearRegexString = itemNode.SelectSingleNode("ClearRegexString"); if (ndClearRegexString != null) { item.ClearRegexString = ndClearRegexString.InnerText; } XmlNode ndDefaultValue = itemNode.SelectSingleNode("DefaultValue"); if (ndDefaultValue != null) { item.DefaultValue = ndDefaultValue.InnerText; } XmlNode ndRegexString = itemNode.SelectSingleNode("RegexString"); if (ndRegexString != null) { item.RegexString = CreateRegexString(ndRegexString); } detailPageConf.SnifferItems.Add(item); } } } XmlNodeList subPageNodes = pageConfNode.SelectNodes("SnifferPage"); foreach (XmlNode subPageNode in subPageNodes) { detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode)); } if (commonConfigNode != null) { subPageNodes = commonConfigNode.SelectNodes("SnifferPage"); foreach (XmlNode subPageNode in subPageNodes) { detailPageConf.SubPageConfigurations.Add(CreateDetailPageConfiguration(detailPageConf, pageConfNode, subPageNode)); } } return(detailPageConf); }
/// <summary> /// 保存图片 /// </summary> /// <param name="value"></param> public void SaveImages(SnifferItem item, ref string value) { value = value.Replace("wallpaper:", "http:"); MatchCollection imgMatchs; if (item.IsUrl) { imgMatchs = Regex.Matches(value, "^.*$", (RegexOptions)25); } else { imgMatchs = Regex.Matches(value, "(?<=<img[^>]*?src=[\"']?)(\\.|/|http)[^\"' >]*", (RegexOptions)25); } ListPage parentPage = this.Parent as ListPage; if (parentPage == null) { parentPage = this.Parent.Parent as ListPage; } string saveImagesPath = string.Empty; string dtPath = DateTime.Now.ToString("yyyy\\\\MM"); if (!string.IsNullOrEmpty(item.SaveImagesPath)) { saveImagesPath = string.Format(item.SaveImagesPath, dtPath); } else { saveImagesPath = parentPage.SavePathAndFileName.Substring(0, parentPage.SavePathAndFileName.LastIndexOf(".")) + "\\"; } if (!saveImagesPath.EndsWith("\\")) { saveImagesPath += "\\"; } if (!Directory.Exists(saveImagesPath)) { Directory.CreateDirectory(saveImagesPath); } foreach (Match match in imgMatchs) { int lastIndex = match.Value.LastIndexOf("/") + 1; string fileName; if (lastIndex >= 0) { fileName = match.Value.Substring(lastIndex); } else { fileName = match.Value; } if (fileName.Contains("?")) { fileName = fileName.Substring(0, fileName.IndexOf("?")); } fileName = HttpUtility.UrlDecode(fileName); try { string absUrl = FileUtil.GetAbsUrl(match.Value, this.SubPageBaseUrl); Stream stream = FileUtil.GetPageStream(absUrl, this.SubPageBaseUrl); if (stream != null) { if (this.Upload) { Thread thread = new Thread(new ParameterizedThreadStart(UploadFile)); thread.Start(new object[] { stream, saveImagesPath, fileName, "image/jpeg" }); } else { FileUtil.StreamSaveToFile(stream, saveImagesPath + fileName); } } string text1 = string.Empty; if (!string.IsNullOrEmpty(item.ImageUrlPath)) { string imageUrlPath = string.Format(item.ImageUrlPath, dtPath.Replace("\\", "/")); if (!imageUrlPath.EndsWith("/")) { imageUrlPath += "/"; } text1 = value.Replace(match.Value, imageUrlPath + fileName); } else { text1 = value.Replace(match.Value, saveImagesPath + fileName); } value = text1; } catch (System.Exception e) { LogManager.WriteLog(string.Format("<saveimgerr><img>{0}</img><error>{1}</error></saveimgerr>", match.Value, e.Message)); //throw e; }; } }
private bool InlineSniffer() { if (!this.Succeed) { _done = true; return(false); } //采集项 foreach (SnifferItem item in this.DetailPageConfiguration.SnifferItems) { //这里的意思应该是,如果不是多页的内容,则当然页如果大于开始页了,则不再采集了。 if (this.PageIndex > this.DetailPageConfiguration.PageStartIndex && !item.MutiPage) { continue; } if (item.RegexString != null) { MatchCollection matchs = Regex.Matches(this.PageBody, item.RegexString.Expression, (RegexOptions)25); if (matchs.Count > 0) { System.Text.StringBuilder sb = new StringBuilder(); foreach (Match match in matchs) { if (matchs.Count > 1 && sb.Length > 0) { sb.Append(item.Separator); } string value = match.Groups[item.RegexString.ValueGroupIndex].Value; if (string.IsNullOrEmpty(value)) { value = item.DefaultValue; } else { //清理垃圾 ClearRubbish(item, ref value); //将内容里的URL转成绝对路径 if (item.UrlToAbs) { UrlToAbs(item, ref value); } //如果采集的是Url则转换成绝对路径 if (item.IsUrl) { value = FileUtil.GetAbsUrl(value, this.SubPageBaseUrl); } //保存图片 if (item.SaveImage) { SaveImages(item, ref value); } //清除A元素 if (item.ClearAElement) { ClearAElement(item, ref value); } //清除HTML代码 if (item.IsClearHTML) { value = ClearHTML(value); } } sb.Append(value); } this.ResultItems.Add(item.ItemName, sb.ToString().Trim()); } else { this.ResultItems.Add(item.ItemName, item.DefaultValue); } } else { this.ResultItems.Add(item.ItemName, item.DefaultValue); } } //整合字段 string text1 = this.PageUrl; DetailPage parentPage = this.Parent as DetailPage; if (parentPage != null) { foreach (string key in this.ResultItems.Keys) { string value = (string)this.ResultItems[key]; if (this.ResultItems.Contains(key)) { string parentPageValue = (string)parentPage.ResultItems[key]; if (parentPageValue != value) { SnifferItem item = null; foreach (SnifferItem itm in parentPage.DetailPageConfiguration.SnifferItems) { if (itm.ItemName == key) { item = itm; break; } } parentPage.ResultItems[key] = parentPageValue + item.MutiPageSeparator + value; } } else { parentPage.ResultItems.Add(key, value); } } } //采集子页 foreach (DetailPageConfiguration conf in this.DetailPageConfiguration.SubPageConfigurations) { MatchCollection matches = Regex.Matches(this.PageBody, conf.SnifferSubPageUrlItem.Expression, (RegexOptions)25); if (matches != null && matches.Count > 0) { foreach (Match match in matches) { if (!string.IsNullOrEmpty(match.Value)) { DetailPage detailPage = new DetailPage(this, conf); detailPage.PageName = match.Groups[conf.SnifferSubPageUrlItem.TitleGroupIndex].Value; string url = match.Groups[conf.SnifferSubPageUrlItem.UrlGroupIndex].Value; if (!string.IsNullOrEmpty(conf.SnifferSubPageUrlItem.UrlFormat)) { url = string.Format(conf.SnifferSubPageUrlItem.UrlFormat, url); } detailPage.PageUrl = FileUtil.GetAbsUrl(url, this.SubPageBaseUrl); detailPage.Sniffer(); this.SubPages.Add(detailPage); } } } } _done = true; return(true); }