/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage2(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id='content']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script"); tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "").Replace("\t", ""); tempString = tempString.Replace("\r\n", "").Replace("\t", "").Replace("手机请访问::feisuz", "").Replace("feisuz", "") .Replace("feisuz", "").Replace("作者的话:", "").Replace("新书,求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", ""); returndata.Add(CollectionFieldName.ExContent, tempString); } HtmlNode nextLink = documentNode.SelectNodes("//*[@id='content']/div[@class='text']/a")?.FirstOrDefault(x => x.InnerText == "下一节"); if (nextLink != null) { string url = nextLink.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(url) && url != "#") { returndata.Add(CollectionFieldName.NextUrl, url); } } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id=\"ccontent\"]"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "a"); //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "") .Replace("\t", "") .Replace("領域文學首發地址httP://www.lingyu.org<br><br>", "") .Replace("領域文學首發地址www.lingyu.org<br><br> <br><br>", "") .Replace("请记住本书首发域名:http://www.lingyu.org 领域文学手机版阅读网址: m.lingyu.org", ""); string pattern = @"http://www.lingyu.org/\w+/\d+/\d+/\d+.html"; tempString = Regex.Replace(tempString, pattern, ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString) .Replace(" ", "") .Replace("領域文學首發地址httP://www.lingyu.org<br><br>", "") .Replace("領域文學首發地址www.lingyu.org<br><br> <br><br>", "") .Replace("请记住本书首发域名:http://www.lingyu.org 领域文学手机版阅读网址: m.lingyu.org", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempInnerText.Length / 500) * 3; if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id='content']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "center", "span"); tempString = tempString.Replace("\r\n", "").Replace("\t", "") .Replace("全本小说网欢迎您!WWW.YZNN.COM T1706231537", "") .Replace("F606121", "") .Replace("全本小说网欢迎您!WWW.YZNN.COM", ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } else { int i = 1; int j = 1 + 1; } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id='content']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script"); tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "").Replace("\t", ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } HtmlNode nextLink = documentNode.SelectNodes("//*[@id='content']/div[@class='text']/a")?.FirstOrDefault(x => x.InnerText == "下一节"); if (nextLink != null) { string url = nextLink.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(url) && url != "#") { returndata.Add(CollectionFieldName.NextUrl, url); } } return(returndata); }
private void dealHtmlstring(ref string Htmlstring) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(Htmlstring); HtmlNode htmlNode = document.DocumentNode; //清洗多余标签 htmlNode.InnerHtml = HTMLUtil.RemoveHtmlTag(htmlNode.InnerHtml, "p", "vedio", "img", "br"); }
public string DeepClear(string htmlString = "") { if (!string.IsNullOrEmpty(htmlString)) { htmlString = HTMLUtil.RemoveHtmlTag(htmlString, "p", "img", "br"); htmlString = HTMLUtil.ClearImgTag(htmlString); Regex contentRegex = new Regex(@"(style="".*?"")"); htmlString = contentRegex.Replace(htmlString, ""); } return(htmlString); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@class=\"panel-body content-body content-ext\"]"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script"); //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "").Replace("\t", "").Replace("手机请访问::feisuz", "") .Replace("feisuz", "").Replace("作者的话:", "").Replace("新书,求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", "").Replace("feisuz", "") .Replace("作者的话:", "").Replace("新书,求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@class='readout']/div[@class='shuneirong']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "center", "span"); tempString = tempString.Replace("\r\n", "").Replace("\t", "") .Replace("免费小说", "") .Replace("biquge5200.com", "") .Replace("biquge5200", "") .Replace("笔趣阁", "") .Replace("http://", "") .Replace("本书红薯网首发,请勿转载!", ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 150) { into = tempInnerText.Substring(0, 150) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } else { int i = 1; int j = 1 + 1; } return(returndata); }
/// <summary> /// 图片相对路径改为全路径 /// </summary> /// <param name="documentNode"></param> /// <param name="InternalRealUrl"></param> protected static void ReplaceIncorrectImageSrc(HtmlNode documentNode, string InternalRealUrl) { HtmlNodeCollection imgNode = documentNode.SelectNodes("//img"); if (imgNode == null) { return; } foreach (var item in imgNode) { var url = item.GetAttributeValue("src", ""); var realUrl = HTMLUtil.GetFullURL(InternalRealUrl, url); item.SetAttributeValue("src", realUrl); //documentNode.InnerHtml=documentNode.InnerHtml.Replace(url, realUrl); } }
/// <summary> /// 章节目录列表数据 /// </summary> /// <param name="hashtable"></param> private void parseListPage(Hashtable hashtable) { List <Hashtable> detaiList = hashtable.ContainsKey(CollectionFieldName.Items) ? (List <Hashtable>)hashtable[CollectionFieldName.Items] : null; List <string> nextPages = hashtable.ContainsKey(CollectionFieldName.Pages) ? (List <string>)hashtable[CollectionFieldName.Pages] : null; List <string> multiPages = hashtable.ContainsKey(CollectionFieldName.MultiPages) ? (List <string>)hashtable[CollectionFieldName.MultiPages] : null; Hashtable htKeys = new Hashtable(); //判断是否重复 //detaiList = null; if (detaiList != null) { foreach (Hashtable cm in detaiList) { //continue; string url = (string)cm[CollectionFieldName.Url]; if (!HTMLUtil.IsCorrect(url)) { continue; } url = GetFullURL(url); cm[CollectionFieldName.Url] = url; wxchapter rm = new wxchapter(); fillWxChapterModel(rm, cm, _CollectionModel.CollectionId); //防止出现重复章节一直循环 if (htKeys.ContainsKey(rm.Id)) { continue; } htKeys.Add(rm.Id, rm.Id); //章节数据保存 this._CollectionModel.chapterList.Add(rm); } } this.currentUrl_ChapterList = null; if (nextPages != null) { foreach (string url in nextPages) { this.currentUrl_ChapterList = GetFullURL(url); } } }
/// <summary> /// 解析开始页 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseStartupPage(HtmlNode documentNode) { Hashtable ht = new Hashtable(); Regex tempReg = null; Match tempMatch = null; HtmlNode linkNodes = documentNode.SelectSingleNode("//meta[@property='og:title']"); if (linkNodes != null) { var title = linkNodes.GetAttributeValue("content", ""); ht.Add(CollectionFieldName.Novel_Name, title); } linkNodes = documentNode.SelectSingleNode("//div[@class='pic text-center']/img"); if (linkNodes != null) { var imgUrl = linkNodes?.GetAttributeValue("src", ""); imgUrl = HTMLUtil.GetFullURL(InternalRealUrl, imgUrl); ht.Add(CollectionFieldName.Novel_CoverImgs, imgUrl); } linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']"); if (linkNodes != null) { var tag = linkNodes?.GetAttributeValue("content", ""); ht.Add(CollectionFieldName.Novel_Tag, tag); } var url = InternalRealUrl; ht.Add(CollectionFieldName.Url, url); tempReg = new Regex(@"/(\d+).html"); tempMatch = tempReg.Match(InternalRealUrl); if (tempMatch.Success) { ht.Add(CollectionFieldName.Novel_UniqueFlag, tempReg.Replace(tempMatch.Value, "$1")); } Hashtable returndata = new Hashtable(); if (ht != null) { returndata.Add(CollectionFieldName.BookInfo, ht); } return(returndata); }
/// <summary> /// 修改占位图片路径 /// </summary> /// <param name="documentNode"></param> /// <param name="InternalRealUrl"></param> protected static void ReplacePlaceholderImageSrc(HtmlNode documentNode, string InternalRealUrl, string PlaceholderAttribute) { HtmlNodeCollection imgNode = documentNode.SelectNodes("//img"); if (imgNode == null) { return; } foreach (var item in imgNode) { var url = item.GetAttributeValue(PlaceholderAttribute, ""); if (!string.IsNullOrEmpty(url)) { var realUrl = HTMLUtil.GetFullURL(InternalRealUrl, url); item.SetAttributeValue("src", realUrl); item.Attributes.Remove(PlaceholderAttribute); } } }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage2(HtmlNode documentNode) { Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@class='content']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = HTMLUtil.RemoveHtmlContent(tempString, "style", "script"); returndata.Add(CollectionFieldName.Chap_Content, tempString); tempInnerText = tempNode.InnerText; if (!string.IsNullOrEmpty(tempInnerText)) { int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_ContentLen, tempString.Length); returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Pay); } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@class='cDetail']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = HTMLUtil.RemoveHtmlContent(tempString, "style", "script"); returndata.Add(CollectionFieldName.Chap_Content, tempString); tempInnerText = tempNode.InnerText; if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@class='articleCon']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "dt", "a"); tempString = tempString.ToLower(); //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "").Replace("\t", "") .Replace("本站访问地址http://www.ziyouge.com 任意搜索引擎内输入:紫幽阁 即可访问!", "") .Replace("http://www.ziyouge.com", "") .Replace("紫幽阁", "") .Replace("wanben.me", "") .Replace("ziyouge.com", "") .Replace("ziyouge", "") .Replace("http://", "") .Replace("http", "") .Replace("紫You阁 WwW.ZiyouGE.com", "") .Replace("WWw.ZiyoUgE.com", "") .Replace("品书网", "") .Replace("www.vodtw.com", "") .Replace("本书来自", "") .Replace("/html/book/19/19092/", "") ; //正则替换域名 string pattern = @"(?=.{3,255}$)[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+"; tempString = Regex.Replace(tempString, pattern, ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", "").Replace("feisuz", "") .Replace("作者的话:", "").Replace("新书,求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// 解析开始页 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseStartupPage(HtmlNode documentNode) { Hashtable ht = new Hashtable(); Regex tempReg = null; Match tempMatch = null; HtmlNode linkNodes = documentNode.SelectSingleNode("//span[@class='book_name']"); if (linkNodes != null) { var title = linkNodes.InnerText; ht.Add(CollectionFieldName.Novel_Name, title); } linkNodes = documentNode.SelectSingleNode("//div[@class='books_bar clear']"); if (linkNodes != null) { var imgUrl = linkNodes.SelectSingleNode("div[@class='lr_list']/img")?.GetAttributeValue("src", ""); ht.Add(CollectionFieldName.Novel_CoverImgs, imgUrl); var tag = linkNodes.SelectSingleNode("//ul[@class='book_list']/li[4]")?.InnerText; ht.Add(CollectionFieldName.Novel_Tag, tag.Replace("类别:", "")); var statusName = linkNodes.SelectSingleNode("//ul[@class='book_list']/li[3]")?.InnerText; if (!string.IsNullOrEmpty(statusName) && statusName.Contains("完结")) { ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Finish); } else { ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Update); } var ContentLenStr = linkNodes.SelectSingleNode("//ul[@class='book_list']/li[2]")?.InnerText; int ContentLen = 0; if (!string.IsNullOrEmpty(ContentLenStr)) { tempReg = new Regex(@"(\d+.\d+)|(\d+)"); tempMatch = tempReg.Match(ContentLenStr); if (tempMatch.Success) { if (ContentLenStr.Contains("万")) { ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 10000); } else if (ContentLenStr.Contains("千")) { ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 1000); } else { ContentLen = Convert.ToInt32(tempMatch.Value); } } ht.Add(CollectionFieldName.Novel_ContentLen, ContentLen); } } linkNodes = documentNode.SelectSingleNode("//a[@class='more_link']"); if (linkNodes != null) { var url = linkNodes.GetAttributeValue("href", ""); url = HTMLUtil.GetFullURL(InternalRealUrl, url); ht.Add(CollectionFieldName.Url, url); } linkNodes = documentNode.SelectSingleNode("//div[@id='divDescription']"); if (linkNodes != null) { var intr = linkNodes.InnerHtml; ht.Add(CollectionFieldName.Novel_Intr, intr); } tempReg = new Regex(@"/book/(\d+).html"); tempMatch = tempReg.Match(InternalRealUrl); if (tempMatch.Success) { ht.Add(CollectionFieldName.Novel_UniqueFlag, tempReg.Replace(tempMatch.Value, "$1")); } Hashtable returndata = new Hashtable(); if (ht != null) { returndata.Add(CollectionFieldName.BookInfo, ht); } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id=\"content\"]"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "a"); //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "") .Replace("\t", "") .Replace("手机请访问::feisuz", "") .Replace("feisuz", "") .Replace("作者的话:", "") .Replace("新书,求收藏求推荐", "") .Replace("本书红薯网首发,请勿转载!", "") .Replace("老铁!还在找\"美艳冥妻\"免费小说?", "") .Replace(" (www.yikanxiaoshuo.com = ", "").Trim().TrimEnd(')') .Replace("<br> <br> 百度直接搜索: \"易看小说\" 看免费小说,没毛病!<br>", "") .Replace("老铁!还在找\"绝望游戏\"免费小说?", "") .Replace("百度直接搜索: \"易看小说\" 看免费小说,没毛病!", "") .Replace("\"易看小说\"", "") .Replace("易看小说", "") .Replace("免费小说", "") .Replace("(更快免费阅读加微信:jxxs9966)", "") .Replace("jxxs9966", "") ; returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id='booktext']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "dt", "a"); tempString = tempString.ToLower(); //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "").Replace("\t", ""); tempString = tempString.Replace("【快速查找本站请百度搜索: 书包520】", "") .Replace("【本站域名更改为“ www.shubao520.net ” ,或者在百度搜索: 书包520】", ""); tempString = tempString.Replace("【本站域名更改为“www.shubao520.net”,或者在百度搜索:书包520】", "") .Replace("【快速查找本站请百度搜索:书包520】", "") .Replace("书包520", "") .Replace("www.shubao520.net", "") .Replace("百度", "") .Replace("搜索", "") .Replace("域名", ""); //正则替换域名 string pattern = @"(?=.{3,255}$)[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+"; tempString = Regex.Replace(tempString, pattern, ""); string pattern1 = @"<.+>"; tempString = Regex.Replace(tempString, pattern1, ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", "").Replace("feisuz", "") .Replace("作者的话:", "").Replace("新书,求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempInnerText.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// Search on Allmusic for the requested string /// </summary> /// <param name="searchBy"></param> /// <param name="searchStr"></param> /// <returns></returns> public bool FindInfo(SearchBy searchBy, string searchStr) { _searchby = searchBy; HTMLUtil util = new HTMLUtil(); string strPostData = ""; if (SearchBy.Albums == searchBy) { strPostData = string.Format(ALBUMSEARCH, HttpUtility.UrlEncode(searchStr)); } else { searchStr = SwitchArtist(searchStr); strPostData = string.Format(ARTISTSEARCH, HttpUtility.UrlEncode(searchStr)); } string strHTML = PostHTTP(MAINURL + URLPROGRAM, strPostData); if (strHTML.Length == 0) { return(false); } _htmlCode = strHTML; // save the html content... Regex multiples = new Regex( @"\sSearch\sResults\sfor:", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled ); if (multiples.IsMatch(strHTML)) { string pattern = ""; if (searchBy.ToString().Equals("Artists")) { pattern = @"<tr.*?>\s*?.*?<td\s*?class=""relevance\stext-center"">\s*?.*\s*?.*</td>" + @"\s*?.*<td.*\s*?.*</td>\s*?.*<td>.*<a.*href=""(?<code>.*?)"">(?<name>.*)</a>.*</td>" + @"\s*?.*<td>(?<detail>.*)</td>\s*?.*<td>(?<detail2>.*)</td>"; } else if (searchBy.ToString().Equals("Albums")) { pattern = @"<tr.*?>\s*?.*?<td\s*?class=""relevance\stext-center"">\s*?.*\s*?.*</td>" + @"\s*?.*<td.*\s*?.*</td>\s*?.*<td>.*<a.*href=""(?<code>.*?)"">(?<name>.*)</a>.*</td>" + @"\s*?.*<td>(?<detail>.*)</td>\s*?.*<td>.*</td>\s*?.*<td>(?<detail2>.*)</td>"; } Match m; Regex itemsFoundFromSite = new Regex( pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled ); for (m = itemsFoundFromSite.Match(strHTML); m.Success; m = m.NextMatch()) { string code = m.Groups["code"].ToString(); string name = m.Groups["name"].ToString(); string detail = m.Groups["detail"].ToString(); string detail2 = m.Groups["detail2"].ToString(); util.RemoveTags(ref name); util.ConvertHTMLToAnsi(name, out name); util.RemoveTags(ref detail); util.ConvertHTMLToAnsi(detail, out detail); util.RemoveTags(ref detail2); util.ConvertHTMLToAnsi(detail2, out detail2); if (SearchBy.Artists == searchBy) { detail += " - " + detail2; if (detail.Length > 0) { _codes.Add(code); _values.Add(name + " - " + detail); } else { _codes.Add(code); _values.Add(name); } } else { MusicAlbumInfo albumInfo = new MusicAlbumInfo(); albumInfo.AlbumURL = code; albumInfo.Artist = detail; albumInfo.Title = name; albumInfo.DateOfRelease = detail2; _albumList.Add(albumInfo); } } _multiple = true; } else // found the right one { } return(true); }
/// <summary> /// Quitar los saltos de linea y poner el TAG html para el salto y limpiar un texto y suprimir los tags html por su equivalente en ascii /// </summary> /// <param name="texto">Texto HTML</param> /// <returns></returns> public static string sanitize(string texto) { return(HTMLUtil.sanitize(texto)); }
/// <summary> /// 解析开始页 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseStartupPage(HtmlNode documentNode) { Hashtable ht = new Hashtable(); Regex tempReg = null; Match tempMatch = null; HtmlNode linkNodes = documentNode.SelectSingleNode("//meta[@property='og:title']"); if (linkNodes != null) { var title = linkNodes.GetAttributeValue("content", ""); ht.Add(CollectionFieldName.Novel_Name, title); } linkNodes = documentNode.SelectSingleNode("//div[@class='pic text-center']/img"); if (linkNodes != null) { var imgUrl = linkNodes?.GetAttributeValue("src", ""); imgUrl = HTMLUtil.GetFullURL(InternalRealUrl, imgUrl); ht.Add(CollectionFieldName.Novel_CoverImgs, imgUrl); } linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']"); if (linkNodes != null) { var tag = linkNodes?.GetAttributeValue("content", ""); ht.Add(CollectionFieldName.Novel_Tag, tag); } //linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']"); //if (linkNodes != null) //{ // var statusName = linkNodes?.GetAttributeValue("content",""); // if (!string.IsNullOrEmpty(statusName) && statusName.Contains("完结")) // ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Finish); // else // ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Update); //} //linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']"); //if (linkNodes != null) //{ // var ContentLenStr = linkNodes.SelectSingleNode("div[@class='fn-clear']/ul/li[4]")?.InnerText; // int ContentLen = 0; // if (!string.IsNullOrEmpty(ContentLenStr)) // { // tempReg = new Regex(@"(\d+.\d+)|(\d+)"); // tempMatch = tempReg.Match(ContentLenStr); // if (tempMatch.Success) // { // if (ContentLenStr.Contains("万")) // ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 10000); // if (ContentLenStr.Contains("千")) // ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 1000); // } // ht.Add(CollectionFieldName.Novel_ContentLen, ContentLen); // } //} //linkNodes = documentNode.SelectSingleNode("//div[@class='panel']/a[@class='btn block white']"); //if (linkNodes != null) //{ // var url = linkNodes.GetAttributeValue("href", ""); // ht.Add(CollectionFieldName.Url, url); //} var url = InternalRealUrl; ht.Add(CollectionFieldName.Url, url); //linkNodes = documentNode.SelectSingleNode("//div[@class='book_intro']/p[@id='summary']"); //if (linkNodes != null) //{ // var intr = linkNodes.InnerHtml; // ht.Add(CollectionFieldName.Novel_Intr, intr); //} tempReg = new Regex(@"/(\d+).html"); tempMatch = tempReg.Match(InternalRealUrl); if (tempMatch.Success) { ht.Add(CollectionFieldName.Novel_UniqueFlag, tempReg.Replace(tempMatch.Value, "$1")); } Hashtable returndata = new Hashtable(); if (ht != null) { returndata.Add(CollectionFieldName.BookInfo, ht); } return(returndata); }
/// <summary> /// 获取完整url地址 /// </summary> /// <param name="relativeURL"></param> /// <returns></returns> protected string GetFullURL(string relativeURL) { return(HTMLUtil.GetFullURL(baseUrl, relativeURL)); }
/// <summary> /// Parse the Detail Page returned from the Allmusic Scraper /// </summary> /// <param name="strHTML"></param> /// <returns></returns> public bool Parse(string strHTML) { HTMLUtil util = new HTMLUtil(); int begIndex = 0; int endIndex = 0; string strHTMLLow = strHTML.ToLower(); // Get the Artist Name string pattern = @"<h1.*class=""title"">(.*)</h1>"; if (!FindPattern(pattern, strHTML)) { return(false); } _strArtistName = _match.Groups[1].Value; // Born pattern = @"<h3>.*Born.*</h3>\s*?<p>(.*)</p>"; if (FindPattern(pattern, strHTML)) { string strValue = _match.Groups[1].Value; util.RemoveTags(ref strValue); util.ConvertHTMLToAnsi(strValue, out _strBorn); _strBorn = _strBorn.Trim(); } // Years Active pattern = @"(<span.*?class=""active"">(.*?)</span>)"; if (FindPattern(pattern, strHTML)) { while (_match.Success) { _strYearsActive += string.Format("{0}s, ", _match.Groups[2].Value); _match = _match.NextMatch(); } _strYearsActive = _strYearsActive.Trim(new[] { ' ', ',' }); } // Genre pattern = @"<div.*?id=""genre-style"">\s*?.*?\s*?<h3>.*?Genres.*?</h3>\s*?.*?(<p>(.*?)</p>)"; if (FindPattern(pattern, strHTML)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strGenres); _strGenres = _strGenres.Trim(new[] { ' ', ',' }); } // Style begIndex = strHTMLLow.IndexOf("<h3>styles</h3>"); endIndex = strHTMLLow.IndexOf("<!--end genre/styles-->", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = strHTML.Substring(begIndex, endIndex - begIndex); pattern = @"(<li>(.*?)</li>)"; if (FindPattern(pattern, contentInfo)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strStyles); _strStyles = _strStyles.Trim(new[] { ' ', ',' }); } } // Mood begIndex = strHTMLLow.IndexOf("<h3>moods</h3>"); endIndex = strHTMLLow.IndexOf("</div>", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = strHTML.Substring(begIndex, endIndex - begIndex); pattern = @"(<li>(.*?)</li>)"; if (FindPattern(pattern, contentInfo)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strTones); _strTones = _strTones.Trim(new[] { ' ', ',' }); } } // Instruments begIndex = strHTMLLow.IndexOf("<h3>instruments</h3>"); endIndex = strHTMLLow.IndexOf("</div>", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = strHTML.Substring(begIndex, endIndex - begIndex); if (FindPattern(pattern, contentInfo)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strInstruments); _strInstruments = _strInstruments.Trim(new[] { ' ', ',' }); } } // picture URL pattern = @"<div.*?class=""image"">\s*?.*<img.*id=""artist_image"".*?src=\""(.*?)\"""; if (FindPattern(pattern, strHTML)) { _strArtistPictureURL = _match.Groups[1].Value; } // parse AMG BIOGRAPHY pattern = @"<td.*?class=""tab_off""><a.*?href=""(.*?)"">.*?Biography.*?</a>"; if (FindPattern(pattern, strHTML)) { try { string contentinfo = AllmusicSiteScraper.GetHTTP(_match.Groups[1].Value); begIndex = contentinfo.IndexOf("<!--Begin Biography -->"); endIndex = contentinfo.IndexOf("</div>", begIndex + 2); if (begIndex != -1 && endIndex != -1) { pattern = @"<p.*?class=""text"">(.*?)</p>"; if (FindPattern(pattern, contentinfo)) { string data = _match.Groups[1].Value; util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out data); _strAMGBiography = data.Trim(); } } } catch (Exception) {} } string compilationPage = ""; string singlesPage = ""; string dvdPage = ""; string miscPage = ""; // discography (albums) pattern = @"<td.*class=""tab_off""><a.*?href=""(.*?)"">.*Discography.*</a>"; if (FindPattern(pattern, strHTML)) { // Get Link to other sub pages compilationPage = _match.Groups[1].Value + "/compilations"; singlesPage = _match.Groups[1].Value + "/singles-eps"; dvdPage = _match.Groups[1].Value + "/dvds-videos"; miscPage = _match.Groups[1].Value + "/other"; try { string contentinfo = AllmusicSiteScraper.GetHTTP(_match.Groups[1].Value); pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" + @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>"; if (FindPattern(pattern, contentinfo)) { while (_match.Success) { string year = _match.Groups["year"].Value; string albumTitle = _match.Groups["album"].Value; string label = _match.Groups["label"].Value; util.RemoveTags(ref year); util.ConvertHTMLToAnsi(year, out year); util.RemoveTags(ref albumTitle); util.ConvertHTMLToAnsi(albumTitle, out albumTitle); util.RemoveTags(ref label); util.ConvertHTMLToAnsi(label, out label); try { string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() }; _discographyAlbum.Add(dAlbumInfo); } catch {} _match = _match.NextMatch(); } } } catch (Exception) {} } // Compilations if (compilationPage != "") { try { string contentinfo = AllmusicSiteScraper.GetHTTP(compilationPage); pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" + @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>"; if (FindPattern(pattern, contentinfo)) { while (_match.Success) { string year = _match.Groups["year"].Value; string albumTitle = _match.Groups["album"].Value; string label = _match.Groups["label"].Value; util.RemoveTags(ref year); util.ConvertHTMLToAnsi(year, out year); util.RemoveTags(ref albumTitle); util.ConvertHTMLToAnsi(albumTitle, out albumTitle); util.RemoveTags(ref label); util.ConvertHTMLToAnsi(label, out label); try { string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() }; _discographyCompilations.Add(dAlbumInfo); } catch {} _match = _match.NextMatch(); } } } catch (Exception) {} } // Singles if (singlesPage != "") { try { string contentinfo = AllmusicSiteScraper.GetHTTP(singlesPage); pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" + @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>"; if (FindPattern(pattern, contentinfo)) { while (_match.Success) { string year = _match.Groups["year"].Value; string albumTitle = _match.Groups["album"].Value; string label = _match.Groups["label"].Value; util.RemoveTags(ref year); util.ConvertHTMLToAnsi(year, out year); util.RemoveTags(ref albumTitle); util.ConvertHTMLToAnsi(albumTitle, out albumTitle); util.RemoveTags(ref label); util.ConvertHTMLToAnsi(label, out label); try { string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() }; _discographySingles.Add(dAlbumInfo); } catch {} _match = _match.NextMatch(); } } } catch (Exception) {} } // DVD Videos if (dvdPage != "") { try { string contentinfo = AllmusicSiteScraper.GetHTTP(dvdPage); pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" + @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>"; if (FindPattern(pattern, contentinfo)) { while (_match.Success) { string year = _match.Groups["year"].Value; string albumTitle = _match.Groups["album"].Value; string label = _match.Groups["label"].Value; util.RemoveTags(ref year); util.ConvertHTMLToAnsi(year, out year); util.RemoveTags(ref albumTitle); util.ConvertHTMLToAnsi(albumTitle, out albumTitle); util.RemoveTags(ref label); util.ConvertHTMLToAnsi(label, out label); try { string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() }; _discographyMisc.Add(dAlbumInfo); } catch {} _match = _match.NextMatch(); } } } catch (Exception) {} } // Other if (miscPage != "") { try { string contentinfo = AllmusicSiteScraper.GetHTTP(miscPage); pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" + @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>"; if (FindPattern(pattern, contentinfo)) { while (_match.Success) { string year = _match.Groups["year"].Value; string albumTitle = _match.Groups["album"].Value; string label = _match.Groups["label"].Value; util.RemoveTags(ref year); util.ConvertHTMLToAnsi(year, out year); util.RemoveTags(ref albumTitle); util.ConvertHTMLToAnsi(albumTitle, out albumTitle); util.RemoveTags(ref label); util.ConvertHTMLToAnsi(label, out label); try { string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() }; _discographyMisc.Add(dAlbumInfo); } catch {} _match = _match.NextMatch(); } } } catch (Exception) {} } _bLoaded = true; return(_bLoaded); }
public bool FindAlbuminfo(string strAlbum, string artistName, int releaseYear) { _albumList.Clear(); // strAlbum="1999";//escapolygy"; // make request // type is // http://www.allmusic.com/cg/amg.dll?P=amg&SQL=escapolygy&OPT1=2 HTMLUtil util = new HTMLUtil(); string postData = String.Format("P=amg&SQL={0}&OPT1=2", HttpUtility.UrlEncode(strAlbum)); string html = PostHTTP("http://www.allmusic.com/cg/amg.dll", postData); if (html.Length == 0) { return(false); } // check if this is an album MusicAlbumInfo newAlbum = new MusicAlbumInfo(); newAlbum.AlbumURL = "http://www.allmusic.com/cg/amg.dll?" + postData; if (newAlbum.Parse(html)) { _albumList.Add(newAlbum); return(true); } string htmlLow = html; htmlLow = htmlLow.ToLower(); int startOfTable = htmlLow.IndexOf("id=\"expansiontable1\""); if (startOfTable < 0) { return(false); } startOfTable = htmlLow.LastIndexOf("<table", startOfTable); if (startOfTable < 0) { return(false); } HTMLTable table = new HTMLTable(); string strTable = html.Substring(startOfTable); table.Parse(strTable); for (int i = 1; i < table.Rows; ++i) { HTMLTable.HTMLRow row = table.GetRow(i); string albumName = ""; string albumUrl = ""; string nameOfAlbum = ""; string nameOfArtist = ""; for (int iCol = 0; iCol < row.Columns; ++iCol) { string column = row.GetColumValue(iCol); if (iCol == 1 && (column.Length != 0)) { albumName = "(" + column + ")"; } if (iCol == 2) { nameOfArtist = column; util.RemoveTags(ref nameOfArtist); if (!column.Equals(" ")) { albumName = String.Format("- {0} {1}", nameOfArtist, albumName); } } if (iCol == 4) { string tempAlbum = column; util.RemoveTags(ref tempAlbum); albumName = String.Format("{0} {1}", tempAlbum, albumName); nameOfAlbum = tempAlbum; } if (iCol == 4 && column.IndexOf("<a href=\"") >= 0) { int pos1 = column.IndexOf("<a href=\""); pos1 += +"<a href=\"".Length; int iPos2 = column.IndexOf("\">", pos1); if (iPos2 >= 0) { if (nameOfAlbum.Length == 0) { nameOfAlbum = albumName; } // full album url: // http://www.allmusic.com/cg/amg.dll?p=amg&token=&sql=10:66jieal64xs7 string url = column.Substring(pos1, iPos2 - pos1); string albumNameStripped; albumUrl = String.Format("http://www.allmusic.com{0}", url); MusicAlbumInfo newAlbumInfo = new MusicAlbumInfo(); util.ConvertHTMLToAnsi(albumName, out albumNameStripped); newAlbumInfo.Title2 = albumNameStripped; newAlbumInfo.AlbumURL = util.ConvertHTMLToAnsi(albumUrl); newAlbumInfo.Artist = util.ConvertHTMLToAnsi(nameOfArtist); newAlbumInfo.Title = util.ConvertHTMLToAnsi(nameOfAlbum); _albumList.Add(newAlbumInfo); } } } } // now sort _albumList.Sort(new AlbumSort(strAlbum, artistName, releaseYear)); return(true); }
public bool Parse(string html) { _songs.Clear(); HTMLUtil util = new HTMLUtil(); string strHtmlLow = html.ToLower(); int begIndex = 0; int endIndex = 0; // Extract Cover URL string pattern = @"<!--Begin.*?Album.*?Photo-->\s*?.*?<img.*?src=\""(.*?)\"""; if (FindPattern(pattern, html)) { _strImageURL = _match.Groups[1].Value; } // Extract Review pattern = @"<td.*?class=""tab_off""><a.*?href=""(.*?)"">.*?Review.*?</a>"; if (FindPattern(pattern, html)) { try { string contentinfo = AllmusicSiteScraper.GetHTTP(_match.Groups[1].Value); pattern = @"<p.*?class=""author"">.*\s*?.*?<p.*?class=""text"">(.*?)</p>"; if (FindPattern(pattern, contentinfo)) { string data = _match.Groups[1].Value; util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out data); _strReview = data.Trim(); } } catch (Exception) {} } // Extract Artist pattern = @"<h3.*?artist</h3>\s*?.*?<a.*"">(.*)</a>"; if (FindPattern(pattern, html)) { _artist = _match.Groups[1].Value; util.RemoveTags(ref _artist); } // Extract Album pattern = @"<h3.*?album</h3>\s*?.*?<p>(.*)</P>"; if (FindPattern(pattern, html)) { _strTitle = _match.Groups[1].Value; util.RemoveTags(ref _strTitle); } // Extract Rating pattern = @"<h3.*?rating</h3>\s*?.*?src=""(.*?)"""; if (FindPattern(pattern, html)) { string strRating = _match.Groups[1].Value; util.RemoveTags(ref strRating); strRating = strRating.Substring(26, 1); try { _iRating = Int32.Parse(strRating); } catch (Exception) {} } // Release Date pattern = @"<h3.*?release.*?date</h3>\s*?.*?<p>(.*)</P>"; if (FindPattern(pattern, html)) { _strDateOfRelease = _match.Groups[1].Value; util.RemoveTags(ref _strDateOfRelease); // extract the year out of something like "1998 (release)" or "12 feb 2003" int nPos = _strDateOfRelease.IndexOf("19"); if (nPos > -1) { if ((int)_strDateOfRelease.Length >= nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) && Char.IsDigit(_strDateOfRelease[nPos + 3])) { string strYear = _strDateOfRelease.Substring(nPos, 4); _strDateOfRelease = strYear; } else { nPos = _strDateOfRelease.IndexOf("19", nPos + 2); if (nPos > -1) { if ((int)_strDateOfRelease.Length >= nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) && Char.IsDigit(_strDateOfRelease[nPos + 3])) { string strYear = _strDateOfRelease.Substring(nPos, 4); _strDateOfRelease = strYear; } } } } nPos = _strDateOfRelease.IndexOf("20"); if (nPos > -1) { if ((int)_strDateOfRelease.Length > nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) && Char.IsDigit(_strDateOfRelease[nPos + 3])) { string strYear = _strDateOfRelease.Substring(nPos, 4); _strDateOfRelease = strYear; } else { nPos = _strDateOfRelease.IndexOf("20", nPos + 1); if (nPos > -1) { if ((int)_strDateOfRelease.Length > nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) && Char.IsDigit(_strDateOfRelease[nPos + 3])) { string strYear = _strDateOfRelease.Substring(nPos, 4); _strDateOfRelease = strYear; } } } } } // Extract Genre begIndex = strHtmlLow.IndexOf("<h3>genre</h3>"); endIndex = strHtmlLow.IndexOf("</div>", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = html.Substring(begIndex, endIndex - begIndex); pattern = @"(<li>(.*?)</li>)"; if (FindPattern(pattern, contentInfo)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strGenre); _strGenre = _strGenre.Trim(new[] { ' ', ',' }); } } // Extract Styles begIndex = strHtmlLow.IndexOf("<h3>style</h3>"); endIndex = strHtmlLow.IndexOf("</div>", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = html.Substring(begIndex, endIndex - begIndex); pattern = @"(<li>(.*?)</li>)"; if (FindPattern(pattern, contentInfo)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strStyles); _strStyles = _strStyles.Trim(new[] { ' ', ',' }); } } // Extract Moods begIndex = strHtmlLow.IndexOf("<h3>moods</h3>"); endIndex = strHtmlLow.IndexOf("</div>", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = html.Substring(begIndex, endIndex - begIndex); pattern = @"(<li>(.*?)</li>)"; if (FindPattern(pattern, contentInfo)) { string data = ""; while (_match.Success) { data += string.Format("{0}, ", _match.Groups[2].Value); _match = _match.NextMatch(); } util.RemoveTags(ref data); util.ConvertHTMLToAnsi(data, out _strTones); _strTones = _strTones.Trim(new[] { ' ', ',' }); } } // Extract Songs begIndex = strHtmlLow.IndexOf("<!-- tracks table -->"); endIndex = strHtmlLow.IndexOf("<!-- end tracks table -->", begIndex + 2); if (begIndex != -1 && endIndex != -1) { string contentInfo = html.Substring(begIndex, endIndex - begIndex); pattern = @"<tr.*class=""visible"".*?\s*?<td.*</td>\s*?.*<td.*</td>\s*?.*<td.*?>(?<track>.*)</td>" + @"\s*?.*<td.*</td>\s*?.*<td.*?>(?<title>.*)</td>\s*?.*?<td.*?>\s*?.*</td>\s*?.*?<td.*?>(?<duration>.*)</td>"; if (FindPattern(pattern, contentInfo)) { while (_match.Success) { // Tracknumber int iTrack = 0; try { iTrack = Int32.Parse(_match.Groups["track"].Value); } catch (Exception) {} // Song Title string strTitle = _match.Groups["title"].Value; util.RemoveTags(ref strTitle); util.ConvertHTMLToAnsi(strTitle, out strTitle); // Duration int iDuration = 0; string strDuration = _match.Groups["duration"].Value; int iPos = strDuration.IndexOf(":"); if (iPos >= 0) { string strMin, strSec; strMin = strDuration.Substring(0, iPos); iPos++; strSec = strDuration.Substring(iPos); int iMin = 0, iSec = 0; try { iMin = Int32.Parse(strMin); iSec = Int32.Parse(strSec); } catch (Exception) {} iDuration = iMin * 60 + iSec; } // Create new song object MusicSong newSong = new MusicSong(); newSong.Track = iTrack; newSong.SongName = strTitle; newSong.Duration = iDuration; _songs.Add(newSong); _match = _match.NextMatch(); } } } // Set to "Not available" if no value from web if (_artist.Length == 0) { _artist = GUILocalizeStrings.Get(416); } if (_strDateOfRelease.Length == 0) { _strDateOfRelease = GUILocalizeStrings.Get(416); } if (_strGenre.Length == 0) { _strGenre = GUILocalizeStrings.Get(416); } if (_strTones.Length == 0) { _strTones = GUILocalizeStrings.Get(416); } if (_strStyles.Length == 0) { _strStyles = GUILocalizeStrings.Get(416); } if (_strTitle.Length == 0) { _strTitle = GUILocalizeStrings.Get(416); } if (_strTitle2.Length == 0) { _strTitle2 = _strTitle; } Loaded = true; return(true); }
// Filmograpy and bio public bool GetActorDetails(IMDBUrl url, out IMDBActor actor) { actor = new IMDBActor(); string[] vdbParserStr = VdbParserStringActorDetails(); if (vdbParserStr == null || vdbParserStr.Length != 46) { return(false); } try { string absoluteUri; string strBody = GetPage(url.URL, "utf-8", out absoluteUri); if (strBody == null) { return(false); } if (strBody.Length == 0) { return(false); } #region Actor imdb id // IMDBActorID try { int pos = url.URL.LastIndexOf("nm"); string id = url.URL.Substring(pos, 9).Replace("/", string.Empty); actor.IMDBActorID = id; } catch (Exception) { } #endregion HTMLParser parser = new HTMLParser(strBody); string strThumb = string.Empty; string value = string.Empty; string value2 = string.Empty; #region Actor name // Actor name if ((parser.skipToEndOf(vdbParserStr[0])) && // <title> (parser.extractTo(vdbParserStr[1], ref value))) // - IMDb</title> { value = new HTMLUtil().ConvertHTMLToAnsi(value); value = Util.Utils.RemoveParenthesis(value).Trim(); actor.Name = HttpUtility.HtmlDecode(value.Trim()); } if (actor.Name == string.Empty) { actor.Name = url.Title; } #endregion // Photo string parserTxt = parser.Content; string photoBlock = string.Empty; #region Actor photo if (parser.skipToStartOf(vdbParserStr[2]) && // <td id="img_primary" (parser.extractTo(vdbParserStr[3], ref photoBlock))) // </td> { parser.Content = photoBlock; if ((parser.skipToEndOf(vdbParserStr[4])) && // <img src=" (parser.extractTo(vdbParserStr[5], ref strThumb))) // " { actor.ThumbnailUrl = strThumb; } parser.Content = parserTxt; } #endregion #region Actor birth date // Birth date if ((parser.skipToEndOf(vdbParserStr[6])) && // >Born:</h4> (parser.skipToEndOf(vdbParserStr[7])) && // birth_monthday= (parser.skipToEndOf(vdbParserStr[8])) && // > (parser.extractTo(vdbParserStr[9], ref value)) && // < (parser.skipToEndOf(vdbParserStr[10])) && // year= (parser.extractTo(vdbParserStr[11], ref value2))) // " { actor.DateOfBirth = value + " " + value2; } #endregion #region Actor death date // Death date if ((parser.skipToEndOf(vdbParserStr[12])) && // >Died:</h4> (parser.skipToEndOf(vdbParserStr[13])) && // death_monthday=" (parser.skipToEndOf(vdbParserStr[14])) && // > (parser.extractTo(vdbParserStr[15], ref value)) && // < (parser.skipToEndOf(vdbParserStr[16])) && // death_date=" (parser.extractTo(vdbParserStr[17], ref value2))) // " { actor.DateOfDeath = value + " " + value2; } #endregion parser.resetPosition(); #region Actor birth place // Birth place if ((parser.skipToEndOf(vdbParserStr[18])) && // birth_place= (parser.skipToEndOf(vdbParserStr[19])) && // > (parser.extractTo(vdbParserStr[20], ref value))) // < { actor.PlaceOfBirth = HttpUtility.HtmlDecode(value); } #endregion #region Actor death place // Death place if ((parser.skipToEndOf(vdbParserStr[21])) && // death_place= (parser.skipToEndOf(vdbParserStr[22])) && // > (parser.extractTo(vdbParserStr[23], ref value))) // < { actor.PlaceOfDeath = HttpUtility.HtmlDecode(value); } #endregion //Mini Biography parser.resetPosition(); #region Actor biography if ((parser.skipToEndOf(vdbParserStr[24])) && // <td id="overview-top"> (parser.skipToEndOf(vdbParserStr[25])) && // <p> (parser.extractTo(vdbParserStr[26], ref value))) // See full bio</a> { value = new HTMLUtil().ConvertHTMLToAnsi(value); actor.MiniBiography = Util.Utils.stripHTMLtags(value); actor.MiniBiography = actor.MiniBiography.Replace(vdbParserStr[45], string.Empty).Trim(); // See full bio » actor.MiniBiography = HttpUtility.HtmlDecode(actor.MiniBiography); // Remove HTML entities like ½ if (actor.MiniBiography != string.Empty) { // get complete biography string bioURL = absoluteUri; if (!bioURL.EndsWith(vdbParserStr[27])) // / { bioURL += vdbParserStr[28]; // /bio } else { bioURL += vdbParserStr[29]; // bio } string strBioBody = GetPage(bioURL, "utf-8", out absoluteUri); if (!string.IsNullOrEmpty(strBioBody)) { HTMLParser parser1 = new HTMLParser(strBioBody); if (parser1.skipToEndOf(vdbParserStr[30]) && // <h5>Mini Biography</h5> parser1.skipToEndOf(vdbParserStr[31]) && // <div class="wikipedia_bio"> parser1.extractTo(vdbParserStr[32], ref value)) // </div> { value = new HTMLUtil().ConvertHTMLToAnsi(value); value = Regex.Replace(value, @"</h5>\s<h5>", "\n\r"); value = Regex.Replace(value, @"<h5>", "\n\r\n\r"); value = Regex.Replace(value, @"</h5>", ":\n\r"); actor.Biography = Util.Utils.stripHTMLtags(value).Trim(); actor.Biography = HttpUtility.HtmlDecode(actor.Biography); } else { parser1.resetPosition(); if (parser1.skipToEndOf(vdbParserStr[33]) && // <h5>Mini Biography</h5> parser1.extractTo(vdbParserStr[34], ref value)) // </p> { value = new HTMLUtil().ConvertHTMLToAnsi(value); actor.Biography = Util.Utils.stripHTMLtags(value).Trim(); actor.Biography = HttpUtility.HtmlDecode(actor.Biography); } } } } } #endregion // Person is movie director or an actor/actress bool isActorPass = false; bool isDirectorPass = false; bool isWriterPass = false; parser.resetPosition(); HTMLParser dirParser = new HTMLParser(); // HTML body for Director HTMLParser wriParser = new HTMLParser(); // HTML body for Writers #region Check person role in movie (actor, director or writer) if ((parser.skipToEndOf(vdbParserStr[35])) && // name="Director">Director</a> (parser.skipToEndOf(vdbParserStr[36]))) // </div> { isDirectorPass = true; dirParser.Content = parser.Content; } parser.resetPosition(); if ((parser.skipToEndOf(vdbParserStr[37])) && // name="Writer">Writer</a> (parser.skipToEndOf(vdbParserStr[38]))) // </div> { isWriterPass = true; wriParser.Content = parser.Content; } parser.resetPosition(); if (parser.skipToEndOf(vdbParserStr[39]) || // name="Actress">Actress</a> parser.skipToEndOf(vdbParserStr[40])) // name="Actor">Actor</a> { isActorPass = true; } #endregion #region Get movies for every role // Get filmography Actor if (isActorPass) { GetActorMovies(actor, parser, false, false); } // Get filmography for writers if (isWriterPass) { parser = wriParser; parser.resetPosition(); if ((parser.skipToEndOf(vdbParserStr[41])) && // name="Writer">Writer</a> (parser.skipToEndOf(vdbParserStr[42]))) // </div> { GetActorMovies(actor, parser, false, true); } } // Get filmography Director if (isDirectorPass) { parser = dirParser; parser.resetPosition(); if (parser.skipToEndOf(vdbParserStr[43]) && // name="Director">Director</a> parser.skipToEndOf(vdbParserStr[44])) // </div> { GetActorMovies(actor, parser, true, false); } } #endregion // Add filmography if (actor.Count > 0) { actor.SortActorMoviesByYear(); } return(true); } catch (Exception ex) { Log.Error("IMDB.GetActorDetails({0} exception:{1} {2} {3}", url.URL, ex.Message, ex.Source, ex.StackTrace); } return(false); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@id='content']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "dt", "a"); tempString = tempString.ToLower(); //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br"); tempString = tempString.Replace("\r\n", "").Replace("\t", "") .Replace("本站访问地址http://www.ziyouge.com 任意搜索引擎内输入:紫幽阁 即可访问!", "") .Replace("http://www.ziyouge.com", "") .Replace("紫幽阁", "") .Replace("wanben.me", "") .Replace("ziyouge.com", "") .Replace("ziyouge", "") .Replace("http://", "") .Replace("http", "") .Replace("紫You阁 WwW.ZiyouGE.com", "") .Replace("WWw.ZiyoUgE.com", "") .Replace("品书网", "") .Replace("www.vodtw.com", "") .Replace("本书来自", "") .Replace("/html/book/19/19092/", "") .Replace("大家想继续看我的书,可以加我微信gdy3208新书出了,我会第一时间发动态通知大家!", "") .Replace("本站重要通知:请使用本站的免费小说app,无广告、破防盗版、更新快,会员同步书架,请关注微信公众号 appxsyd (按住三秒复制) 下载免费阅读器!!", "") .Replace("本站重要通知: 请使用本站的免费小说app,无广告、破防盗版、更新快,会员同步书架,请关注微信公众号 gegegengxin (按住三秒复制)下载免费阅读器!!", "") .Replace("本站重要通知:", "") .Replace("请使用本站的免费小说", "") .Replace("app", "") .Replace("无广告、破防盗版、更新快,会员同步书架", "") .Replace("请关注微信公众号", "") .Replace("appxsyd", "") .Replace("gegegengxin", "") .Replace("(按住三秒复制)", "") .Replace("(按住三秒复制)", "") .Replace("下载免费阅读器", "") ; //正则替换域名 string pattern = @"(?=.{3,255}$)[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+"; tempString = Regex.Replace(tempString, pattern, ""); string pattern1 = @"<.+>"; tempString = Regex.Replace(tempString, pattern1, ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", "").Replace("feisuz", "") .Replace("作者的话:", "").Replace("新书,求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// 解析明细页内容 /// </summary> /// <param name="documentNode"></param> /// <returns></returns> private Hashtable parseDetailPage1(HtmlNode documentNode) { List <string> multipage = null; Hashtable returndata = new Hashtable(); HtmlNode tempNode = null; string tempString = null; string tempInnerText = null; Regex tempReg = null; Match tempMatch = null; tempNode = documentNode.SelectSingleNode("//div[@class='messagecontent']"); if (tempNode != null) { tempString = tempNode.InnerHtml; tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script"); tempString = tempString.Replace("\r\n", "").Replace("\t", ""); //正则替换 string pattern = @"www\.[a-zA-Z0-9]+\.(?:com|cn|net|org)/(\w+|/)+\.html"; tempString = Regex.Replace(tempString, pattern, ""); string pattern1 = @"www\.[a-zA-Z0-9]+\.(?:com|cn|net|org)/(\w+|/)+"; tempString = Regex.Replace(tempString, pattern1, ""); string pattern2 = @"www\.[a-zA-Z0-9]+\.(?:com|cn|net|org)/?"; tempString = Regex.Replace(tempString, pattern2, ""); returndata.Add(CollectionFieldName.Chap_Content, tempString); //移除无效字符,用来计算长度 tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace(" ", ""); if (!string.IsNullOrEmpty(tempInnerText)) { returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length); string into = ""; if (tempInnerText.Length > 40) { into = tempInnerText.Substring(0, 40) + "..."; } else { into = tempInnerText; } returndata.Add(CollectionFieldName.Chap_Intro, into); int price = (tempString.Length / 1000) * 5; if (price == 0) { price = 5; } if (price > 15) { price = 15; } returndata.Add(CollectionFieldName.Chap_Pirce, price); } returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine); returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free); tempInnerText = tempNode.InnerText; } return(returndata); }
/// <summary> /// Volver a poner el formato HTML en un documento descodificado como ascii /// </summary> /// <param name="texto">Texto HTML</param> /// <returns></returns> public static string decodeHTML(string texto) { return(HTMLUtil.decodeHTML(texto)); }
private void FindIMDBActor(string strURL) { string[] vdbParserStr = VdbParserStringActor(); if (vdbParserStr == null || vdbParserStr.Length != 29) { return; } try { string absoluteUri; // UTF-8 have problem with special country chars, default IMDB enc is used string strBody = GetPage(strURL, "utf-8", out absoluteUri); string value = string.Empty; HTMLParser parser = new HTMLParser(strBody); if ((parser.skipToEndOf(vdbParserStr[0])) && // <title> (parser.extractTo(vdbParserStr[1], ref value)) && // </title> !value.ToLowerInvariant().Equals(vdbParserStr[2])) // imdb name search { value = new HTMLUtil().ConvertHTMLToAnsi(value); value = Util.Utils.RemoveParenthesis(value).Trim(); IMDBUrl oneUrl = new IMDBUrl(absoluteUri, value, "IMDB"); _elements.Add(oneUrl); return; } parser.resetPosition(); string popularBody = string.Empty; string exactBody = string.Empty; string url = string.Empty; string name = string.Empty; string role = string.Empty; if (parser.skipToStartOfNoCase(vdbParserStr[3])) // Popular names { parser.skipToEndOf(vdbParserStr[4]); // <table> parser.extractTo(vdbParserStr[5], ref popularBody); // </table> parser = new HTMLParser(popularBody); while (parser.skipToStartOf(vdbParserStr[6])) // href="/name/ { parser.skipToEndOf(vdbParserStr[7]); // href=" parser.extractTo(vdbParserStr[8], ref url); // " parser.skipToEndOf(vdbParserStr[9]); // Image()).src='/rg/find-name- parser.skipToEndOf(vdbParserStr[10]); // ';"> parser.extractTo(vdbParserStr[11], ref name); // </a> parser.skipToEndOf(vdbParserStr[12]); // <small>( parser.extractTo(vdbParserStr[13], ref role); // , if (role != string.Empty) { name += " - " + role; } name = new HTMLUtil().ConvertHTMLToAnsi(name); name = Util.Utils.RemoveParenthesis(name).Trim(); IMDBUrl newUrl = new IMDBUrl("http://www.imdb.com" + url, name, "IMDB"); _elements.Add(newUrl); parser.skipToEndOf(vdbParserStr[14]); // </tr> } } parser = new HTMLParser(strBody); if (parser.skipToStartOfNoCase(vdbParserStr[15])) // Exact Matches { parser.skipToEndOf(vdbParserStr[16]); // <table> parser.extractTo(vdbParserStr[17], ref exactBody); // </table> } else if (parser.skipToStartOfNoCase(vdbParserStr[18])) // Approx Matches { parser.skipToEndOf(vdbParserStr[19]); // <table> parser.extractTo(vdbParserStr[20], ref exactBody); // </table> } else { return; } parser = new HTMLParser(exactBody); url = string.Empty; name = string.Empty; role = string.Empty; while (parser.skipToStartOf(vdbParserStr[21])) // href="/name/ { parser.skipToEndOf(vdbParserStr[22]); // href=" parser.extractTo(vdbParserStr[23], ref url); // " parser.skipToEndOf(vdbParserStr[24]); // Image()).src='/rg/find-name- parser.skipToEndOf(vdbParserStr[25]); // ';"> parser.extractTo(vdbParserStr[26], ref name); // </a> parser.skipToEndOf(vdbParserStr[27]); // <small>( parser.extractTo(vdbParserStr[28], ref role); // , if (role != string.Empty) { name += " - " + role; } name = new HTMLUtil().ConvertHTMLToAnsi(name); name = Util.Utils.RemoveParenthesis(name).Trim(); IMDBUrl newUrl = new IMDBUrl("http://www.imdb.com" + url, name, "IMDB"); _elements.Add(newUrl); parser.skipToEndOf(vdbParserStr[29]); // </tr> } } catch (Exception ex) { Log.Error("exception for imdb lookup of {0} err:{1} stack:{2}", strURL, ex.Message, ex.StackTrace); } }