Ejemplos de código de HTMLUtil en C# (CSharp)

Ejemplo n.º 1

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage2(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id='content']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script");
                tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "").Replace("手机请访问：:feisuz", "").Replace("feisuz", "")
                             .Replace("feisuz", "").Replace("作者的话:", "").Replace("新书，求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", "");
                returndata.Add(CollectionFieldName.ExContent, tempString);
            }
            HtmlNode nextLink = documentNode.SelectNodes("//*[@id='content']/div[@class='text']/a")?.FirstOrDefault(x => x.InnerText == "下一节");

            if (nextLink != null)
            {
                string url = nextLink.GetAttributeValue("href", "");
                if (!string.IsNullOrEmpty(url) && url != "#")
                {
                    returndata.Add(CollectionFieldName.NextUrl, url);
                }
            }
            return(returndata);
        }

Ejemplo n.º 2

0

Mostrar archivo

Archivo: web_qqkanshu.cs Proyecto: change008/boruinoveltools

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id=\"ccontent\"]");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "a");
                //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "")
                             .Replace("\t", "")
                             .Replace("領域文學首發地址httP://ｗｗｗ.ｌｉｎｇｙｕ.ｏｒｇ<br><br>", "")
                             .Replace("領域文學首發地址ｗｗｗ.ｌｉｎｇｙｕ.ｏｒｇ<br><br>　　<br><br>", "")
                             .Replace("请记住本书首发域名：http://www.lingyu.org&nbsp;&nbsp;领域文学手机版阅读网址： m.lingyu.org", "");

                string pattern = @"http://www.lingyu.org/\w+/\d+/\d+/\d+.html";
                tempString = Regex.Replace(tempString, pattern, "");
                returndata.Add(CollectionFieldName.Chap_Content, tempString);


                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString)
                                .Replace("&nbsp;", "")
                                .Replace("領域文學首發地址httP://ｗｗｗ.ｌｉｎｇｙｕ.ｏｒｇ<br><br>", "")
                                .Replace("領域文學首發地址ｗｗｗ.ｌｉｎｇｙｕ.ｏｒｇ<br><br>　　<br><br>", "")
                                .Replace("请记住本书首发域名：http://www.lingyu.org&nbsp;&nbsp;领域文学手机版阅读网址： m.lingyu.org", "");

                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempInnerText.Length / 500) * 3;
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 3

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id='content']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "center", "span");

                tempString = tempString.Replace("\r\n", "").Replace("\t", "")
                             .Replace("全本小说网欢迎您！WWW.YZNN.COM T1706231537", "")
                             .Replace("F606121", "")
                             .Replace("全本小说网欢迎您！WWW.YZNN.COM", "");

                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            else
            {
                int i = 1;
                int j = 1 + 1;
            }
            return(returndata);
        }

Ejemplo n.º 4

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id='content']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script");
                tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "");
                returndata.Add(CollectionFieldName.Chap_Content, tempString);
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            HtmlNode nextLink = documentNode.SelectNodes("//*[@id='content']/div[@class='text']/a")?.FirstOrDefault(x => x.InnerText == "下一节");

            if (nextLink != null)
            {
                string url = nextLink.GetAttributeValue("href", "");
                if (!string.IsNullOrEmpty(url) && url != "#")
                {
                    returndata.Add(CollectionFieldName.NextUrl, url);
                }
            }
            return(returndata);
        }

Ejemplo n.º 5

0

Mostrar archivo

        private void dealHtmlstring(ref string Htmlstring)
        {
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(Htmlstring);
            HtmlNode htmlNode = document.DocumentNode;

            //清洗多余标签
            htmlNode.InnerHtml = HTMLUtil.RemoveHtmlTag(htmlNode.InnerHtml, "p", "vedio", "img", "br");
        }

Ejemplo n.º 6

0

Mostrar archivo

Archivo: CoreController.cs Proyecto: change008/myplat

 public string DeepClear(string htmlString = "")
 {
     if (!string.IsNullOrEmpty(htmlString))
     {
         htmlString = HTMLUtil.RemoveHtmlTag(htmlString, "p", "img", "br");
         htmlString = HTMLUtil.ClearImgTag(htmlString);
         Regex contentRegex = new Regex(@"(style="".*?"")");
         htmlString = contentRegex.Replace(htmlString, "");
     }
     return(htmlString);
 }

Ejemplo n.º 7

0

Mostrar archivo

Archivo: web_shumanwu.cs Proyecto: change008/boruinoveltools

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@class=\"panel-body content-body content-ext\"]");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script");
                //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "").Replace("手机请访问：:feisuz", "")
                             .Replace("feisuz", "").Replace("作者的话:", "").Replace("新书，求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", "");
                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "").Replace("feisuz", "")
                                .Replace("作者的话:", "").Replace("新书，求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 8

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@class='readout']/div[@class='shuneirong']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "center", "span");

                tempString = tempString.Replace("\r\n", "").Replace("\t", "")
                             .Replace("免费小说", "")
                             .Replace("biquge5200.com", "")
                             .Replace("biquge5200", "")
                             .Replace("笔趣阁", "")
                             .Replace("http://", "")
                             .Replace("本书红薯网首发,请勿转载!", "");

                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 150)
                    {
                        into = tempInnerText.Substring(0, 150) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            else
            {
                int i = 1;
                int j = 1 + 1;
            }
            return(returndata);
        }

Ejemplo n.º 9

0

Mostrar archivo

        /// <summary>
        /// 图片相对路径改为全路径
        /// </summary>
        /// <param name="documentNode"></param>
        /// <param name="InternalRealUrl"></param>
        protected static void ReplaceIncorrectImageSrc(HtmlNode documentNode, string InternalRealUrl)
        {
            HtmlNodeCollection imgNode = documentNode.SelectNodes("//img");

            if (imgNode == null)
            {
                return;
            }
            foreach (var item in imgNode)
            {
                var url     = item.GetAttributeValue("src", "");
                var realUrl = HTMLUtil.GetFullURL(InternalRealUrl, url);
                item.SetAttributeValue("src", realUrl);
                //documentNode.InnerHtml=documentNode.InnerHtml.Replace(url, realUrl);
            }
        }

Ejemplo n.º 10

0

Mostrar archivo

        /// <summary>
        /// 章节目录列表数据
        /// </summary>
        /// <param name="hashtable"></param>
        private void parseListPage(Hashtable hashtable)
        {
            List <Hashtable> detaiList  = hashtable.ContainsKey(CollectionFieldName.Items) ? (List <Hashtable>)hashtable[CollectionFieldName.Items] : null;
            List <string>    nextPages  = hashtable.ContainsKey(CollectionFieldName.Pages) ? (List <string>)hashtable[CollectionFieldName.Pages] : null;
            List <string>    multiPages = hashtable.ContainsKey(CollectionFieldName.MultiPages) ? (List <string>)hashtable[CollectionFieldName.MultiPages] : null;

            Hashtable htKeys = new Hashtable(); //判断是否重复

            //detaiList = null;
            if (detaiList != null)
            {
                foreach (Hashtable cm in detaiList)
                {
                    //continue;
                    string url = (string)cm[CollectionFieldName.Url];
                    if (!HTMLUtil.IsCorrect(url))
                    {
                        continue;
                    }
                    url = GetFullURL(url);
                    cm[CollectionFieldName.Url] = url;
                    wxchapter rm = new wxchapter();
                    fillWxChapterModel(rm, cm, _CollectionModel.CollectionId);

                    //防止出现重复章节一直循环
                    if (htKeys.ContainsKey(rm.Id))
                    {
                        continue;
                    }
                    htKeys.Add(rm.Id, rm.Id);

                    //章节数据保存
                    this._CollectionModel.chapterList.Add(rm);
                }
            }

            this.currentUrl_ChapterList = null;

            if (nextPages != null)
            {
                foreach (string url in nextPages)
                {
                    this.currentUrl_ChapterList = GetFullURL(url);
                }
            }
        }

Ejemplo n.º 11

0

Mostrar archivo

Archivo: web_jingcaiyuedu.cs Proyecto: change008/boruinoveltools

        /// <summary>
        /// 解析开始页
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseStartupPage(HtmlNode documentNode)
        {
            Hashtable ht        = new Hashtable();
            Regex     tempReg   = null;
            Match     tempMatch = null;
            HtmlNode  linkNodes = documentNode.SelectSingleNode("//meta[@property='og:title']");

            if (linkNodes != null)
            {
                var title = linkNodes.GetAttributeValue("content", "");
                ht.Add(CollectionFieldName.Novel_Name, title);
            }
            linkNodes = documentNode.SelectSingleNode("//div[@class='pic text-center']/img");
            if (linkNodes != null)
            {
                var imgUrl = linkNodes?.GetAttributeValue("src", "");
                imgUrl = HTMLUtil.GetFullURL(InternalRealUrl, imgUrl);
                ht.Add(CollectionFieldName.Novel_CoverImgs, imgUrl);
            }
            linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']");
            if (linkNodes != null)
            {
                var tag = linkNodes?.GetAttributeValue("content", "");
                ht.Add(CollectionFieldName.Novel_Tag, tag);
            }

            var url = InternalRealUrl;

            ht.Add(CollectionFieldName.Url, url);

            tempReg   = new Regex(@"/(\d+).html");
            tempMatch = tempReg.Match(InternalRealUrl);
            if (tempMatch.Success)
            {
                ht.Add(CollectionFieldName.Novel_UniqueFlag, tempReg.Replace(tempMatch.Value, "$1"));
            }
            Hashtable returndata = new Hashtable();

            if (ht != null)
            {
                returndata.Add(CollectionFieldName.BookInfo, ht);
            }
            return(returndata);
        }

Ejemplo n.º 12

0

Mostrar archivo

        /// <summary>
        /// 修改占位图片路径
        /// </summary>
        /// <param name="documentNode"></param>
        /// <param name="InternalRealUrl"></param>
        protected static void ReplacePlaceholderImageSrc(HtmlNode documentNode, string InternalRealUrl, string PlaceholderAttribute)
        {
            HtmlNodeCollection imgNode = documentNode.SelectNodes("//img");

            if (imgNode == null)
            {
                return;
            }
            foreach (var item in imgNode)
            {
                var url = item.GetAttributeValue(PlaceholderAttribute, "");
                if (!string.IsNullOrEmpty(url))
                {
                    var realUrl = HTMLUtil.GetFullURL(InternalRealUrl, url);
                    item.SetAttributeValue("src", realUrl);
                    item.Attributes.Remove(PlaceholderAttribute);
                }
            }
        }

Ejemplo n.º 13

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage2(HtmlNode documentNode)
        {
            Hashtable returndata    = new Hashtable();
            HtmlNode  tempNode      = null;
            string    tempString    = null;
            string    tempInnerText = null;
            Regex     tempReg       = null;
            Match     tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@class='content']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "style", "script");
                returndata.Add(CollectionFieldName.Chap_Content, tempString);
                tempInnerText = tempNode.InnerText;
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempString.Length);
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Pay);
            }
            return(returndata);
        }

Ejemplo n.º 14

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            Hashtable returndata    = new Hashtable();
            HtmlNode  tempNode      = null;
            string    tempString    = null;
            string    tempInnerText = null;
            Regex     tempReg       = null;
            Match     tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@class='cDetail']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "style", "script");
                returndata.Add(CollectionFieldName.Chap_Content, tempString);
                tempInnerText = tempNode.InnerText;
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
            }
            return(returndata);
        }

Ejemplo n.º 15

0

Mostrar archivo

Archivo: web_wanben.cs Proyecto: change008/noveltools

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@class='articleCon']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "dt", "a");
                tempString = tempString.ToLower();
                //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "")
                             .Replace("本站访问地址http://www.ziyouge.com 任意搜索引擎内输入:紫幽阁 即可访问!", "")
                             .Replace("http://www.ziyouge.com", "")
                             .Replace("紫幽阁", "")
                             .Replace("wanben.me", "")
                             .Replace("ziyouge.com", "")
                             .Replace("ziyouge", "")
                             .Replace("http://", "")
                             .Replace("http", "")
                             .Replace("紫Ｙou阁 ＷwＷ.ZiyouＧＥ.com", "")
                             .Replace("WWw.ZiyoUgE.com", "")
                             .Replace("品书网", "")
                             .Replace("www.vodtw.com", "")
                             .Replace("本书来自", "")
                             .Replace("/html/book/19/19092/", "")
                ;

                //正则替换域名
                string pattern = @"(?=.{3,255}$)[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+";
                tempString = Regex.Replace(tempString, pattern, "");


                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "").Replace("feisuz", "")
                                .Replace("作者的话:", "").Replace("新书，求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 16

0

Mostrar archivo

        /// <summary>
        /// 解析开始页
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseStartupPage(HtmlNode documentNode)
        {
            Hashtable ht        = new Hashtable();
            Regex     tempReg   = null;
            Match     tempMatch = null;
            HtmlNode  linkNodes = documentNode.SelectSingleNode("//span[@class='book_name']");

            if (linkNodes != null)
            {
                var title = linkNodes.InnerText;
                ht.Add(CollectionFieldName.Novel_Name, title);
            }
            linkNodes = documentNode.SelectSingleNode("//div[@class='books_bar clear']");
            if (linkNodes != null)
            {
                var imgUrl = linkNodes.SelectSingleNode("div[@class='lr_list']/img")?.GetAttributeValue("src", "");
                ht.Add(CollectionFieldName.Novel_CoverImgs, imgUrl);
                var tag = linkNodes.SelectSingleNode("//ul[@class='book_list']/li[4]")?.InnerText;
                ht.Add(CollectionFieldName.Novel_Tag, tag.Replace("类别：", ""));
                var statusName = linkNodes.SelectSingleNode("//ul[@class='book_list']/li[3]")?.InnerText;
                if (!string.IsNullOrEmpty(statusName) && statusName.Contains("完结"))
                {
                    ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Finish);
                }
                else
                {
                    ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Update);
                }
                var ContentLenStr = linkNodes.SelectSingleNode("//ul[@class='book_list']/li[2]")?.InnerText;
                int ContentLen    = 0;
                if (!string.IsNullOrEmpty(ContentLenStr))
                {
                    tempReg   = new Regex(@"(\d+.\d+)|(\d+)");
                    tempMatch = tempReg.Match(ContentLenStr);
                    if (tempMatch.Success)
                    {
                        if (ContentLenStr.Contains("万"))
                        {
                            ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 10000);
                        }
                        else if (ContentLenStr.Contains("千"))
                        {
                            ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 1000);
                        }
                        else
                        {
                            ContentLen = Convert.ToInt32(tempMatch.Value);
                        }
                    }
                    ht.Add(CollectionFieldName.Novel_ContentLen, ContentLen);
                }
            }
            linkNodes = documentNode.SelectSingleNode("//a[@class='more_link']");
            if (linkNodes != null)
            {
                var url = linkNodes.GetAttributeValue("href", "");
                url = HTMLUtil.GetFullURL(InternalRealUrl, url);
                ht.Add(CollectionFieldName.Url, url);
            }
            linkNodes = documentNode.SelectSingleNode("//div[@id='divDescription']");
            if (linkNodes != null)
            {
                var intr = linkNodes.InnerHtml;
                ht.Add(CollectionFieldName.Novel_Intr, intr);
            }
            tempReg   = new Regex(@"/book/(\d+).html");
            tempMatch = tempReg.Match(InternalRealUrl);
            if (tempMatch.Success)
            {
                ht.Add(CollectionFieldName.Novel_UniqueFlag, tempReg.Replace(tempMatch.Value, "$1"));
            }
            Hashtable returndata = new Hashtable();

            if (ht != null)
            {
                returndata.Add(CollectionFieldName.BookInfo, ht);
            }
            return(returndata);
        }

Ejemplo n.º 17

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id=\"content\"]");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "a");
                //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "")
                             .Replace("\t", "")
                             .Replace("手机请访问：:feisuz", "")
                             .Replace("feisuz", "")
                             .Replace("作者的话:", "")
                             .Replace("新书，求收藏求推荐", "")
                             .Replace("本书红薯网首发,请勿转载!", "")
                             .Replace("老铁!还在找\"美艳冥妻\"免费小说?", "")
                             .Replace("&nbsp;&nbsp;&nbsp;&nbsp;(www.yikanxiaoshuo.com = ", "").Trim().TrimEnd(')')
                             .Replace("<br>&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;百度直接搜索: \"易看小说\" 看免费小说,没毛病!<br>", "")
                             .Replace("老铁!还在找\"绝望游戏\"免费小说?", "")
                             .Replace("百度直接搜索: \"易看小说\" 看免费小说,没毛病!", "")
                             .Replace("\"易看小说\"", "")
                             .Replace("易看小说", "")
                             .Replace("免费小说", "")
                             .Replace("(更快免费阅读加微信：jxxs9966)", "")
                             .Replace("jxxs9966", "")
                ;

                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "");

                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 18

0

Mostrar archivo

Archivo: web_shubao520.cs Proyecto: change008/noveltools

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id='booktext']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "dt", "a");
                tempString = tempString.ToLower();
                //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "");

                tempString = tempString.Replace("【快速查找本站请百度搜索:&nbsp;书包520】", "")
                             .Replace("【本站域名更改为“&nbsp;www.shubao520.net&nbsp;”&nbsp;,或者在百度搜索:&nbsp;书包520】", "");

                tempString = tempString.Replace("【本站域名更改为“www.shubao520.net”,或者在百度搜索:书包520】", "")
                             .Replace("【快速查找本站请百度搜索:书包520】", "")
                             .Replace("书包520", "")
                             .Replace("www.shubao520.net", "")
                             .Replace("百度", "")
                             .Replace("搜索", "")
                             .Replace("域名", "");



                //正则替换域名
                string pattern = @"(?=.{3,255}$)[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+";
                tempString = Regex.Replace(tempString, pattern, "");

                string pattern1 = @"&lt;.+&gt;";
                tempString = Regex.Replace(tempString, pattern1, "");

                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "").Replace("feisuz", "")
                                .Replace("作者的话:", "").Replace("新书，求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempInnerText.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 19

0

Mostrar archivo

        /// <summary>
        /// Search on Allmusic for the requested string
        /// </summary>
        /// <param name="searchBy"></param>
        /// <param name="searchStr"></param>
        /// <returns></returns>
        public bool FindInfo(SearchBy searchBy, string searchStr)
        {
            _searchby = searchBy;
            HTMLUtil util        = new HTMLUtil();
            string   strPostData = "";

            if (SearchBy.Albums == searchBy)
            {
                strPostData = string.Format(ALBUMSEARCH, HttpUtility.UrlEncode(searchStr));
            }
            else
            {
                searchStr   = SwitchArtist(searchStr);
                strPostData = string.Format(ARTISTSEARCH, HttpUtility.UrlEncode(searchStr));
            }

            string strHTML = PostHTTP(MAINURL + URLPROGRAM, strPostData);

            if (strHTML.Length == 0)
            {
                return(false);
            }

            _htmlCode = strHTML; // save the html content...

            Regex multiples = new Regex(
                @"\sSearch\sResults\sfor:",
                RegexOptions.IgnoreCase
                | RegexOptions.Multiline
                | RegexOptions.IgnorePatternWhitespace
                | RegexOptions.Compiled
                );

            if (multiples.IsMatch(strHTML))
            {
                string pattern = "";
                if (searchBy.ToString().Equals("Artists"))
                {
                    pattern = @"<tr.*?>\s*?.*?<td\s*?class=""relevance\stext-center"">\s*?.*\s*?.*</td>" +
                              @"\s*?.*<td.*\s*?.*</td>\s*?.*<td>.*<a.*href=""(?<code>.*?)"">(?<name>.*)</a>.*</td>" +
                              @"\s*?.*<td>(?<detail>.*)</td>\s*?.*<td>(?<detail2>.*)</td>";
                }
                else if (searchBy.ToString().Equals("Albums"))
                {
                    pattern = @"<tr.*?>\s*?.*?<td\s*?class=""relevance\stext-center"">\s*?.*\s*?.*</td>" +
                              @"\s*?.*<td.*\s*?.*</td>\s*?.*<td>.*<a.*href=""(?<code>.*?)"">(?<name>.*)</a>.*</td>" +
                              @"\s*?.*<td>(?<detail>.*)</td>\s*?.*<td>.*</td>\s*?.*<td>(?<detail2>.*)</td>";
                }


                Match m;
                Regex itemsFoundFromSite = new Regex(
                    pattern,
                    RegexOptions.IgnoreCase
                    | RegexOptions.Multiline
                    | RegexOptions.IgnorePatternWhitespace
                    | RegexOptions.Compiled
                    );


                for (m = itemsFoundFromSite.Match(strHTML); m.Success; m = m.NextMatch())
                {
                    string code    = m.Groups["code"].ToString();
                    string name    = m.Groups["name"].ToString();
                    string detail  = m.Groups["detail"].ToString();
                    string detail2 = m.Groups["detail2"].ToString();

                    util.RemoveTags(ref name);
                    util.ConvertHTMLToAnsi(name, out name);

                    util.RemoveTags(ref detail);
                    util.ConvertHTMLToAnsi(detail, out detail);

                    util.RemoveTags(ref detail2);
                    util.ConvertHTMLToAnsi(detail2, out detail2);

                    if (SearchBy.Artists == searchBy)
                    {
                        detail += " - " + detail2;
                        if (detail.Length > 0)
                        {
                            _codes.Add(code);
                            _values.Add(name + " - " + detail);
                        }
                        else
                        {
                            _codes.Add(code);
                            _values.Add(name);
                        }
                    }
                    else
                    {
                        MusicAlbumInfo albumInfo = new MusicAlbumInfo();
                        albumInfo.AlbumURL      = code;
                        albumInfo.Artist        = detail;
                        albumInfo.Title         = name;
                        albumInfo.DateOfRelease = detail2;
                        _albumList.Add(albumInfo);
                    }
                }
                _multiple = true;
            }
            else // found the right one
            {
            }
            return(true);
        }

Ejemplo n.º 20

0

Mostrar archivo

Archivo: basicHTML.cs Proyecto: drualcman/Library

 /// <summary>
 /// Quitar los saltos de linea y poner el TAG html para el salto y limpiar un texto y suprimir los tags html por su equivalente en ascii
 /// </summary>
 /// <param name="texto">Texto HTML</param>
 /// <returns></returns>
 public static string sanitize(string texto)
 {
     return(HTMLUtil.sanitize(texto));
 }

Ejemplo n.º 21

0

Mostrar archivo

        /// <summary>
        /// 解析开始页
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseStartupPage(HtmlNode documentNode)
        {
            Hashtable ht        = new Hashtable();
            Regex     tempReg   = null;
            Match     tempMatch = null;
            HtmlNode  linkNodes = documentNode.SelectSingleNode("//meta[@property='og:title']");

            if (linkNodes != null)
            {
                var title = linkNodes.GetAttributeValue("content", "");
                ht.Add(CollectionFieldName.Novel_Name, title);
            }
            linkNodes = documentNode.SelectSingleNode("//div[@class='pic text-center']/img");
            if (linkNodes != null)
            {
                var imgUrl = linkNodes?.GetAttributeValue("src", "");
                imgUrl = HTMLUtil.GetFullURL(InternalRealUrl, imgUrl);
                ht.Add(CollectionFieldName.Novel_CoverImgs, imgUrl);
            }
            linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']");
            if (linkNodes != null)
            {
                var tag = linkNodes?.GetAttributeValue("content", "");
                ht.Add(CollectionFieldName.Novel_Tag, tag);
            }
            //linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']");
            //if (linkNodes != null)
            //{
            //    var statusName = linkNodes?.GetAttributeValue("content","");
            //    if (!string.IsNullOrEmpty(statusName) && statusName.Contains("完结"))
            //        ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Finish);
            //    else
            //        ht.Add(CollectionFieldName.Novel_Status, BookStatus.BookStatus_Update);
            //}
            //linkNodes = documentNode.SelectSingleNode("//meta[@property='og:novel:category']");
            //if (linkNodes != null)
            //{
            //    var ContentLenStr = linkNodes.SelectSingleNode("div[@class='fn-clear']/ul/li[4]")?.InnerText;
            //    int ContentLen = 0;
            //    if (!string.IsNullOrEmpty(ContentLenStr))
            //    {
            //        tempReg = new Regex(@"(\d+.\d+)|(\d+)");
            //        tempMatch = tempReg.Match(ContentLenStr);
            //        if (tempMatch.Success)
            //        {
            //            if (ContentLenStr.Contains("万"))
            //                ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 10000);
            //            if (ContentLenStr.Contains("千"))
            //                ContentLen = (int)(Convert.ToDouble(tempMatch.Value) * 1000);
            //        }
            //        ht.Add(CollectionFieldName.Novel_ContentLen, ContentLen);
            //    }

            //}
            //linkNodes = documentNode.SelectSingleNode("//div[@class='panel']/a[@class='btn block white']");
            //if (linkNodes != null)
            //{
            //    var url = linkNodes.GetAttributeValue("href", "");
            //    ht.Add(CollectionFieldName.Url, url);
            //}
            var url = InternalRealUrl;

            ht.Add(CollectionFieldName.Url, url);
            //linkNodes = documentNode.SelectSingleNode("//div[@class='book_intro']/p[@id='summary']");
            //if (linkNodes != null)
            //{
            //    var intr = linkNodes.InnerHtml;
            //    ht.Add(CollectionFieldName.Novel_Intr, intr);
            //}
            tempReg   = new Regex(@"/(\d+).html");
            tempMatch = tempReg.Match(InternalRealUrl);
            if (tempMatch.Success)
            {
                ht.Add(CollectionFieldName.Novel_UniqueFlag, tempReg.Replace(tempMatch.Value, "$1"));
            }
            Hashtable returndata = new Hashtable();

            if (ht != null)
            {
                returndata.Add(CollectionFieldName.BookInfo, ht);
            }
            return(returndata);
        }

Ejemplo n.º 22

0

Mostrar archivo

 /// <summary>
 /// 获取完整url地址
 /// </summary>
 /// <param name="relativeURL"></param>
 /// <returns></returns>
 protected string GetFullURL(string relativeURL)
 {
     return(HTMLUtil.GetFullURL(baseUrl, relativeURL));
 }

Ejemplo n.º 23

0

Mostrar archivo

Archivo: MusicArtistInfo.cs Proyecto: MustafaUzumcuCom/MediaPortal-1

        /// <summary>
        /// Parse the Detail Page returned from the Allmusic Scraper
        /// </summary>
        /// <param name="strHTML"></param>
        /// <returns></returns>
        public bool Parse(string strHTML)
        {
            HTMLUtil util       = new HTMLUtil();
            int      begIndex   = 0;
            int      endIndex   = 0;
            string   strHTMLLow = strHTML.ToLower();

            // Get the Artist Name
            string pattern = @"<h1.*class=""title"">(.*)</h1>";

            if (!FindPattern(pattern, strHTML))
            {
                return(false);
            }

            _strArtistName = _match.Groups[1].Value;

            // Born
            pattern = @"<h3>.*Born.*</h3>\s*?<p>(.*)</p>";
            if (FindPattern(pattern, strHTML))
            {
                string strValue = _match.Groups[1].Value;
                util.RemoveTags(ref strValue);
                util.ConvertHTMLToAnsi(strValue, out _strBorn);
                _strBorn = _strBorn.Trim();
            }

            // Years Active
            pattern = @"(<span.*?class=""active"">(.*?)</span>)";
            if (FindPattern(pattern, strHTML))
            {
                while (_match.Success)
                {
                    _strYearsActive += string.Format("{0}s, ", _match.Groups[2].Value);
                    _match           = _match.NextMatch();
                }
                _strYearsActive = _strYearsActive.Trim(new[] { ' ', ',' });
            }

            // Genre
            pattern = @"<div.*?id=""genre-style"">\s*?.*?\s*?<h3>.*?Genres.*?</h3>\s*?.*?(<p>(.*?)</p>)";
            if (FindPattern(pattern, strHTML))
            {
                string data = "";
                while (_match.Success)
                {
                    data  += string.Format("{0}, ", _match.Groups[2].Value);
                    _match = _match.NextMatch();
                }
                util.RemoveTags(ref data);
                util.ConvertHTMLToAnsi(data, out _strGenres);
                _strGenres = _strGenres.Trim(new[] { ' ', ',' });
            }

            // Style
            begIndex = strHTMLLow.IndexOf("<h3>styles</h3>");
            endIndex = strHTMLLow.IndexOf("<!--end genre/styles-->", begIndex + 2);

            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = strHTML.Substring(begIndex, endIndex - begIndex);
                pattern = @"(<li>(.*?)</li>)";
                if (FindPattern(pattern, contentInfo))
                {
                    string data = "";
                    while (_match.Success)
                    {
                        data  += string.Format("{0}, ", _match.Groups[2].Value);
                        _match = _match.NextMatch();
                    }
                    util.RemoveTags(ref data);
                    util.ConvertHTMLToAnsi(data, out _strStyles);
                    _strStyles = _strStyles.Trim(new[] { ' ', ',' });
                }
            }

            // Mood
            begIndex = strHTMLLow.IndexOf("<h3>moods</h3>");
            endIndex = strHTMLLow.IndexOf("</div>", begIndex + 2);
            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = strHTML.Substring(begIndex, endIndex - begIndex);
                pattern = @"(<li>(.*?)</li>)";
                if (FindPattern(pattern, contentInfo))
                {
                    string data = "";
                    while (_match.Success)
                    {
                        data  += string.Format("{0}, ", _match.Groups[2].Value);
                        _match = _match.NextMatch();
                    }
                    util.RemoveTags(ref data);
                    util.ConvertHTMLToAnsi(data, out _strTones);
                    _strTones = _strTones.Trim(new[] { ' ', ',' });
                }
            }

            // Instruments
            begIndex = strHTMLLow.IndexOf("<h3>instruments</h3>");
            endIndex = strHTMLLow.IndexOf("</div>", begIndex + 2);
            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = strHTML.Substring(begIndex, endIndex - begIndex);
                if (FindPattern(pattern, contentInfo))
                {
                    string data = "";
                    while (_match.Success)
                    {
                        data  += string.Format("{0}, ", _match.Groups[2].Value);
                        _match = _match.NextMatch();
                    }
                    util.RemoveTags(ref data);
                    util.ConvertHTMLToAnsi(data, out _strInstruments);
                    _strInstruments = _strInstruments.Trim(new[] { ' ', ',' });
                }
            }

            // picture URL
            pattern = @"<div.*?class=""image"">\s*?.*<img.*id=""artist_image"".*?src=\""(.*?)\""";
            if (FindPattern(pattern, strHTML))
            {
                _strArtistPictureURL = _match.Groups[1].Value;
            }

            // parse AMG BIOGRAPHY
            pattern = @"<td.*?class=""tab_off""><a.*?href=""(.*?)"">.*?Biography.*?</a>";
            if (FindPattern(pattern, strHTML))
            {
                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(_match.Groups[1].Value);
                    begIndex = contentinfo.IndexOf("<!--Begin Biography -->");
                    endIndex = contentinfo.IndexOf("</div>", begIndex + 2);
                    if (begIndex != -1 && endIndex != -1)
                    {
                        pattern = @"<p.*?class=""text"">(.*?)</p>";
                        if (FindPattern(pattern, contentinfo))
                        {
                            string data = _match.Groups[1].Value;
                            util.RemoveTags(ref data);
                            util.ConvertHTMLToAnsi(data, out data);
                            _strAMGBiography = data.Trim();
                        }
                    }
                }
                catch (Exception) {}
            }


            string compilationPage = "";
            string singlesPage     = "";
            string dvdPage         = "";
            string miscPage        = "";

            // discography (albums)
            pattern = @"<td.*class=""tab_off""><a.*?href=""(.*?)"">.*Discography.*</a>";
            if (FindPattern(pattern, strHTML))
            {
                // Get Link to other sub pages
                compilationPage = _match.Groups[1].Value + "/compilations";
                singlesPage     = _match.Groups[1].Value + "/singles-eps";
                dvdPage         = _match.Groups[1].Value + "/dvds-videos";
                miscPage        = _match.Groups[1].Value + "/other";

                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(_match.Groups[1].Value);
                    pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" +
                              @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>";

                    if (FindPattern(pattern, contentinfo))
                    {
                        while (_match.Success)
                        {
                            string year       = _match.Groups["year"].Value;
                            string albumTitle = _match.Groups["album"].Value;
                            string label      = _match.Groups["label"].Value;

                            util.RemoveTags(ref year);
                            util.ConvertHTMLToAnsi(year, out year);
                            util.RemoveTags(ref albumTitle);
                            util.ConvertHTMLToAnsi(albumTitle, out albumTitle);
                            util.RemoveTags(ref label);
                            util.ConvertHTMLToAnsi(label, out label);

                            try
                            {
                                string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() };
                                _discographyAlbum.Add(dAlbumInfo);
                            }
                            catch {}

                            _match = _match.NextMatch();
                        }
                    }
                }
                catch (Exception) {}
            }

            // Compilations
            if (compilationPage != "")
            {
                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(compilationPage);
                    pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" +
                              @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>";

                    if (FindPattern(pattern, contentinfo))
                    {
                        while (_match.Success)
                        {
                            string year       = _match.Groups["year"].Value;
                            string albumTitle = _match.Groups["album"].Value;
                            string label      = _match.Groups["label"].Value;

                            util.RemoveTags(ref year);
                            util.ConvertHTMLToAnsi(year, out year);
                            util.RemoveTags(ref albumTitle);
                            util.ConvertHTMLToAnsi(albumTitle, out albumTitle);
                            util.RemoveTags(ref label);
                            util.ConvertHTMLToAnsi(label, out label);

                            try
                            {
                                string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() };
                                _discographyCompilations.Add(dAlbumInfo);
                            }
                            catch {}

                            _match = _match.NextMatch();
                        }
                    }
                }
                catch (Exception) {}
            }

            // Singles
            if (singlesPage != "")
            {
                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(singlesPage);
                    pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" +
                              @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>";

                    if (FindPattern(pattern, contentinfo))
                    {
                        while (_match.Success)
                        {
                            string year       = _match.Groups["year"].Value;
                            string albumTitle = _match.Groups["album"].Value;
                            string label      = _match.Groups["label"].Value;

                            util.RemoveTags(ref year);
                            util.ConvertHTMLToAnsi(year, out year);
                            util.RemoveTags(ref albumTitle);
                            util.ConvertHTMLToAnsi(albumTitle, out albumTitle);
                            util.RemoveTags(ref label);
                            util.ConvertHTMLToAnsi(label, out label);

                            try
                            {
                                string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() };
                                _discographySingles.Add(dAlbumInfo);
                            }
                            catch {}

                            _match = _match.NextMatch();
                        }
                    }
                }
                catch (Exception) {}
            }

            // DVD Videos
            if (dvdPage != "")
            {
                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(dvdPage);
                    pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" +
                              @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>";

                    if (FindPattern(pattern, contentinfo))
                    {
                        while (_match.Success)
                        {
                            string year       = _match.Groups["year"].Value;
                            string albumTitle = _match.Groups["album"].Value;
                            string label      = _match.Groups["label"].Value;

                            util.RemoveTags(ref year);
                            util.ConvertHTMLToAnsi(year, out year);
                            util.RemoveTags(ref albumTitle);
                            util.ConvertHTMLToAnsi(albumTitle, out albumTitle);
                            util.RemoveTags(ref label);
                            util.ConvertHTMLToAnsi(label, out label);

                            try
                            {
                                string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() };
                                _discographyMisc.Add(dAlbumInfo);
                            }
                            catch {}

                            _match = _match.NextMatch();
                        }
                    }
                }
                catch (Exception) {}
            }

            // Other
            if (miscPage != "")
            {
                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(miscPage);
                    pattern = @"sorted.*? cell"">(?<year>.*?)</td>\s*?.*?</td>\s*.*?<a.*?"">(?<album>.*?)" +
                              @"</a>.*?</td>\s*.*?</td>\s*.*?"">(?<label>.*?)</td>";

                    if (FindPattern(pattern, contentinfo))
                    {
                        while (_match.Success)
                        {
                            string year       = _match.Groups["year"].Value;
                            string albumTitle = _match.Groups["album"].Value;
                            string label      = _match.Groups["label"].Value;

                            util.RemoveTags(ref year);
                            util.ConvertHTMLToAnsi(year, out year);
                            util.RemoveTags(ref albumTitle);
                            util.ConvertHTMLToAnsi(albumTitle, out albumTitle);
                            util.RemoveTags(ref label);
                            util.ConvertHTMLToAnsi(label, out label);

                            try
                            {
                                string[] dAlbumInfo = { year.Trim(), albumTitle.Trim(), label.Trim() };
                                _discographyMisc.Add(dAlbumInfo);
                            }
                            catch {}

                            _match = _match.NextMatch();
                        }
                    }
                }
                catch (Exception) {}
            }

            _bLoaded = true;
            return(_bLoaded);
        }

Ejemplo n.º 24

0

Mostrar archivo

Archivo: MusicInfoScraper.cs Proyecto: MustafaUzumcuCom/MediaPortal-1

        public bool FindAlbuminfo(string strAlbum, string artistName, int releaseYear)
        {
            _albumList.Clear();

//     strAlbum="1999";//escapolygy";

            // make request
            // type is
            // http://www.allmusic.com/cg/amg.dll?P=amg&SQL=escapolygy&OPT1=2

            HTMLUtil util     = new HTMLUtil();
            string   postData = String.Format("P=amg&SQL={0}&OPT1=2", HttpUtility.UrlEncode(strAlbum));

            string html = PostHTTP("http://www.allmusic.com/cg/amg.dll", postData);

            if (html.Length == 0)
            {
                return(false);
            }

            // check if this is an album
            MusicAlbumInfo newAlbum = new MusicAlbumInfo();

            newAlbum.AlbumURL = "http://www.allmusic.com/cg/amg.dll?" + postData;
            if (newAlbum.Parse(html))
            {
                _albumList.Add(newAlbum);
                return(true);
            }

            string htmlLow = html;

            htmlLow = htmlLow.ToLower();
            int startOfTable = htmlLow.IndexOf("id=\"expansiontable1\"");

            if (startOfTable < 0)
            {
                return(false);
            }
            startOfTable = htmlLow.LastIndexOf("<table", startOfTable);
            if (startOfTable < 0)
            {
                return(false);
            }

            HTMLTable table    = new HTMLTable();
            string    strTable = html.Substring(startOfTable);

            table.Parse(strTable);

            for (int i = 1; i < table.Rows; ++i)
            {
                HTMLTable.HTMLRow row          = table.GetRow(i);
                string            albumName    = "";
                string            albumUrl     = "";
                string            nameOfAlbum  = "";
                string            nameOfArtist = "";
                for (int iCol = 0; iCol < row.Columns; ++iCol)
                {
                    string column = row.GetColumValue(iCol);
                    if (iCol == 1 && (column.Length != 0))
                    {
                        albumName = "(" + column + ")";
                    }
                    if (iCol == 2)
                    {
                        nameOfArtist = column;
                        util.RemoveTags(ref nameOfArtist);
                        if (!column.Equals("&nbsp;"))
                        {
                            albumName = String.Format("- {0} {1}", nameOfArtist, albumName);
                        }
                    }
                    if (iCol == 4)
                    {
                        string tempAlbum = column;
                        util.RemoveTags(ref tempAlbum);
                        albumName   = String.Format("{0} {1}", tempAlbum, albumName);
                        nameOfAlbum = tempAlbum;
                    }
                    if (iCol == 4 && column.IndexOf("<a href=\"") >= 0)
                    {
                        int pos1 = column.IndexOf("<a href=\"");
                        pos1 += +"<a href=\"".Length;
                        int iPos2 = column.IndexOf("\">", pos1);
                        if (iPos2 >= 0)
                        {
                            if (nameOfAlbum.Length == 0)
                            {
                                nameOfAlbum = albumName;
                            }

                            // full album url:
                            // http://www.allmusic.com/cg/amg.dll?p=amg&token=&sql=10:66jieal64xs7
                            string url = column.Substring(pos1, iPos2 - pos1);
                            string albumNameStripped;
                            albumUrl = String.Format("http://www.allmusic.com{0}", url);
                            MusicAlbumInfo newAlbumInfo = new MusicAlbumInfo();
                            util.ConvertHTMLToAnsi(albumName, out albumNameStripped);
                            newAlbumInfo.Title2   = albumNameStripped;
                            newAlbumInfo.AlbumURL = util.ConvertHTMLToAnsi(albumUrl);
                            newAlbumInfo.Artist   = util.ConvertHTMLToAnsi(nameOfArtist);
                            newAlbumInfo.Title    = util.ConvertHTMLToAnsi(nameOfAlbum);
                            _albumList.Add(newAlbumInfo);
                        }
                    }
                }
            }

            // now sort
            _albumList.Sort(new AlbumSort(strAlbum, artistName, releaseYear));
            return(true);
        }

Ejemplo n.º 25

0

Mostrar archivo

Archivo: MusicAlbumInfo.cs Proyecto: MustafaUzumcuCom/MediaPortal-1

        public bool Parse(string html)
        {
            _songs.Clear();
            HTMLUtil util       = new HTMLUtil();
            string   strHtmlLow = html.ToLower();

            int begIndex = 0;
            int endIndex = 0;

            //	Extract Cover URL
            string pattern = @"<!--Begin.*?Album.*?Photo-->\s*?.*?<img.*?src=\""(.*?)\""";

            if (FindPattern(pattern, html))
            {
                _strImageURL = _match.Groups[1].Value;
            }

            //	Extract Review
            pattern = @"<td.*?class=""tab_off""><a.*?href=""(.*?)"">.*?Review.*?</a>";
            if (FindPattern(pattern, html))
            {
                try
                {
                    string contentinfo = AllmusicSiteScraper.GetHTTP(_match.Groups[1].Value);
                    pattern = @"<p.*?class=""author"">.*\s*?.*?<p.*?class=""text"">(.*?)</p>";
                    if (FindPattern(pattern, contentinfo))
                    {
                        string data = _match.Groups[1].Value;
                        util.RemoveTags(ref data);
                        util.ConvertHTMLToAnsi(data, out data);
                        _strReview = data.Trim();
                    }
                }
                catch (Exception) {}
            }

            //	Extract Artist
            pattern = @"<h3.*?artist</h3>\s*?.*?<a.*"">(.*)</a>";
            if (FindPattern(pattern, html))
            {
                _artist = _match.Groups[1].Value;
                util.RemoveTags(ref _artist);
            }

            //	Extract Album
            pattern = @"<h3.*?album</h3>\s*?.*?<p>(.*)</P>";
            if (FindPattern(pattern, html))
            {
                _strTitle = _match.Groups[1].Value;
                util.RemoveTags(ref _strTitle);
            }

            // Extract Rating
            pattern = @"<h3.*?rating</h3>\s*?.*?src=""(.*?)""";
            if (FindPattern(pattern, html))
            {
                string strRating = _match.Groups[1].Value;
                util.RemoveTags(ref strRating);
                strRating = strRating.Substring(26, 1);
                try
                {
                    _iRating = Int32.Parse(strRating);
                }
                catch (Exception) {}
            }

            //	Release Date
            pattern = @"<h3.*?release.*?date</h3>\s*?.*?<p>(.*)</P>";
            if (FindPattern(pattern, html))
            {
                _strDateOfRelease = _match.Groups[1].Value;
                util.RemoveTags(ref _strDateOfRelease);

                //	extract the year out of something like "1998 (release)" or "12 feb 2003"
                int nPos = _strDateOfRelease.IndexOf("19");
                if (nPos > -1)
                {
                    if ((int)_strDateOfRelease.Length >= nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) &&
                        Char.IsDigit(_strDateOfRelease[nPos + 3]))
                    {
                        string strYear = _strDateOfRelease.Substring(nPos, 4);
                        _strDateOfRelease = strYear;
                    }
                    else
                    {
                        nPos = _strDateOfRelease.IndexOf("19", nPos + 2);
                        if (nPos > -1)
                        {
                            if ((int)_strDateOfRelease.Length >= nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) &&
                                Char.IsDigit(_strDateOfRelease[nPos + 3]))
                            {
                                string strYear = _strDateOfRelease.Substring(nPos, 4);
                                _strDateOfRelease = strYear;
                            }
                        }
                    }
                }

                nPos = _strDateOfRelease.IndexOf("20");
                if (nPos > -1)
                {
                    if ((int)_strDateOfRelease.Length > nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) &&
                        Char.IsDigit(_strDateOfRelease[nPos + 3]))
                    {
                        string strYear = _strDateOfRelease.Substring(nPos, 4);
                        _strDateOfRelease = strYear;
                    }
                    else
                    {
                        nPos = _strDateOfRelease.IndexOf("20", nPos + 1);
                        if (nPos > -1)
                        {
                            if ((int)_strDateOfRelease.Length > nPos + 3 && Char.IsDigit(_strDateOfRelease[nPos + 2]) &&
                                Char.IsDigit(_strDateOfRelease[nPos + 3]))
                            {
                                string strYear = _strDateOfRelease.Substring(nPos, 4);
                                _strDateOfRelease = strYear;
                            }
                        }
                    }
                }
            }

            // Extract Genre
            begIndex = strHtmlLow.IndexOf("<h3>genre</h3>");
            endIndex = strHtmlLow.IndexOf("</div>", begIndex + 2);
            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = html.Substring(begIndex, endIndex - begIndex);
                pattern = @"(<li>(.*?)</li>)";
                if (FindPattern(pattern, contentInfo))
                {
                    string data = "";
                    while (_match.Success)
                    {
                        data  += string.Format("{0}, ", _match.Groups[2].Value);
                        _match = _match.NextMatch();
                    }
                    util.RemoveTags(ref data);
                    util.ConvertHTMLToAnsi(data, out _strGenre);
                    _strGenre = _strGenre.Trim(new[] { ' ', ',' });
                }
            }

            // Extract Styles
            begIndex = strHtmlLow.IndexOf("<h3>style</h3>");
            endIndex = strHtmlLow.IndexOf("</div>", begIndex + 2);
            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = html.Substring(begIndex, endIndex - begIndex);
                pattern = @"(<li>(.*?)</li>)";
                if (FindPattern(pattern, contentInfo))
                {
                    string data = "";
                    while (_match.Success)
                    {
                        data  += string.Format("{0}, ", _match.Groups[2].Value);
                        _match = _match.NextMatch();
                    }
                    util.RemoveTags(ref data);
                    util.ConvertHTMLToAnsi(data, out _strStyles);
                    _strStyles = _strStyles.Trim(new[] { ' ', ',' });
                }
            }

            // Extract Moods
            begIndex = strHtmlLow.IndexOf("<h3>moods</h3>");
            endIndex = strHtmlLow.IndexOf("</div>", begIndex + 2);
            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = html.Substring(begIndex, endIndex - begIndex);
                pattern = @"(<li>(.*?)</li>)";
                if (FindPattern(pattern, contentInfo))
                {
                    string data = "";
                    while (_match.Success)
                    {
                        data  += string.Format("{0}, ", _match.Groups[2].Value);
                        _match = _match.NextMatch();
                    }
                    util.RemoveTags(ref data);
                    util.ConvertHTMLToAnsi(data, out _strTones);
                    _strTones = _strTones.Trim(new[] { ' ', ',' });
                }
            }

            // Extract Songs
            begIndex = strHtmlLow.IndexOf("<!-- tracks table -->");
            endIndex = strHtmlLow.IndexOf("<!-- end tracks table -->", begIndex + 2);
            if (begIndex != -1 && endIndex != -1)
            {
                string contentInfo = html.Substring(begIndex, endIndex - begIndex);
                pattern = @"<tr.*class=""visible"".*?\s*?<td.*</td>\s*?.*<td.*</td>\s*?.*<td.*?>(?<track>.*)</td>" +
                          @"\s*?.*<td.*</td>\s*?.*<td.*?>(?<title>.*)</td>\s*?.*?<td.*?>\s*?.*</td>\s*?.*?<td.*?>(?<duration>.*)</td>";

                if (FindPattern(pattern, contentInfo))
                {
                    while (_match.Success)
                    {
                        //	Tracknumber
                        int iTrack = 0;
                        try
                        {
                            iTrack = Int32.Parse(_match.Groups["track"].Value);
                        }
                        catch (Exception) {}

                        // Song Title
                        string strTitle = _match.Groups["title"].Value;
                        util.RemoveTags(ref strTitle);
                        util.ConvertHTMLToAnsi(strTitle, out strTitle);

                        //	Duration
                        int    iDuration   = 0;
                        string strDuration = _match.Groups["duration"].Value;
                        int    iPos        = strDuration.IndexOf(":");
                        if (iPos >= 0)
                        {
                            string strMin, strSec;
                            strMin = strDuration.Substring(0, iPos);
                            iPos++;
                            strSec = strDuration.Substring(iPos);
                            int iMin = 0, iSec = 0;
                            try
                            {
                                iMin = Int32.Parse(strMin);
                                iSec = Int32.Parse(strSec);
                            }
                            catch (Exception) {}
                            iDuration = iMin * 60 + iSec;
                        }

                        //	Create new song object
                        MusicSong newSong = new MusicSong();
                        newSong.Track    = iTrack;
                        newSong.SongName = strTitle;
                        newSong.Duration = iDuration;
                        _songs.Add(newSong);

                        _match = _match.NextMatch();
                    }
                }
            }

            //	Set to "Not available" if no value from web
            if (_artist.Length == 0)
            {
                _artist = GUILocalizeStrings.Get(416);
            }
            if (_strDateOfRelease.Length == 0)
            {
                _strDateOfRelease = GUILocalizeStrings.Get(416);
            }
            if (_strGenre.Length == 0)
            {
                _strGenre = GUILocalizeStrings.Get(416);
            }
            if (_strTones.Length == 0)
            {
                _strTones = GUILocalizeStrings.Get(416);
            }
            if (_strStyles.Length == 0)
            {
                _strStyles = GUILocalizeStrings.Get(416);
            }
            if (_strTitle.Length == 0)
            {
                _strTitle = GUILocalizeStrings.Get(416);
            }

            if (_strTitle2.Length == 0)
            {
                _strTitle2 = _strTitle;
            }

            Loaded = true;
            return(true);
        }

Ejemplo n.º 26

0

Mostrar archivo

        // Filmograpy and bio
        public bool GetActorDetails(IMDBUrl url, out IMDBActor actor)
        {
            actor = new IMDBActor();

            string[] vdbParserStr = VdbParserStringActorDetails();

            if (vdbParserStr == null || vdbParserStr.Length != 46)
            {
                return(false);
            }

            try
            {
                string absoluteUri;
                string strBody = GetPage(url.URL, "utf-8", out absoluteUri);

                if (strBody == null)
                {
                    return(false);
                }

                if (strBody.Length == 0)
                {
                    return(false);
                }

                #region Actor imdb id

                // IMDBActorID
                try
                {
                    int    pos = url.URL.LastIndexOf("nm");
                    string id  = url.URL.Substring(pos, 9).Replace("/", string.Empty);
                    actor.IMDBActorID = id;
                }
                catch (Exception) { }

                #endregion

                HTMLParser parser   = new HTMLParser(strBody);
                string     strThumb = string.Empty;
                string     value    = string.Empty;
                string     value2   = string.Empty;

                #region Actor name

                // Actor name
                if ((parser.skipToEndOf(vdbParserStr[0])) &&        // <title>
                    (parser.extractTo(vdbParserStr[1], ref value))) // - IMDb</title>
                {
                    value      = new HTMLUtil().ConvertHTMLToAnsi(value);
                    value      = Util.Utils.RemoveParenthesis(value).Trim();
                    actor.Name = HttpUtility.HtmlDecode(value.Trim());
                }

                if (actor.Name == string.Empty)
                {
                    actor.Name = url.Title;
                }

                #endregion

                // Photo
                string parserTxt  = parser.Content;
                string photoBlock = string.Empty;

                #region Actor photo

                if (parser.skipToStartOf(vdbParserStr[2]) &&             // <td id="img_primary"
                    (parser.extractTo(vdbParserStr[3], ref photoBlock))) // </td>
                {
                    parser.Content = photoBlock;

                    if ((parser.skipToEndOf(vdbParserStr[4])) &&           // <img src="
                        (parser.extractTo(vdbParserStr[5], ref strThumb))) // "
                    {
                        actor.ThumbnailUrl = strThumb;
                    }
                    parser.Content = parserTxt;
                }

                #endregion

                #region Actor birth date

                // Birth date
                if ((parser.skipToEndOf(vdbParserStr[6])) &&          // >Born:</h4>
                    (parser.skipToEndOf(vdbParserStr[7])) &&          // birth_monthday=
                    (parser.skipToEndOf(vdbParserStr[8])) &&          // >
                    (parser.extractTo(vdbParserStr[9], ref value)) && // <
                    (parser.skipToEndOf(vdbParserStr[10])) &&         // year=
                    (parser.extractTo(vdbParserStr[11], ref value2))) // "

                {
                    actor.DateOfBirth = value + " " + value2;
                }

                #endregion

                #region Actor death date

                // Death date
                if ((parser.skipToEndOf(vdbParserStr[12])) &&          // >Died:</h4>
                    (parser.skipToEndOf(vdbParserStr[13])) &&          // death_monthday="
                    (parser.skipToEndOf(vdbParserStr[14])) &&          // >
                    (parser.extractTo(vdbParserStr[15], ref value)) && // <
                    (parser.skipToEndOf(vdbParserStr[16])) &&          // death_date="
                    (parser.extractTo(vdbParserStr[17], ref value2)))  // "
                {
                    actor.DateOfDeath = value + " " + value2;
                }

                #endregion

                parser.resetPosition();

                #region Actor birth place

                // Birth place
                if ((parser.skipToEndOf(vdbParserStr[18])) &&        // birth_place=
                    (parser.skipToEndOf(vdbParserStr[19])) &&        // >
                    (parser.extractTo(vdbParserStr[20], ref value))) // <
                {
                    actor.PlaceOfBirth = HttpUtility.HtmlDecode(value);
                }

                #endregion

                #region Actor death place

                // Death place
                if ((parser.skipToEndOf(vdbParserStr[21])) &&        // death_place=
                    (parser.skipToEndOf(vdbParserStr[22])) &&        // >
                    (parser.extractTo(vdbParserStr[23], ref value))) // <
                {
                    actor.PlaceOfDeath = HttpUtility.HtmlDecode(value);
                }

                #endregion

                //Mini Biography
                parser.resetPosition();

                #region Actor biography

                if ((parser.skipToEndOf(vdbParserStr[24])) &&        // <td id="overview-top">
                    (parser.skipToEndOf(vdbParserStr[25])) &&        // <p>
                    (parser.extractTo(vdbParserStr[26], ref value))) // See full bio</a>
                {
                    value = new HTMLUtil().ConvertHTMLToAnsi(value);
                    actor.MiniBiography = Util.Utils.stripHTMLtags(value);
                    actor.MiniBiography = actor.MiniBiography.Replace(vdbParserStr[45], string.Empty).Trim(); // See full bio »
                    actor.MiniBiography = HttpUtility.HtmlDecode(actor.MiniBiography);                        // Remove HTML entities like &#189;

                    if (actor.MiniBiography != string.Empty)
                    {
                        // get complete biography
                        string bioURL = absoluteUri;

                        if (!bioURL.EndsWith(vdbParserStr[27])) // /
                        {
                            bioURL += vdbParserStr[28];         // /bio
                        }
                        else
                        {
                            bioURL += vdbParserStr[29]; // bio
                        }

                        string strBioBody = GetPage(bioURL, "utf-8", out absoluteUri);

                        if (!string.IsNullOrEmpty(strBioBody))
                        {
                            HTMLParser parser1 = new HTMLParser(strBioBody);

                            if (parser1.skipToEndOf(vdbParserStr[30]) &&        // <h5>Mini Biography</h5>
                                parser1.skipToEndOf(vdbParserStr[31]) &&        // <div class="wikipedia_bio">
                                parser1.extractTo(vdbParserStr[32], ref value)) // </div>
                            {
                                value           = new HTMLUtil().ConvertHTMLToAnsi(value);
                                value           = Regex.Replace(value, @"</h5>\s<h5>", "\n\r");
                                value           = Regex.Replace(value, @"<h5>", "\n\r\n\r");
                                value           = Regex.Replace(value, @"</h5>", ":\n\r");
                                actor.Biography = Util.Utils.stripHTMLtags(value).Trim();
                                actor.Biography = HttpUtility.HtmlDecode(actor.Biography);
                            }
                            else
                            {
                                parser1.resetPosition();

                                if (parser1.skipToEndOf(vdbParserStr[33]) &&        // <h5>Mini Biography</h5>
                                    parser1.extractTo(vdbParserStr[34], ref value)) // </p>
                                {
                                    value           = new HTMLUtil().ConvertHTMLToAnsi(value);
                                    actor.Biography = Util.Utils.stripHTMLtags(value).Trim();
                                    actor.Biography = HttpUtility.HtmlDecode(actor.Biography);
                                }
                            }
                        }
                    }
                }

                #endregion

                // Person is movie director or an actor/actress
                bool isActorPass    = false;
                bool isDirectorPass = false;
                bool isWriterPass   = false;

                parser.resetPosition();

                HTMLParser dirParser = new HTMLParser(); // HTML body for Director
                HTMLParser wriParser = new HTMLParser(); // HTML body for Writers

                #region Check person role in movie (actor, director or writer)

                if ((parser.skipToEndOf(vdbParserStr[35])) && // name="Director">Director</a>
                    (parser.skipToEndOf(vdbParserStr[36])))   // </div>
                {
                    isDirectorPass    = true;
                    dirParser.Content = parser.Content;
                }

                parser.resetPosition();

                if ((parser.skipToEndOf(vdbParserStr[37])) && // name="Writer">Writer</a>
                    (parser.skipToEndOf(vdbParserStr[38])))   // </div>
                {
                    isWriterPass      = true;
                    wriParser.Content = parser.Content;
                }

                parser.resetPosition();

                if (parser.skipToEndOf(vdbParserStr[39]) || // name="Actress">Actress</a>
                    parser.skipToEndOf(vdbParserStr[40]))   // name="Actor">Actor</a>
                {
                    isActorPass = true;
                }

                #endregion

                #region Get movies for every role

                // Get filmography Actor
                if (isActorPass)
                {
                    GetActorMovies(actor, parser, false, false);
                }

                // Get filmography for writers
                if (isWriterPass)
                {
                    parser = wriParser;
                    parser.resetPosition();

                    if ((parser.skipToEndOf(vdbParserStr[41])) && // name="Writer">Writer</a>
                        (parser.skipToEndOf(vdbParserStr[42])))   // </div>
                    {
                        GetActorMovies(actor, parser, false, true);
                    }
                }

                // Get filmography Director
                if (isDirectorPass)
                {
                    parser = dirParser;
                    parser.resetPosition();

                    if (parser.skipToEndOf(vdbParserStr[43]) && // name="Director">Director</a>
                        parser.skipToEndOf(vdbParserStr[44]))   // </div>
                    {
                        GetActorMovies(actor, parser, true, false);
                    }
                }

                #endregion

                // Add filmography
                if (actor.Count > 0)
                {
                    actor.SortActorMoviesByYear();
                }

                return(true);
            }
            catch (Exception ex)
            {
                Log.Error("IMDB.GetActorDetails({0} exception:{1} {2} {3}", url.URL, ex.Message, ex.Source, ex.StackTrace);
            }
            return(false);
        }

Ejemplo n.º 27

0

Mostrar archivo

Archivo: web_23us.cs Proyecto: change008/noveltools

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@id='content']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script", "dt", "a");
                tempString = tempString.ToLower();
                //tempString = HTMLUtil.RemoveHtmlTag(tempString, "p", "img", "br");
                tempString = tempString.Replace("\r\n", "").Replace("\t", "")
                             .Replace("本站访问地址http://www.ziyouge.com 任意搜索引擎内输入:紫幽阁 即可访问!", "")
                             .Replace("http://www.ziyouge.com", "")
                             .Replace("紫幽阁", "")
                             .Replace("wanben.me", "")
                             .Replace("ziyouge.com", "")
                             .Replace("ziyouge", "")
                             .Replace("http://", "")
                             .Replace("http", "")
                             .Replace("紫Ｙou阁 ＷwＷ.ZiyouＧＥ.com", "")
                             .Replace("WWw.ZiyoUgE.com", "")
                             .Replace("品书网", "")
                             .Replace("www.vodtw.com", "")
                             .Replace("本书来自", "")
                             .Replace("/html/book/19/19092/", "")
                             .Replace("大家想继续看我的书，可以加我微信gdy3208新书出了，我会第一时间发动态通知大家！", "")
                             .Replace("本站重要通知:请使用本站的免费小说app,无广告、破防盗版、更新快,会员同步书架,请关注微信公众号 appxsyd (按住三秒复制) 下载免费阅读器!!", "")
                             .Replace("本站重要通知: 请使用本站的免费小说app,无广告、破防盗版、更新快,会员同步书架,请关注微信公众号 gegegengxin (按住三秒复制)下载免费阅读器!!", "")
                             .Replace("本站重要通知:", "")
                             .Replace("请使用本站的免费小说", "")
                             .Replace("app", "")
                             .Replace("无广告、破防盗版、更新快,会员同步书架", "")
                             .Replace("请关注微信公众号", "")
                             .Replace("appxsyd", "")
                             .Replace("gegegengxin", "")
                             .Replace("(按住三秒复制)", "")
                             .Replace("(按住三秒复制)", "")
                             .Replace("下载免费阅读器", "")
                ;

                //正则替换域名
                string pattern = @"(?=.{3,255}$)[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+";
                tempString = Regex.Replace(tempString, pattern, "");

                string pattern1 = @"&lt;.+&gt;";
                tempString = Regex.Replace(tempString, pattern1, "");

                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "").Replace("feisuz", "")
                                .Replace("作者的话:", "").Replace("新书，求收藏求推荐", "").Replace("本书红薯网首发,请勿转载!", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 28

0

Mostrar archivo

        /// <summary>
        /// 解析明细页内容
        /// </summary>
        /// <param name="documentNode"></param>
        /// <returns></returns>
        private Hashtable parseDetailPage1(HtmlNode documentNode)
        {
            List <string> multipage     = null;
            Hashtable     returndata    = new Hashtable();
            HtmlNode      tempNode      = null;
            string        tempString    = null;
            string        tempInnerText = null;
            Regex         tempReg       = null;
            Match         tempMatch     = null;

            tempNode = documentNode.SelectSingleNode("//div[@class='messagecontent']");
            if (tempNode != null)
            {
                tempString = tempNode.InnerHtml;
                tempString = HTMLUtil.RemoveHtmlContent(tempString, "div", "style", "script");

                tempString = tempString.Replace("\r\n", "").Replace("\t", "");

                //正则替换
                string pattern = @"www\.[a-zA-Z0-9]+\.(?:com|cn|net|org)/(\w+|/)+\.html";
                tempString = Regex.Replace(tempString, pattern, "");

                string pattern1 = @"www\.[a-zA-Z0-9]+\.(?:com|cn|net|org)/(\w+|/)+";
                tempString = Regex.Replace(tempString, pattern1, "");

                string pattern2 = @"www\.[a-zA-Z0-9]+\.(?:com|cn|net|org)/?";
                tempString = Regex.Replace(tempString, pattern2, "");

                returndata.Add(CollectionFieldName.Chap_Content, tempString);

                //移除无效字符,用来计算长度
                tempInnerText = HTMLUtil.RemoveHtmlTag(tempString).Replace("&nbsp;", "");
                if (!string.IsNullOrEmpty(tempInnerText))
                {
                    returndata.Add(CollectionFieldName.Chap_ContentLen, tempInnerText.Length);
                    string into = "";
                    if (tempInnerText.Length > 40)
                    {
                        into = tempInnerText.Substring(0, 40) + "...";
                    }
                    else
                    {
                        into = tempInnerText;
                    }
                    returndata.Add(CollectionFieldName.Chap_Intro, into);
                    int price = (tempString.Length / 1000) * 5;
                    if (price == 0)
                    {
                        price = 5;
                    }
                    if (price > 15)
                    {
                        price = 15;
                    }
                    returndata.Add(CollectionFieldName.Chap_Pirce, price);
                }
                returndata.Add(CollectionFieldName.Chap_Status, ChapterStatus.ChapterStatus_OnLine);
                returndata.Add(CollectionFieldName.Chap_ChapterType, ChapterType.ChapterType_Free);
                tempInnerText = tempNode.InnerText;
            }
            return(returndata);
        }

Ejemplo n.º 29

0

Mostrar archivo

Archivo: basicHTML.cs Proyecto: drualcman/Library

 /// <summary>
 /// Volver a poner el formato HTML en un documento descodificado como ascii
 /// </summary>
 /// <param name="texto">Texto HTML</param>
 /// <returns></returns>
 public static string decodeHTML(string texto)
 {
     return(HTMLUtil.decodeHTML(texto));
 }

Ejemplo n.º 30

0

Mostrar archivo

        private void FindIMDBActor(string strURL)
        {
            string[] vdbParserStr = VdbParserStringActor();

            if (vdbParserStr == null || vdbParserStr.Length != 29)
            {
                return;
            }

            try
            {
                string absoluteUri;
                // UTF-8 have problem with special country chars, default IMDB enc is used
                string     strBody = GetPage(strURL, "utf-8", out absoluteUri);
                string     value   = string.Empty;
                HTMLParser parser  = new HTMLParser(strBody);

                if ((parser.skipToEndOf(vdbParserStr[0])) &&           // <title>
                    (parser.extractTo(vdbParserStr[1], ref value)) &&  // </title>
                    !value.ToLowerInvariant().Equals(vdbParserStr[2])) // imdb name search
                {
                    value = new HTMLUtil().ConvertHTMLToAnsi(value);
                    value = Util.Utils.RemoveParenthesis(value).Trim();
                    IMDBUrl oneUrl = new IMDBUrl(absoluteUri, value, "IMDB");
                    _elements.Add(oneUrl);
                    return;
                }

                parser.resetPosition();

                string popularBody = string.Empty;
                string exactBody   = string.Empty;
                string url         = string.Empty;
                string name        = string.Empty;
                string role        = string.Empty;

                if (parser.skipToStartOfNoCase(vdbParserStr[3]))        // Popular names
                {
                    parser.skipToEndOf(vdbParserStr[4]);                // <table>
                    parser.extractTo(vdbParserStr[5], ref popularBody); // </table>

                    parser = new HTMLParser(popularBody);

                    while (parser.skipToStartOf(vdbParserStr[6]))     // href="/name/
                    {
                        parser.skipToEndOf(vdbParserStr[7]);          // href="
                        parser.extractTo(vdbParserStr[8], ref url);   // "
                        parser.skipToEndOf(vdbParserStr[9]);          // Image()).src='/rg/find-name-
                        parser.skipToEndOf(vdbParserStr[10]);         // ';">
                        parser.extractTo(vdbParserStr[11], ref name); // </a>
                        parser.skipToEndOf(vdbParserStr[12]);         // <small>(
                        parser.extractTo(vdbParserStr[13], ref role); // ,

                        if (role != string.Empty)
                        {
                            name += " - " + role;
                        }

                        name = new HTMLUtil().ConvertHTMLToAnsi(name);
                        name = Util.Utils.RemoveParenthesis(name).Trim();
                        IMDBUrl newUrl = new IMDBUrl("http://www.imdb.com" + url, name, "IMDB");
                        _elements.Add(newUrl);
                        parser.skipToEndOf(vdbParserStr[14]); // </tr>
                    }
                }
                parser = new HTMLParser(strBody);

                if (parser.skipToStartOfNoCase(vdbParserStr[15]))      // Exact Matches
                {
                    parser.skipToEndOf(vdbParserStr[16]);              // <table>
                    parser.extractTo(vdbParserStr[17], ref exactBody); // </table>
                }
                else if (parser.skipToStartOfNoCase(vdbParserStr[18])) // Approx Matches
                {
                    parser.skipToEndOf(vdbParserStr[19]);              // <table>
                    parser.extractTo(vdbParserStr[20], ref exactBody); // </table>
                }
                else
                {
                    return;
                }

                parser = new HTMLParser(exactBody);
                url    = string.Empty;
                name   = string.Empty;
                role   = string.Empty;

                while (parser.skipToStartOf(vdbParserStr[21]))    // href="/name/
                {
                    parser.skipToEndOf(vdbParserStr[22]);         // href="
                    parser.extractTo(vdbParserStr[23], ref url);  // "
                    parser.skipToEndOf(vdbParserStr[24]);         // Image()).src='/rg/find-name-
                    parser.skipToEndOf(vdbParserStr[25]);         // ';">
                    parser.extractTo(vdbParserStr[26], ref name); // </a>
                    parser.skipToEndOf(vdbParserStr[27]);         // <small>(
                    parser.extractTo(vdbParserStr[28], ref role); // ,

                    if (role != string.Empty)
                    {
                        name += " - " + role;
                    }

                    name = new HTMLUtil().ConvertHTMLToAnsi(name);
                    name = Util.Utils.RemoveParenthesis(name).Trim();
                    IMDBUrl newUrl = new IMDBUrl("http://www.imdb.com" + url, name, "IMDB");
                    _elements.Add(newUrl);
                    parser.skipToEndOf(vdbParserStr[29]); // </tr>
                }
            }
            catch (Exception ex)
            {
                Log.Error("exception for imdb lookup of {0} err:{1} stack:{2}", strURL, ex.Message, ex.StackTrace);
            }
        }

Ejemplos de HTMLUtil en C# (CSharp)