Exemple #1
0
        /// <summary>
        /// 获取产品描述
        /// </summary>
        /// <param name="docHtml">html文件</param>
        /// <param name="version">区分图书1和其他0</param>
        /// <returns></returns>
        public static string GetProDesc(string docHtml, int version)
        {
            string area = "";

            try
            {
                docHtml = RegexHelper.ClearTrn(docHtml);

                if (version == 0)
                {
                    //增加 规格描述 -2012-02-29 shy
                    string pt = HtmlCls.GetHtmlByCss(docHtml, "Ptable").FirstOrDefault();
                    if (!string.IsNullOrEmpty(pt))
                    {
                        area += pt;
                    }

                    area += HtmlCls.GetHtmlByCss(docHtml, "content").FirstOrDefault();
                }
                else
                {
                    //图书类
                    var list = HtmlCls.GetHtmlByCss(docHtml, "m m1");
                    area = list.Aggregate(area, (current, s) => current + s);
                    string listH = HtmlCls.GetHtmlByCss(area, "list-h").FirstOrDefault();
                    if (!string.IsNullOrEmpty(listH))
                    {
                        area = area.Replace(listH, "");
                    }
                    //去除【该作者其它作品】区域
                    listH = HtmlCls.GetHtmlById(area, "related-works");
                    if (!string.IsNullOrEmpty(listH))
                    {
                        area = area.Replace(listH, "");
                    }
                    string sum     = HtmlCls.GetHtmlById(docHtml, "summary"); //加入图书信息
                    var    sumList = RegexHelper.Matches(sum, "<li[^>]*>(.*?)</li>").Take(9);
                    sum = sumList.Aggregate("", (current, s) => current + "<div>" + s + "</div>");
                    sum = Regex.Replace(sum, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签

                    area = sum + area;
                }
                //排除授权html
                string red = HtmlCls.GetHtmlByAttr(area, "color=\"red\"").FirstOrDefault();
                if (!string.IsNullOrEmpty(red))
                {
                    area = area.Replace(red, "");
                }
                area = area.Replace("class=\"content\"", "");                                        //排除样式冲突
                //area = Regex.Replace(area, "class=['\"][^'\"]*['\"]", "");//排除样式冲突.终极
                area = Regex.Replace(area, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签
                area = Regex.Replace(area, "\\sstyle=(['\"])[^'\"]+?\\1", "");                       //排除样式
                area = Regex.Replace(area, "<script[^>]*>(.*?)</script>", "");                       //排除script标签
                area = Regex.Replace(area, "src\\d=", "src=");                                       //显示src
                area = Regex.Replace(area, "京东商城|京东", "本商城");                                        //排除京东字样
            }
            catch (Exception ex)
            {
                FileHelper.WriteException(ex);
            }
            return(area);
        }