Ejemplo n.º 1
0
        /// <summary>
        /// 转换数据
        /// </summary>
        /// <returns></returns>
        public override TygModel.书名表 Convert()
        {
            DateTime updateTime;

            DateTime.TryParse(更新, out updateTime);

            //看看分类表里有没有这个分类如果没有则进行分类添加


            //分类表
            var classItems = Skybot.Cache.RecordsCacheManager.Instance.Tygdb.分类表.Where(p => p.分类名称.Trim() == 类别.Trim());

            //当前分类
            TygModel.分类表 classItem = null;
            //如果分类不存在
            if (classItems.Count() == 0)
            {
                classItem = new TygModel.分类表()
                {
                    分类标识 = 类别.Trim(),
                    分类名称 = 类别.Trim(),
                    分类说明 = 类别.Trim(),
                    备注   = "来自 yankuaikan.com",
                    通用分类 = 类别.Trim()
                };
                Skybot.Cache.RecordsCacheManager.Instance.Tygdb.AddTo分类表(classItem);
                //保存分类
                Skybot.Cache.RecordsCacheManager.Instance.Tygdb.SaveChanges();
            }
            else
            {
                classItem = classItems.FirstOrDefault();
            }


            TygModel.书名表 book = new TygModel.书名表()
            {
                分类表      = classItem,
                分类标识     = classItem.分类标识,
                分类表ID    = classItem.ID,
                GUID     = Guid.NewGuid(),
                采集用的URL1 = 小说目录URL,
                采集用的URL2 = 小说简介URL,
                创建时间     = DateTime.Now,
                最新章节     = 最新章节,
                作者名称     = 作者,
                说明       = "",
                书名       = 小说名称.Replace("》", "").Replace("《", ""),
                最后更新时间   = updateTime,
                完本       = 状态.Trim() == "完成" ? true : false,
                配图       = new Func <string>(() =>
                {
                    if (小说简介URL != null)
                    {
                        //初始化一个DOM
                        HtmlAgilityPack.HtmlDocument dom = new HtmlAgilityPack.HtmlDocument();
                        dom.LoadHtml(小说简介URL.GetWeb());

                        //内容
                        HtmlAgilityPack.HtmlNode listContent = dom.GetElementbyId("content");

                        //可能的原素
                        List <PossiblyResultElement> possiblyResultElements = new List <PossiblyResultElement>();


                        //开始循环子原素
                        SingleListPageAnalyse.AnalyseMaxATagNearest(listContent, possiblyResultElements, 0, new PossiblyResultElement()
                        {
                            ParentPossiblyResult = null,
                            CurrnetHtmlElement   = listContent,
                            LayerIndex           = -1,
                            ContainTagNum        = 0
                        });
                        //计算当前所有HTML原素中的img原素
                        var PageimgElements = from img in possiblyResultElements
                                              where img.CurrnetHtmlElement.Name == "a" && img.CurrnetHtmlElement.HasChildNodes && img.CurrnetHtmlElement.ChildNodes.Where(p => p.Name == "img").Count() > 0
                                              select img;

                        try
                        {
                            if (PageimgElements.Count() > 0)
                            {
                                //img.CurrnetHtmlElement.Attributes["src"].Value.Contains("http://tu.yankuai.com") && img.CurrnetHtmlElement.Attributes["src"] != null

                                string imgurl = PageimgElements.First().CurrnetHtmlElement.ChildNodes[0].Attributes["src"].Value;
                                return(imgurl.Trim().Contains("http://tu.yankuai.com") ? imgurl : "/images/noimg.jpg");
                            }
                        }
                        catch (Exception ex)
                        {
                            System.Diagnostics.Debug.WriteLine(DateTime.Now + ex.Message + "|||||" + ex.StackTrace);
                        }
                    }
                    //找到文章目录
                    return("/images/noimg.jpg");
                }).Invoke(),
            };

            return(book);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 传入搜索页面 的HTML内容
        /// 返回章节列表
        /// </summary>
        /// <param name="Html">搜索页面 的HTML内容</param>
        /// <returns>返回章节列表</returns>
        public List <ListPageContentUrl> AnalyseListUrls(string htmlContent, bool IsIndex = true)
        {
            List <ListPageContentUrl> pageListUrls = new List <ListPageContentUrl>();


            //原素标签
            var elementTags = new string[] { "p", "span", "strong", "font", "h1", "tbody", "o:p", "dd", "tr", "table" };
            // 处理Xml的dom
            XDocument xdom = null;

            #region 将HTML转换成 XDocuemnt


            try
            {
                HtmlDocument htmlDom = new HtmlDocument();
                //格式化为html
                htmlDom.LoadHtml(htmlContent);
                //格式化为html
                htmlContent = Tong.HtmlToXML.HTMLConvert(htmlDom.DocumentNode.OuterHtml);
                //重新加载HTML字符串 对一些标签进行闭合
                htmlDom.LoadHtml(htmlContent.FiltrateHTML(elementTags));
                //重新转换字符串
                htmlContent = Tong.HtmlToXML.HTMLConvert(htmlDom.DocumentNode.OuterHtml);

                xdom = XDocument.Parse("<data>" + htmlContent + "</data>");
            }
            catch
            {
                return(pageListUrls);
            }
            #endregion


            #region 分析数据内容



            #region 处理所有原素数据 生成所有带a 的原素
            //可能的原素
            List <PossiblyResultElement> possiblyResultElements = new List <PossiblyResultElement>();

            //开始循环子原素
            SingleListPageAnalyse.AnalyseMaxATagNearest(xdom.Root, possiblyResultElements, 0, new PossiblyResultElement()
            {
                ParentPossiblyResult = null,
                CurrentElement       = xdom.Root,
                LayerIndex           = -1,
                ContainTagNum        = 0
            });
            #endregion

            //得到所页面中所有的A标题原素
            //并将标题转换成为简体中文
            var allAElements = from a in possiblyResultElements
                               where a.CurrentElement.Name == "a" && a.CurrentElement.Attribute("href") != null &&
                               a.CurrentElement.Attribute("href").Value.Length > 5 && a.CurrentElement.Attribute("href").Value.Contains("http") &&
                               !a.CurrentElement.Attribute("href").Value.Contains("baidu") &&
                               (
                a.CurrentElement.Attribute("href").Value.EndsWith(".html", StringComparison.CurrentCultureIgnoreCase) ||
                a.CurrentElement.Attribute("href").Value.EndsWith(".aspx", StringComparison.CurrentCultureIgnoreCase) ||
                a.CurrentElement.Attribute("href").Value.EndsWith(".asp", StringComparison.CurrentCultureIgnoreCase) ||
                a.CurrentElement.Attribute("href").Value.EndsWith(".htm", StringComparison.CurrentCultureIgnoreCase) ||
                a.CurrentElement.Attribute("href").Value.EndsWith(".php", StringComparison.CurrentCultureIgnoreCase) ||
                a.CurrentElement.Attribute("href").Value.EndsWith(".shtml", StringComparison.CurrentCultureIgnoreCase)
                               )
                               select new ListPageContentUrl()
            {
                Url   = new Uri(a.CurrentElement.Attribute("href").Value),
                Title = a.CurrentElement.Value.WordTraditionalToSimple(),
            };
            //找到合适的章节列表
            //仙逆txt下载、仙逆全文阅读下载、仙逆免费章节列表
            var Aels = allAElements.Where(p =>
                                          (p.Title.Contains(KeyWord + "章节列表"))
                                          ||
                                          (p.Title.Contains(KeyWord + "txt"))
                                          ||
                                          (p.Title.Contains(KeyWord + "TXT"))
                                          ||
                                          (p.Title.Contains(KeyWord + "全文阅读"))
                                          ||
                                          (p.Title.Contains(KeyWord + "最新章节"))
                                          ||
                                          (p.Title.Contains(KeyWord + "最新章节列表"))
                                          ||
                                          (p.Title.Contains(KeyWord + "目录"))
                                          ||
                                          (p.Title.Contains(KeyWord + "列表"))
                                          );
            //有可能是正文
            if (!IsIndex)
            {
                Aels = allAElements;
            }
            allAElements = Aels;
            #endregion
            //转换为列表输出
            pageListUrls = allAElements.ToList();

            return(pageListUrls);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 转换数据
        /// </summary>
        /// <returns></returns>
        public override TygModel.书名表 Convert()
        {
            DateTime updateTime = DateTime.Parse("2000-01-01");

            //书本
            TygModel.书名表 book = null;
            //看看分类表里有没有这个分类如果没有则进行分类添加

            using (TygModel.Entities tygdb = new TygModel.Entities())
            {
                //分类表
                var classItems = tygdb.分类表.Where(p => p.分类名称.Trim() == 类别.Trim());
                //当前分类
                TygModel.分类表 classItem = null;
                //如果分类不存在
                if (classItems.Count() == 0)
                {
                    try
                    {
                        classItem = new TygModel.分类表()
                        {
                            分类标识 = 类别.Trim(),
                            分类名称 = 类别.Trim(),
                            分类说明 = 类别.Trim(),
                            备注   = "来自 86zw.com",
                            通用分类 = 类别.Trim()
                        };
                        tygdb.AddTo分类表(classItem);
                        //保存分类
                        tygdb.SaveChanges();
                    }
                    catch (Exception ex)
                    {
                        System.Diagnostics.UDPGroup.SendStrGB2312(ex.Message + (ex.StackTrace != null ? ex.StackTrace : ""));
                    }
                }
                else
                {
                    classItem = classItems.FirstOrDefault();
                }



                book = new TygModel.书名表()
                {
                    分类表      = classItem,
                    分类标识     = classItem.分类标识,
                    分类表ID    = classItem.ID,
                    GUID     = Guid.NewGuid(),
                    采集用的URL1 = 小说目录URL,
                    采集用的URL2 = 小说简介URL,
                    创建时间     = DateTime.Now,
                    最新章节     = 最新章节,
                    作者名称     = 作者,
                    说明       = "",
                    书名       = 小说名称.Replace("》", "").Replace("《", ""),
                    最后更新时间   = updateTime,
                    完本       = 状态.Trim() == "完结" ? true : false,
                    配图       = "/images/noimg.gif",
                };
                //修改配图或者说明

                if (小说简介URL != null)
                {
                    System.Diagnostics.UDPGroup.SendStrGB2312("获取配图:" + 小说简介URL);
                    //初始化一个DOM
                    HtmlAgilityPack.HtmlDocument dom = new HtmlAgilityPack.HtmlDocument();
                    dom.LoadHtml(小说简介URL.GetWeb());

                    //信息说明字段
                    HtmlAgilityPack.HtmlNode DesriptionContent = dom.GetElementbyId("CrbtrTop");
                    //采集时间的xpath表达式 2011-11-18
                    HtmlAgilityPack.HtmlNode node = dom.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[2]/div[2]/div[3]/div[2]/div[1]/ul[1]/li[6]");
                    if (node != null)
                    {
                        if (DateTime.TryParse(node.InnerText, out updateTime))
                        {
                            book.最后更新时间 = updateTime;
                        }
                    }
                    //说明
                    HtmlAgilityPack.HtmlNode summary = dom.GetElementbyId("CrbsSum");

                    if (summary != null)
                    {
                        book.说明 = summary.InnerHtml.Length > 4000 ? new Tong.TongUse().ForMatText(summary.InnerText, 0, 3800) : summary.InnerHtml;
                    }

                    //图片
                    HtmlAgilityPack.HtmlNode listContent = dom.GetElementbyId("CrbtlBookImg");

                    if (listContent != null)
                    {
                        //可能的原素
                        List <PossiblyResultElement> possiblyResultElements = new List <PossiblyResultElement>();


                        //开始循环子原素
                        SingleListPageAnalyse.AnalyseMaxATagNearest(listContent, possiblyResultElements, 0, new PossiblyResultElement()
                        {
                            ParentPossiblyResult = null,
                            CurrnetHtmlElement   = listContent,
                            LayerIndex           = -1,
                            ContainTagNum        = 0
                        });
                        //计算当前所有HTML原素中的img原素
                        var PageimgElements = from img in possiblyResultElements
                                              where img.CurrnetHtmlElement.Name == "img"
                                              select img;

                        try
                        {
                            if (PageimgElements.Count() > 0)
                            {
                                string imgurl = PageimgElements.First().CurrnetHtmlElement.Attributes["src"].Value;
                                if (!imgurl.ToLower().Contains("images/noimg.gif"))
                                {
                                    try
                                    {
                                        imgurl = new Uri(new Uri(小说简介URL), imgurl).ToString();
                                    }
                                    catch (Exception ex)
                                    {
                                        System.Diagnostics.Debug.WriteLine(DateTime.Now + ex.Message + "|||||" + ex.StackTrace);
                                        imgurl = "/images/noimg.gif";
                                    }
                                }

                                System.Diagnostics.UDPGroup.SendStrGB2312("获取配图:" + 小说简介URL + "  完成" + imgurl);

                                book.配图 = imgurl.Trim().Contains("/images/noimg.gif") ? "/images/noimg.gif" : imgurl;
                            }
                        }
                        catch (Exception ex)
                        {
                            System.Diagnostics.Debug.WriteLine(DateTime.Now + ex.Message + "|||||" + ex.StackTrace);
                        }
                    }
                }
            }


            return(book);
        }
Ejemplo n.º 4
0
    void 获取中文更新()
    {
        //书名集合
        List <string> BookNames = new List <string>();

        HtmlAgilityPack.HtmlDocument dom = new HtmlAgilityPack.HtmlDocument();
        // dom.LoadHtml("http://www.xkzw.org/xkph_2.htm".GetWeb());
        dom.LoadHtml(listdiv.InnerHtml);

        //可能的原素
        List <PossiblyResultElement> possiblyResultElements = new List <PossiblyResultElement>();


        //开始循环子原素
        SingleListPageAnalyse.AnalyseMaxATagNearest(dom.DocumentNode, possiblyResultElements, 0, new PossiblyResultElement()
        {
            ParentPossiblyResult = null,
            CurrnetHtmlElement   = dom.DocumentNode,
            LayerIndex           = -1,
            ContainTagNum        = 0
        });

        //移除第一个 ul 原素
        var removeitem = from title in possiblyResultElements
                         where title.CurrnetHtmlElement.Name == "li"
                         select title;

        //移除
        possiblyResultElements.Remove(removeitem.ElementAt(0));


        //计算当前所有HTML原素中的tr原素
        var PageTrElements = from tr in possiblyResultElements
                             where tr.CurrnetHtmlElement.Name == "li"
                             select tr;

        List <Skybot.Collections.Sites.BookInfo86zw_com> list = new List <Skybot.Collections.Sites.BookInfo86zw_com>();

        using (TygModel.Entities tygdb = new TygModel.Entities())
        {
            var books = tygdb.书名表.ToLookup(p => p.书名.Replace("》", "").Replace("《", "").Trim() + "|" + p.作者名称);
            //填类
            foreach (var item in PageTrElements)
            {
                if (item.CurrnetHtmlElement.HasChildNodes)
                {
                    //span class="fl">[东方玄幻]</span>
                    //<span class="sm"><a href="/xkzw3226/" target="_blank">
                    //一等家丁</a></span>
                    //<span class="zj"><a href="/xkzw3226/5162956.html" title="第一五七三章 药水"
                    //target="_blank">第一五七三章 药水</a></span>
                    //<span class="zz">纯情犀利哥</span> <span class="zs">
                    //1608193</span>
                    //<span class="sj">2013-05-12</span> <span class="zt">连载</span>

                    var els = item.CurrnetHtmlElement.SelectNodes("span");

                    Skybot.Collections.Sites.BookInfo86zw_com bookITEM = new Skybot.Collections.Sites.BookInfo86zw_com();
                    bookITEM.类别      = els[0].InnerText.Replace("[", "").Replace("]", "").Trim();
                    bookITEM.小说名称    = els[1].Element("a").InnerText.Replace("\r\n", "").Trim();
                    bookITEM.小说目录URL = "http://www.xkzw.org/" + els[1].Element("a").Attributes["href"].Value;
                    bookITEM.最新章节    = els[2].InnerText;
                    bookITEM.作者      = els[3].InnerText;
                    bookITEM.更新      = DateTime.Now.ToString();
                    bookITEM.采集URL   = bookITEM.小说目录URL;
                    bookITEM.状态      = els[6].InnerText;
                    bookITEM.小说简介URL = null;
                    list.Add(bookITEM);
                }
            }



            //更新或者是添加书
            foreach (var item in list)
            {
                string key   = item.小说名称 + "|" + item.作者;
                var    query = tygdb.书名表.Where(p => p.书名.Replace("》", "").Replace("《", "").Trim() + "|" + p.作者名称 == key);
                if (query.Count() > 0)
                {
                    foreach (var bookItem in query)
                    {
                        bookItem.最后更新时间 = DateTime.Now;
                    }
                }
                else
                {
                    var bok = item.Convert();

                    //添加记录
                    Skybot.Cache.RecordsCacheManager.Instance.Tygdb.AddTo书名表(bok);
                }
            }

            tygdb.SaveChanges();

            tygdb.Connection.Close();
            tygdb.Dispose();
            Skybot.Cache.RecordsCacheManager.Instance.Tygdb.SaveChanges();
        }
    }