Esempio n. 1
0
        internal int CountTextofNode(HtmlNode Node)
        {
            if (Node.XPath.Contains("/script") || Node.XPath.Contains("/style") || Node.XPath.Contains("/meta"))
            {
                return(0);
            }
            if (Node.Name.Contains("#") && !Node.Attributes.Contains("cutting"))
            {
                return(HTMLCleaner.GetCleanInnerText(Node).Length);
            }
            if (Node.ChildNodes.Count == 0)
            {
                return(0);
            }
            int count = 0;

            foreach (HtmlNode node in Node.ChildNodes)
            {
                if (node.Attributes.Contains("cutting"))
                {
                    continue;
                }
                if (node.Name.Contains("#"))
                {
                    count += HTMLCleaner.GetCleanInnerText(node).Length;
                }
                else
                {
                    count += CountTextofNode(node);
                }
            }
            return(count);
        }
Esempio n. 2
0
        public static bool ParseItem(string Html, string Pattern, string Url, ref Article BaseArticle)
        {
            //输入检查
            if (string.IsNullOrWhiteSpace(Html) || string.IsNullOrWhiteSpace(Pattern))
            {
                return(false);
            }

            //检查 Pattern 的格式,判断是否符合要求
            XpathPattern xpathPattern = null;

            try
            {
                xpathPattern = JsonConvert.DeserializeObject <XpathPattern>(Pattern);
            }
            catch (Exception ex)
            {
                Logger.Error(string.Format("Pattern 的格式不符合 Xpath Parser 的定义,请检查!Url:{0}, Pattern:{1}.", Url, Pattern), ex);
            }

            HtmlNode itempagenode = HtmlUtility.getSafeHtmlRootNode(Html, true, true);

            //提取文章正文
            if (string.IsNullOrEmpty(BaseArticle.HtmlContent) && !string.IsNullOrWhiteSpace(xpathPattern.ItemContentXPath))
            {
                try
                {
                    BaseArticle.HtmlContent = HTMLCleaner.CleanContent(itempagenode.SelectNodes(xpathPattern.ItemContentXPath), Url, true);
                    BaseArticle.Content     = HTMLCleaner.CleanHTML(BaseArticle.HtmlContent, false);
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析正文出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex);
                }
            }

            //确认标题
            if (string.IsNullOrEmpty(BaseArticle.Title) && !string.IsNullOrWhiteSpace(xpathPattern.ItemTitleXPath))
            {
                try
                {
                    BaseArticle.Title = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemTitleXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析标题出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemTitleXPath), ex);
                }
            }

            //确认时间
            if (!string.IsNullOrWhiteSpace(xpathPattern.ItemDateXPath))
            {
                try
                {
                    DateTime Pubdate = DateTimeParser.Parser(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemDateXPath)));

                    if (BaseArticle.PubDate <= DateTime.MinValue.AddYears(1) && Pubdate.Year > 2000) //发布时间过旧
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                    else if (BaseArticle.PubDate.Hour == 0 && BaseArticle.PubDate.Minute == 0 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && Pubdate.Year > 2000) //发布时间没有时与分
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                    else if (Pubdate.Year > 2000 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && (BaseArticle.PubDate - Pubdate) > new TimeSpan(0, 1, 59) && BaseArticle.PubDate >= DateTime.Now.AddMinutes(-10)) //发布时间拒当前时间很近且相差较大
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析标题出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex);
                }
            }

            //确认媒体
            if (string.IsNullOrEmpty(BaseArticle.MediaName) && !string.IsNullOrWhiteSpace(xpathPattern.ItemMediaNameXPath))
            {
                try
                {
                    BaseArticle.MediaName = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemMediaNameXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析媒体出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemMediaNameXPath), ex);
                }
            }

            //确认作者
            if (string.IsNullOrEmpty(BaseArticle.Author) && !string.IsNullOrWhiteSpace(xpathPattern.ItemAuthorXPath))
            {
                try
                {
                    BaseArticle.Author = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemAuthorXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析作者出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemAuthorXPath), ex);
                }
            }

            return(true);
        }