Beispiel #1
0
 private static void EnsurePermissionAllowed(CQ body, MillRequest request)
 {
     if (body.Text() == "无权查看本主题")
     {
         throw new PermissionDeniedException(request);
     }
 }
Beispiel #2
0
        public MillResult <ForumThread> ProcessThreadPage(MillRequest request)
        {
            if (request == null || string.IsNullOrWhiteSpace(request.HtmlContent))
            {
                throw new ArgumentNullException("request");
            }

            var root = CQ.CreateDocument(request.HtmlContent);

            EnsureSignedIn(root, request);

            var body = root.Select("div.wrap:first > div:eq(1)").FirstElement();

            if (body == null || body.HasAttributes)
            {
                throw new ProcessFaultException(request, "无法定位包含内容的div元素");
            }
            var bodyCq = body.Cq();

            EnsurePermissionAllowed(bodyCq, request);

            var currentPageIndex = GetThreadPageCurrentIndex(request, root);
            var isFirstPage      = currentPageIndex == 1;
            var thread           = new ForumThread
            {
                Url = request.Url,
                Id  = request.Url.GetThreadId(),
                CurrentPageIndex = currentPageIndex
            };

            if (isFirstPage)
            {
                ProcessFirstPost(request, bodyCq, thread, root);
            }

            var infobarNodes = bodyCq.Find("div.infobar");
            var messageNodes = isFirstPage ? bodyCq.Find("div.message:gt(0)") : bodyCq.Find("div.message");

            if (infobarNodes.Length != messageNodes.Length)
            {
                throw new ProcessFaultException(request, "infobar和message元素个数不一致。");
            }

            ProcessReplies(request, infobarNodes, messageNodes, thread);

            var nextPageUrl = isFirstPage ? null : GetNextThreadPageUrl(request, root);

            return(new MillResult <ForumThread>
            {
                Url = request.Url,
                HumanReadableDescription = request.HumanReadableDescription,
                NextPageUrl = nextPageUrl,
                Result = thread
            });
        }
Beispiel #3
0
        private static string GetNextThreadPageUrl(MillRequest request, CQ root)
        {
            var currentPageNode = root.Select("span.paging > span.s1").FirstElement();

            if (currentPageNode == null)
            {
                throw new ProcessFaultException(request, "找不到分页元素。");
            }

            var previous = currentPageNode.Cq().Prev("a").FirstElement();

            return(previous == null ? null : previous["href"]);
        }
Beispiel #4
0
        private static string GetNextForumPageUrl(CQ root, MillRequest request)
        {
            var currentPageIndexNode = root.Select("span.paging > span.s1").FirstOrDefault();

            if (currentPageIndexNode == null)
            {
                throw new ProcessFaultException(request, "找不到分页元素。");
            }

            var nextPageNode = currentPageIndexNode.Cq().Next("a").FirstOrDefault();

            return(nextPageNode == null ? null : nextPageNode["href"]);
        }
Beispiel #5
0
        private static int GetThreadPageCurrentIndex(MillRequest request, CQ root)
        {
            if (!request.Url.MatchPageIndex())
            {
                return(1);
            }

            var pagingNode = root.Select("span.paging:first").FirstOrDefault();

            if (pagingNode == null)
            {
                return(1);
            }

            var currentPageIndexNode = pagingNode.Cq().Find("span.s1").Single();

            return(int.Parse(currentPageIndexNode.InnerText.Replace("##", "")));
        }
Beispiel #6
0
        private static void ProcessReplies(MillRequest request, CQ infobarNodes, CQ messageNodes, ForumThread thread)
        {
            if (infobarNodes.Length <= 0)
            {
                return;
            }
            var replies = infobarNodes.Zip(messageNodes, (infobarNode, messageNode) => new { infobarNode, messageNode }).Select((x, index) =>
            {
                var post = new Post
                {
                    ThreadId    = thread.Id,
                    HtmlContent = HttpUtility.HtmlDecode(x.messageNode.InnerHTML)
                };
                var orderAnchor = x.infobarNode.Cq().Find("b a").FirstElement();
                if (orderAnchor == null)
                {
                    throw new ProcessFaultException(request, string.Format("第{0}个回复无法定位楼层锚元素。", index));
                }
                post.Id    = orderAnchor["href"].GetPostId();
                post.Order = int.Parse(orderAnchor.InnerText.Replace("#", ""));

                var nextTextNode = orderAnchor.ParentNode.NextSibling;
                if (nextTextNode == null || nextTextNode.NodeType != NodeType.TEXT_NODE)
                {
                    throw new ProcessFaultException(request, string.Format("第{0}个回复无法定位作者元素。", index));
                }
                var text      = nextTextNode.Cq().Text();
                post.UserName = text.Trim().EndsWith("匿名")
                    ? "匿名"
                    : orderAnchor.ParentNode.NextElementSibling.Cq().Text();
                post.CreateDate   = DateTime.Parse(x.infobarNode.Cq().Find("span.nf:first").Text());
                post.ModifyDate   = GetModifyDate(x.messageNode.Cq(), post.UserName);
                var ratings       = x.messageNode.GetRatings();
                post.PositiveRate = ratings.Item1;
                post.NegativeRate = ratings.Item2;

                return(post);
            });

            thread.Posts.AddRange(replies);
        }
Beispiel #7
0
        public MillResult <List <ThreadHeader> > ProcessForumPage(MillRequest request)
        {
            if (request == null || string.IsNullOrWhiteSpace(request.HtmlContent))
            {
                throw new ArgumentNullException("request");
            }

            var root = CQ.CreateDocument(request.HtmlContent);

            EnsureSignedIn(root, request);

            var titles  = root.Select("span.title");
            var authors = root.Select("span.author");

            if (titles.Length != authors.Length)
            {
                throw new ProcessFaultException(request, "forum page title和author元素个数不一样。");
            }
            if (titles.Length == 0)
            {
                throw new ProcessFaultException(request, "forum page 没有找到title和author元素。");
            }

            var headers = new List <ThreadHeader>(titles.Length);

            for (var i = 0; i < titles.Length; i++)
            {
                var titleNode  = titles[i];
                var authorNode = authors[i];
                headers.Add(GetThreadHeader(titleNode, authorNode, request, i));
            }

            return(new MillResult <List <ThreadHeader> >
            {
                Url = request.Url,
                NextPageUrl = GetNextForumPageUrl(root, request),
                Result = headers
            });
        }
Beispiel #8
0
        private static void EnsureSignedIn(CQ root, MillRequest request)
        {
            var signedIn = true;
            var footer   = root.Select("div#footer").FirstOrDefault();

            if (footer == null)
            {
                signedIn = false;
            }
            else
            {
                var anchorsInFooter = footer.Cq().Find("a");
                if (anchorsInFooter.Any(link => link.Cq().Text() == "注册") ||
                    anchorsInFooter.Any(link => link.Cq().Text() == "登陆"))
                {
                    signedIn = false;
                }
            }

            if (!signedIn)
            {
                throw new NotSignedInException(request);
            }
        }
Beispiel #9
0
        private static ThreadHeader GetThreadHeader(IDomObject titleNode, IDomObject authorNode, MillRequest request, int i)
        {
            var titleAnchor = titleNode.Cq().Find("a:first").FirstOrDefault();

            if (titleAnchor == null)
            {
                throw new ProcessFaultException(request, string.Format("第{0}个title元素里面没有a元素。", i));
            }

            var url       = titleAnchor["href"];
            var titleText = titleAnchor.Cq().Text().Trim();
            var threadId  = url.GetThreadId();

            if (0 == threadId)
            {
                throw new ProcessFaultException(request, string.Format("无法从第{0}个thread url中获取thread id。", i));
            }
            var header = new ThreadHeader
            {
                Id         = threadId,
                Url        = url,
                Title      = titleText,
                ReplyCount = -1
            };
            var authorText = authorNode.Cq().Text();

            if (string.IsNullOrWhiteSpace(authorText))
            {
                return(header);
            }
            var values = authorText.Replace("[", "").Replace("]", "").Split('/');

            if (values.Length != 4)
            {
                return(header);
            }

            header.UserName = values[0];
            int replyCount;

            if (int.TryParse(values[1], out replyCount))
            {
                header.ReplyCount = replyCount;
            }

            return(header);
        }
Beispiel #10
0
        private static void ProcessFirstPost(MillRequest request, CQ bodyCq, ForumThread thread, CQ root)
        {
            var titleNode = bodyCq.Find("b:first").FirstOrDefault();

            if (titleNode == null)
            {
                throw new ProcessFaultException(request, "无法定位主题的title元素");
            }
            thread.Title = titleNode.Cq().Text();

            var dateNode = titleNode.NextSibling;

            if (dateNode == null)
            {
                throw new ProcessFaultException(request, "无法定位发帖日期元素");
            }
            dateNode = dateNode.NextSibling;
            if (dateNode == null || dateNode.NodeType != NodeType.TEXT_NODE)
            {
                throw new ProcessFaultException(request, "无法定位发帖日期元素");
            }
            var createDate = DateTime.Parse(dateNode.NodeValue.Replace("时间:", "").Trim());

            var authorLiteralNode = dateNode.NextSibling;

            if (authorLiteralNode == null)
            {
                throw new ProcessFaultException(request, "无法定位主题作者元素");
            }
            authorLiteralNode = authorLiteralNode.NextSibling;
            if (authorLiteralNode == null || authorLiteralNode.NodeType != NodeType.TEXT_NODE)
            {
                throw new ProcessFaultException(request, "无法定位主题作者元素");
            }
            var author = authorLiteralNode.NodeValue;

            if (author == "作者:匿名")
            {
                thread.UserName = "******";
            }
            else if (author == "作者:")
            {
                var userNameAnchor = authorLiteralNode.NextElementSibling;
                if (userNameAnchor == null || !"A".Equals(userNameAnchor.NodeName.ToUpper(), StringComparison.OrdinalIgnoreCase))
                {
                    throw new ProcessFaultException(request, "无法定位主题作者元素");
                }
                thread.UserName = userNameAnchor.Cq().Text().Trim();
            }

            var messageNode = bodyCq.Find("div.message:first").FirstElement();

            if (messageNode == null)
            {
                throw new ProcessFaultException(request, "无法定位主题内容元素");
            }

            var firstPostHtmlContent = HttpUtility.HtmlDecode(messageNode.InnerHTML);
            var modifyDate           = GetModifyDate(messageNode.Cq(), thread.UserName);
            var pidAnchor            = messageNode.Cq().NextAll("a").FirstElement();

            if (pidAnchor == null || pidAnchor["href"] == null)
            {
                pidAnchor = root.Select("a:contains('引用')").FirstElement();
            }
            if (pidAnchor == null)
            {
                throw new ProcessFaultException(request, "无法定位主题中包含pid的锚元素,无法获取pid");
            }
            var pid     = pidAnchor["href"].GetPostId();
            var ratings = messageNode.GetRatings();
            var post    = new Post
            {
                Id           = pid,
                ThreadId     = thread.Id,
                UserName     = thread.UserName,
                Title        = thread.Title,
                Order        = 1,
                HtmlContent  = firstPostHtmlContent,
                CreateDate   = createDate,
                ModifyDate   = modifyDate,
                PositiveRate = ratings.Item1,
                NegativeRate = ratings.Item2,
            };

            thread.Posts.Add(post);
        }
Beispiel #11
0
 public PermissionDeniedException(MillRequest request)
     : base(request, "", null)
 {
 }
Beispiel #12
0
 public ProcessFaultException(MillRequest request, string message, Exception e = null)
     : base(request, message, e)
 {
 }
Beispiel #13
0
 public NotSignedInException(MillRequest request)
     : base(request, "", null)
 {
 }
Beispiel #14
0
 protected PageMillException(MillRequest request, string message, Exception e)
     : base(message, e)
 {
     Request = request;
 }
Beispiel #15
0
 public static MillStatus TryProcessPage <T>(this IPageProcessor pageProcessor, MillRequest request, out MillResult <T> result)
     where T : class
 {
     result = new MillResult <T> {
         Url = request.Url, HumanReadableDescription = request.HumanReadableDescription
     };
     try
     {
         if (typeof(T) == typeof(List <ThreadHeader>))
         {
             result = pageProcessor.ProcessForumPage(request) as MillResult <T>;
         }
         else if (typeof(T) == typeof(ForumThread))
         {
             result = pageProcessor.ProcessThreadPage(request) as MillResult <T>;
         }
         else
         {
             throw new NotSupportedException();
         }
         return(MillStatus.Success);
     }
     catch (NotSignedInException ne)
     {
         Logger.Info("解析网页时发现未登录,URL: {0}\r\n 内容:{1}", ne.Request.Url, ne.Request.HtmlContent);
         return(MillStatus.NotSignedIn);
     }
     catch (PermissionDeniedException pde)
     {
         Logger.Info("解析网页时发现没有权限,URL: {0}\r\n 内容:{1}", pde.Request.Url, pde.Request.HtmlContent);
         return(MillStatus.PermissionDenied);
     }
     catch (ProcessFaultException pfe)
     {
         Logger.Info("解析网页时发生错误,错误信息:{0}\r\n URL: {1}\r\n 内容:{2}\r\n 内部异常:{3}\r\n", pfe.Message, pfe.Request.Url,
                     pfe.Request.HtmlContent, pfe.InnerException);
         return(MillStatus.FormatError);
     }
     catch (Exception e)
     {
         Logger.Info("Error URL:{0}\r\n{1}\r\n{2}", request.Url, e, request.HtmlContent);
         return(MillStatus.FormatError);
     }
 }