/// <summary> Create a new tag node. /// Note that the attributes vector contains at least one element, /// which is the tag name (standalone attribute) at position zero. /// This can be used to decide which type of node to create, or /// gate other processing that may be appropriate. /// </summary> /// <param name="page">The page the node is on. /// </param> /// <param name="start">The beginning position of the tag. /// </param> /// <param name="end">The ending positiong of the tag. /// </param> /// <param name="attributes">The attributes contained in this tag. /// </param> /// <returns> A tag node comprising the indicated characters from the page. /// </returns> public virtual ITag CreateTagNode(Page page, int start, int end, System.Collections.ArrayList attributes) { TagAttribute attribute; System.String id; ITag prototype; ITag ret; ret = null; if (0 != attributes.Count) { attribute = (TagAttribute) attributes[0]; id = attribute.GetName(); if (null != id) { try { id = id.ToUpper(new System.Globalization.CultureInfo("en")); if (!id.StartsWith("/")) { if (id.EndsWith("/")) id = id.Substring(0, (id.Length - 1) - (0)); prototype = (ITag) mBlastocyst[id]; if (null != prototype) { ret = (ITag) prototype.Clone(); ret.Page = page; ret.StartPosition = start; ret.EndPosition = end; ret.AttributesEx = attributes; } } } catch { // default to creating a generic one } } } if (null == ret) { // generate a generic node try { ret = (ITag) TagPrototype.Clone(); ret.Page = page; ret.StartPosition = start; ret.EndPosition = end; ret.AttributesEx = attributes; } catch { ret = new TagNode(page, start, end, attributes); } } return (ret); }
/// <summary> /// 给定根节点,填充相似转发微博的信息 /// </summary> /// <param name="currentPage">微博所在页面序号</param> /// <param name="i">微博在所在页面中的流水号</param> /// <param name="similarFeedDiv">包含相似微博的div标记</param> /// <param name="originalAuthor">转发微博的原作者</param> /// <param name="feedContent">转发微博的内容</param> /// <returns>返回填充好的一个Feed实例</returns> private Feed GetSimilarFeed(int currentPage, int i, TagNode similarFeedDiv, string originalAuthor, string feedContent) { Feed feed = new Feed(); serialNumber++; feed.Page = currentPage; feed.Number = serialNumber; feed.ReFeedOrNot = true; #region 获取转发相似微博的作者 NodeList feedAuthorNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true); if (feedAuthorNodeList.Size() == 1) { ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[1]; string author = feedAuthorTag.GetAttribute("TITLE"); feed.Author = author; //如果存在,则获取该作者的备注名 INode remarkNameNode = feedAuthorTag.NextSibling; if (remarkNameNode.GetType().Equals(typeof(Span))) { string remarkName = ((Span)remarkNameNode).StringText; //去掉前后括号 remarkName = remarkName.Substring(1, remarkName.Length - 2); feed.RemarkName = remarkName; } } else { Console.WriteLine("第" + i + "条微博中,判断转发类似微博的作者的标准出错!"); } #endregion #region 获取转发相似微博的转发理由 NodeList reFeedReasonNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true); if (reFeedReasonNodeList.Size() == 1) { feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false); } else { Console.WriteLine("第" + i + "条微博中,判断转发类似微博的转发理由的标准出错!"); } #endregion //因为是转发相似微博,所以原微博的作者和内容就通过参数传入 feed.OriginalAuthor = originalAuthor; feed.Content = feedContent; //获取转发类似微博的发送地点信息 feed.Location = GetLocationInfo(i, similarFeedDiv); //获取转发类似微博赞数 feed.LikeCount = GetFeedLikeInfo(i, similarFeedDiv); //获取转发类似微博的转发数 feed.ReFeedCount = GetFeedForwardCount(i, similarFeedDiv); //获取转发类似微博的评论数 feed.CommentCount = GetFeedCommentCount(i, similarFeedDiv); //获取转发类似微博的发送时间 feed.Time = GetFeedTimeInfo(i, similarFeedDiv); //获取转发类似微博的发送方式 feed.Device = GetFeedSendTypeInfo(i, similarFeedDiv); return feed; }
/// <summary> Create a tag like the one provided.</summary> /// <param name="tag">The tag to emulate. /// </param> /// <param name="scanner">The scanner for this tag. /// </param> public TagNode(TagNode tag, TagScanner scanner):this(tag.Page, tag.TagBegin, tag.TagEnd, tag.AttributesEx) { ThisScanner = scanner; }
/// <summary> /// 给定根结点,填充转发微博相关的各种信息 /// </summary> /// <param name="i">该微博在所在页面中的流水号</param> /// <param name="feed">保存该微博的Feed实例</param> /// <param name="reFeedDiv">包含转发微博的div标记</param> /// <param name="feedDiv">包含原微博的div标记</param> private void GetReFeedInfo(int i, Feed feed, TagNode reFeedDiv, TagNode feedDiv) { //标记ReFeedOrNot为true,表明是转发微博 feed.ReFeedOrNot = true; //标识是否出现“转发微博已被删除”的情况 bool reFeedIsDeleted = false; #region 获取原微博作者 NodeList reFeedOriginalAuthorNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedAuthorFilter, true); if (reFeedOriginalAuthorNodeList.Size() == 1) { INode reFeedOriginalAuthorNode = reFeedOriginalAuthorNodeList[0]; //由于包含原微博作者的链接标记与得到的子div相对位置不定(某些情况下可能会有空文本标记,很奇怪= =),所以采用遍历判断标记类型的办法 for (int j = 0; j < reFeedOriginalAuthorNode.Children.Size(); j++) { INode reFeedOriginalAuthorCandidate = reFeedOriginalAuthorNode.Children[j]; if (reFeedOriginalAuthorCandidate.GetType().Equals(typeof(ATag))) { feed.OriginalAuthor = ((ATag)reFeedOriginalAuthorCandidate).GetAttribute("TITLE"); break; } } } else { NodeList deletedFeedList = reFeedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter1, true); if (deletedFeedList.Size() > 0) { reFeedIsDeleted = true; feed.OriginalAuthor = "Unknown"; } else { Console.WriteLine("第" + i + "条微博中,判断转发微博作者的标准出错!"); } } #endregion #region 获取原微博内容 NodeList reFeedContentNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedContentFilter, true); if (reFeedContentNodeList.Size() == 1) { //不清楚em是什么类型的节点,所以直接传递reFeedContentNodeList.Children给函数,让函数对其中每个元素进行遍历处理 //在个人主页中,好像又没有em节点了= =瞎了…… feed.Content = GetContentFromChildren(feed, reFeedContentNodeList[0], false); } else { if (reFeedIsDeleted) { feed.Content = "微博已删除"; } else { Console.WriteLine("第" + i + "条微博中,判断转发微博内容的标准出错!"); } } #endregion #region 获取本条微博内容作为转发理由 NodeList reFeedReasonNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true); //注意:如果含有“还有X条对原微博的转发”的内容,那么此处reFeedReasonNodeList的数量应该等于(1 + similarFeedCount),但是考虑到之前已经有了多次判断相等的过程,所以此处直接调用第一个(即下标为0的元素)即可 feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false); #endregion }