/// <summary> Create a new tag node.
		/// Note that the attributes vector contains at least one element,
		/// which is the tag name (standalone attribute) at position zero.
		/// This can be used to decide which type of node to create, or
		/// gate other processing that may be appropriate.
		/// </summary>
		/// <param name="page">The page the node is on.
		/// </param>
		/// <param name="start">The beginning position of the tag.
		/// </param>
		/// <param name="end">The ending positiong of the tag.
		/// </param>
		/// <param name="attributes">The attributes contained in this tag.
		/// </param>
		/// <returns> A tag node comprising the indicated characters from the page.
		/// </returns>
		public virtual ITag CreateTagNode(Page page, int start, int end, System.Collections.ArrayList attributes)
		{
			TagAttribute attribute;
			System.String id;
			ITag prototype;
			ITag ret;
			
			ret = null;
			
			if (0 != attributes.Count)
			{
				attribute = (TagAttribute) attributes[0];
				id = attribute.GetName();
				if (null != id)
				{
					try
					{
						id = id.ToUpper(new System.Globalization.CultureInfo("en"));
						if (!id.StartsWith("/"))
						{
							if (id.EndsWith("/"))
								id = id.Substring(0, (id.Length - 1) - (0));
							prototype = (ITag) mBlastocyst[id];
							if (null != prototype)
							{
								ret = (ITag) prototype.Clone();
								ret.Page = page;
								ret.StartPosition = start;
								ret.EndPosition = end;
								ret.AttributesEx = attributes;
							}
						}
					}
					catch
					{
						// default to creating a generic one
					}
				}
			}
			if (null == ret)
			{
				// generate a generic node
				try
				{
					ret = (ITag) TagPrototype.Clone();
					ret.Page = page;
					ret.StartPosition = start;
					ret.EndPosition = end;
					ret.AttributesEx = attributes;
				}
				catch
				{
					ret = new TagNode(page, start, end, attributes);
				}
			}
			
			return (ret);
		}
Esempio n. 2
0
        /// <summary>
        /// 给定根节点,填充相似转发微博的信息
        /// </summary>
        /// <param name="currentPage">微博所在页面序号</param>
        /// <param name="i">微博在所在页面中的流水号</param>
        /// <param name="similarFeedDiv">包含相似微博的div标记</param>
        /// <param name="originalAuthor">转发微博的原作者</param>
        /// <param name="feedContent">转发微博的内容</param>
        /// <returns>返回填充好的一个Feed实例</returns>
        private Feed GetSimilarFeed(int currentPage, int i, TagNode similarFeedDiv, string originalAuthor, string feedContent)
        {
            Feed feed = new Feed();
            serialNumber++;
            feed.Page = currentPage;
            feed.Number = serialNumber;

            feed.ReFeedOrNot = true;

            #region 获取转发相似微博的作者
            NodeList feedAuthorNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true);
            if (feedAuthorNodeList.Size() == 1)
            {
                ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[1];
                string author = feedAuthorTag.GetAttribute("TITLE");
                feed.Author = author;
                //如果存在,则获取该作者的备注名
                INode remarkNameNode = feedAuthorTag.NextSibling;
                if (remarkNameNode.GetType().Equals(typeof(Span)))
                {
                    string remarkName = ((Span)remarkNameNode).StringText;
                    //去掉前后括号
                    remarkName = remarkName.Substring(1, remarkName.Length - 2);
                    feed.RemarkName = remarkName;
                }
            }
            else
            {
                Console.WriteLine("第" + i + "条微博中,判断转发类似微博的作者的标准出错!");
            }
            #endregion

            #region 获取转发相似微博的转发理由
            NodeList reFeedReasonNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
            if (reFeedReasonNodeList.Size() == 1)
            {
                feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false);
            }
            else
            {
                Console.WriteLine("第" + i + "条微博中,判断转发类似微博的转发理由的标准出错!");
            }
            #endregion

            //因为是转发相似微博,所以原微博的作者和内容就通过参数传入
            feed.OriginalAuthor = originalAuthor;
            feed.Content = feedContent;

            //获取转发类似微博的发送地点信息
            feed.Location = GetLocationInfo(i, similarFeedDiv);
            //获取转发类似微博赞数
            feed.LikeCount = GetFeedLikeInfo(i, similarFeedDiv);
            //获取转发类似微博的转发数
            feed.ReFeedCount = GetFeedForwardCount(i, similarFeedDiv);
            //获取转发类似微博的评论数
            feed.CommentCount = GetFeedCommentCount(i, similarFeedDiv);
            //获取转发类似微博的发送时间
            feed.Time = GetFeedTimeInfo(i, similarFeedDiv);
            //获取转发类似微博的发送方式
            feed.Device = GetFeedSendTypeInfo(i, similarFeedDiv);

            return feed;
        }
		/// <summary> Create a tag like the one provided.</summary>
		/// <param name="tag">The tag to emulate.
		/// </param>
		/// <param name="scanner">The scanner for this tag.
		/// </param>
		public TagNode(TagNode tag, TagScanner scanner):this(tag.Page, tag.TagBegin, tag.TagEnd, tag.AttributesEx)
		{
			ThisScanner = scanner;
		}
Esempio n. 4
0
        /// <summary>
        /// 给定根结点,填充转发微博相关的各种信息
        /// </summary>
        /// <param name="i">该微博在所在页面中的流水号</param>
        /// <param name="feed">保存该微博的Feed实例</param>
        /// <param name="reFeedDiv">包含转发微博的div标记</param>
        /// <param name="feedDiv">包含原微博的div标记</param>
        private void GetReFeedInfo(int i, Feed feed, TagNode reFeedDiv, TagNode feedDiv)
        {
            //标记ReFeedOrNot为true,表明是转发微博
            feed.ReFeedOrNot = true;
            //标识是否出现“转发微博已被删除”的情况
            bool reFeedIsDeleted = false;

            #region 获取原微博作者
            NodeList reFeedOriginalAuthorNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedAuthorFilter, true);
            if (reFeedOriginalAuthorNodeList.Size() == 1)
            {
                INode reFeedOriginalAuthorNode = reFeedOriginalAuthorNodeList[0];
                //由于包含原微博作者的链接标记与得到的子div相对位置不定(某些情况下可能会有空文本标记,很奇怪= =),所以采用遍历判断标记类型的办法
                for (int j = 0; j < reFeedOriginalAuthorNode.Children.Size(); j++)
                {
                    INode reFeedOriginalAuthorCandidate = reFeedOriginalAuthorNode.Children[j];
                    if (reFeedOriginalAuthorCandidate.GetType().Equals(typeof(ATag)))
                    {
                        feed.OriginalAuthor = ((ATag)reFeedOriginalAuthorCandidate).GetAttribute("TITLE");
                        break;
                    }
                }
            }
            else
            {
                NodeList deletedFeedList = reFeedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter1, true);
                if (deletedFeedList.Size() > 0)
                {
                    reFeedIsDeleted = true;
                    feed.OriginalAuthor = "Unknown";
                }
                else
                {
                    Console.WriteLine("第" + i + "条微博中,判断转发微博作者的标准出错!");
                }
            }
            #endregion

            #region 获取原微博内容
            NodeList reFeedContentNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedContentFilter, true);
            if (reFeedContentNodeList.Size() == 1)
            {
                //不清楚em是什么类型的节点,所以直接传递reFeedContentNodeList.Children给函数,让函数对其中每个元素进行遍历处理
                //在个人主页中,好像又没有em节点了= =瞎了……
                feed.Content = GetContentFromChildren(feed, reFeedContentNodeList[0], false);
            }
            else
            {
                if (reFeedIsDeleted)
                {
                    feed.Content = "微博已删除";
                }
                else
                {
                    Console.WriteLine("第" + i + "条微博中,判断转发微博内容的标准出错!");
                }
            }
            #endregion

            #region 获取本条微博内容作为转发理由
            NodeList reFeedReasonNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
            //注意:如果含有“还有X条对原微博的转发”的内容,那么此处reFeedReasonNodeList的数量应该等于(1 + similarFeedCount),但是考虑到之前已经有了多次判断相等的过程,所以此处直接调用第一个(即下标为0的元素)即可
            feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false);
            #endregion
        }