/// <summary>
		/// 
		/// </summary>
		/// <param name="parser"></param>
		public HtmlPage(Parser parser):base(true)
		{
			title = "";
			nodesInBody = new NodeList();
			tables = new NodeList();
			m_Images = new NodeList();
		}
		/// <summary> Create an abstract node with the page positions given.
		/// Remember the page and start & end cursor positions.
		/// </summary>
		/// <param name="page">The page this tag was read from.
		/// </param>
		/// <param name="start">The starting offset of this node within the page.
		/// </param>
		/// <param name="end">The ending offset of this node within the page.
		/// </param>
		public AbstractNode(Page page, int start, int end)
		{
			mPage = page;
			nodeBegin = start;
			nodeEnd = end;
			parent = null;
			children = null;
		}
Пример #3
0
        public NodeList GetListUrl(string url)
        {
            Parser parser = ParserHelp.GetParser(url);
            NodeFilter filter = new HasAttributeFilter("class", "list_title");
            NodeList list = new NodeList();

            list = parser.ExtractAllNodesThatMatch(filter);

            return list;
        }
		/// <summary> Search given node and pick up any objects of given type.</summary>
		/// <param name="node">The node to search.
		/// </param>
		/// <param name="type">The class to search for.
		/// </param>
		/// <returns> A node array with the matching nodes.
		/// </returns>
		public static INode[] FindTypeInNode(INode node, System.Type type)
		{
			INodeFilter filter;
			NodeList ret;
			
			ret = new NodeList();
			filter = new NodeClassFilter(type);
			node.CollectInto(ret, filter);
			
			return (ret.ToNodeArray());
		}
		/// <summary> Scan for script.
		/// Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
		/// </summary>
		/// <param name="tag">The tag this scanner is responsible for.
		/// </param>
		/// <param name="lexer">The source of CDATA.
		/// </param>
		/// <param name="stack">The parse stack, <em>not used</em>.
		/// </param>
		public override ITag Scan(ITag tag, Lexer lexer, NodeList stack)
		{
			System.String language;
			System.String code;
			INode content;
			int position;
			INode node;
			TagAttribute attribute;
			System.Collections.ArrayList vector;
			
			if (tag is ScriptTag)
			{
				language = ((ScriptTag) tag).Language;
				if ((null != language) && (language.ToUpper().Equals("JScript.Encode".ToUpper()) || language.ToUpper().Equals("VBScript.Encode".ToUpper())))
				{
					code = ScriptDecoder.Decode(lexer.Page, lexer.Cursor);
					((ScriptTag) tag).ScriptCode = code;
				}
			}
			content = lexer.ParseCDATA(!STRICT);
			position = lexer.Position;
			node = lexer.NextNode(false);
			if (null != node)
				if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0])))
				{
					lexer.Position = position;
					node = null;
				}
			
			// build new end tag if required
			if (null == node)
			{
				attribute = new TagAttribute("/script", null);
				vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
				vector.Add(attribute);
				node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector);
			}
			tag.SetEndTag((ITag) node);
			if (null != content)
			{
				tag.Children = new NodeList(content);
				content.Parent = tag;
			}
			node.Parent = tag;
			tag.DoSemanticAction();
			
			return (tag);
		}
Пример #6
0
        public void GetContartInfo(NodeList list, ref Job info)
        {
            string miaoshu = list[0].ToString();
            if (string.IsNullOrEmpty(miaoshu))
            {
                return;
            }

            miaoshu = Regex.Replace(miaoshu,@"(\\t|\s)","");

            Match company = Regex.Match(miaoshu, @"Txt\(4903\[108\,12\]\,4935\[110\,16\]\)\:\\n(?<company>\w*)\\n...End", RegexOptions.Multiline);
            if (company.Success)
            {
                info.company = company.Value;
            }
        }
		/// <summary> Get the next node.</summary>
		/// <returns> The next node in the HTML stream, or null if there are no more nodes.
		/// </returns>
		/// <exception cref="ParserException">If an unrecoverable error occurs.
		/// </exception>
		public virtual INode NextNode()
		{
			ITag tag;
			IScanner scanner;
			NodeList stack;
			INode ret;
			
			try
			{
				ret = mLexer.NextNode();
				if (null != ret)
				{
					// kick off recursion for the top level node
					if (ret is ITag)
					{
						tag = (ITag) ret;
						if (!tag.IsEndTag())
						{
							// now recurse if there is a scanner for this type of tag
							scanner = tag.ThisScanner;
							if (null != scanner)
							{
								stack = new NodeList();
								ret = scanner.Scan(tag, mLexer, stack);
							}
						}
					}
				}
			}
			catch (ParserException pe)
			{
				throw pe; // no need to wrap an existing ParserException
			}
			catch (System.Exception e)
			{
				System.Text.StringBuilder msgBuffer = new System.Text.StringBuilder();
				msgBuffer.Append("Unexpected Exception occurred while reading ");
				msgBuffer.Append(mLexer.Page.Url);
				msgBuffer.Append(", in nextNode");
				// TODO: appendLineDetails (msgBuffer);
				ParserException ex = new ParserException(msgBuffer.ToString(), e);
				mFeedback.Error(msgBuffer.ToString(), ex);
				throw ex;
			}
			
			return (ret);
		}
Пример #8
0
        /// <summary>
        /// 将img图片路径转为网路完整的图片路径
        /// </summary>
        /// <param name="html">需要转换的内容</param>
        /// <param name="url">替换站点路径:http://www.z01.com</param>
        /// <returns></returns>
        public string ConvertImgUrl(string html, string url)
        {
            if (string.IsNullOrEmpty(html) || string.IsNullOrEmpty(url))
            {
                return(html);
            }
            HtmlPage page = GetPage("<html><body>" + html + "</body></html>");

            Winista.Text.HtmlParser.Util.NodeList nodes = page.Body.ExtractAllNodesThatMatch(new TagNameFilter("IMG"), true);
            for (int i = 0; i < nodes.Count; i++)
            {
                ImageTag image = (ImageTag)nodes[i];
                if (!image.ImageURL.ToLower().Contains("://"))
                {
                    image.ImageURL = url.TrimEnd('/') + ("/" + image.ImageURL.TrimStart('/'));
                }
            }
            return(page.Body.ToHtml());
        }
Пример #9
0
        public Job GetDetail(string url)
        {
            Job info = new Job();

            Parser parser = ParserHelp.GetParser(url);

            NodeFilter miaoShu = new HasAttributeFilter("id", "miaoshu");
            NodeFilter mainBox = new HasAttributeFilter("class", "mainBox");
            NodeFilter orfilter = new OrFilter(miaoShu, mainBox);

            NodeList list = new NodeList();
            list = parser.Parse(orfilter);
            if (list == null || list.Count < 2)
            {
                return info;
            }

            GetMiaoShu(list, ref info);
            GetContartInfo(list, ref info);

            return info;
        }
		/// <summary> Scan for style definitions.
		/// Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
		/// </summary>
		/// <param name="tag">The tag this scanner is responsible for.
		/// </param>
		/// <param name="lexer">The source of CDATA.
		/// </param>
		/// <param name="stack">The parse stack, <em>not used</em>.
		/// </param>
		public override ITag Scan(ITag tag, Lexer lexer, NodeList stack)
		{
			INode content;
			int position;
			INode node;
			TagAttribute attribute;
			System.Collections.ArrayList vector;
			
			content = lexer.ParseCDATA();
			position = lexer.Position;
			node = lexer.NextNode(false);
			if (null != node)
				if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0])))
				{
					lexer.Position = position;
					node = null;
				}
			
			// build new end tag if required
			if (null == node)
			{
				attribute = new TagAttribute("/style", null);
				vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
				vector.Add(attribute);
				node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector);
			}
			tag.SetEndTag((ITag) node);
			if (null != content)
			{
				tag.Children = new NodeList(content);
				content.Parent = tag;
			}
			node.Parent = tag;
			tag.DoSemanticAction();
			
			return (tag);
		}
Пример #11
0
 static void GetSubtitleFromHtml(NodeList nodeList, string subtitleType)
 {
     Console.WriteLine(subtitleType);
     StreamWriter writer = new StreamWriter(subtitleType + ".txt");
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode currentNode = (INode)nodeList[i];
         while (currentNode.NextSibling != null && !currentNode.NextSibling.GetType().Equals(typeof(TableColumn)))
         {
             currentNode = currentNode.NextSibling;
         }
         if (currentNode.NextSibling != null)
         {
             TableColumn contentNode = (TableColumn)currentNode.NextSibling;
             string content = "";
             NodeList childrenNode = contentNode.Children;
             if (childrenNode != null)
             {
                 for (int j = 0; j < childrenNode.Count; j++)
                 {
                     if (childrenNode[j].GetText().Equals("br"))
                     {
                         writer.WriteLine(content);
                         content = "";
                         continue;
                     }
                     string tmpStr = HttpUtility.HtmlDecode(childrenNode[j].ToPlainTextString());
                     tmpStr = tmpStr.Trim();
                     content += tmpStr;
                 }
                 //TableColumn speakerNode=(TableColumn)colorCell.NextSibling.NextSibling;
                 writer.WriteLine(content);
             }
         }
     }
     writer.Close();
 }
		/// <summary> Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node
		/// satisfies the filtering criteria.<P>
		/// 
		/// This mechanism allows powerful filtering code to be written very easily,
		/// without bothering about collection of embedded tags separately.
		/// e.g. when we try to get all the links on a page, it is not possible to
		/// get it at the top-level, as many tags (like form tags), can contain
		/// links embedded in them. We could get the links out by checking if the
		/// current node is a <see cref="CompositeTag"></see>, and going through its children.
		/// So this method provides a convenient way to do this.<P>
		/// 
		/// Using collectInto(), programs get a lot shorter. Now, the code to
		/// extract all links from a page would look like:
		/// <pre>
		/// NodeList collectionList = new NodeList();
		/// NodeFilter filter = new TagNameFilter ("A");
		/// for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
		/// e.nextNode().collectInto(collectionList, filter);
		/// </pre>
		/// Thus, collectionList will hold all the link nodes, irrespective of how
		/// deep the links are embedded.<P>
		/// 
		/// Another way to accomplish the same objective is:
		/// <pre>
		/// NodeList collectionList = new NodeList();
		/// NodeFilter filter = new TagClassFilter (LinkTag.class);
		/// for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
		/// e.nextNode().collectInto(collectionList, filter);
		/// </pre>
		/// This is slightly less specific because the LinkTag class may be
		/// registered for more than one node name, e.g. &lt;LINK&gt; tags too.
		/// </summary>
		/// <param name="list">The node list to collect acceptable nodes into.
		/// </param>
		/// <param name="filter">The filter to determine which nodes are retained.
		/// </param>
		public virtual void CollectInto(NodeList list, INodeFilter filter)
		{
			if (filter.Accept(this))
			{
				list.Add(this);
			}
		}
Пример #13
0
		/// <summary> Filter the list with the given filter.</summary>
		/// <param name="filter">The filter to use.
		/// </param>
		/// <param name="recursive">If <code>true<code> digs into the children recursively.
		/// </param>
		/// <returns> A new node array containing the nodes accepted by the filter.
		/// This is a linear list and preserves the nested structure of the returned
		/// nodes only.
		/// </returns>
		public virtual NodeList ExtractAllNodesThatMatch(NodeFilter filter, bool recursive)
		{
			INode node;
			NodeList children;
			NodeList ret;
			
			ret = new NodeList();
			for (int i = 0; i < m_iSize; i++)
			{
				node = nodeData[i];
				if (filter.Accept(node))
					ret.Add(node);
				if (recursive)
				{
					children = node.Children;
					if (null != children)
						ret.Add(children.ExtractAllNodesThatMatch(filter, recursive));
				}
			}
			
			return (ret);
		}
Пример #14
0
        private void ParsePorductDescribe(NodeList nodes)
        {
            NodeFilter miao = new HasAttributeFilter("class", "miao");
            NodeList miaoArea = nodes.ExtractAllNodesThatMatch(miao, true);

            NodeFilter pictures = new NodeClassFilter(typeof(ImageTag));
            NodeList pictureNodes = miaoArea.ExtractAllNodesThatMatch(pictures, true);

            DownloadPictures(pictureNodes);

            string miaoshu = miaoArea.AsHtml();
            miaoshu = Regex.Replace(miaoshu, @"http\://(www\.|)rrxf\.cn/", pictureURL + "/", RegexOptions.IgnoreCase);
            miaoshu = Regex.Replace(miaoshu, @"(pic|bigpic)/", "$1_", RegexOptions.IgnoreCase);
            miaoshu = miaoshu.Replace("-", "_");

            Console.WriteLine(miaoshu);
        }
Пример #15
0
        public virtual bool NodePushRangeChildren()
        {
            NodeList nl = m_node.Children;
            nl = nl.ExtractAllNodesThatMatch(AndFilter.TrueFilter,true);
            if (nl.Count > 0)
            {
                m_nodestack.Push(m_node);
                m_nodestack.Push(m_nodeenum);
                m_nodestack.Push(m_nodelist);

                m_nodelist = nl;
                m_nodeenum = m_nodelist.Elements();
                m_node = null;
                return true;
            }
            else
                return false;
        }
Пример #16
0
        public virtual bool NodePushRangeAll()
        {
            NodeList nl = m_parser.Parse(AndFilter.TrueFilter);
            if (nl.Count > 0)
            {
                m_nodestack.Push(m_node);
                m_nodestack.Push(m_nodeenum);
                m_nodestack.Push(m_nodelist);

                m_nodelist = nl;
                m_nodeenum = m_nodelist.Elements();
                m_node = null;
                return true;
            }
            else
                return false;
        }
Пример #17
0
 public virtual void Navigate(string url)
 {
     try
     {
         m_parser = new Parser(new Lexer(getHtml(url,null)),null);
         m_nodestack.Clear();
         m_node = null;
         m_nodeenum = null;
         m_nodelist = null;
         m_url = url;
         //m_parser.InputHTML = getHtml(url, null);
         //m_parser.URL = url;
         //m_parser.AnalyzePage();
     }
     catch (Exception e)
     {
         log.Error("Navigate: "+url, e);
     }
 }
Пример #18
0
		/// <summary> Extract all nodes matching the given filter.</summary>
		/// <param name="filter">The filter to be applied to the nodes.
		/// </param>
		/// <throws>  ParserException If a parse error occurs. </throws>
		/// <returns> A list of nodes matching the filter criteria,
		/// i.e. for which the filter's accept method
		/// returned <code>true</code>.
		/// </returns>
		public virtual NodeList ExtractAllNodesThatMatch(INodeFilter filter)
		{
			INodeIterator e;
			NodeList ret;
			
			ret = new NodeList();
			for (e = Elements(); e.HasMoreNodes(); )
				e.NextNode().CollectInto(ret, filter);			
			return (ret);
		}
Пример #19
0
		/// <summary> Parse the given resource, using the filter provided.
		/// This can be used to extract information from specific nodes.
		/// When used with a <code>null</code> filter it returns an
		/// entire page which can then be modified and converted back to HTML
		/// (Note: the synthesis use-case is not handled very well; the parser
		/// is more often used to extract information from a web page).
		/// <p>For example, to replace the entire contents of the HEAD with a
		/// single TITLE tag you could do this:
		/// <pre>
		/// NodeList nl = parser.parse (null); // here is your two node list
		/// NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD"))
		/// if (heads.size () > 0) // there may not be a HEAD tag
		/// {
		/// Head head = heads.elementAt (0); // there should be only one
		/// head.removeAll (); // clean out the contents
		/// Tag title = new TitleTag ();
		/// title.setTagName ("title");
		/// title.setChildren (new NodeList (new TextNode ("The New Title")));
		/// Tag title_end = new TitleTag ();
		/// title_end.setTagName ("/title");
		/// title.setEndTag (title_end);
		/// head.add (title);
		/// }
		/// System.out.println (nl.toHtml ()); // output the modified HTML
		/// </pre>
		/// </p>
		/// </summary>
		/// <returns> The list of matching nodes (for a <code>null</code>
		/// filter this is all the top level nodes).
		/// </returns>
		/// <param name="filter">The filter to apply to the parsed nodes,
		/// or <code>null</code> to retrieve all the top level nodes.
		/// </param>
		/// <throws>  ParserException If a parsing error occurs. </throws>
		public virtual NodeList Parse(INodeFilter filter)
		{
			INodeIterator e;
			INode node;
			NodeList ret;
			
			ret = new NodeList();
			for (e = Elements(); e.HasMoreNodes(); )
			{
				node = e.NextNode();
				if (null != filter)
					node.CollectInto(ret, filter);
				else
					ret.Add(node);
			}
			
			return (ret);
		}
Пример #20
0
        private static void ParseProductTitle(NodeList nodes)
        {
            NodeFilter title = new HasAttributeFilter("class", "prouductx");
            NodeList titleNodes = nodes.ExtractAllNodesThatMatch(title, true);

            Console.WriteLine(titleNodes[0].ToPlainTextString());
        }
Пример #21
0
        private void ParseProductShowPhoto(NodeList nodes)
        {
            NodeFilter show = new HasAttributeFilter("class", "Picture220");
            NodeList showNodes = nodes.ExtractAllNodesThatMatch(show, true);
            ImageTag showTag = showNodes[0] as ImageTag;
            showTag.ImageURL = showTag.ImageURL.Replace("../../", "http://rrxf.cn/");

            Console.WriteLine(showTag.ImageURL);
            DownloadPicture(showTag.ImageURL);
        }
Пример #22
0
 private void ParseProductDemoPhoto(NodeList nodes)
 {
     NodeFilter photo = new HasAttributeFilter("class", "Picture40");
     NodeList photoNodes = nodes.ExtractAllNodesThatMatch(photo, true);
     DownloadPictures(photoNodes);
 }
Пример #23
0
		/// <summary> Add another node list to this one.</summary>
		/// <param name="list">The list to add.
		/// </param>
		public virtual void Add(NodeList list)
		{
			for (int i = 0; i < list.m_iSize; i++)
				Add(list.nodeData[i]);
		}
Пример #24
0
 public virtual bool NodePopRange()
 {   
     m_nodelist = (NodeList)m_nodestack.Pop();
     m_nodeenum = (ISimpleNodeIterator)m_nodestack.Pop();
     m_node = (INode)m_nodestack.Pop();
     return true;
 }
Пример #25
0
			public SimpleNodeIterator(NodeList enclosingInstance)
			{
				InitBlock(enclosingInstance);
			}
Пример #26
0
        public virtual bool NodePushRangeByName(string elementname)
        {
            NodeList nl = m_parser.Parse(new HasAttributeFilter("name", elementname));
            if (nl.Count > 0)
            {
                m_nodestack.Push(m_node);
                m_nodestack.Push(m_nodeenum);
                m_nodestack.Push(m_nodelist);

                m_nodelist = nl;
                m_nodeenum = m_nodelist.Elements();
                m_node = null;
                return true;
            }
            else
                return false;
        }
Пример #27
0
			private void  InitBlock(NodeList enclosingInstance)
			{
				this.m_enclosingInstance = enclosingInstance;
			}
Пример #28
0
        public virtual bool NodePushRangeScripts()
        {
            NodeList nl = m_parser.Parse(new TagNameFilter("SCRIPT"));
            if (nl.Count > 0)
            {
                m_nodestack.Push(m_node);
                m_nodestack.Push(m_nodeenum);
                m_nodestack.Push(m_nodelist);

                m_nodelist = nl;
                m_nodeenum = m_nodelist.Elements();
                m_node = null;
                return true;
            }
            else
                return false;
        }
Пример #29
0
        private void DownloadPictures(NodeList photoNodes)
        {
            List<ImageTag> photos = new List<ImageTag>();

            int length = photoNodes.Count;
            for (int i = 0; i < length; i++)
            {
                ImageTag imgTag = photoNodes[i] as ImageTag;
                imgTag.ImageURL = imgTag.ImageURL.Replace("../../", "http://rrxf.cn/");
                Console.WriteLine(imgTag.ImageURL);

                photos.Add(imgTag);

                DownloadPicture(imgTag.ImageURL);
            }
        }
Пример #30
0
        protected void Dispose(bool Disposing)  
        {  
            if(!IsDisposed)  
            {
                if (Disposing)  
                {
                    //清理托管资源
                    m_parser = null;
                    //if (m_nodelist != null) m_nodelist.Clear();
                    m_nodelist = null;
                    m_node = null;
                    m_nodeenum = null;
                    //m_nodestack.Clear();
                    m_nodestack = null;
                }  
                //清理非托管资源

            }  
            IsDisposed=true;  
        }
		/// <summary> Scan the tag.
		/// For this implementation, the only operation is to perform the tag's
		/// semantic action.
		/// </summary>
		/// <param name="tag">The tag to scan.
		/// </param>
		/// <param name="lexer">Provides html page access.
		/// </param>
		/// <param name="stack">The parse stack. May contain pending tags that enclose
		/// this tag.
		/// </param>
		/// <returns> The resultant tag (may be unchanged).
		/// </returns>
		public virtual ITag Scan(ITag tag, Winista.Text.HtmlParser.Lex.Lexer lexer, NodeList stack)
		{
			tag.DoSemanticAction();
			
			return (tag);
		}