Example #1
0
        /// <summary> Finds a text node, however embedded it might be, and returns
        /// it. The text node will retain links to its parents, so
        /// further navigation is possible.
        /// </summary>
        /// <param name="searchText">The text to search for.
        /// </param>
        /// <returns> The list of text nodes (recursively) found.
        /// </returns>
        public virtual IText[] DigupStringNode(System.String searchText)
        {
            NodeList nodeList    = SearchFor(searchText);
            NodeList stringNodes = new NodeList();

            for (int i = 0; i < nodeList.Size(); i++)
            {
                INode node = nodeList.ElementAt(i);
                if (node is IText)
                {
                    stringNodes.Add(node);
                }
                else
                {
                    if (node is CompositeTag)
                    {
                        CompositeTag ctag  = (CompositeTag)node;
                        IText[]      nodes = ctag.DigupStringNode(searchText);
                        for (int j = 0; j < nodes.Length; j++)
                        {
                            stringNodes.Add(nodes[j]);
                        }
                    }
                }
            }
            IText[] stringNode = new IText[stringNodes.Size()];
            for (int i = 0; i < stringNode.Length; i++)
            {
                stringNode[i] = (IText)stringNodes.ElementAt(i);
            }
            return(stringNode);
        }
Example #2
0
        public List <String> start(String htmlContent)
        {
            Parser parser = new Parser();

            parser.InputHTML = htmlContent;
            NodeList      nodelist = parser.Parse(nodefilter);
            int           size     = nodelist.Size();
            List <String> results  = new List <String>();

            for (int i = 0; i < size; i++)
            {
                INode node = nodelist.ElementAt(i);
                if (node is INode)
                {
                    ITag tag = node as ITag;
                    if (needValue == "href")
                    {
                        results.Add(tag.GetAttribute(needValue));
                    }
                    else
                    {
                        results.Add(tag.FirstChild.GetText());
                    }
                }
            }
            return(results);
        }
        public System.Collections.Specialized.StringCollection Extract()
        {
            String strPattern = "([A-Za-z0-9](([_\\.\\-]?[a-zA-Z0-9]+)*)@([A-Za-z0-9]+)(([\\.\\-]?[a-zA-Z0-9]+)*)\\.([A-Za-z]{2,}))";

            System.Collections.Specialized.StringCollection collAddresses = new System.Collections.Specialized.StringCollection();
            INodeFilter filter = new RegexFilter(strPattern, true);
            NodeList    nodes  = m_obParser.Parse(filter);

            if (null != nodes &&
                0 != nodes.Size())
            {
                RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Multiline;
                Regex        obRegex = new Regex(strPattern, options);
                for (Int32 i = 0; i < nodes.Size(); i++)
                {
                    INode  obNode  = nodes.ElementAt(i);
                    String strText = obNode.GetText();

                    Match m = obRegex.Match(strText);
                    while (m.Success)
                    {
                        collAddresses.Add(m.Groups[0].Value);
                        // Advance to the next match.
                        m = m.NextMatch();
                    }
                }
            }

            return(collAddresses);
        }
Example #4
0
        private List <Centroid> initialCentroidRandom(int k, NodeList <BuildAction> observations)
        {
            List <Centroid> centroids = new List <Centroid>();

            while (k > 0)
            {
                centroids.Add(new Centroid(observations.ElementAt(rand.Next(observations.Count - 1))));
                k--;
            }
            return(centroids);
        }
        public static void RemoveMeaninglessNodes(this NodeList list)
        {
            for (int i = 0; i < list.Count; i++)
            {
                INode node = list.ElementAt(i);

                if (node is TextNode && node.ToPlainTextStringEx().Trim().Equals(""))
                {
                    list.Remove(node);
                }
            }
        }
Example #6
0
        public RJOutline(String docs)
        {
            HTMLParser p = HTMLParser.GetByHTML(docs);

            NodeList nodes = p.GetFirstNode("id", "work_outline").Children;

            nodes.KeepAllNodesThatMatch(new TagNameFilter("tr"));

            for (int i = 0; i < nodes.Count; i++)
            {
                INode node = nodes.ElementAt(i);

                if (node != null)
                {
                    node.Children.RemoveMeaninglessNodes();
                    this.data.Add(node.FirstChild.ToPlainTextStringEx().Trim(), node.LastChild.ToDividedTextString(" ").TrimAll());
                }
            }
        }
Example #7
0
        private void DeleteExecute(object obj)
        {
            int index = NodeList.IndexOf(selectValue);

            NodeList.Remove(selectValue);
            if (NodeList.Count <= 0)
            {
                return;
            }

            if (index > 0)
            {
                SelectValue = NodeList.ElementAt(--index);
            }
            else
            {
                SelectValue = NodeList.ElementAt(0);
            }
        }
 /// <summary>
 /// 第二种页面:数据格式为文本
 /// </summary>
 /// <returns></returns>
 public static string[] GetDataFromText(string link)
 {
     //定义一个数组存放需要的解析数据
     string[] tempData = new string[6];
     try
     {
         //过滤条件:添加第二次筛选条件
         NodeFilter secondFilter   = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "nrl"));
         NodeList   secondNodeList = GetNodeList(link, secondFilter);
         //获取目标div标签的子节点
         secondNodeList = secondNodeList.ElementAt(0).Children;
         //开始解析需要的数据项
         TableTag targetTable = (TableTag)secondNodeList[0];
         //获取表中的所有行
         TableRow[] tr = targetTable.Rows;
         //标题
         tempData[0] = tr[0].ToPlainTextString().Trim();
         //文章来源及更新时间
         string temp   = tr[1].ToPlainTextString().Trim();
         int    index1 = temp.IndexOf(':');
         int    index2 = temp.LastIndexOf(':');
         tempData[1] = temp.Substring(index1 + 1, 3);
         temp        = temp.Substring(index2 + 1).Trim();
         int year, month, date;
         year        = temp.LastIndexOf('年');
         month       = temp.LastIndexOf('月');
         date        = temp.LastIndexOf('日');
         tempData[2] = temp.Substring(0, year) + "-" + temp.Substring(year + 1, month - year - 1)
                       + temp.Substring(month + 1, date - month - 1);
         //网址
         tempData[3] = link;
         //水位表
         TableColumn[] tc = tr[3].Columns;
         tempData[4] = tc[0].ChildAt(2).ToHtml();
     }
     catch (Exception ex)
     {
         Console.WriteLine("第二层解析出错啦" + ex.Message);
         tempData[5] = "failed";
     }
     return(tempData);
 }
Example #9
0
        /// <summary>
        /// 从Html中获取所有超链接,必须以<html>包裹,自动过滤javascript:;等无效Url
        /// </summary>
        /// <param name="html">需要筛选的Html代码</param>
        /// <param name="pre">链接前加</param>
        /// <param name="end">链接后加</param>
        /// <param name="charcontain">必须包含指定字符</param>
        /// <returns></returns>
        public string GetAllLink(string html, M_Collect_ListFilter model)
        {
            string   list     = "";
            NodeList nodeList = GetTagList(html, "A");

            for (int i = 0; i < nodeList.Size(); i++)
            {
                ATag   link = (ATag)nodeList.ElementAt(i);
                string href = link.GetAttribute("href");
                if (string.IsNullOrEmpty(href) || href.ToLower().IndexOf("javascript") > -1)
                {
                    continue;
                }
                //必须包含有指定字符串
                if (!string.IsNullOrEmpty(model.CharContain) && !href.Contains(model.CharContain))
                {
                    bool flag = false;
                    foreach (string start in model.CharContain.Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                    {
                        if (href.Contains(start))
                        {
                            flag = true; break;
                        }
                    }
                    if (flag == false)
                    {
                        continue;
                    }
                }
                if (!string.IsNullOrEmpty(model.CharRegex) && !Regex.IsMatch(href, model.CharRegex))
                {
                    continue;
                }
                list += (model.FillStart + link.GetAttribute("href") + model.FillEnd) + "\n";
            }
            return(list);
        }
Example #10
0
 private List<Centroid> initialCentroidRandom(int k, NodeList<BuildAction> observations)
 {
     List<Centroid> centroids = new List<Centroid>();
     while (k > 0)
     {
         centroids.Add(new Centroid(observations.ElementAt(rand.Next(observations.Count - 1))));
         k--;
     }
     return centroids;
 }
Example #11
0
        public INode GetFirstNode(String attribute, String regex)
        {
            NodeList list = GetNodes(attribute, regex);

            return(list.ElementAt(0));
        }
Example #12
0
        /// <summary> Collect the children.
        /// <p>An initial test is performed for an empty XML tag, in which case
        /// the start tag and end tag of the returned tag are the same and it has
        /// no children.<p>
        /// If it's not an empty XML tag, the lexer is repeatedly asked for
        /// subsequent nodes until an end tag is found or a node is encountered
        /// that matches the tag ender set or end tag ender set.
        /// In the latter case, a virtual end tag is created.
        /// Each node found that is not the end tag is added to
        /// the list of children. The end tag is special and not a child.<p>
        /// Nodes that also have a CompositeTagScanner as their scanner are
        /// recursed into, which provides the nested structure of an HTML page.
        /// This method operates in two possible modes, depending on a private boolean.
        /// It can recurse on the JVM stack, which has caused some overflow problems
        /// in the past, or it can use the supplied stack argument to nest scanning
        /// of child tags within itself. The former is left as an option in the code,
        /// mostly to help subsequent modifiers visualize what the internal nesting
        /// is doing.
        /// </summary>
        /// <param name="tag">The tag this scanner is responsible for.
        /// </param>
        /// <param name="lexer">The source of subsequent nodes.
        /// </param>
        /// <param name="stack">The parse stack. May contain pending tags that enclose
        /// this tag.
        /// </param>
        /// <returns> The resultant tag (may be unchanged).
        /// </returns>
        public override ITag Scan(ITag tag, Lexer lexer, NodeList stack)
        {
            INode node;
            ITag  next;

            System.String name;
            IScanner      scanner;
            ITag          ret;

            ret = tag;

            if (ret.EmptyXmlTag)
            {
                ret.SetEndTag(ret);
            }
            else
            {
                do
                {
                    node = lexer.NextNode(false);
                    if (null != node)
                    {
                        if (node is ITag)
                        {
                            next = (ITag)node;
                            name = next.TagName;
                            // check for normal end tag
                            if (next.IsEndTag() && name.Equals(ret.TagName))
                            {
                                ret.SetEndTag(next);
                                node = null;
                            }
                            else if (IsTagToBeEndedFor(ret, next))
                            // check DTD
                            {
                                // backup one node. insert a virtual end tag later
                                lexer.Position = next.StartPosition;
                                node           = null;
                            }
                            else if (!next.IsEndTag())
                            {
                                // now recurse if there is a scanner for this type of tag
                                scanner = next.ThisScanner;
                                if (null != scanner)
                                {
                                    if (mUseJVMStack)
                                    {
                                        // JVM stack recursion
                                        node = scanner.Scan(next, lexer, stack);
                                        AddChild(ret, node);
                                    }
                                    else
                                    {
                                        // fake recursion:
                                        if (scanner == this)
                                        {
                                            if (next.EmptyXmlTag)
                                            {
                                                next.SetEndTag(next);
                                                FinishTag(next, lexer);
                                                AddChild(ret, next);
                                            }
                                            else
                                            {
                                                stack.Add(ret);
                                                ret = next;
                                            }
                                        }
                                        else
                                        {
                                            // normal recursion if switching scanners
                                            node = scanner.Scan(next, lexer, stack);
                                            AddChild(ret, node);
                                        }
                                    }
                                }
                                else
                                {
                                    AddChild(ret, next);
                                }
                            }
                            else
                            {
                                if (!mUseJVMStack && !mLeaveEnds)
                                {
                                    // Since all non-end tags are consumed by the
                                    // previous clause, we're here because we have an
                                    // end tag with no opening tag... this could be bad.
                                    // There are two cases...
                                    // 1) The tag hasn't been registered, in which case
                                    // we just add it as a simple child, like it's
                                    // opening tag
                                    // 2) There may be an opening tag further up the
                                    // parse stack that needs closing.
                                    // So, we ask the factory for a node like this one
                                    // (since end tags never have scanners) and see
                                    // if it's scanner is a composite tag scanner.
                                    // If it is we walk up the parse stack looking for
                                    // something that needs this end tag to finish it.
                                    // If there is something, we close off all the tags
                                    // walked over and continue on as if nothing
                                    // happened.
                                    System.Collections.ArrayList attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
                                    attributes.Add(new TagAttribute(name, null));
                                    ITag opener = lexer.NodeFactory.CreateTagNode(lexer.Page, next.StartPosition, next.EndPosition, attributes);

                                    scanner = opener.ThisScanner;
                                    if ((null != scanner) && (scanner == this))
                                    {
                                        // uh-oh
                                        int index = -1;
                                        for (int i = stack.Size() - 1; (-1 == index) && (i >= 0); i--)
                                        {
                                            // short circuit here... assume everything on the stack has this as it's scanner
                                            // we'll need to stop if either of those conditions isn't met
                                            ITag boffo = (ITag)stack.ElementAt(i);
                                            if (name.Equals(boffo.TagName))
                                            {
                                                index = i;
                                            }
                                            else if (IsTagToBeEndedFor(boffo, next))
                                            {
                                                // check DTD
                                                index = i;
                                            }
                                        }
                                        if (-1 != index)
                                        {
                                            // finish off the current one first
                                            FinishTag(ret, lexer);
                                            AddChild((ITag)stack.ElementAt(stack.Size() - 1), ret);
                                            for (int i = stack.Size() - 1; i > index; i--)
                                            {
                                                ITag fred = (ITag)stack.Remove(i);
                                                FinishTag(fred, lexer);
                                                AddChild((ITag)stack.ElementAt(i - 1), fred);
                                            }
                                            ret  = (ITag)stack.Remove(index);
                                            node = null;
                                        }
                                        else
                                        {
                                            AddChild(ret, next);                                             // default behaviour
                                        }
                                    }
                                    else
                                    {
                                        AddChild(ret, next);                                         // default behaviour
                                    }
                                }
                                else
                                {
                                    AddChild(ret, next);
                                }
                            }
                        }
                        else
                        {
                            AddChild(ret, node);
                            node.DoSemanticAction();
                        }
                    }

                    if (!mUseJVMStack)
                    {
                        // handle coming out of fake recursion
                        if (null == node)
                        {
                            int depth = stack.Size();
                            if (0 != depth)
                            {
                                node = stack.ElementAt(depth - 1);
                                if (node is ITag)
                                {
                                    ITag precursor = (ITag)node;
                                    scanner = precursor.ThisScanner;
                                    if (scanner == this)
                                    {
                                        stack.Remove(depth - 1);
                                        FinishTag(ret, lexer);
                                        AddChild(precursor, ret);
                                        ret = precursor;
                                    }
                                    else
                                    {
                                        node = null;                                         // normal recursion
                                    }
                                }
                                else
                                {
                                    node = null;                                     // normal recursion
                                }
                            }
                        }
                    }
                }while (null != node);
            }

            FinishTag(ret, lexer);

            return(ret);
        }
Example #13
0
        /// <summary>
        /// 函数名称:ItemRetrival_2
        /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间
        /// 参数:string url表示帖子列表url
        /// 参数 ref Encoding encode 用于获取网页字符集编码
        /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息
        ///
        /// </summary>
        /// <param name="url"></param>
        /// <param name="encode"></param>
        /// <param name="listurl"></param>
        /// <param name="listtitle"></param>
        /// <param name="listtime"></param>
        public static void ItemRetrival_2(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle,
                                          ref List <string> listTime)
        {
            //获取网页源码;
            string rawtext = GetDataFromUrl(url);
            string reg1    = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->";

            rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "");
            //将无关的style,script等标签去掉;
            //以下操作用于提取帖子页面的发帖时间、帖子URL,帖子标题等信息
            //用htmlparser获取目标li元素
            Lexer      lexer     = new Lexer(rawtext);
            Parser     parser    = new Parser(lexer);
            NodeFilter filter    = new TagNameFilter("li");//解析出其中的li元素
            NodeList   htmlNodes = parser.Parse(filter);
            //去掉其中不含有时间的条目
            Regex f2 = new Regex(@"\d\d:\d\d");

            for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                if (!f2.IsMatch(htmlNodes[i].ToHtml()))
                {
                    htmlNodes.Remove(i);
                }
            }
            RegexFilter rf    = new RegexFilter(@"\d\d:\d\d");
            string      final = htmlNodes.ToHtml();

            for (int i = 0; i < htmlNodes.Count; i++)
            {
                Lexer    lexerTmp  = new Lexer(htmlNodes[i].ToHtml());
                Parser   parserTmp = new Parser(lexerTmp);
                NodeList tmp       = parserTmp.Parse(rf);
                if (tmp.Count > 0)
                {
                    for (int j = 0; j < tmp.Count; j++)
                    {
                        string temp = tmp[j].ToHtml();
                        ModifyRawText(ref temp);
                        listTime.Add(temp);
                    }
                }
            }


            //提取帖子url和标题
            string     atagAssist = htmlNodes.ToHtml();
            Lexer      lex3       = new Lexer(atagAssist);
            Parser     par3       = new Parser(lex3);
            NodeFilter filter3    = new TagNameFilter("a");
            NodeList   atagNodes  = par3.Parse(filter3);

            for (int i = 0; i < atagNodes.Count; i++)
            {
                string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value;
                ATag   link    = (ATag)atagNodes.ElementAt(i);
                string temp1   = link.GetAttribute("href");
                string temp2   = link.StringText;

                if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url
                {
                    temp1 = urlpart + temp1;                            //将提取出的url构造完整,形成完整的url
                }
                ModifyRawText(ref temp2);
                listUrl.Add(temp1);
                listTitle.Add(temp2);
            }
        }
Example #14
0
        /// <summary>
        /// 函数名称:ItemRetrival_1
        /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间
        /// 参数:string url表示帖子列表url
        /// 参数 ref Encoding encode 用于获取网页字符集编码
        /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息
        /// </summary>
        /// <param name="url"></param>
        /// <param name="encode"></param>
        /// <param name="listurl"></param>
        /// <param name="listtitle"></param>
        /// <param name="listtime"></param>
        public static void ItemRetrival_1(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle,
                                          ref List <string> listTime)
        {
            //获取网页源码;
            string rawtext = GetDataFromUrl(url);
            //将无关的style,script等标签去掉;
            string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->";

            rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "");


            //以下用htmlparser提取源码中的目标table;

            Lexer lexer = new Lexer(rawtext);
            //解析出其中的table元素
            Parser     parser    = new Parser(lexer);
            NodeFilter filter    = new TagNameFilter("table");
            NodeList   htmlNodes = parser.Parse(filter);
            //去除嵌套式table
            Regex f1 = new Regex(@"<table.*?>");

            for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                MatchCollection myCollection = f1.Matches(htmlNodes[i].ToHtml());
                if (myCollection.Count > 1)
                {
                    htmlNodes.Remove(i);
                }
            }

            //去除没有时间的table,认为这种table是无效table
            Regex f2 = new Regex(@"\d\d:\d\d");

            for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                if (!f2.IsMatch(htmlNodes[i].ToHtml()))
                {
                    htmlNodes.Remove(i);
                }
            }



            //以下程序解析出以上三种目标信息

            string     final      = htmlNodes.ToHtml();
            Lexer      lex2       = new Lexer(final);
            Parser     par2       = new Parser(lex2);
            NodeFilter filter2    = new TagNameFilter("tr");
            NodeList   finalNodes = par2.Parse(filter2);
            //提取发帖时间信息
            RegexFilter rf = new RegexFilter(@"\d\d:\d\d");

            for (int i = 0; i < finalNodes.Count; i++)
            {
                Lexer    lexerTmp  = new Lexer(finalNodes[i].ToHtml());
                Parser   parserTmp = new Parser(lexerTmp);
                NodeList tmp       = parserTmp.Parse(rf);
                if (tmp.Count > 0)
                {
                    for (int j = 0; j < tmp.Count; j++)
                    {
                        string temp = tmp[j].ToHtml();
                        ModifyRawText(ref temp);
                        listTime.Add(temp);
                    }
                }
            }
            //提取帖子URL以及帖子标题
            string     atagAssist = finalNodes.ToHtml();
            Lexer      lex3       = new Lexer(atagAssist);
            Parser     par3       = new Parser(lex3);
            NodeFilter filter3    = new TagNameFilter("a");
            NodeList   atagNodes  = par3.Parse(filter3);
            string     urlpart    = new Regex(@"http://.*?(?=/)").Match(url).Value;

            for (int i = 0; i < atagNodes.Count; i++)
            {
                ATag   link  = (ATag)atagNodes.ElementAt(i);
                string temp1 = link.GetAttribute("href");
                string temp2 = link.StringText;

                if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url
                {
                    temp1 = urlpart + temp1;                            //将提取出的url构造完整,形成完整的url
                }
                ModifyRawText(ref temp2);
                listUrl.Add(temp1);
                listTitle.Add(temp2);
            }
        }