/// <summary> Finds a text node, however embedded it might be, and returns /// it. The text node will retain links to its parents, so /// further navigation is possible. /// </summary> /// <param name="searchText">The text to search for. /// </param> /// <returns> The list of text nodes (recursively) found. /// </returns> public virtual IText[] DigupStringNode(System.String searchText) { NodeList nodeList = SearchFor(searchText); NodeList stringNodes = new NodeList(); for (int i = 0; i < nodeList.Size(); i++) { INode node = nodeList.ElementAt(i); if (node is IText) { stringNodes.Add(node); } else { if (node is CompositeTag) { CompositeTag ctag = (CompositeTag)node; IText[] nodes = ctag.DigupStringNode(searchText); for (int j = 0; j < nodes.Length; j++) { stringNodes.Add(nodes[j]); } } } } IText[] stringNode = new IText[stringNodes.Size()]; for (int i = 0; i < stringNode.Length; i++) { stringNode[i] = (IText)stringNodes.ElementAt(i); } return(stringNode); }
public List <String> start(String htmlContent) { Parser parser = new Parser(); parser.InputHTML = htmlContent; NodeList nodelist = parser.Parse(nodefilter); int size = nodelist.Size(); List <String> results = new List <String>(); for (int i = 0; i < size; i++) { INode node = nodelist.ElementAt(i); if (node is INode) { ITag tag = node as ITag; if (needValue == "href") { results.Add(tag.GetAttribute(needValue)); } else { results.Add(tag.FirstChild.GetText()); } } } return(results); }
public System.Collections.Specialized.StringCollection Extract() { String strPattern = "([A-Za-z0-9](([_\\.\\-]?[a-zA-Z0-9]+)*)@([A-Za-z0-9]+)(([\\.\\-]?[a-zA-Z0-9]+)*)\\.([A-Za-z]{2,}))"; System.Collections.Specialized.StringCollection collAddresses = new System.Collections.Specialized.StringCollection(); INodeFilter filter = new RegexFilter(strPattern, true); NodeList nodes = m_obParser.Parse(filter); if (null != nodes && 0 != nodes.Size()) { RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Multiline; Regex obRegex = new Regex(strPattern, options); for (Int32 i = 0; i < nodes.Size(); i++) { INode obNode = nodes.ElementAt(i); String strText = obNode.GetText(); Match m = obRegex.Match(strText); while (m.Success) { collAddresses.Add(m.Groups[0].Value); // Advance to the next match. m = m.NextMatch(); } } } return(collAddresses); }
private List <Centroid> initialCentroidRandom(int k, NodeList <BuildAction> observations) { List <Centroid> centroids = new List <Centroid>(); while (k > 0) { centroids.Add(new Centroid(observations.ElementAt(rand.Next(observations.Count - 1)))); k--; } return(centroids); }
public static void RemoveMeaninglessNodes(this NodeList list) { for (int i = 0; i < list.Count; i++) { INode node = list.ElementAt(i); if (node is TextNode && node.ToPlainTextStringEx().Trim().Equals("")) { list.Remove(node); } } }
public RJOutline(String docs) { HTMLParser p = HTMLParser.GetByHTML(docs); NodeList nodes = p.GetFirstNode("id", "work_outline").Children; nodes.KeepAllNodesThatMatch(new TagNameFilter("tr")); for (int i = 0; i < nodes.Count; i++) { INode node = nodes.ElementAt(i); if (node != null) { node.Children.RemoveMeaninglessNodes(); this.data.Add(node.FirstChild.ToPlainTextStringEx().Trim(), node.LastChild.ToDividedTextString(" ").TrimAll()); } } }
private void DeleteExecute(object obj) { int index = NodeList.IndexOf(selectValue); NodeList.Remove(selectValue); if (NodeList.Count <= 0) { return; } if (index > 0) { SelectValue = NodeList.ElementAt(--index); } else { SelectValue = NodeList.ElementAt(0); } }
/// <summary> /// 第二种页面:数据格式为文本 /// </summary> /// <returns></returns> public static string[] GetDataFromText(string link) { //定义一个数组存放需要的解析数据 string[] tempData = new string[6]; try { //过滤条件:添加第二次筛选条件 NodeFilter secondFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "nrl")); NodeList secondNodeList = GetNodeList(link, secondFilter); //获取目标div标签的子节点 secondNodeList = secondNodeList.ElementAt(0).Children; //开始解析需要的数据项 TableTag targetTable = (TableTag)secondNodeList[0]; //获取表中的所有行 TableRow[] tr = targetTable.Rows; //标题 tempData[0] = tr[0].ToPlainTextString().Trim(); //文章来源及更新时间 string temp = tr[1].ToPlainTextString().Trim(); int index1 = temp.IndexOf(':'); int index2 = temp.LastIndexOf(':'); tempData[1] = temp.Substring(index1 + 1, 3); temp = temp.Substring(index2 + 1).Trim(); int year, month, date; year = temp.LastIndexOf('年'); month = temp.LastIndexOf('月'); date = temp.LastIndexOf('日'); tempData[2] = temp.Substring(0, year) + "-" + temp.Substring(year + 1, month - year - 1) + temp.Substring(month + 1, date - month - 1); //网址 tempData[3] = link; //水位表 TableColumn[] tc = tr[3].Columns; tempData[4] = tc[0].ChildAt(2).ToHtml(); } catch (Exception ex) { Console.WriteLine("第二层解析出错啦" + ex.Message); tempData[5] = "failed"; } return(tempData); }
/// <summary> /// 从Html中获取所有超链接,必须以<html>包裹,自动过滤javascript:;等无效Url /// </summary> /// <param name="html">需要筛选的Html代码</param> /// <param name="pre">链接前加</param> /// <param name="end">链接后加</param> /// <param name="charcontain">必须包含指定字符</param> /// <returns></returns> public string GetAllLink(string html, M_Collect_ListFilter model) { string list = ""; NodeList nodeList = GetTagList(html, "A"); for (int i = 0; i < nodeList.Size(); i++) { ATag link = (ATag)nodeList.ElementAt(i); string href = link.GetAttribute("href"); if (string.IsNullOrEmpty(href) || href.ToLower().IndexOf("javascript") > -1) { continue; } //必须包含有指定字符串 if (!string.IsNullOrEmpty(model.CharContain) && !href.Contains(model.CharContain)) { bool flag = false; foreach (string start in model.CharContain.Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { if (href.Contains(start)) { flag = true; break; } } if (flag == false) { continue; } } if (!string.IsNullOrEmpty(model.CharRegex) && !Regex.IsMatch(href, model.CharRegex)) { continue; } list += (model.FillStart + link.GetAttribute("href") + model.FillEnd) + "\n"; } return(list); }
private List<Centroid> initialCentroidRandom(int k, NodeList<BuildAction> observations) { List<Centroid> centroids = new List<Centroid>(); while (k > 0) { centroids.Add(new Centroid(observations.ElementAt(rand.Next(observations.Count - 1)))); k--; } return centroids; }
public INode GetFirstNode(String attribute, String regex) { NodeList list = GetNodes(attribute, regex); return(list.ElementAt(0)); }
/// <summary> Collect the children. /// <p>An initial test is performed for an empty XML tag, in which case /// the start tag and end tag of the returned tag are the same and it has /// no children.<p> /// If it's not an empty XML tag, the lexer is repeatedly asked for /// subsequent nodes until an end tag is found or a node is encountered /// that matches the tag ender set or end tag ender set. /// In the latter case, a virtual end tag is created. /// Each node found that is not the end tag is added to /// the list of children. The end tag is special and not a child.<p> /// Nodes that also have a CompositeTagScanner as their scanner are /// recursed into, which provides the nested structure of an HTML page. /// This method operates in two possible modes, depending on a private boolean. /// It can recurse on the JVM stack, which has caused some overflow problems /// in the past, or it can use the supplied stack argument to nest scanning /// of child tags within itself. The former is left as an option in the code, /// mostly to help subsequent modifiers visualize what the internal nesting /// is doing. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of subsequent nodes. /// </param> /// <param name="stack">The parse stack. May contain pending tags that enclose /// this tag. /// </param> /// <returns> The resultant tag (may be unchanged). /// </returns> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { INode node; ITag next; System.String name; IScanner scanner; ITag ret; ret = tag; if (ret.EmptyXmlTag) { ret.SetEndTag(ret); } else { do { node = lexer.NextNode(false); if (null != node) { if (node is ITag) { next = (ITag)node; name = next.TagName; // check for normal end tag if (next.IsEndTag() && name.Equals(ret.TagName)) { ret.SetEndTag(next); node = null; } else if (IsTagToBeEndedFor(ret, next)) // check DTD { // backup one node. insert a virtual end tag later lexer.Position = next.StartPosition; node = null; } else if (!next.IsEndTag()) { // now recurse if there is a scanner for this type of tag scanner = next.ThisScanner; if (null != scanner) { if (mUseJVMStack) { // JVM stack recursion node = scanner.Scan(next, lexer, stack); AddChild(ret, node); } else { // fake recursion: if (scanner == this) { if (next.EmptyXmlTag) { next.SetEndTag(next); FinishTag(next, lexer); AddChild(ret, next); } else { stack.Add(ret); ret = next; } } else { // normal recursion if switching scanners node = scanner.Scan(next, lexer, stack); AddChild(ret, node); } } } else { AddChild(ret, next); } } else { if (!mUseJVMStack && !mLeaveEnds) { // Since all non-end tags are consumed by the // previous clause, we're here because we have an // end tag with no opening tag... this could be bad. // There are two cases... // 1) The tag hasn't been registered, in which case // we just add it as a simple child, like it's // opening tag // 2) There may be an opening tag further up the // parse stack that needs closing. // So, we ask the factory for a node like this one // (since end tags never have scanners) and see // if it's scanner is a composite tag scanner. // If it is we walk up the parse stack looking for // something that needs this end tag to finish it. // If there is something, we close off all the tags // walked over and continue on as if nothing // happened. System.Collections.ArrayList attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); attributes.Add(new TagAttribute(name, null)); ITag opener = lexer.NodeFactory.CreateTagNode(lexer.Page, next.StartPosition, next.EndPosition, attributes); scanner = opener.ThisScanner; if ((null != scanner) && (scanner == this)) { // uh-oh int index = -1; for (int i = stack.Size() - 1; (-1 == index) && (i >= 0); i--) { // short circuit here... assume everything on the stack has this as it's scanner // we'll need to stop if either of those conditions isn't met ITag boffo = (ITag)stack.ElementAt(i); if (name.Equals(boffo.TagName)) { index = i; } else if (IsTagToBeEndedFor(boffo, next)) { // check DTD index = i; } } if (-1 != index) { // finish off the current one first FinishTag(ret, lexer); AddChild((ITag)stack.ElementAt(stack.Size() - 1), ret); for (int i = stack.Size() - 1; i > index; i--) { ITag fred = (ITag)stack.Remove(i); FinishTag(fred, lexer); AddChild((ITag)stack.ElementAt(i - 1), fred); } ret = (ITag)stack.Remove(index); node = null; } else { AddChild(ret, next); // default behaviour } } else { AddChild(ret, next); // default behaviour } } else { AddChild(ret, next); } } } else { AddChild(ret, node); node.DoSemanticAction(); } } if (!mUseJVMStack) { // handle coming out of fake recursion if (null == node) { int depth = stack.Size(); if (0 != depth) { node = stack.ElementAt(depth - 1); if (node is ITag) { ITag precursor = (ITag)node; scanner = precursor.ThisScanner; if (scanner == this) { stack.Remove(depth - 1); FinishTag(ret, lexer); AddChild(precursor, ret); ret = precursor; } else { node = null; // normal recursion } } else { node = null; // normal recursion } } } } }while (null != node); } FinishTag(ret, lexer); return(ret); }
/// <summary> /// 函数名称:ItemRetrival_2 /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间 /// 参数:string url表示帖子列表url /// 参数 ref Encoding encode 用于获取网页字符集编码 /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息 /// /// </summary> /// <param name="url"></param> /// <param name="encode"></param> /// <param name="listurl"></param> /// <param name="listtitle"></param> /// <param name="listtime"></param> public static void ItemRetrival_2(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle, ref List <string> listTime) { //获取网页源码; string rawtext = GetDataFromUrl(url); string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->"; rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, ""); //将无关的style,script等标签去掉; //以下操作用于提取帖子页面的发帖时间、帖子URL,帖子标题等信息 //用htmlparser获取目标li元素 Lexer lexer = new Lexer(rawtext); Parser parser = new Parser(lexer); NodeFilter filter = new TagNameFilter("li");//解析出其中的li元素 NodeList htmlNodes = parser.Parse(filter); //去掉其中不含有时间的条目 Regex f2 = new Regex(@"\d\d:\d\d"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { if (!f2.IsMatch(htmlNodes[i].ToHtml())) { htmlNodes.Remove(i); } } RegexFilter rf = new RegexFilter(@"\d\d:\d\d"); string final = htmlNodes.ToHtml(); for (int i = 0; i < htmlNodes.Count; i++) { Lexer lexerTmp = new Lexer(htmlNodes[i].ToHtml()); Parser parserTmp = new Parser(lexerTmp); NodeList tmp = parserTmp.Parse(rf); if (tmp.Count > 0) { for (int j = 0; j < tmp.Count; j++) { string temp = tmp[j].ToHtml(); ModifyRawText(ref temp); listTime.Add(temp); } } } //提取帖子url和标题 string atagAssist = htmlNodes.ToHtml(); Lexer lex3 = new Lexer(atagAssist); Parser par3 = new Parser(lex3); NodeFilter filter3 = new TagNameFilter("a"); NodeList atagNodes = par3.Parse(filter3); for (int i = 0; i < atagNodes.Count; i++) { string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value; ATag link = (ATag)atagNodes.ElementAt(i); string temp1 = link.GetAttribute("href"); string temp2 = link.StringText; if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url { temp1 = urlpart + temp1; //将提取出的url构造完整,形成完整的url } ModifyRawText(ref temp2); listUrl.Add(temp1); listTitle.Add(temp2); } }
/// <summary> /// 函数名称:ItemRetrival_1 /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间 /// 参数:string url表示帖子列表url /// 参数 ref Encoding encode 用于获取网页字符集编码 /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息 /// </summary> /// <param name="url"></param> /// <param name="encode"></param> /// <param name="listurl"></param> /// <param name="listtitle"></param> /// <param name="listtime"></param> public static void ItemRetrival_1(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle, ref List <string> listTime) { //获取网页源码; string rawtext = GetDataFromUrl(url); //将无关的style,script等标签去掉; string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->"; rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, ""); //以下用htmlparser提取源码中的目标table; Lexer lexer = new Lexer(rawtext); //解析出其中的table元素 Parser parser = new Parser(lexer); NodeFilter filter = new TagNameFilter("table"); NodeList htmlNodes = parser.Parse(filter); //去除嵌套式table Regex f1 = new Regex(@"<table.*?>"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { MatchCollection myCollection = f1.Matches(htmlNodes[i].ToHtml()); if (myCollection.Count > 1) { htmlNodes.Remove(i); } } //去除没有时间的table,认为这种table是无效table Regex f2 = new Regex(@"\d\d:\d\d"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { if (!f2.IsMatch(htmlNodes[i].ToHtml())) { htmlNodes.Remove(i); } } //以下程序解析出以上三种目标信息 string final = htmlNodes.ToHtml(); Lexer lex2 = new Lexer(final); Parser par2 = new Parser(lex2); NodeFilter filter2 = new TagNameFilter("tr"); NodeList finalNodes = par2.Parse(filter2); //提取发帖时间信息 RegexFilter rf = new RegexFilter(@"\d\d:\d\d"); for (int i = 0; i < finalNodes.Count; i++) { Lexer lexerTmp = new Lexer(finalNodes[i].ToHtml()); Parser parserTmp = new Parser(lexerTmp); NodeList tmp = parserTmp.Parse(rf); if (tmp.Count > 0) { for (int j = 0; j < tmp.Count; j++) { string temp = tmp[j].ToHtml(); ModifyRawText(ref temp); listTime.Add(temp); } } } //提取帖子URL以及帖子标题 string atagAssist = finalNodes.ToHtml(); Lexer lex3 = new Lexer(atagAssist); Parser par3 = new Parser(lex3); NodeFilter filter3 = new TagNameFilter("a"); NodeList atagNodes = par3.Parse(filter3); string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value; for (int i = 0; i < atagNodes.Count; i++) { ATag link = (ATag)atagNodes.ElementAt(i); string temp1 = link.GetAttribute("href"); string temp2 = link.StringText; if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url { temp1 = urlpart + temp1; //将提取出的url构造完整,形成完整的url } ModifyRawText(ref temp2); listUrl.Add(temp1); listTitle.Add(temp2); } }