public static void SetOriginalWords(List <Word> wordList, HtmlNode nowNode) { foreach (Word word in wordList) { word.Original = CrawlingHelper.GetIfExist(nowNode, word.Tag); } }
public static void SetTreeUsingNextNode(TreeData parentNode, HtmlNode newDoc) { foreach (HtmlNode htmlNode in newDoc.ChildNodes) { /*Unnecessary tree data*/ if (CrawlingHelper.IsText(htmlNode.Name) && htmlNode.InnerText.Trim() == "") { continue; } /*Set parent text*/ if (CrawlingHelper.IsText(htmlNode.Name) && htmlNode.InnerText.Trim() != "") { parentNode.Text += " " + WebUtility.HtmlDecode(newDoc.InnerText); //return; } /*New tree data*/ TreeData treeNode = new TreeData(); if (htmlNode.Attributes.Count == 0) { treeNode.Text = WebUtility.HtmlDecode(htmlNode.Name); treeNode.Tag = new NodeTag( htmlNode.Name, htmlNode.Attributes.Count, htmlNode.ChildNodes.Count); //treeNode.Parent = parentNode; } else { treeNode.Text = WebUtility.HtmlDecode(htmlNode.Name + CrawlingHelper.ConvertAttrToText(htmlNode)); treeNode.Tag = new NodeTag( htmlNode.Name + CrawlingHelper.ConvertAttrToTag(htmlNode), htmlNode.Attributes.Count, htmlNode.ChildNodes.Count); //treeNode.Parent = parentNode; } /*Next Tree*/ if (htmlNode.ChildNodes.Count == 0) { treeNode.Text += WebUtility.HtmlDecode(" " + htmlNode.InnerText); parentNode.Children.Add(treeNode); } else { parentNode.Children.Add(treeNode); SetTreeUsingNextNode(treeNode, htmlNode); } } }
public static List <List <string> > GetEncodedWords( HtmlDocument html, List <Word> wordList, List <CrawlingInfo> otherInfoList, string xPath) { HtmlNodeCollection nodes = CrawlingHelper.GetResults(html, xPath); List <List <string> > results = new List <List <string> >(); if (nodes == null) { return(results); } foreach (HtmlNode nowNode in nodes) { List <string> otherList = new List <string>(); bool isExcept = false; SetOriginalWords(wordList, nowNode); SetEncodedWords(wordList); if (IsExistExceptWord(wordList)) { continue; } foreach (CrawlingInfo crawlingInfo in otherInfoList) { var other = CrawlingHelper.CrawlingOne(crawlingInfo.CrawlingPointer, crawlingInfo.UrlOption.ToString()); if (IsExistExceptWord(other)) { isExcept = true; break; } foreach (Word word in other) { otherList.Add(word.Encoded); } } if (isExcept) { continue; } results.Add(wordList.Select(x => x.Encoded).Concat(otherList).ToList()); } return(results); }
public static void SetListUsingNode( List <TreeData> outList, HtmlNode nowNode, string currentXPath) { TreeData newNode = new TreeData(); if (nowNode.InnerText.Trim() != "") { newNode.Text = nowNode.InnerText; newNode.Tag = new NodeTag( CrawlingHelper.GetAllXPath(nowNode), nowNode.Attributes.Count, nowNode.ChildNodes.Count, nowNode.Name, currentXPath); AddListIfNotExist(outList, newNode); } foreach (var attrNode in nowNode.Attributes) { newNode = new TreeData(); newNode.Text = WebUtility.HtmlDecode(attrNode.Value); newNode.Tag = new NodeTag( CrawlingHelper.GetAllXPath(nowNode), nowNode.Attributes.Count, nowNode.ChildNodes.Count, attrNode.Name, currentXPath); AddListIfNotExist(outList, newNode); } foreach (HtmlNode htmlNode in nowNode.ChildNodes) { SetListUsingNode(outList, htmlNode, currentXPath + CrawlingHelper.GetOneXPath(htmlNode)); } }