Пример #1
0
 public static void SetOriginalWords(List <Word> wordList, HtmlNode nowNode)
 {
     foreach (Word word in wordList)
     {
         word.Original = CrawlingHelper.GetIfExist(nowNode, word.Tag);
     }
 }
Пример #2
0
        public static void SetTreeUsingNextNode(TreeData parentNode, HtmlNode newDoc)
        {
            foreach (HtmlNode htmlNode in newDoc.ChildNodes)
            {
                /*Unnecessary tree data*/
                if (CrawlingHelper.IsText(htmlNode.Name) &&
                    htmlNode.InnerText.Trim() == "")
                {
                    continue;
                }

                /*Set parent text*/
                if (CrawlingHelper.IsText(htmlNode.Name) &&
                    htmlNode.InnerText.Trim() != "")
                {
                    parentNode.Text += " " + WebUtility.HtmlDecode(newDoc.InnerText);
                    //return;
                }

                /*New tree data*/
                TreeData treeNode = new TreeData();
                if (htmlNode.Attributes.Count == 0)
                {
                    treeNode.Text = WebUtility.HtmlDecode(htmlNode.Name);
                    treeNode.Tag  = new NodeTag(
                        htmlNode.Name,
                        htmlNode.Attributes.Count,
                        htmlNode.ChildNodes.Count);
                    //treeNode.Parent = parentNode;
                }
                else
                {
                    treeNode.Text = WebUtility.HtmlDecode(htmlNode.Name + CrawlingHelper.ConvertAttrToText(htmlNode));
                    treeNode.Tag  = new NodeTag(
                        htmlNode.Name + CrawlingHelper.ConvertAttrToTag(htmlNode),
                        htmlNode.Attributes.Count,
                        htmlNode.ChildNodes.Count);
                    //treeNode.Parent = parentNode;
                }

                /*Next Tree*/
                if (htmlNode.ChildNodes.Count == 0)
                {
                    treeNode.Text += WebUtility.HtmlDecode(" " + htmlNode.InnerText);
                    parentNode.Children.Add(treeNode);
                }
                else
                {
                    parentNode.Children.Add(treeNode);
                    SetTreeUsingNextNode(treeNode, htmlNode);
                }
            }
        }
Пример #3
0
        public static List <List <string> > GetEncodedWords(
            HtmlDocument html,
            List <Word> wordList,
            List <CrawlingInfo> otherInfoList,
            string xPath)
        {
            HtmlNodeCollection    nodes   = CrawlingHelper.GetResults(html, xPath);
            List <List <string> > results = new List <List <string> >();

            if (nodes == null)
            {
                return(results);
            }

            foreach (HtmlNode nowNode in nodes)
            {
                List <string> otherList = new List <string>();
                bool          isExcept  = false;
                SetOriginalWords(wordList, nowNode);
                SetEncodedWords(wordList);

                if (IsExistExceptWord(wordList))
                {
                    continue;
                }

                foreach (CrawlingInfo crawlingInfo in otherInfoList)
                {
                    var other = CrawlingHelper.CrawlingOne(crawlingInfo.CrawlingPointer, crawlingInfo.UrlOption.ToString());
                    if (IsExistExceptWord(other))
                    {
                        isExcept = true;
                        break;
                    }
                    foreach (Word word in other)
                    {
                        otherList.Add(word.Encoded);
                    }
                }

                if (isExcept)
                {
                    continue;
                }

                results.Add(wordList.Select(x => x.Encoded).Concat(otherList).ToList());
            }

            return(results);
        }
Пример #4
0
        public static void SetListUsingNode(
            List <TreeData> outList,
            HtmlNode nowNode,
            string currentXPath)
        {
            TreeData newNode = new TreeData();

            if (nowNode.InnerText.Trim() != "")
            {
                newNode.Text = nowNode.InnerText;
                newNode.Tag  = new NodeTag(
                    CrawlingHelper.GetAllXPath(nowNode),
                    nowNode.Attributes.Count,
                    nowNode.ChildNodes.Count,
                    nowNode.Name,
                    currentXPath);
                AddListIfNotExist(outList, newNode);
            }

            foreach (var attrNode in nowNode.Attributes)
            {
                newNode      = new TreeData();
                newNode.Text = WebUtility.HtmlDecode(attrNode.Value);
                newNode.Tag  = new NodeTag(
                    CrawlingHelper.GetAllXPath(nowNode),
                    nowNode.Attributes.Count,
                    nowNode.ChildNodes.Count,
                    attrNode.Name,
                    currentXPath);
                AddListIfNotExist(outList, newNode);
            }

            foreach (HtmlNode htmlNode in nowNode.ChildNodes)
            {
                SetListUsingNode(outList, htmlNode, currentXPath + CrawlingHelper.GetOneXPath(htmlNode));
            }
        }