Beispiel #1
0
        public static IEnumerable <WordOccurance> ProcessString(string Text, bool IsStopWord)
        {
            List <WordOccurance> WordOccurList = new List <WordOccurance>();
            var    SplitString = Text.Split(" ");
            string WordPattern = @"\b[^\d\W]+\b";

            foreach (var ss in SplitString)
            {
                if (Regex.IsMatch(ss, WordPattern))
                {
                    if (IsStopWord)
                    {
                        if (!(StopWord.ListStopWord().Any(x => x.SWord.Equals(ss.ToLower()))))
                        {
                            if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                            {
                                WordOccurList.Add(new WordOccurance()
                                {
                                    Word = ss.ToString(), Count = 1
                                });
                            }
                            else
                            {
                                WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                            }
                        }
                    }
                    else
                    {
                        if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                        {
                            WordOccurList.Add(new WordOccurance()
                            {
                                Word = ss.ToString(), Count = 1
                            });
                        }
                        else
                        {
                            WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                        }
                    }
                }
            }

            return(WordOccurList);
        }
Beispiel #2
0
        public static IEnumerable <WordOccurance> ProcessHTMLDoc(string PlainHTML, bool IsStopWord)
        {
            List <WordOccurance> WordOccurList = new List <WordOccurance>();
            HtmlDocument         HtmlDoc       = new HtmlDocument();

            HtmlDoc.LoadHtml(PlainHTML);
            var    TagPResult  = HtmlDoc.DocumentNode.SelectNodes("//p");
            string WordPattern = @"\b[^\d\W]+\b";

            if (TagPResult != null)
            {
                foreach (var result in TagPResult)
                {
                    var SplitString = result.InnerText.Split(" ");

                    foreach (var ss in SplitString)
                    {
                        if (!(ss.ToString() == ""))
                        {
                            if (Regex.IsMatch(ss, WordPattern))
                            {
                                if (!IsStopWord)
                                {
                                    if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                                    {
                                        WordOccurList.Add(new WordOccurance()
                                        {
                                            Word = ss.ToString(), Count = 1
                                        });
                                    }
                                    else
                                    {
                                        WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                                    }
                                }
                                else
                                {
                                    if (!(StopWord.ListStopWord().Any(x => x.SWord.Equals(ss.ToLower()))))
                                    {
                                        if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                                        {
                                            WordOccurList.Add(new WordOccurance()
                                            {
                                                Word = ss.ToString(), Count = 1
                                            });
                                        }
                                        else
                                        {
                                            WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            for (int i = 0; i < 6; i++)
            {
                string NodePath;
                if (i == 0)
                {
                    NodePath = "//h";
                }
                else
                {
                    NodePath = "//h" + i;
                }

                var TagHResult = HtmlDoc.DocumentNode.SelectNodes(NodePath);

                if (TagHResult != null)
                {
                    foreach (var result in TagHResult)
                    {
                        var SplitString = result.InnerText.Split(" ");

                        foreach (var ss in SplitString)
                        {
                            if (!(ss.ToString() == ""))
                            {
                                if (Regex.IsMatch(ss, WordPattern))
                                {
                                    if (!IsStopWord)
                                    {
                                        if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                                        {
                                            WordOccurList.Add(new WordOccurance()
                                            {
                                                Word = ss.ToString(), Count = 1
                                            });
                                        }
                                        else
                                        {
                                            WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                                        }
                                    }
                                    else
                                    {
                                        if (!(StopWord.ListStopWord().Any(x => x.SWord.Equals(ss.ToLower()))))
                                        {
                                            if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                                            {
                                                WordOccurList.Add(new WordOccurance()
                                                {
                                                    Word = ss.ToString(), Count = 1
                                                });
                                            }
                                            else
                                            {
                                                WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            var TagTableResult = HtmlDoc.DocumentNode.SelectNodes("//table");

            if (TagTableResult != null)
            {
                foreach (var result in TagTableResult)
                {
                    var SplitString = result.InnerText.Split(" ");

                    foreach (var ss in SplitString)
                    {
                        if (!(ss.ToString() == ""))
                        {
                            if (Regex.IsMatch(ss, WordPattern))
                            {
                                if (!IsStopWord)
                                {
                                    if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                                    {
                                        WordOccurList.Add(new WordOccurance()
                                        {
                                            Word = ss.ToString(), Count = 1
                                        });
                                    }
                                    else
                                    {
                                        WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                                    }
                                }
                                else
                                {
                                    if (!(StopWord.ListStopWord().Any(x => x.SWord.Equals(ss.ToLower()))))
                                    {
                                        if (WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())) == null)
                                        {
                                            WordOccurList.Add(new WordOccurance()
                                            {
                                                Word = ss.ToString(), Count = 1
                                            });
                                        }
                                        else
                                        {
                                            WordOccurList.FirstOrDefault(x => x.Word.Equals(ss.ToString())).Count += 1;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }



            return(WordOccurList);
        }