Example #1
0
        static void Main(string[] args)
        {
            LanguagePrebuilt language = args != null && args.Length > 0 ? getLanguage(args[0]) : getLanguage();

            ILemmatizer lemmatizer = new LemmatizerPrebuiltCompact(language);

            if (args.Length == 0 || args.Length == 1)
            {
                Console.WriteLine("Batch-processing all files contained in the subfolder 'lemma-source' into 'lemma-output'");

                string[] fileList = FileTraverser.getFileList();

                foreach (string file in fileList)
                {
                    processFile(file, lemmatizer);
                }
            }
            else if (args.Length == 2)
            {
                Logger.logError("Missing argument");
            }
            else if (args.Length == 3)
            {
                processFile(args[1], lemmatizer, args[2]);
            }
            else
            {
                Logger.logError("Argument count mismatch, expected max 3 received {0}", args.Length.ToString());
            }
        }
Example #2
0
        /// <summary>
        ///  Given a list of phrases, return the best matching phrase based on an input phrase.
        /// </summary>
        /// <returns>List of 2 best matches</returns>
        public static List <string> GetMatchingPhrase(string inputPhrase, List <string> phrases)
        {
            if (phrases?.Count == 0)
            {
                return(null);
            }

            // Lemmatize and remove stop-words from phrases
            var lmtz             = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
            var lemmaListPhrases = Lemmatize(phrases, lmtz);

            for (var i = 0; i < lemmaListPhrases.Count; i++)
            {
                var phrase = lemmaListPhrases[i];
                lemmaListPhrases[i] = RemoveStopWords(phrase);
            }

            // Lemmatize and remove stop-words from inputPhrase
            var lemmaInputPhrase = string.Join(" ", LemmatizePhrase(lmtz, inputPhrase));

            lemmaInputPhrase = RemoveStopWords(lemmaInputPhrase);
            lemmaInputPhrase = SubstituteWords(lemmaInputPhrase);   // "your" => "my"

            // find the best match
            var matches        = BestSetMatch(lemmaInputPhrase, lemmaListPhrases);
            var matchedPhrases = matches.Count == 0 ? new List <string> {
                ""
            } : matches.Take(2).Select(idx => phrases[idx]).ToList();

            // paraphrase
            //var paraphrasedResult = Paraphrase(matchedPhrases[0]);

            return(matchedPhrases);
        }
Example #3
0
        static void  Main(string [] args)
        {
            ILemmatizer  lmtz = new  LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Czech);
            StreamWriter tw   = new  StreamWriter(File.OpenWrite("ExampleFile.txt"));

            Output(((Lemmatizer )lmtz).RootNode, tw, 0, false);
        }
Example #4
0
        private DocumentVector vectorizeDocument(String htmlResult)
        {
            // Get term vector
            var lmtz           = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
            var documentVector = from s in _splitRegex.Split(htmlResult)
                                 where !String.IsNullOrWhiteSpace(s)
                                 let canonical = s.ToLower()
                                                 where !_stopWords.Contains(canonical) && canonical.Length > 1
                                                 select lmtz.Lemmatize(s);

            return(new DocumentVector(documentVector));
        }
Example #5
0
        void Lemmatize(ref Story story)
        {
            ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            story.words = story.text.Split(
                new char[] { ' ', '\t', '\n', ',', '/', '\\', '?', '!', '<', '>', '\'', '|', ':', ';', ')', '(', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' },
                StringSplitOptions.RemoveEmptyEntries);

            story.freq = new int[story.words.Length];

            for (int j = 0; j < story.words.Length; j++)
            {
                LemmatizeOne(lmtz, ref story.words[j]);
            }
        }
Example #6
0
        public string send2(string text)
        {
            string[] exampleWords = text.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            StringBuilder sb = new StringBuilder();

            foreach (string word in exampleWords)
            {
                sb.Append(LemmatizeOne(lmtz, word) + " ");
            }
            return(sb.ToString());
        }
Example #7
0
        public LanguageSupport(string language)
        {
            switch (language)
            {
            case "English":
                Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);
                Stemmer    = new EnglishStemmer();
                break;

            case "Russian":
                Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Russian);
                Stemmer    = new RussianStemmer();
                break;

            default:
                throw new NotSupportedException("Language " + language + "is not suported.");
            }
        }
Example #8
0
        private string Preprocess(string term, LemmatizerPrebuiltCompact lemmatizer)
        {
            string result;

            char[] termCharArray = term.ToCharArray();
            //Remove non-alphanumeric letters
            termCharArray = Array.FindAll <char>(termCharArray, (ch => (char.IsLetterOrDigit(ch) || char.IsWhiteSpace(ch))));
            string input = new string(termCharArray.Where(char.IsLetter).ToArray());

            //string input = new string(termCharArray);
            //Remove newline character from a term
            input = input.Trim(new char[] { '\n' });
            //Make all words lowercase
            input = input.ToLower();
            //Lemmatize word
            result = lemmatizer.Lemmatize(input);

            return(result);
        }
Example #9
0
        public string send(string text)
        {
            string[] exampleWords = text.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            StringBuilder sb = new StringBuilder();

            foreach (string word in exampleWords)
            {
                sb.Append(LemmatizeOne(lmtz, word) + " ");
            }

            string finalstring = sb.ToString();


            var jarRoot         = @"E:\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09";
            var modelsDirectory = jarRoot + @"\models";
            // Loading POS Tagger
            var tagger = new MaxentTagger(modelsDirectory + @"\wsj-0-18-bidirectional-nodistsim.tagger");

            // Text for tagging
            StringBuilder str = new StringBuilder();

            var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(finalstring)).toArray();

            foreach (ArrayList sentence in sentences)
            {
                var    taggedSentence = tagger.tagSentence(sentence);
                string sent           = SentenceUtils.listToString(taggedSentence, false);

                String[] tokens = sent.Split(' ');
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (tokens[i].Contains("/VB"))
                    {
                        str.Append(tokens[i] + " ");
                    }
                }
            }
            return(str.ToString());
        }
    static string getArticleBody(string url)
    {
        try
        {
            HtmlAgilityPack.HtmlWeb      web = new HtmlWeb();
            HtmlAgilityPack.HtmlDocument doc = web.Load(url);
            Console.WriteLine(url);
            string text    = "";
            string stemmed = "";
            foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p/text()"))
            {
                text += Regex.Replace(node.InnerText, @"[^\w\s]", " ").ToLower();
            }

            text = Services.StopwordTool.RemoveStopwords(text);

            //Get just the list of words
            string[] toStemSplit = text.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            //Load the Lemmatizer for English
            ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            foreach (string word in toStemSplit)
            {
                //Put the word in lower case;
                string wordLower = word.ToLower();
                //Lemmatize the word to get the stem
                string lemma = lmtz.Lemmatize(wordLower);
                //Add it to the output
                stemmed += lemma + " ";
            }

            //Console.WriteLine("The stemmed article\n\n\n" + stemmed);
            return(stemmed);
        }
        catch
        {
            return("ERROR");
        }
    }
Example #11
0
        static public List <Word> EqualWords(string[] str)
        {
            ILemmatizer lmtz  = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Russian);
            var         words = (from s in str.AsParallel()
                                 where s.Length > 2
                                 group s by lmtz.Lemmatize(s.ToLower()) into d
                                 let count = d.Count()
                                             select new
            {
                num = count,
                word = d.GroupBy(p => p.ToLower()).Where(p => count > 1),
                num1 = d.GroupBy(p => p.ToLower()).Where(p => count > 1).Count()
            }).OrderByDescending(p => p.num);
            List <Word> wordsList = new List <Word>();

            foreach (var y in words)
            {
                foreach (var s in y.word)
                {
                    wordsList.Add(new Word(s.Key, y.num, y.num1));
                }
            }
            return(wordsList);
        }
Example #12
0
        public LemmaSharp.LemmatizerPrebuiltCompact LemmaGenChoice(string LemmatizerDropdownSelection)
        {
            LemmaSharp.LemmatizerPrebuiltCompact Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);

            switch (LemmatizerDropdownSelection)
            {
            case "Беларуская (Bulgarian)":
                Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LanguagePrebuilt.Bulgarian); break;

            case "čeština (Czech)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Czech); break;

            case "English":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); break;

            case "Eesti (Estonian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Estonian); break;

            case "فارسی (Persian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Persian); break;

            case "français (French)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.French); break;

            case "Magyar (Hungarian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Hungarian); break;

            case "Македонски (Macedonian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Macedonian); break;

            case "polski (Polish)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Polish); break;

            case "Română (Romanian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Romanian); break;

            case "Pyccĸий (Russian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); break;

            case "Slovenčina (Slovak)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Slovak); break;

            case "Slovene":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Slovene); break;

            case "Srpski / Српски (Serbian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Serbian); break;

            case "Українська (Ukranian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Ukrainian); break;

            case "EnglishMT":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.EnglishMT); break;

            case "françaisMT":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.FrenchMT); break;

            case "Deutsch (German)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.German); break;

            case "italiano (Italian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Italian); break;

            case "Español (Spanish)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Spanish); break;
            }

            return(Lemmatizer);
        }
        /// <summary>
        /// Only taking tokens of at least 3 chars.
        /// </summary>
        /// <param name="text"></param>
        /// <param name="threeshold"></param>
        /// <returns></returns>
        private static Dictionary <string, int> Tokenize(string text, int threeshold, string language)
        {
            Dictionary <string, int> WordCount = new Dictionary <string, int>();
            ILemmatizer lmtz = null;

            switch (language)
            {
            case "eng":
                lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);
                break;

            case "fra":
                lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French);
                break;
            }

            text = text.Replace("\r\n", " ");
            Dictionary <string, int> entities = NlpHelper.GetNamedEntititesForText(text);

            LogHelper.Log("entities:" + entities.Count.ToString());
            string[] words = text.Split(new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            for (int i = 0; i < words.Length; i++)
            {
                var word      = words[i].ToLowerInvariant();
                var LeftWord  = (i > 0) ? words[i - 1].ToLowerInvariant() : string.Empty;
                var RightWord = (i < (words.Length - 1)) ? words[i + 1].ToLowerInvariant() : string.Empty;
                if (word.Length < 3) //avoid unnecessary lemmatization
                {
                    continue;
                }

                string LeftBiGramKey  = string.Concat(LeftWord, " ", word);
                string RightBiGramKey = string.Concat(word, " ", RightWord);
                string TriGramKey     = string.Concat(LeftWord, " ", word, " ", RightWord);
                string NamedEntity    = null;

                if (entities.ContainsKey(word.ToLowerInvariant()))
                {
                    if (entities[word.ToLowerInvariant()] != 2)
                    {
                        NamedEntity = word;
                    }
                }
                else if (entities.ContainsKey(LeftBiGramKey))
                {
                    if (entities[LeftBiGramKey] != 2)
                    {
                        NamedEntity = string.Concat(LeftWord, " ", word);
                    }
                }
                else if (entities.ContainsKey(RightBiGramKey))
                {
                    if (entities[RightBiGramKey] != 2)
                    {
                        NamedEntity = string.Concat(word, " ", RightWord);
                    }
                }
                else if (entities.ContainsKey(TriGramKey))
                {
                    if (entities[TriGramKey] != 2)
                    {
                        NamedEntity = string.Concat(LeftWord, " ", word, " ", RightWord);
                    }
                }

                if (NamedEntity != null)
                {
                    if (!WordCount.ContainsKey(NamedEntity))
                    {
                        WordCount.Add(NamedEntity, 1);
                    }
                    else
                    {
                        WordCount[NamedEntity]++;
                    }
                }
                else
                {
                    string lemma = (lmtz != null) ? LemmatizeOne(lmtz, word) : word;

                    if (lemma.Length < 3) //ignore lemma of less than 3 characters
                    {
                        continue;
                    }

                    if (!WordCount.ContainsKey(lemma))
                    {
                        WordCount.Add(lemma, 1);
                    }
                    else
                    {
                        WordCount[lemma]++;
                    }
                }
            }
            Dictionary <string, int> ElligibleWords = WordCount.Where(
                w => w.Value >= threeshold).Select(w => new { w.Key, w.Value }).ToDictionary(w => w.Key, w => w.Value);

            return(ElligibleWords);
        }
Example #14
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Input link: ");
            //String s = Console.ReadLine();
            //Console.WriteLine("Input Directory to save: ");
            //String d = Console.ReadLine();

            String startupPath = System.IO.Directory.GetCurrentDirectory();

            //startupPath = Path.Combine(startupPath, d);


            dsFiles = Path.Combine(startupPath, "Data");//Path to source files.
            //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas

            /* STEP 1. PARSE HTML
             * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser());
             *
             * parser.SetSettings(new NekdoSettings(1,100));
             * parser.OnNewData += NewData;
             * parser.OnComplete += Complete;
             * dir = new DirectoryInfo(startupPath);
             * try{
             *  dir.Create();
             * }
             * catch(IOException){
             *  Console.WriteLine("This directory has already exist. Continue work with this directory");
             * }
             * parser.Start();
             * while(parser.IsActive()){//awaiting parser...
             *
             * }
             *
             *
             *
             * CreateIndexF(parser.GetUrls());
             */


            //STEP 2 STEMMING

            /*
             * TrainDataParser TDP = new TrainDataParser();
             *
             *
             * Lemmatization(TDP);
             *
             * Console.WriteLine("");
             */
            //STEP 3 CREATING INDEX.
            String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt");

            Console.WriteLine("===STEP 3 ===");

            IndexBuilder builder = new IndexBuilder();

            Console.WriteLine("Source: {0} ", builder.Source);
            Console.WriteLine("Dest: {0}", indexFileP);


            LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX


            // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5)

            foreach (KeyValuePair <String, IndexEntry> p in indexer)
            {
                Double I = Math.Round(100.0 / p.Value.Ids.Count, 5);
                p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0);

                foreach (Double prob in p.Value.Probs)
                {
                    p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D)
                }
                //String data = p.Key +" : "+ p.Value;
                //__CreateIFile(indexFileP, data);//read Data from indexer to file.
            }

            Console.WriteLine("Done.");



            IStemmer         stem   = new RussianStemmer();                                    //STEMMER
            BoolSyntaxParser bp     = new BoolSyntaxParser();                                  //PARSER OF BOOLEAN EXPRESSIONS
            ILemmatizer      lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER.


            //STEP 4. BOOLEAN SEARCH BY(indexer)

            /*
             * while(true){
             *  Console.WriteLine("Input search str...");
             *  String ui = Console.ReadLine();
             *
             *  String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries);
             *  LinkedStack<String> ui_w =  bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION
             *
             *  String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY
             *  foreach(String it2 in ui_wa){
             *      Console.WriteLine(it2);
             *  }
             *  SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa);
             *  __FindLinks(idsOf);
             *
             * }*/


            //STEP 5 Vector SEARCH BY(indexer).

            ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT.

            Console.WriteLine("VECTOR SEARCH...\n");
            while (true)
            {
                PQ.Clear();
                Console.WriteLine("Input search str...");
                String   ui    = Console.ReadLine();
                Double[] score = new Double[101];
                //Double[] lengths = new Double[101];//ST_C
                Double[] lengths = builder.GetLens();//ST_UC
                Double   q_w     = 0.0;
                String[] u       = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (String t in u)
                {
                    IndexEntry te;
                    if (indexer.TryGetValue(lemmer.Lemmatize(t), out te))
                    {
                        q_w += te.IDF * te.IDF;
                        Int32 i = 1;
                        foreach (Int32 id in te.Ids)
                        {
                            score[id] += te.Weights[i];
                            //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C
                            i++;
                        }
                    }
                }
                q_w = Math.Sqrt(q_w);
                if (q_w == 0.0)
                {
                    Console.WriteLine("NOT FOUND");
                }
                else
                {
                    for (Int32 k = 1; k < 101; k++)
                    {
                        if (lengths[k - 1] == 0) //ST_C
                        {
                            continue;            //ST_C
                        }
                        //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C
                        //score[k] = score[k]/(lengths[k]*q_w);//ST_C
                        score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0.
                        if (score[k] == 0.0)
                        {
                            continue;
                        }
                        PQ.Add(new HeapEntry()
                        {
                            Relevance = 1d / score[k], Id = k
                        });                                                      //ASC ORDER
                    }
                    SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>();
                    Int32 KM = 5;
                    while (!PQ.IsEmpty() && KM > 0)
                    {
                        HeapEntry et = PQ.DeleteMin();
                        Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance);
                        docIds.Add(et.Id);
                        KM--;
                    }
                    Console.WriteLine("");
                    __FindLinksV(docIds);
                }
            }
        }
Example #15
0
        private Keywords getKeywords(string data, int count)
        {
            string paragraph = data;// "Simple computers are small enough to fit into mobile devices, and mobile computers can be powered by small batteries. Personal computers in their various forms are icons of the Information Age and are what most people think of as “computers.” However, the embedded computers found in many devices from MP3 players to fighter aircraft and from toys to industrial robots are the most numerous.";

            paragraph = paragraph.ToLower();
            string[] words = paragraph.Split(new char[] { ' ', ',', '.', '(', ')', '[', ']', '“', '”', '"', '\n', '!' }, StringSplitOptions.RemoveEmptyEntries);

            string[]      swords = words.Where(x => !stopWordTest(x)).ToArray();
            List <string> lwords = new List <string>();
            ILemmatizer   lemm   = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);

            foreach (string word in swords)
            {
                if (word.Length == 1)
                {
                    continue;
                }
                if (word.Length <= 3)
                {
                    //Console.WriteLine(word);
                    lwords.Add(word.ToLower());
                }
                else
                {
                    lwords.Add(lemm.Lemmatize(word));
                }
            }
            List <string> fwords = new List <string>();

            fwords = lwords.Where(x => !commonWordTest(x)).ToList();
            //remove keyword
            //
            string sptr = textBox1.Text;

            sptr = sptr.ToLower();
            // foreach (string sp in fwords)
            //   if (sp==sptr) fwords.Remove(sp);
            //
            for (int i = 0; i < fwords.Count; i++)
            {
                if (fwords[i].Equals(sptr))
                {
                    fwords.Remove(fwords[i]);
                }
            }

            Dictionary <string, int> finallist = new Dictionary <string, int>();
            var cwords = fwords.GroupBy(i => i);

            foreach (var w in cwords)
            {
                if (w.Count() > count)
                {
                    finallist.Add(w.Key, w.Count());
                    textBox2.AppendText(w.Key + ":  " + w.Count() + "\n");
                    Console.WriteLine("{0} {1}", w.Key, w.Count());
                }
            }

            Keywords keys = new Keywords();

            for (int i = 0; i < fwords.Count; i++)
            {
                if (finallist.ContainsKey(fwords[i]))
                {
                    keys.addOcc(fwords[i], i);
                }
            }
            keys.words.Sort(sortWordsCount);
            return(keys);
        }
Example #16
0
 public Normalizer()
 {
     lemmatizer = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);
 }
Example #17
0
        private Dictionary <string, int> GetTermsDictionary(HtmlDocument doc)
        {
            Dictionary <string, int> dict = new Dictionary <string, int>();

            //var paragraphs = doc.DocumentNode.SelectNodes("//div[contains(@id, 'mw-content-text')]");
            //Get the contents
            var paragraphs = doc.DocumentNode.SelectNodes("//p");

            var contents       = doc.DocumentNode.SelectNodes("//div[contains(@class, 'mw-parser-output')]/p");
            var orderedLists   = doc.DocumentNode.SelectNodes("//div[contains(@class, 'mw-parser-output')]/ol/li");
            var unorderedLists = doc.DocumentNode.SelectNodes("//div[contains(@class, 'mw-parser-output')]/ul/li");
            //int count = 1;
            var lemmatizer = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            if (contents != null)
            {
                foreach (var content in contents)
                {
                    //Console.WriteLine(content.InnerText);
                    string[] terms = content.InnerText.Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
                    foreach (var term in terms)
                    {
                        string input = Preprocess(term, lemmatizer);
                        //Don't count on stopwords
                        if (stopwords.Contains(input))
                        {
                            continue;
                        }
                        if (dict.ContainsKey(input))
                        {
                            dict[input] += 1;
                        }
                        else if (input.Length > 14)
                        {
                            continue;
                        }
                        else
                        if (!string.IsNullOrEmpty(input))
                        {
                            dict.Add(input, 1);
                        }
                    }
                }
            }

            if (orderedLists != null)
            {
                foreach (var content in orderedLists)
                {
                    //Console.WriteLine(content.InnerText);
                    string[] terms = content.InnerText.Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
                    foreach (var term in terms)
                    {
                        string input = Preprocess(term, lemmatizer);
                        //Don't count on stopwords
                        if (stopwords.Contains(input))
                        {
                            continue;
                        }
                        if (dict.ContainsKey(input))
                        {
                            dict[input] += 1;
                        }
                        else if (input.Length > 14)
                        {
                            continue;
                        }
                        else
                        if (!string.IsNullOrEmpty(input))
                        {
                            dict.Add(input, 1);
                        }
                    }
                }
            }

            if (unorderedLists != null)
            {
                foreach (var content in unorderedLists)
                {
                    //Console.WriteLine(content.InnerText);
                    string[] terms = content.InnerText.Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
                    foreach (var term in terms)
                    {
                        string input = Preprocess(term, lemmatizer);
                        //Don't count on stopwords
                        if (stopwords.Contains(input))
                        {
                            continue;
                        }
                        if (dict.ContainsKey(input))
                        {
                            dict[input] += 1;
                        }
                        else if (input.Length > 14)
                        {
                            continue;
                        }
                        else
                        if (!string.IsNullOrEmpty(input))
                        {
                            dict.Add(input, 1);
                        }
                    }
                }
            }

            return(dict);
        }