Example #1
0
        static void Main(string[] args)
        {
            LanguagePrebuilt language = args != null && args.Length > 0 ? getLanguage(args[0]) : getLanguage();

            ILemmatizer lemmatizer = new LemmatizerPrebuiltCompact(language);

            if (args.Length == 0 || args.Length == 1)
            {
                Console.WriteLine("Batch-processing all files contained in the subfolder 'lemma-source' into 'lemma-output'");

                string[] fileList = FileTraverser.getFileList();

                foreach (string file in fileList)
                {
                    processFile(file, lemmatizer);
                }
            }
            else if(args.Length == 2)
            {
                Logger.logError("Missing argument");
            }
            else if(args.Length == 3)
            {
                processFile(args[1], lemmatizer, args[2]);
            }
            else
            {
                Logger.logError("Argument count mismatch, expected max 3 received {0}", args.Length.ToString());
            }
        }
 private DocumentVector vectorizeDocument(String htmlResult)
 {
     // Get term vector
     var lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
     var documentVector = from s in _splitRegex.Split(htmlResult)
                          where !String.IsNullOrWhiteSpace(s)
                          let canonical = s.ToLower()
                          where !_stopWords.Contains(canonical) && canonical.Length > 1
                          select lmtz.Lemmatize(s);
     return new DocumentVector(documentVector);
 }
Example #3
0
        public LemmaSharp.LemmatizerPrebuiltCompact LemmaGenChoice(string LemmatizerDropdownSelection)
        {
            LemmaSharp.LemmatizerPrebuiltCompact Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);

            switch (LemmatizerDropdownSelection)
            {
            case "Беларуская (Bulgarian)":
                Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LanguagePrebuilt.Bulgarian); break;

            case "čeština (Czech)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Czech); break;

            case "English":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); break;

            case "Eesti (Estonian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Estonian); break;

            case "فارسی (Persian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Persian); break;

            case "français (French)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.French); break;

            case "Magyar (Hungarian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Hungarian); break;

            case "Македонски (Macedonian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Macedonian); break;

            case "polski (Polish)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Polish); break;

            case "Română (Romanian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Romanian); break;

            case "Pyccĸий (Russian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); break;

            case "Slovenčina (Slovak)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Slovak); break;

            case "Slovene":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Slovene); break;

            case "Srpski / Српски (Serbian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Serbian); break;

            case "Українська (Ukranian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Ukrainian); break;

            case "EnglishMT":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.EnglishMT); break;

            case "françaisMT":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.FrenchMT); break;

            case "Deutsch (German)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.German); break;

            case "italiano (Italian)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Italian); break;

            case "Español (Spanish)":
                Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Spanish); break;
            }

            return(Lemmatizer);
        }
        /// <summary>
        /// Only taking tokens of at least 3 chars.
        /// </summary>
        /// <param name="text"></param>
        /// <param name="threeshold"></param>
        /// <returns></returns>
        private static Dictionary<string, int> Tokenize(string text, int threeshold,string language)
        {
            Dictionary<string, int> WordCount = new Dictionary<string, int>();
            ILemmatizer lmtz = null;
            switch(language)
            {
                case "eng":
                    lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);
                    break;
                case "fra":
                    lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French);
                    break;
            }
            
            text = text.Replace("\r\n", " ");
            Dictionary<string,int> entities = NlpHelper.GetNamedEntititesForText(text);
            LogHelper.Log("entities:"+entities.Count.ToString());
            string[] words = text.Split(new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            for (int i = 0; i < words.Length;i++ )
            {
                var word = words[i].ToLowerInvariant();
                var LeftWord = (i > 0) ? words[i - 1].ToLowerInvariant() : string.Empty;
                var RightWord = (i < (words.Length - 1)) ? words[i + 1].ToLowerInvariant() : string.Empty;
                if (word.Length < 3) //avoid unnecessary lemmatization
                    continue;
                
                string LeftBiGramKey=string.Concat(LeftWord," ",word);
                string RightBiGramKey = string.Concat(word, " ", RightWord);
                string TriGramKey = string.Concat(LeftWord, " ", word, " ", RightWord);
                string NamedEntity = null;

                if (entities.ContainsKey(word.ToLowerInvariant()))
                {
                    if (entities[word.ToLowerInvariant()] != 2)
                        NamedEntity = word;
                }
                else if(entities.ContainsKey(LeftBiGramKey))
                {
                    if (entities[LeftBiGramKey] != 2)
                        NamedEntity = string.Concat(LeftWord, " ", word);
                }
                else if(entities.ContainsKey(RightBiGramKey))
                {
                    if (entities[RightBiGramKey] != 2)
                        NamedEntity = string.Concat(word, " ", RightWord);
                }
                else if(entities.ContainsKey(TriGramKey))
                {
                    if (entities[TriGramKey] != 2)
                        NamedEntity = string.Concat(LeftWord, " ", word, " ", RightWord);
                }

                if(NamedEntity != null)
                {
                    if (!WordCount.ContainsKey(NamedEntity))
                    {
                        WordCount.Add(NamedEntity, 1);
                    }
                    else
                    {
                        WordCount[NamedEntity]++;
                    }
                }
                else{
                    string lemma = (lmtz != null) ? LemmatizeOne(lmtz, word) : word;

                    if (lemma.Length < 3) //ignore lemma of less than 3 characters
                        continue;

                    if (!WordCount.ContainsKey(lemma))
                    {
                        WordCount.Add(lemma, 1);
                    }
                    else
                    {
                        WordCount[lemma]++;
                    }    
                }
                
            }
            Dictionary<string, int> ElligibleWords = WordCount.Where(
                w => w.Value >= threeshold).Select(w => new { w.Key, w.Value }).ToDictionary(w => w.Key, w => w.Value);

            return ElligibleWords;
        }
Example #5
0
        private Keywords getKeywords(string data, int count)
        {
            string paragraph = data;// "Simple computers are small enough to fit into mobile devices, and mobile computers can be powered by small batteries. Personal computers in their various forms are icons of the Information Age and are what most people think of as “computers.” However, the embedded computers found in many devices from MP3 players to fighter aircraft and from toys to industrial robots are the most numerous.";

            paragraph = paragraph.ToLower();
            string[] words = paragraph.Split(new char[] { ' ', ',', '.', '(', ')', '[', ']', '“', '”', '"', '\n', '!' }, StringSplitOptions.RemoveEmptyEntries);

            string[] swords = words.Where(x => !stopWordTest(x)).ToArray();
            List<string> lwords = new List<string>();
            ILemmatizer lemm = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
            foreach (string word in swords)
            {
                if (word.Length == 1)
                    continue;
                if (word.Length <= 3)
                {
                    //Console.WriteLine(word);
                    lwords.Add(word.ToLower());
                }
                else
                    lwords.Add(lemm.Lemmatize(word));

            }
            List<string> fwords = new List<string>();
            fwords = lwords.Where(x => !commonWordTest(x)).ToList();
            //remove keyword
            //
            string sptr = textBox1.Text;
            sptr = sptr.ToLower();
            // foreach (string sp in fwords)
            //   if (sp==sptr) fwords.Remove(sp);
            //
            for (int i = 0; i < fwords.Count; i++)
            {
                if (fwords[i].Equals(sptr))
                    fwords.Remove(fwords[i]);
            }

            Dictionary<string, int> finallist = new Dictionary<string, int>();
            var cwords = fwords.GroupBy(i => i);
            foreach (var w in cwords)
            {
                if (w.Count() > count)
                {

                    finallist.Add(w.Key, w.Count());
                    textBox2.AppendText(w.Key + ":  " + w.Count() + "\n");
                    Console.WriteLine("{0} {1}", w.Key, w.Count());

                }
            }

            Keywords keys = new Keywords();
            for (int i = 0; i < fwords.Count; i++)
            {
                if(finallist.ContainsKey(fwords[i]))
                    keys.addOcc(fwords[i], i);
            }
            keys.words.Sort(sortWordsCount);
            return keys;
        }