private string NormalizeAndCreateAlphabet(string [] words, ILemmatizer lemmatizer, IStemmer stemmer, List <string> alphabet)
        {
            if (words == null || lemmatizer == null || stemmer == null)
            {
                throw new Exception("Null parameters while normalizing text");
            }

            var preprocessedText = new StringBuilder();

            for (var i = 0; i < words.Length; i++)
            {
                var newWord = stemmer.Stem(lemmatizer.Lemmatize(words[i].ToLower())); //firstly the words are lemmatized, then stemmed
                words[i] = newWord;
                preprocessedText.Append(newWord);
                preprocessedText.Append(' ');

                if (alphabet.Find(x => x == newWord) == null)
                {
                    alphabet.Add(newWord);            //creating alphabet of words in text
                }
            }

            alphabet.Sort();                          //alphabet is sorted so other method could use it to convert words to numbers in text

            return(preprocessedText.ToString());
        }
예제 #2
0
 public TrainDataParser(String path)
 {
     this._stemmer = new RussianStemmer(); //RUSSIAN STEMMER
     this._dPath   = path;
     this._s       = Path.Combine(System.IO.Directory.GetCurrentDirectory(), "LData");
     this.lm       = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian);//RUSSIAN LEMMATIZER IS BEING USED NOW
 }
예제 #3
0
        private static string LemmatizeWord(ILemmatizer lmtz, string word)
        {
            string wordLower = word.ToLower();
            string lemma     = lmtz.Lemmatize(wordLower);

            return(lemma);
        }
예제 #4
0
 public UploadFileHandler(
     ReaderAppContext dbContext,
     IAuthorizationService authService,
     IMapper mapper,
     IUserFileStore userFilesStore,
     IFileProcessedWordsCache filesWordsCache,
     ILemmatizer lemmatizer,
     IUserDictionary userDictionary,
     PipeBuilder pipeBuilder) : base(dbContext, authService, mapper) => (_userFilesStore, _filesWordsCache, _lemmatizer, _userDictionary, _pipeBuilder) = (userFilesStore, filesWordsCache, lemmatizer, userDictionary, pipeBuilder);
예제 #5
0
        /// <summary>
        /// Initializes the current instance.
        /// </summary>
        /// <param name="lemmatizer">A lemmatizer.</param>
        /// <param name="listeners">An array of evaluation listeners.</param>
        public LemmatizerEvaluator(ILemmatizer lemmatizer, params IEvaluationMonitor <LemmaSample>[] listeners) : base(listeners)
        {
            if (lemmatizer == null)
            {
                throw new ArgumentNullException(nameof(lemmatizer));
            }

            this.lemmatizer = lemmatizer;
        }
예제 #6
0
        private void LoadLemmatizer()
        {
            if (!alreadyLoadLemmatizer)
            {
                lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

                alreadyLoadLemmatizer = true;
            }
        }
예제 #7
0
        /// <summary>
        /// Initializes a new instance of the <see cref="LemmatizerAnalyzer"/> class.
        /// </summary>
        /// <param name="lemmatizer">The lemmatizer.</param>
        /// <exception cref="System.ArgumentNullException">
        /// <paramref name="lemmatizer"/>
        /// </exception>
        public LemmatizerAnalyzer(ILemmatizer lemmatizer)
        {
            if (lemmatizer == null)
            {
                throw new ArgumentNullException("lemmatizer");
            }

            this.lemmatizer = lemmatizer;
        }
예제 #8
0
        private static string[] LemmatizePhrase(ILemmatizer lmtz, string phrase)
        {
            var words = phrase.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            for (var i = 0; i < words.Length; i++)
            {
                words[i] = LemmatizeWord(lmtz, words[i]);
            }

            return(words);
        }
예제 #9
0
        /// <summary>
        /// Lemmatize the phrases.
        /// </summary>
        private static List <string> Lemmatize(List <string> lstPhrases, ILemmatizer lmtz)
        {
            var lemmaListPhrases = new List <string[]>();

            foreach (var phrase in lstPhrases)
            {
                var words = phrase.Split(
                    new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);
                lemmaListPhrases.Add(words);
            }

            foreach (var arrOfWords in lemmaListPhrases)
            {
                for (var i = 0; i < arrOfWords.Length; i++)
                {
                    arrOfWords[i] = LemmatizeWord(lmtz, arrOfWords[i]);
                }
            }

            return(lemmaListPhrases.Select(arr => string.Join(" ", arr)).ToList());
        }
예제 #10
0
        /// <summary>
        /// Processes single file specified in the path parameter. If targetPath is specified, the result will be written in that location.
        /// If targetPath is not specified, result path will be built by replacing 'lemma-source' by 'lemma-output' in the original file path.
        /// </summary>
        /// <param name="path">file location</param>
        /// <param name="lemmatizer">Lemmatizer from lemmagen that should process the file</param>
        /// <param name="targetPath">target file location and name</param>
        private static void processFile(string path, ILemmatizer lemmatizer, string targetPath = null)
        {
            Console.WriteLine("Processing file {0}", new FileInfo(path).Name);

            string fileContent = readFile(path);

            if (fileContent == null)
            {
                return;
            }

            string[] wordList = prepareFileContent(fileContent);

            List <string> resultList = new List <string>();

            foreach (string word in wordList)
            {
                resultList.Add(processWord(word, lemmatizer));
            }

            writeOutputFile(resultList.ToArray(), path, targetPath);
        }
        public IDictionary<string, int> MakeDictionary(string text, ILemmatizer lemmatizer)
        {
            var dictionary = InitDictionary();
            var words = text.Split(_splitters, StringSplitOptions.RemoveEmptyEntries);

            foreach (var word in words)
            {
                int tmp;
                if (Int32.TryParse(word, out tmp))
                {
                    continue;
                }

                string lowerWord = lemmatizer.Lemmatize(word.ToLower().Trim());

                if (!dictionary.ContainsKey(lowerWord))
                    dictionary[lowerWord] = 1;
                else
                    dictionary[lowerWord] += 1;
            }

            return dictionary;
        }
예제 #12
0
        public async static Task <IEnumerable <LemmaGroup> > GroupLemmas(IEnumerable <string> words, ILemmatizer lemmatizer)
        {
            var map = new Dictionary <string, LemmaGroup>();

            foreach (var word in words)
            {
                var lemma = await lemmatizer.Lemmatize(word);

                if (!map.ContainsKey(lemma))
                {
                    map.Add(lemma, new LemmaGroup(lemma));
                }

                map[lemma].AddToGroup(word);
            }

            return(map.Values);
        }
예제 #13
0
 /// <summary>
 /// this needs 32 bit version of project
 /// You can change it in project properties
 /// </summary>
 public AOTLemmatizer()
 {
     lemmatizerRu = new LemmatizerRussian();
     lemmatizerRu.LoadDictionariesRegistry();
 }
        /// <summary>
        /// Only taking tokens of at least 3 chars.
        /// </summary>
        /// <param name="text"></param>
        /// <param name="threeshold"></param>
        /// <returns></returns>
        private static Dictionary <string, int> Tokenize(string text, int threeshold, string language)
        {
            Dictionary <string, int> WordCount = new Dictionary <string, int>();
            ILemmatizer lmtz = null;

            switch (language)
            {
            case "eng":
                lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);
                break;

            case "fra":
                lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French);
                break;
            }

            text = text.Replace("\r\n", " ");
            Dictionary <string, int> entities = NlpHelper.GetNamedEntititesForText(text);

            LogHelper.Log("entities:" + entities.Count.ToString());
            string[] words = text.Split(new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            for (int i = 0; i < words.Length; i++)
            {
                var word      = words[i].ToLowerInvariant();
                var LeftWord  = (i > 0) ? words[i - 1].ToLowerInvariant() : string.Empty;
                var RightWord = (i < (words.Length - 1)) ? words[i + 1].ToLowerInvariant() : string.Empty;
                if (word.Length < 3) //avoid unnecessary lemmatization
                {
                    continue;
                }

                string LeftBiGramKey  = string.Concat(LeftWord, " ", word);
                string RightBiGramKey = string.Concat(word, " ", RightWord);
                string TriGramKey     = string.Concat(LeftWord, " ", word, " ", RightWord);
                string NamedEntity    = null;

                if (entities.ContainsKey(word.ToLowerInvariant()))
                {
                    if (entities[word.ToLowerInvariant()] != 2)
                    {
                        NamedEntity = word;
                    }
                }
                else if (entities.ContainsKey(LeftBiGramKey))
                {
                    if (entities[LeftBiGramKey] != 2)
                    {
                        NamedEntity = string.Concat(LeftWord, " ", word);
                    }
                }
                else if (entities.ContainsKey(RightBiGramKey))
                {
                    if (entities[RightBiGramKey] != 2)
                    {
                        NamedEntity = string.Concat(word, " ", RightWord);
                    }
                }
                else if (entities.ContainsKey(TriGramKey))
                {
                    if (entities[TriGramKey] != 2)
                    {
                        NamedEntity = string.Concat(LeftWord, " ", word, " ", RightWord);
                    }
                }

                if (NamedEntity != null)
                {
                    if (!WordCount.ContainsKey(NamedEntity))
                    {
                        WordCount.Add(NamedEntity, 1);
                    }
                    else
                    {
                        WordCount[NamedEntity]++;
                    }
                }
                else
                {
                    string lemma = (lmtz != null) ? LemmatizeOne(lmtz, word) : word;

                    if (lemma.Length < 3) //ignore lemma of less than 3 characters
                    {
                        continue;
                    }

                    if (!WordCount.ContainsKey(lemma))
                    {
                        WordCount.Add(lemma, 1);
                    }
                    else
                    {
                        WordCount[lemma]++;
                    }
                }
            }
            Dictionary <string, int> ElligibleWords = WordCount.Where(
                w => w.Value >= threeshold).Select(w => new { w.Key, w.Value }).ToDictionary(w => w.Key, w => w.Value);

            return(ElligibleWords);
        }
예제 #15
0
 public TextUtil()
 {
     _lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
 }
예제 #16
0
 /// <summary>
 /// Converts single word to lowercase and lemmatizes
 /// </summary>
 /// <param name="word"></param>
 /// <param name="lemmatizer"></param>
 /// <returns></returns>
 private static string processWord(string word, ILemmatizer lemmatizer)
 {
     return lemmatizer.Lemmatize(word.ToLower());
 }
예제 #17
0
        /// <summary>
        /// Processes single file specified in the path parameter. If targetPath is specified, the result will be written in that location.
        /// If targetPath is not specified, result path will be built by replacing 'lemma-source' by 'lemma-output' in the original file path.
        /// </summary>
        /// <param name="path">file location</param>
        /// <param name="lemmatizer">Lemmatizer from lemmagen that should process the file</param>
        /// <param name="targetPath">target file location and name</param>
        private static void processFile(string path, ILemmatizer lemmatizer, string targetPath = null)
        {
            Console.WriteLine("Processing file {0}", new FileInfo(path).Name);

            string fileContent = readFile(path);

            if (fileContent == null)
                return;

            string[] wordList = prepareFileContent(fileContent);

            List<string> resultList = new List<string>();

            foreach(string word in wordList)
            {
                resultList.Add(processWord(word, lemmatizer));
            }

            writeOutputFile(resultList.ToArray(), path, targetPath);
        }
예제 #18
0
        //COMPUTE BY VECTORS AND OPERATIONS IDS OF DOCUMENTS.
        private static SimpleTextCrawler.Structures.LinkedList <Int32> __GetIds(ILemmatizer lemmer, LinkedDictionary <String, IndexEntry> indexer, String[] expr)
        {
            SimpleTextCrawler.Structures.LinkedList <Int32> IDS = new SimpleTextCrawler.Structures.LinkedList <Int32>();
            LinkedStack <Boolean[]> V = new LinkedStack <Boolean[]>();
            Int32 i = 0;

            while (i < expr.Length)
            {
                if (__isUnOperator(expr[i]))
                {
                    if (V.IsEmpty())
                    {
                        Console.WriteLine("Error in Expression");
                        return(IDS);
                    }
                    Boolean[] vi = V.Top();
                    V.Pop();
                    for (Int32 j = 1; j < 101; j++)
                    {
                        vi[j] = !(vi[j]);
                    }
                    V.Push(vi);
                }
                else if (__isOperator(expr[i]))
                {
                    if (V.Count < 2)
                    {
                        Console.WriteLine("Error in Expression");
                        return(IDS);
                    }
                    Boolean[] o1 = V.Top();
                    V.Pop();
                    Boolean[] o2 = V.Top();
                    V.Pop();
                    Boolean[] r = new Boolean[101];
                    switch (expr[i])
                    {
                    case "and": {
                        for (Int32 j = 1; j < 101; j++)
                        {
                            r[j] = o1[j] && o2[j];
                        }
                        break;
                    }

                    case "or": {
                        for (Int32 j = 1; j < 101; j++)
                        {
                            r[j] = o1[j] || o2[j];
                        }
                        break;
                    }

                    default: {
                        for (Int32 j = 1; j < 101; j++)
                        {
                            r[j] = o1[j] && o2[j];
                        }
                        break;
                    }
                    }
                    V.Push(r);
                }
                else
                {
                    //Console.WriteLine("Lemma: "+lemmer.Lemmatize(expr[i]));
                    V.Push(__GetBVector(indexer, lemmer.Lemmatize(expr[i])));
                    //Console.WriteLine("added");
                }
                i++;
            }
            if (V.IsEmpty())
            {
                Console.WriteLine("Error in Expression");
                return(IDS);
            }
            Boolean[] r_v = V.Top();
            V.Pop();
            for (Int32 d = 1; d < 101; d++)
            {
                if (r_v[d])
                {
                    IDS.Add(d);
                }
            }
            return(IDS);
        }
예제 #19
0
 public static void Register(Language language, ILemmatizer lemmatizer) => _perLanguage[language] = lemmatizer;
예제 #20
0
 public GetWordInfoHandler(
     ITranslationProvider translationProvider,
     IWordDefinitionProvider definitionProvider,
     ILemmatizer lemmatizer) => (_translationProvider, _definitionProvider, _lemmatizer) = (translationProvider, definitionProvider, lemmatizer);
 public LemmaGen()
 {
     _lemmatizer = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);
 }
예제 #22
0
 public LemmaFilter(ILemmatizer lemmatizer) => _lemmatizer = lemmatizer;
예제 #23
0
 public Morpho()
 {
     lemmatizerRus = new LemmatizerRussian();
     al = new ArrayList();
 }
예제 #24
0
 /// <summary>
 /// Converts single word to lowercase and lemmatizes
 /// </summary>
 /// <param name="word"></param>
 /// <param name="lemmatizer"></param>
 /// <returns></returns>
 private static string processWord(string word, ILemmatizer lemmatizer)
 {
     return(lemmatizer.Lemmatize(word.ToLower()));
 }
예제 #25
0
 public Index(ILemmatizer lemmatizer, string filename = "index.json")
 {
     this.Lemmatizer = lemmatizer;
     this.filename = filename;
     LoadFromJson(filename);
 }
 public LemmaGen(LemmaSharp.LanguagePrebuilt language)
 {
     _lemmatizer = new LemmatizerPrebuiltCompact(language);
 }
예제 #27
0
        /// <summary>
        /// Initializes a new instance of the <see cref="LemmatizerAnalyzer"/> class.
        /// </summary>
        /// <param name="lemmatizer">The lemmatizer.</param>
        /// <exception cref="System.ArgumentNullException">
        /// <paramref name="lemmatizer"/>
        /// </exception>
        public LemmatizerAnalyzer(ILemmatizer lemmatizer) {
            if (lemmatizer == null)
                throw new ArgumentNullException("lemmatizer");

            this.lemmatizer = lemmatizer;
        }
 public LemmatizerImplementation()
 {
     lemmatizer = new LemmaSharp.Classes.Lemmatizer(System.IO.File.OpenRead(
                                                        "Lemmas/full7z-mlteast-en.lem")
                                                    );
 }
예제 #29
0
 public Morpher(ILemmatizer lemmatizer)
 {
     this.lemmatizer = lemmatizer;
 }
예제 #30
0
파일: Program.cs 프로젝트: svn2github/seman
        static void Main(string[] args)
        {
            var rmlPath = System.Environment.GetEnvironmentVariable("RML");

            Console.WriteLine("For test LemmatizerNET you need Lemmatizer dictionaries (RML)");
            Console.Write("\tRML directory (" + rmlPath + "): ");

            if (string.IsNullOrEmpty(rmlPath))
            {
                var newRmlPath = Console.ReadLine();
                if (!string.IsNullOrEmpty(newRmlPath))
                {
                    rmlPath = newRmlPath;
                }
            }

            //Console.Write("Select language 'R'-Russian, 'G'-German, 'E'-English (default - R): ");
            var           langStr = "R"; // Console.ReadLine().ToUpper(CultureInfo.InvariantCulture);
            MorphLanguage lang;

            switch (langStr)
            {
            case "":
            case "R":
                lang = MorphLanguage.Russian;
                break;

            case "G":
                lang = MorphLanguage.German;
                break;

            case "E":
                lang = MorphLanguage.English;
                break;

            default:
                Console.WriteLine("Wrong selection. Using default language Russian");
                lang = MorphLanguage.Russian;
                break;
            }
            ILemmatizer lem = LemmatizerFactory.Create(lang);
            string      rgt = "";

            try
            {
                StreamReader r = new StreamReader(rmlPath + @"\Dicts\Morph\" + langStr.ToLower() + "gramtab.tab", Encoding.GetEncoding(1251));
                rgt = r.ReadToEnd(); r.Close();
            }
            catch (Exception e)
            {
            }
            try
            {
                var manager = FileManager.GetFileManager(rmlPath);
                lem.LoadDictionariesRegistry(manager);
            }
            catch (IOException e)
            {
                Console.WriteLine("Unable to load dictionaries due to the following:\r\n\t");
                Console.WriteLine(e.Message);
                return;
            }
            while (true)
            {
                Console.Write("\nInput word (q - exit): ");
                var word = Console.ReadLine().Replace("\"", "").Replace("'", "").Trim();
                if (word.Equals("q", StringComparison.InvariantCultureIgnoreCase))
                {
                    return;
                }
                //Позволяет декодировать грамкоды
                if (word.ToLower().Contains("g") || word.Contains("\t") || word.Contains("\a")) //m_gramcodes = 0x0322630c "абажай"
                {
                    string gc = Regex.Match(word, "[А-Яа-яёЁ]+").Groups[0].Value;
                    string r  = "";
                    for (int i = 0; i < gc.Length / 2; i++)
                    {
                        Console.WriteLine(Regex.Match(rgt, "^" + gc.Substring(2 * i, 2) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", ""));
                    }
                    Console.WriteLine("");
                    continue;
                }
                var paradigmList = lem.CreateParadigmCollectionFromForm(word, false, false);
                if (paradigmList.Count == 0)
                {
                    try //Позволяет декодировать граммемы, если число вместо слова
                    {
                        string[] g = Grammems;
                        if (word.StartsWith("f"))
                        {
                            word = word.Substring(1); g = Flags;
                        }                                                                  //декодируем флаги
                        UInt64 gr = Convert.ToUInt64(word);
                        for (int i = g.Length - 1; i > -1; i--)
                        {
                            if (((1uL << i) & gr) > 0)
                            {
                                Console.Write(g[i] + ",");
                            }
                        }
                        Console.WriteLine("");
                    }
                    catch (Exception)
                    {
                    }

                    Console.WriteLine("Paradigms not found");
                    continue;
                }
                string ancodes = "";
                for (var i = 0; i < paradigmList.Count; i++)
                {
                    var paradigm = paradigmList[i];

                    Console.Write("Paradigm: ");
                    rmlPath = paradigm.SrcAncode;
                    Console.WriteLine(paradigm.Norm);
                    int k = paradigm.GetAccent(0);
                    k = paradigm.SrcAccentedVowel;
                    Console.Write("\tFounded: ");
                    Console.WriteLine(paradigm.Founded);
                    Console.Write("\tParadigmID: ");
                    Console.WriteLine(paradigm.ParadigmID);
                    Console.Write("\tAccentModelNo: ");
                    Console.WriteLine(paradigm.AccentModelNo);
                    Console.WriteLine("=====");
                    Console.WriteLine("$type_grm = " + (paradigm.TypeAncode == "??" ? "" : Regex.Match(rgt, "^" + paradigm.TypeAncode + "[^а-яА-яёЁ]*([^\r]*)", RegexOptions.Multiline).Groups[1].Value));
                    ancodes += paradigm.TypeAncode;
                    for (var j = 0; j < paradigm.Count; j++)
                    {
                        ancodes += paradigm.GetAncode(j);
                        Console.Write("\t\t");
                        Console.Write(paradigm.GetAccent(j) == 255 ? paradigm.GetForm(j) : paradigm.GetForm(j).Insert(paradigm.GetAccent(j) + 1, "'"));
                        Console.Write("\t");
                        Console.WriteLine(Regex.Match(rgt, "^" + paradigm.GetAncode(j) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", ""));
                    }
                }
            }
        }
예제 #31
0
 /// <summary>
 /// this needs 32 bit version of project
 /// You can change it in project properties
 /// </summary>
 public AOTLemmatizer()
 {
     lemmatizerRu = new LemmatizerRussian();
     lemmatizerRu.LoadDictionariesRegistry();
 }