private string NormalizeAndCreateAlphabet(string [] words, ILemmatizer lemmatizer, IStemmer stemmer, List <string> alphabet) { if (words == null || lemmatizer == null || stemmer == null) { throw new Exception("Null parameters while normalizing text"); } var preprocessedText = new StringBuilder(); for (var i = 0; i < words.Length; i++) { var newWord = stemmer.Stem(lemmatizer.Lemmatize(words[i].ToLower())); //firstly the words are lemmatized, then stemmed words[i] = newWord; preprocessedText.Append(newWord); preprocessedText.Append(' '); if (alphabet.Find(x => x == newWord) == null) { alphabet.Add(newWord); //creating alphabet of words in text } } alphabet.Sort(); //alphabet is sorted so other method could use it to convert words to numbers in text return(preprocessedText.ToString()); }
public TrainDataParser(String path) { this._stemmer = new RussianStemmer(); //RUSSIAN STEMMER this._dPath = path; this._s = Path.Combine(System.IO.Directory.GetCurrentDirectory(), "LData"); this.lm = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian);//RUSSIAN LEMMATIZER IS BEING USED NOW }
private static string LemmatizeWord(ILemmatizer lmtz, string word) { string wordLower = word.ToLower(); string lemma = lmtz.Lemmatize(wordLower); return(lemma); }
public UploadFileHandler( ReaderAppContext dbContext, IAuthorizationService authService, IMapper mapper, IUserFileStore userFilesStore, IFileProcessedWordsCache filesWordsCache, ILemmatizer lemmatizer, IUserDictionary userDictionary, PipeBuilder pipeBuilder) : base(dbContext, authService, mapper) => (_userFilesStore, _filesWordsCache, _lemmatizer, _userDictionary, _pipeBuilder) = (userFilesStore, filesWordsCache, lemmatizer, userDictionary, pipeBuilder);
/// <summary> /// Initializes the current instance. /// </summary> /// <param name="lemmatizer">A lemmatizer.</param> /// <param name="listeners">An array of evaluation listeners.</param> public LemmatizerEvaluator(ILemmatizer lemmatizer, params IEvaluationMonitor <LemmaSample>[] listeners) : base(listeners) { if (lemmatizer == null) { throw new ArgumentNullException(nameof(lemmatizer)); } this.lemmatizer = lemmatizer; }
private void LoadLemmatizer() { if (!alreadyLoadLemmatizer) { lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); alreadyLoadLemmatizer = true; } }
/// <summary> /// Initializes a new instance of the <see cref="LemmatizerAnalyzer"/> class. /// </summary> /// <param name="lemmatizer">The lemmatizer.</param> /// <exception cref="System.ArgumentNullException"> /// <paramref name="lemmatizer"/> /// </exception> public LemmatizerAnalyzer(ILemmatizer lemmatizer) { if (lemmatizer == null) { throw new ArgumentNullException("lemmatizer"); } this.lemmatizer = lemmatizer; }
private static string[] LemmatizePhrase(ILemmatizer lmtz, string phrase) { var words = phrase.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); for (var i = 0; i < words.Length; i++) { words[i] = LemmatizeWord(lmtz, words[i]); } return(words); }
/// <summary> /// Lemmatize the phrases. /// </summary> private static List <string> Lemmatize(List <string> lstPhrases, ILemmatizer lmtz) { var lemmaListPhrases = new List <string[]>(); foreach (var phrase in lstPhrases) { var words = phrase.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); lemmaListPhrases.Add(words); } foreach (var arrOfWords in lemmaListPhrases) { for (var i = 0; i < arrOfWords.Length; i++) { arrOfWords[i] = LemmatizeWord(lmtz, arrOfWords[i]); } } return(lemmaListPhrases.Select(arr => string.Join(" ", arr)).ToList()); }
/// <summary> /// Processes single file specified in the path parameter. If targetPath is specified, the result will be written in that location. /// If targetPath is not specified, result path will be built by replacing 'lemma-source' by 'lemma-output' in the original file path. /// </summary> /// <param name="path">file location</param> /// <param name="lemmatizer">Lemmatizer from lemmagen that should process the file</param> /// <param name="targetPath">target file location and name</param> private static void processFile(string path, ILemmatizer lemmatizer, string targetPath = null) { Console.WriteLine("Processing file {0}", new FileInfo(path).Name); string fileContent = readFile(path); if (fileContent == null) { return; } string[] wordList = prepareFileContent(fileContent); List <string> resultList = new List <string>(); foreach (string word in wordList) { resultList.Add(processWord(word, lemmatizer)); } writeOutputFile(resultList.ToArray(), path, targetPath); }
public IDictionary<string, int> MakeDictionary(string text, ILemmatizer lemmatizer) { var dictionary = InitDictionary(); var words = text.Split(_splitters, StringSplitOptions.RemoveEmptyEntries); foreach (var word in words) { int tmp; if (Int32.TryParse(word, out tmp)) { continue; } string lowerWord = lemmatizer.Lemmatize(word.ToLower().Trim()); if (!dictionary.ContainsKey(lowerWord)) dictionary[lowerWord] = 1; else dictionary[lowerWord] += 1; } return dictionary; }
public async static Task <IEnumerable <LemmaGroup> > GroupLemmas(IEnumerable <string> words, ILemmatizer lemmatizer) { var map = new Dictionary <string, LemmaGroup>(); foreach (var word in words) { var lemma = await lemmatizer.Lemmatize(word); if (!map.ContainsKey(lemma)) { map.Add(lemma, new LemmaGroup(lemma)); } map[lemma].AddToGroup(word); } return(map.Values); }
/// <summary> /// this needs 32 bit version of project /// You can change it in project properties /// </summary> public AOTLemmatizer() { lemmatizerRu = new LemmatizerRussian(); lemmatizerRu.LoadDictionariesRegistry(); }
/// <summary> /// Only taking tokens of at least 3 chars. /// </summary> /// <param name="text"></param> /// <param name="threeshold"></param> /// <returns></returns> private static Dictionary <string, int> Tokenize(string text, int threeshold, string language) { Dictionary <string, int> WordCount = new Dictionary <string, int>(); ILemmatizer lmtz = null; switch (language) { case "eng": lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); break; case "fra": lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French); break; } text = text.Replace("\r\n", " "); Dictionary <string, int> entities = NlpHelper.GetNamedEntititesForText(text); LogHelper.Log("entities:" + entities.Count.ToString()); string[] words = text.Split(new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; i++) { var word = words[i].ToLowerInvariant(); var LeftWord = (i > 0) ? words[i - 1].ToLowerInvariant() : string.Empty; var RightWord = (i < (words.Length - 1)) ? words[i + 1].ToLowerInvariant() : string.Empty; if (word.Length < 3) //avoid unnecessary lemmatization { continue; } string LeftBiGramKey = string.Concat(LeftWord, " ", word); string RightBiGramKey = string.Concat(word, " ", RightWord); string TriGramKey = string.Concat(LeftWord, " ", word, " ", RightWord); string NamedEntity = null; if (entities.ContainsKey(word.ToLowerInvariant())) { if (entities[word.ToLowerInvariant()] != 2) { NamedEntity = word; } } else if (entities.ContainsKey(LeftBiGramKey)) { if (entities[LeftBiGramKey] != 2) { NamedEntity = string.Concat(LeftWord, " ", word); } } else if (entities.ContainsKey(RightBiGramKey)) { if (entities[RightBiGramKey] != 2) { NamedEntity = string.Concat(word, " ", RightWord); } } else if (entities.ContainsKey(TriGramKey)) { if (entities[TriGramKey] != 2) { NamedEntity = string.Concat(LeftWord, " ", word, " ", RightWord); } } if (NamedEntity != null) { if (!WordCount.ContainsKey(NamedEntity)) { WordCount.Add(NamedEntity, 1); } else { WordCount[NamedEntity]++; } } else { string lemma = (lmtz != null) ? LemmatizeOne(lmtz, word) : word; if (lemma.Length < 3) //ignore lemma of less than 3 characters { continue; } if (!WordCount.ContainsKey(lemma)) { WordCount.Add(lemma, 1); } else { WordCount[lemma]++; } } } Dictionary <string, int> ElligibleWords = WordCount.Where( w => w.Value >= threeshold).Select(w => new { w.Key, w.Value }).ToDictionary(w => w.Key, w => w.Value); return(ElligibleWords); }
public TextUtil() { _lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); }
/// <summary> /// Converts single word to lowercase and lemmatizes /// </summary> /// <param name="word"></param> /// <param name="lemmatizer"></param> /// <returns></returns> private static string processWord(string word, ILemmatizer lemmatizer) { return lemmatizer.Lemmatize(word.ToLower()); }
/// <summary> /// Processes single file specified in the path parameter. If targetPath is specified, the result will be written in that location. /// If targetPath is not specified, result path will be built by replacing 'lemma-source' by 'lemma-output' in the original file path. /// </summary> /// <param name="path">file location</param> /// <param name="lemmatizer">Lemmatizer from lemmagen that should process the file</param> /// <param name="targetPath">target file location and name</param> private static void processFile(string path, ILemmatizer lemmatizer, string targetPath = null) { Console.WriteLine("Processing file {0}", new FileInfo(path).Name); string fileContent = readFile(path); if (fileContent == null) return; string[] wordList = prepareFileContent(fileContent); List<string> resultList = new List<string>(); foreach(string word in wordList) { resultList.Add(processWord(word, lemmatizer)); } writeOutputFile(resultList.ToArray(), path, targetPath); }
//COMPUTE BY VECTORS AND OPERATIONS IDS OF DOCUMENTS. private static SimpleTextCrawler.Structures.LinkedList <Int32> __GetIds(ILemmatizer lemmer, LinkedDictionary <String, IndexEntry> indexer, String[] expr) { SimpleTextCrawler.Structures.LinkedList <Int32> IDS = new SimpleTextCrawler.Structures.LinkedList <Int32>(); LinkedStack <Boolean[]> V = new LinkedStack <Boolean[]>(); Int32 i = 0; while (i < expr.Length) { if (__isUnOperator(expr[i])) { if (V.IsEmpty()) { Console.WriteLine("Error in Expression"); return(IDS); } Boolean[] vi = V.Top(); V.Pop(); for (Int32 j = 1; j < 101; j++) { vi[j] = !(vi[j]); } V.Push(vi); } else if (__isOperator(expr[i])) { if (V.Count < 2) { Console.WriteLine("Error in Expression"); return(IDS); } Boolean[] o1 = V.Top(); V.Pop(); Boolean[] o2 = V.Top(); V.Pop(); Boolean[] r = new Boolean[101]; switch (expr[i]) { case "and": { for (Int32 j = 1; j < 101; j++) { r[j] = o1[j] && o2[j]; } break; } case "or": { for (Int32 j = 1; j < 101; j++) { r[j] = o1[j] || o2[j]; } break; } default: { for (Int32 j = 1; j < 101; j++) { r[j] = o1[j] && o2[j]; } break; } } V.Push(r); } else { //Console.WriteLine("Lemma: "+lemmer.Lemmatize(expr[i])); V.Push(__GetBVector(indexer, lemmer.Lemmatize(expr[i]))); //Console.WriteLine("added"); } i++; } if (V.IsEmpty()) { Console.WriteLine("Error in Expression"); return(IDS); } Boolean[] r_v = V.Top(); V.Pop(); for (Int32 d = 1; d < 101; d++) { if (r_v[d]) { IDS.Add(d); } } return(IDS); }
public static void Register(Language language, ILemmatizer lemmatizer) => _perLanguage[language] = lemmatizer;
public GetWordInfoHandler( ITranslationProvider translationProvider, IWordDefinitionProvider definitionProvider, ILemmatizer lemmatizer) => (_translationProvider, _definitionProvider, _lemmatizer) = (translationProvider, definitionProvider, lemmatizer);
public LemmaGen() { _lemmatizer = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); }
public LemmaFilter(ILemmatizer lemmatizer) => _lemmatizer = lemmatizer;
public Morpho() { lemmatizerRus = new LemmatizerRussian(); al = new ArrayList(); }
/// <summary> /// Converts single word to lowercase and lemmatizes /// </summary> /// <param name="word"></param> /// <param name="lemmatizer"></param> /// <returns></returns> private static string processWord(string word, ILemmatizer lemmatizer) { return(lemmatizer.Lemmatize(word.ToLower())); }
public Index(ILemmatizer lemmatizer, string filename = "index.json") { this.Lemmatizer = lemmatizer; this.filename = filename; LoadFromJson(filename); }
public LemmaGen(LemmaSharp.LanguagePrebuilt language) { _lemmatizer = new LemmatizerPrebuiltCompact(language); }
/// <summary> /// Initializes a new instance of the <see cref="LemmatizerAnalyzer"/> class. /// </summary> /// <param name="lemmatizer">The lemmatizer.</param> /// <exception cref="System.ArgumentNullException"> /// <paramref name="lemmatizer"/> /// </exception> public LemmatizerAnalyzer(ILemmatizer lemmatizer) { if (lemmatizer == null) throw new ArgumentNullException("lemmatizer"); this.lemmatizer = lemmatizer; }
public LemmatizerImplementation() { lemmatizer = new LemmaSharp.Classes.Lemmatizer(System.IO.File.OpenRead( "Lemmas/full7z-mlteast-en.lem") ); }
public Morpher(ILemmatizer lemmatizer) { this.lemmatizer = lemmatizer; }
static void Main(string[] args) { var rmlPath = System.Environment.GetEnvironmentVariable("RML"); Console.WriteLine("For test LemmatizerNET you need Lemmatizer dictionaries (RML)"); Console.Write("\tRML directory (" + rmlPath + "): "); if (string.IsNullOrEmpty(rmlPath)) { var newRmlPath = Console.ReadLine(); if (!string.IsNullOrEmpty(newRmlPath)) { rmlPath = newRmlPath; } } //Console.Write("Select language 'R'-Russian, 'G'-German, 'E'-English (default - R): "); var langStr = "R"; // Console.ReadLine().ToUpper(CultureInfo.InvariantCulture); MorphLanguage lang; switch (langStr) { case "": case "R": lang = MorphLanguage.Russian; break; case "G": lang = MorphLanguage.German; break; case "E": lang = MorphLanguage.English; break; default: Console.WriteLine("Wrong selection. Using default language Russian"); lang = MorphLanguage.Russian; break; } ILemmatizer lem = LemmatizerFactory.Create(lang); string rgt = ""; try { StreamReader r = new StreamReader(rmlPath + @"\Dicts\Morph\" + langStr.ToLower() + "gramtab.tab", Encoding.GetEncoding(1251)); rgt = r.ReadToEnd(); r.Close(); } catch (Exception e) { } try { var manager = FileManager.GetFileManager(rmlPath); lem.LoadDictionariesRegistry(manager); } catch (IOException e) { Console.WriteLine("Unable to load dictionaries due to the following:\r\n\t"); Console.WriteLine(e.Message); return; } while (true) { Console.Write("\nInput word (q - exit): "); var word = Console.ReadLine().Replace("\"", "").Replace("'", "").Trim(); if (word.Equals("q", StringComparison.InvariantCultureIgnoreCase)) { return; } //Позволяет декодировать грамкоды if (word.ToLower().Contains("g") || word.Contains("\t") || word.Contains("\a")) //m_gramcodes = 0x0322630c "абажай" { string gc = Regex.Match(word, "[А-Яа-яёЁ]+").Groups[0].Value; string r = ""; for (int i = 0; i < gc.Length / 2; i++) { Console.WriteLine(Regex.Match(rgt, "^" + gc.Substring(2 * i, 2) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", "")); } Console.WriteLine(""); continue; } var paradigmList = lem.CreateParadigmCollectionFromForm(word, false, false); if (paradigmList.Count == 0) { try //Позволяет декодировать граммемы, если число вместо слова { string[] g = Grammems; if (word.StartsWith("f")) { word = word.Substring(1); g = Flags; } //декодируем флаги UInt64 gr = Convert.ToUInt64(word); for (int i = g.Length - 1; i > -1; i--) { if (((1uL << i) & gr) > 0) { Console.Write(g[i] + ","); } } Console.WriteLine(""); } catch (Exception) { } Console.WriteLine("Paradigms not found"); continue; } string ancodes = ""; for (var i = 0; i < paradigmList.Count; i++) { var paradigm = paradigmList[i]; Console.Write("Paradigm: "); rmlPath = paradigm.SrcAncode; Console.WriteLine(paradigm.Norm); int k = paradigm.GetAccent(0); k = paradigm.SrcAccentedVowel; Console.Write("\tFounded: "); Console.WriteLine(paradigm.Founded); Console.Write("\tParadigmID: "); Console.WriteLine(paradigm.ParadigmID); Console.Write("\tAccentModelNo: "); Console.WriteLine(paradigm.AccentModelNo); Console.WriteLine("====="); Console.WriteLine("$type_grm = " + (paradigm.TypeAncode == "??" ? "" : Regex.Match(rgt, "^" + paradigm.TypeAncode + "[^а-яА-яёЁ]*([^\r]*)", RegexOptions.Multiline).Groups[1].Value)); ancodes += paradigm.TypeAncode; for (var j = 0; j < paradigm.Count; j++) { ancodes += paradigm.GetAncode(j); Console.Write("\t\t"); Console.Write(paradigm.GetAccent(j) == 255 ? paradigm.GetForm(j) : paradigm.GetForm(j).Insert(paradigm.GetAccent(j) + 1, "'")); Console.Write("\t"); Console.WriteLine(Regex.Match(rgt, "^" + paradigm.GetAncode(j) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", "")); } } } }