private void LoadIfNeed() { if (_hEngine != IntPtr.Zero) { return; } _hEngine = LemmatizatorEngine.sol_LoadLemmatizatorW( Path.Combine(Environment.CurrentDirectory, "lemmatizer.db"), LemmatizatorEngine .LEME_DEFAULT); }
private static List <string> AddLemmasToResult(IntPtr hList, StringBuilder buffer, HashSet <string> result) { int countLemmas = LemmatizatorEngine.sol_CountLemmas(hList); for (int i = 0; i < countLemmas; i++) { LemmatizatorEngine.sol_GetLemmaStringW(hList, i, buffer, buffer.Capacity); AddNormalWordToList(buffer, result); } // LemmatizatorEngine.sol_DeleteLemmas(hList); return(result.ToList()); }
public List <string> GetSentenceLemmas(string sentence, char separator = ' ') { try { LoadIfNeed(); var result = new HashSet <string>(); var buffer = new StringBuilder(); IntPtr hList = LemmatizatorEngine.sol_LemmatizePhraseW(_hEngine, sentence, LemmatizatorEngine.LEME_DEFAULT, separator); return(AddLemmasToResult(hList, buffer, result)); } catch (Exception e) { return(null); } }
public List <string> GetWordLemmas(string word) { try { LoadIfNeed(); var result = new HashSet <string>(); var buffer = new StringBuilder(); LemmatizatorEngine.sol_GetLemmaW(_hEngine, word, buffer, buffer.Capacity); AddNormalWordToList(buffer, result); IntPtr hList = LemmatizatorEngine.sol_GetLemmasW(_hEngine, word); return(AddLemmasToResult(hList, buffer, result)); } catch (Exception e) { return(null); } }
static void Main(string[] args) { // http://www.solarix.ru/api/en/sol_LoadLemmatizator.shtml IntPtr hEngine = LemmatizatorEngine.sol_LoadLemmatizatorW("..\\..\\..\\..\\..\\..\\..\\..\\bin-windows64\\lemmatizer.db", 0); // IntPtr hEngine = LemmatizatorEngine.sol_LoadLemmatizatorW("lemmatizer.db",0); if (hEngine == IntPtr.Zero) { Console.WriteLine("Error"); return; } System.Text.StringBuilder lemma = new System.Text.StringBuilder(); lemma.EnsureCapacity(32); // http://www.solarix.ru/api/en/sol_GetLemma.shtml LemmatizatorEngine.sol_GetLemmaW(hEngine, "галактическими", lemma, 32); String slemma = lemma.ToString(); Console.WriteLine("lemma={0}\n", slemma); // http://www.solarix.ru/api/en/sol_GetLemmas.shtml IntPtr lemmas = LemmatizatorEngine.sol_GetLemmasW(hEngine, "роем"); if (lemmas != (IntPtr)0) { // http://www.solarix.ru/api/en/sol_CountLemmas.shtml int n = LemmatizatorEngine.sol_CountLemmas(lemmas); for (int i = 0; i < n; ++i) { lemma.Length = 0; // http://www.solarix.ru/api/en/sol_GetLemmaString.shtml LemmatizatorEngine.sol_GetLemmaStringW(lemmas, i, lemma, 32); Console.WriteLine("lemma[{0}]={1}", i, lemma.ToString()); } // http://www.solarix.ru/api/en/sol_DeleteLemmas.shtml LemmatizatorEngine.sol_DeleteLemmas(lemmas); } // http://www.solarix.ru/api/en/sol_DeleteLemmatizator.shtml LemmatizatorEngine.sol_DeleteLemmatizator(hEngine); return; }
public void process() { //tokenize bool prevgood = false; bool currgood = false; string wordinprogress = ""; for (int i = 0; i < text.Length; i++) { char currch = text[i]; if (!isGood(currch)) { currgood = false; } else { currgood = true; } if (!prevgood && !currgood) { //do nothing } if (!prevgood && currgood) { wordinprogress = wordinprogress + text[i]; } if (prevgood && currgood) { wordinprogress = wordinprogress + text[i]; } if (prevgood && !currgood) { wordsList.Add(wordinprogress); wordinprogress = ""; } prevgood = currgood; } bool badchar = false; for (int i = 0; i < wordinprogress.Length; i++) { if (!isGood(wordinprogress[i])) { badchar = true; } } if (wordinprogress == "") { badchar = true; } if (!badchar) { wordsList.Add(wordinprogress); } if (OnProgressUpdate != null) { OnProgressUpdate(17); } wordsTotal = wordsList.Count; //lemmatize if (lemmatize) { for (int i = 0; i < wordsList.Count; i++) { System.Text.StringBuilder lemma = new System.Text.StringBuilder(); lemma.EnsureCapacity(32); LemmatizatorEngine.sol_GetLemmaW(hEngine, wordsList[i], lemma, 32); String slemma = lemma.ToString(); wordsList[i] = slemma; if (OnProgressUpdate != null) { OnProgressUpdate(17 + i * 83 / wordsList.Count); } } } //get to lowercase for (int i = 0; i < wordsList.Count; i++) { wordsList[i] = wordsList[i].ToLower(); } //count words for (int i = 0; i < wordsList.Count; i++) { int count = 1; int j = i + 1; while (j < wordsList.Count) { if (wordsList[i] == wordsList[j]) { count++; wordsList.RemoveAt(j); j--; } j++; } wordsCount.Add(count); } //delete numbers if (deletenumbers) { int i1 = 0; while (i1 < wordsList.Count) { int n; bool isNumeric = int.TryParse(wordsList[i1], out n); if (isNumeric) { wordsList.RemoveAt(i1); wordsCount.RemoveAt(i1); i1--; } i1++; } } //delete words from blacklist if (blacklistEnabled) { int i2 = 0; while (i2 < wordsList.Count) { for (int i3 = 0; i3 < blacklist.Count; i3++) { if (wordsList[i2] == blacklist[i3]) { wordsList.RemoveAt(i2); wordsCount.RemoveAt(i2); i2--; goto m1; } } m1 : i2++; } } //form the final list wordsUnique = wordsList.Count; }
public Processor() { // attach database hEngine = LemmatizatorEngine.sol_LoadLemmatizatorW("lemmatizer.db", 0); }