static void Main(string[] args) { string exampleSentence = "On the other hand, inflectional paradigms, " + "or lists of inflected forms of typical words (such as sing, sang, " + "sung, sings, singing, singer, singers, song, songs, songstress, " + "songstresses in English) need to be analyzed according to criteria " + "for uncovering the underlying lexical stem."; string[] exampleWords = exampleSentence.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); ILemmatizer lmtz = new LemmatizerPrebuiltFull(LanguagePrebuilt.English); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Example sentence lemmatized"); Console.WriteLine(" WORD ==> LEMMA"); foreach (string word in exampleWords) { LemmatizeOne(lmtz, word); } Console.ForegroundColor = ConsoleColor.White; Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
private void AdditionToGlossary(string s, SystemCollection system) { ILemmatizer lmtz = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian); List <String> wordsInText = new List <string>(); wordsInText.AddRange(s.Split(new char[] { '.', '?', '!', '(', ')', ',', ':', ';', ' ' }, StringSplitOptions.RemoveEmptyEntries)); for (int i = 0; i < wordsInText.Count; i++) { wordsInText[i] = wordsInText[i].ToLower(); wordsInText[i] = lmtz.Lemmatize(wordsInText[i]); for (int w = 0; w < stopWords.Count; w++) { if (wordsInText[i] == stopWords[w]) { wordsInText.RemoveAt(i); i--; break; } } } List <String> words = new List <string>(); foreach (var a in wordsInText.Distinct <string>()) { words.Add(a); } for (int i = 0; i < words.Count; i++) { bool t = false; GlossarySystem glossarySystem = new GlossarySystem(); var gos = (from p in model.GlossarySystems where p.SystemCollectionId == system.Id select p).ToList(); for (int g = 0; g < gos.Count; g++) { if (gos[g].WordGlossary == words[i]) { glossarySystem = model.GlossarySystems.Find(gos[g].Id); t = true; break; } } if (t != true) { glossarySystem.WordGlossary = words[i]; glossarySystem.SystemCollectionId = system.Id; glossarySystem.WordValue = 1; glossarySystem.SystemCollection = system; model.GlossarySystems.Add(glossarySystem); model.SaveChanges(); } else { glossarySystem.WordValue++; model.SaveChanges(); } } }
private List <Double> token(string s) { ILemmatizer lmtz = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian); List <String> wordsInText = new List <string>(); wordsInText.AddRange(s.Split(new char[] { '.', '?', '!', '(', ')', ',', ':', ';', ' ' }, StringSplitOptions.RemoveEmptyEntries)); for (int i = 0; i < wordsInText.Count; i++) { wordsInText[i] = wordsInText[i].ToLower(); wordsInText[i] = lmtz.Lemmatize(wordsInText[i]); for (int w = 0; w < stopWords.Count; w++) { if (wordsInText[i] == stopWords[w]) { wordsInText.RemoveAt(i); i--; break; } } } int size = (from p in model.TypeDemands where p.GroupDemand.SystemCollectionId == systemCollection.Id select p).Count(); var gos = (from p in model.GlossarySystems where p.SystemCollectionId == systemCollection.Id select p).ToList(); List <Double> tfidf = new List <double>(); double tf = 0; for (int g = 0; g < gos.Count; g++) { for (int k = 0; k < wordsInText.Count; k++) { if (wordsInText[k] == gos[g].WordGlossary) { tf++; } } tf = (tf / wordsInText.Count) * (Math.Log(1 + size / gos[g].WordValue)); tfidf.Add(tf); tf = 0; } return(tfidf); }
public async Task Run() { Lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian); var t = new Stopwatch(); t.Start(); ClearFileAndInitializeQueue(); while (!IsTimeToStop() && LinksQueue.TryDequeue(out var nextUrl)) { await ReadNewPage(nextUrl); } WriteIndexToFile(); CountIDF(); t.Stop(); Console.WriteLine($"\n{t.ElapsedMilliseconds}"); }
private static Lemmatizer CreatePreBuiltLemmatizer() { var lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.English); return(lemmatizer); }
public string LemmatizeTwo(string iskalni_kljuc) { Regex pattern = new Regex(@"([^\W_\d]([^\W_\d]|[-'\d](?=[^\W_\d|]))*[^\W_\d])", RegexOptions.IgnorePatternWhitespace); ILemmatizer lmtz = new LemmatizerPrebuiltFull(LemmaSharp.LanguagePrebuilt.Slovene); string lemma; string leme_kljuc = ""; foreach (Match m in pattern.Matches(iskalni_kljuc)) { lemma = lmtz.Lemmatize(m.Groups[1].Value.ToLower()); leme_kljuc = leme_kljuc + "'" + lemma + "',"; } return leme_kljuc; }
public void Lematiziraj(string teme, string vsebina) { MySqlConnection connection = new MySqlConnection("server=localhost;user id=keko;password=keko;database=feri;"); connection.Open(); ILemmatizer lmtz = new LemmatizerPrebuiltFull(LemmaSharp.LanguagePrebuilt.Slovene); Dictionary<string, string> dictionary = new Dictionary<string, string>(); Regex pattern = new Regex(@"([^\W_\d]([^\W_\d]|[-'\d](?=[^\W_\d|]))*[^\W_\d])", RegexOptions.IgnorePatternWhitespace); string prebrano = vsebina; foreach (Match m in pattern.Matches(prebrano)) { string lemma; lemma = lmtz.Lemmatize(m.Groups[1].Value.ToLower()); leme_string = leme_string + lemma + ","; //LemmatizeOne(m.Groups[1].Value); } dictionary.Add(teme, leme_string); leme_string = ""; besede.Clear(); foreach (KeyValuePair<string, string> kvp in dictionary) { string datoteka = kvp.Key; string[] leme = kvp.Value.Split(','); string[] odstranjeni_duplikati = leme.Distinct().ToArray(); int stevec = 0; for (int i = 0; i < odstranjeni_duplikati.Count() - 1; i++) { for (int j = 0; j < leme.Count(); j++) { if (odstranjeni_duplikati[i] == leme[j]) stevec++; } string sql = "INSERT INTO tf(Lema,St_pojavitev,Dokument)VALUES (@lema,@st_pojavitev,@dokument)"; MySqlCommand cmd = new MySqlCommand(sql, connection); cmd.Parameters.AddWithValue("@lema", odstranjeni_duplikati[i]); cmd.Parameters.AddWithValue("@st_pojavitev", stevec); cmd.Parameters.AddWithValue("@dokument", datoteka); try { cmd.ExecuteNonQuery(); } catch (Exception ex) { } stevec = 0; } } connection.Close(); connection.Open(); List<int> C = new List<int>(); List<string> Lema = new List<string>(); MySqlCommand cmd1 = new MySqlCommand("SELECT COUNT(*) as C, Lema FROM tf GROUP BY Lema ORDER BY C Desc", connection); MySqlDataReader reader = cmd1.ExecuteReader(); while (reader.Read()) { C.Add(reader.GetInt32(0)); Lema.Add(reader.GetString(1)); } connection.Close(); connection.Open(); string strSql = "TRUNCATE TABLE df"; MySqlCommand cmd2 = new MySqlCommand(strSql, connection); cmd2.ExecuteNonQuery(); connection.Close(); connection.Open(); string sql1 = "INSERT INTO df(Lema,DF)VALUES (@lema,@DF)"; for (int i = 0; i < Lema.Count(); i++) { MySqlCommand cmd3 = new MySqlCommand(sql1, connection); cmd3.Parameters.AddWithValue("@lema", Lema[i]); cmd3.Parameters.AddWithValue("@DF", C[i]); cmd3.ExecuteNonQuery(); } connection.Close(); }
static void Main(string[] args) { String connectionString = "Persist Security Info=False;Integrated Security=true;Initial Catalog=;server=server28"; { List<String> freshWordForms = new List<String>(); //read word form from Data Base and save it to wordForms list. using (SqlConnection conn = new SqlConnection(connectionString)) { string queryString = "SELECT DISTINCT AttribValue FROM Attrib72 WHERE AttribValue IS NOT NULL;"; SqlCommand command = new SqlCommand(queryString, conn); conn.Open(); SqlDataReader reader = command.ExecuteReader(); Console.WriteLine("Read word form = ###"); fnum = 0; while (reader.Read()) { freshWordForms.Add((String)reader[0]); fnum++; Console.Write("\r{0}", fnum); } } //check the applicability reg = new Regex(positivePattern); pnum = 0; processed = 0; Console.WriteLine("\nWord validation\nProcessed ### from ### - added ###"); Parallel.ForEach<String>(freshWordForms, ChecWord); } ///////////////////////////////////////////////////////////////////////////// //lemmatize Console.WriteLine("\nPrepare pair lemma-wordform"); int numberOfWorkers = 10; ThreadController thrConrl = new ThreadController(); for (int i = 0; i < numberOfWorkers; i++) { thrConrl.addWorker(new LemmaTrainerWorker(connectionString)); } thrConrl.setData(wordForms); thrConrl.executeWorks(); pairs = thrConrl.getResult(); /////////////////////////////////////////////////////////////////////////// #region old pare prepaer /* using (SqlConnection conn = new SqlConnection(connectionString)) { conn.Open(); string queryString = "SELECT I.Name " + "FROM dbo.Attrib72 A WITH(NOLOCK)INNER JOIN ItemList I WITH(NOLOCK) ON I.ID = A.IdItem " + "where I.UP = 18 AND A.AttribValue = \'"; StringBuilder bld = new StringBuilder(); int counter = 0; foreach (String str in wordForms) { bld.Clear(); bld.Append(queryString); bld.Append(str); bld.Append("\';"); SqlCommand command = new SqlCommand(bld.ToString(), conn); SqlDataReader reader = command.ExecuteReader(); if (reader.Read()) { if ((String)reader[0] != str) { //String lemma = (String)reader[0]; pairs.Add(new Pair((String)reader[0], str)); counter++; Console.Write("\r{0}", counter); counter++; } } reader.Close(); } } */ #endregion /////////////////////////////////////////////////////////////////////////// LemmatizerPrebuiltFull lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian); Console.WriteLine("\nLearning..."); foreach (Object obj in pairs) { Pair pair = (Pair)obj; lemmatizer.AddExample(pair.WordForm, pair.Lemma); } Console.ReadLine(); }