Пример #1
0
        static void Main(string[] args)
        {
            string exampleSentence = "On the other hand, inflectional paradigms, " +
                                     "or lists of inflected forms of typical words (such as sing, sang, " +
                                     "sung, sings, singing, singer, singers, song, songs, songstress, " +
                                     "songstresses in English) need to be analyzed according to criteria " +
                                     "for uncovering the underlying lexical stem.";

            string[] exampleWords = exampleSentence.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            ILemmatizer lmtz = new LemmatizerPrebuiltFull(LanguagePrebuilt.English);

            Console.ForegroundColor = ConsoleColor.Green;
            Console.WriteLine("Example sentence lemmatized");
            Console.WriteLine("        WORD ==> LEMMA");
            foreach (string word in exampleWords)
            {
                LemmatizeOne(lmtz, word);
            }

            Console.ForegroundColor = ConsoleColor.White;

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Пример #2
0
        private void AdditionToGlossary(string s, SystemCollection system)
        {
            ILemmatizer   lmtz        = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian);
            List <String> wordsInText = new List <string>();

            wordsInText.AddRange(s.Split(new char[] { '.', '?', '!', '(', ')', ',', ':', ';', ' ' }, StringSplitOptions.RemoveEmptyEntries));
            for (int i = 0; i < wordsInText.Count; i++)
            {
                wordsInText[i] = wordsInText[i].ToLower();
                wordsInText[i] = lmtz.Lemmatize(wordsInText[i]);
                for (int w = 0; w < stopWords.Count; w++)
                {
                    if (wordsInText[i] == stopWords[w])
                    {
                        wordsInText.RemoveAt(i);
                        i--;
                        break;
                    }
                }
            }

            List <String> words = new List <string>();

            foreach (var a in wordsInText.Distinct <string>())
            {
                words.Add(a);
            }

            for (int i = 0; i < words.Count; i++)
            {
                bool           t = false;
                GlossarySystem glossarySystem = new GlossarySystem();

                var gos = (from p in model.GlossarySystems where p.SystemCollectionId == system.Id select p).ToList();
                for (int g = 0; g < gos.Count; g++)
                {
                    if (gos[g].WordGlossary == words[i])
                    {
                        glossarySystem = model.GlossarySystems.Find(gos[g].Id);
                        t = true;
                        break;
                    }
                }
                if (t != true)
                {
                    glossarySystem.WordGlossary       = words[i];
                    glossarySystem.SystemCollectionId = system.Id;
                    glossarySystem.WordValue          = 1;
                    glossarySystem.SystemCollection   = system;
                    model.GlossarySystems.Add(glossarySystem);
                    model.SaveChanges();
                }
                else
                {
                    glossarySystem.WordValue++;
                    model.SaveChanges();
                }
            }
        }
Пример #3
0
        private List <Double> token(string s)
        {
            ILemmatizer   lmtz        = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian);
            List <String> wordsInText = new List <string>();

            wordsInText.AddRange(s.Split(new char[] { '.', '?', '!', '(', ')', ',', ':', ';', ' ' }, StringSplitOptions.RemoveEmptyEntries));
            for (int i = 0; i < wordsInText.Count; i++)
            {
                wordsInText[i] = wordsInText[i].ToLower();
                wordsInText[i] = lmtz.Lemmatize(wordsInText[i]);
                for (int w = 0; w < stopWords.Count; w++)
                {
                    if (wordsInText[i] == stopWords[w])
                    {
                        wordsInText.RemoveAt(i);
                        i--;
                        break;
                    }
                }
            }
            int size = (from p in model.TypeDemands
                        where p.GroupDemand.SystemCollectionId == systemCollection.Id
                        select p).Count();

            var gos = (from p in model.GlossarySystems
                       where p.SystemCollectionId == systemCollection.Id
                       select p).ToList();
            List <Double> tfidf = new List <double>();
            double        tf    = 0;

            for (int g = 0; g < gos.Count; g++)
            {
                for (int k = 0; k < wordsInText.Count; k++)
                {
                    if (wordsInText[k] == gos[g].WordGlossary)
                    {
                        tf++;
                    }
                }
                tf = (tf / wordsInText.Count) * (Math.Log(1 + size / gos[g].WordValue));
                tfidf.Add(tf);
                tf = 0;
            }
            return(tfidf);
        }
Пример #4
0
        public async Task Run()
        {
            Lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian);

            var t = new Stopwatch();

            t.Start();

            ClearFileAndInitializeQueue();

            while (!IsTimeToStop() && LinksQueue.TryDequeue(out var nextUrl))
            {
                await ReadNewPage(nextUrl);
            }

            WriteIndexToFile();

            CountIDF();

            t.Stop();
            Console.WriteLine($"\n{t.ElapsedMilliseconds}");
        }
Пример #5
0
        private static Lemmatizer CreatePreBuiltLemmatizer()
        {
            var lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.English);

            return(lemmatizer);
        }
Пример #6
0
 public string LemmatizeTwo(string iskalni_kljuc)
 {
     Regex pattern = new Regex(@"([^\W_\d]([^\W_\d]|[-'\d](?=[^\W_\d|]))*[^\W_\d])", RegexOptions.IgnorePatternWhitespace);
     ILemmatizer lmtz = new LemmatizerPrebuiltFull(LemmaSharp.LanguagePrebuilt.Slovene);
     string lemma;
     string leme_kljuc = "";
     foreach (Match m in pattern.Matches(iskalni_kljuc))
     {
         lemma = lmtz.Lemmatize(m.Groups[1].Value.ToLower());
         leme_kljuc = leme_kljuc + "'" + lemma + "',";
     }
     return leme_kljuc;
 }
Пример #7
0
        public void Lematiziraj(string teme, string vsebina)
        {
            MySqlConnection connection = new MySqlConnection("server=localhost;user id=keko;password=keko;database=feri;");
            connection.Open();
            ILemmatizer lmtz = new LemmatizerPrebuiltFull(LemmaSharp.LanguagePrebuilt.Slovene);
            Dictionary<string, string> dictionary = new Dictionary<string, string>();

            Regex pattern = new Regex(@"([^\W_\d]([^\W_\d]|[-'\d](?=[^\W_\d|]))*[^\W_\d])", RegexOptions.IgnorePatternWhitespace);
            string prebrano = vsebina;
            foreach (Match m in pattern.Matches(prebrano))
            {
                string lemma;
                lemma = lmtz.Lemmatize(m.Groups[1].Value.ToLower());
                leme_string = leme_string + lemma + ",";
                //LemmatizeOne(m.Groups[1].Value);
            }

            dictionary.Add(teme, leme_string);
            leme_string = "";
            besede.Clear();

            foreach (KeyValuePair<string, string> kvp in dictionary)
            {
                string datoteka = kvp.Key;
                string[] leme = kvp.Value.Split(',');
                string[] odstranjeni_duplikati = leme.Distinct().ToArray();
                int stevec = 0;
                for (int i = 0; i < odstranjeni_duplikati.Count() - 1; i++)
                {
                    for (int j = 0; j < leme.Count(); j++)
                    {
                        if (odstranjeni_duplikati[i] == leme[j])
                            stevec++;

                    }
                    string sql = "INSERT INTO tf(Lema,St_pojavitev,Dokument)VALUES (@lema,@st_pojavitev,@dokument)";
                    MySqlCommand cmd = new MySqlCommand(sql, connection);
                    cmd.Parameters.AddWithValue("@lema", odstranjeni_duplikati[i]);
                    cmd.Parameters.AddWithValue("@st_pojavitev", stevec);
                    cmd.Parameters.AddWithValue("@dokument", datoteka);
                    try
                    {
                        cmd.ExecuteNonQuery();
                    }
                    catch (Exception ex)
                    {
                    }
                    stevec = 0;
                }
            }
            connection.Close();
            connection.Open();
            List<int> C = new List<int>();
            List<string> Lema = new List<string>();
            MySqlCommand cmd1 = new MySqlCommand("SELECT COUNT(*) as C, Lema FROM tf GROUP BY Lema ORDER BY C Desc", connection);
            MySqlDataReader reader = cmd1.ExecuteReader();
            while (reader.Read())
            {
                C.Add(reader.GetInt32(0));
                Lema.Add(reader.GetString(1));
            }
            connection.Close();
            connection.Open();
            string strSql = "TRUNCATE TABLE df";
            MySqlCommand cmd2 = new MySqlCommand(strSql, connection);
            cmd2.ExecuteNonQuery();

            connection.Close();
            connection.Open();
            string sql1 = "INSERT INTO df(Lema,DF)VALUES (@lema,@DF)";
            for (int i = 0; i < Lema.Count(); i++)
            {
                MySqlCommand cmd3 = new MySqlCommand(sql1, connection);
                cmd3.Parameters.AddWithValue("@lema", Lema[i]);
                cmd3.Parameters.AddWithValue("@DF", C[i]);
                cmd3.ExecuteNonQuery();
            }
            connection.Close();
        }
Пример #8
0
        static void Main(string[] args)
        {
            String connectionString = "Persist Security Info=False;Integrated Security=true;Initial Catalog=;server=server28";
            {
                List<String> freshWordForms = new List<String>();

                //read word form from Data Base and save it to wordForms list.
                using (SqlConnection conn = new SqlConnection(connectionString))
                {

                    string queryString = "SELECT DISTINCT AttribValue FROM Attrib72 WHERE AttribValue IS NOT NULL;";

                    SqlCommand command = new SqlCommand(queryString, conn);
                    conn.Open();

                    SqlDataReader reader = command.ExecuteReader();

                    Console.WriteLine("Read word form = ###");

                    fnum = 0;
                    while (reader.Read())
                    {
                        freshWordForms.Add((String)reader[0]);
                        fnum++;
                        Console.Write("\r{0}", fnum);
                    }

                }

                //check the applicability
                reg = new Regex(positivePattern);
                pnum = 0;
                processed = 0;

                Console.WriteLine("\nWord validation\nProcessed ### from ### - added ###");
                Parallel.ForEach<String>(freshWordForms, ChecWord);
            }

            /////////////////////////////////////////////////////////////////////////////
            //lemmatize
            Console.WriteLine("\nPrepare pair lemma-wordform");

            int numberOfWorkers = 10;
            ThreadController thrConrl = new ThreadController();

            for (int i = 0; i < numberOfWorkers; i++)
            {
                thrConrl.addWorker(new LemmaTrainerWorker(connectionString));
            }
            thrConrl.setData(wordForms);
            thrConrl.executeWorks();
            pairs = thrConrl.getResult();

            ///////////////////////////////////////////////////////////////////////////
            #region old pare prepaer
            /*
            using (SqlConnection conn = new SqlConnection(connectionString))
            {
                conn.Open();
                string queryString = "SELECT I.Name "
                    + "FROM dbo.Attrib72 A WITH(NOLOCK)INNER JOIN ItemList I WITH(NOLOCK) ON I.ID = A.IdItem "
                    + "where I.UP = 18 AND A.AttribValue = \'";
                StringBuilder bld = new StringBuilder();
                int counter = 0;

                foreach (String str in wordForms)
                {
                    bld.Clear();
                    bld.Append(queryString);
                    bld.Append(str);
                    bld.Append("\';");

                    SqlCommand command = new SqlCommand(bld.ToString(), conn);
                    SqlDataReader reader = command.ExecuteReader();

                    if (reader.Read())
                    {
                        if ((String)reader[0] != str)
                        {
                            //String lemma = (String)reader[0];
                            pairs.Add(new Pair((String)reader[0], str));
                            counter++;
                            Console.Write("\r{0}", counter);
                            counter++;
                        }
                    }
                    reader.Close();

                }

            }
            */
            #endregion
            ///////////////////////////////////////////////////////////////////////////

            LemmatizerPrebuiltFull lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.Russian);

            Console.WriteLine("\nLearning...");
            foreach (Object obj in pairs)
            {
                Pair pair = (Pair)obj;
                lemmatizer.AddExample(pair.WordForm, pair.Lemma);
            }

            Console.ReadLine();
        }