Пример #1
0
        private void GetPossibleWords(Word word, string restOfWord, IList <Word> words, bool checkTransition)
        {
            if (restOfWord.Length == 0)
            {
                var newPossibleWord = Word.CopyOf(word);
                words.Add(newPossibleWord);
                return;
            }

            IList <KeyValuePair <string, Suffix> > possibleFirstSuffixes = GetPossibleFirstSuffixes(restOfWord);


            if (possibleFirstSuffixes.Count == 0)
            {
                return;
            }

            foreach (var pair in possibleFirstSuffixes)
            {
                //Burada değişiklik yaptık, şimdilik sorun yok
                if (!_lang.Morphotactics.HasTransition(word.Last.Morpheme.Id, pair.Value.Id) && checkTransition)
                {
                    continue;
                }
                word.AddSuffix(pair.Value);
                GetPossibleWords(word, restOfWord.Remove(0, pair.Key.Length), words, checkTransition);
                word.RemoveLastSuffix();
            }
        }
Пример #2
0
        /// <summary>
        ///     Creates and returns a word with the specified "morpheme id" sequence
        /// </summary>
        public Word Generate(params string[] morphemes)
        {
            StringExtensions.ThrowIfNullAny(morphemes);

            var index = StringExtensions.ContainsWhitespaceAny(morphemes);

            //var index  = StringExtensions.ContainsWhitespaceAny(morphemes);

            //if (index >= 0)
            //{
            //    throw new ArgumentException($"Morpheme identifier can not contain whitespace: \"{morphemes[index]}\"");
            //}

            var root = GetRoot(morphemes[0]);

            if (root == null)
            {
                return(null);
            }

            var word = new Word(root);

            for (var i = 1; i < morphemes.Length; i++)
            {
                var suffix = GetSuffix(morphemes[i]);
                if (suffix == null)
                {
                    return(null);
                }
                word.AddSuffix(suffix);
            }

            return(word);
        }
Пример #3
0
        public bool TestStrictGeneration(string rootWord, string suffixId)
        {
            var root = Tr.GetRootsHavingSurface(rootWord).First();
            var word = new Word(root);

            var suffix = Tr.GetSuffix(suffixId);
            return word.AddSuffix(suffix, Tr);
        }
Пример #4
0
        public bool TestStrictGeneration(string rootWord, string suffixId)
        {
            var root = Tr.GetRootsHavingSurface(rootWord).First();
            var word = new Word(root);

            var suffix = Tr.GetSuffix(suffixId);

            return(word.AddSuffix(suffix, Tr));
        }
Пример #5
0
        public void TestStrictGeneration2()
        {
            var root = Tr.GetRootsHavingSurface("gel").First();
            var word = new Word(root);
            var copy = Word.CopyOf(word);

            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"), Tr);

            Assert.True(copy.Equals(word));
            Assert.True(word.Equals(copy));
        }
Пример #6
0
        public void TestStrictGeneration2()
        {
            var root = Tr.GetRootsHavingSurface("gel").First();
            var word = new Word(root);
            var copy = Word.CopyOf(word);

            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"), Tr);

            Assert.True(copy.Equals(word));
            Assert.True(word.Equals(copy));
        }
Пример #7
0
        private void GetPossibleWords(Word word, string restOfWord, IList <Word> words)
        {
            var suffixes = FindPossibleSuffixes(word.Last.Morpheme, restOfWord);

            foreach (var suffix in suffixes)
            {
                word.AddSuffix(suffix.Morpheme);
                GetPossibleWords(word, restOfWord.Remove(0, suffix.Surface.Length), words);
                word.RemoveLastSuffix();
            }

            if (restOfWord.Length == 0 && _lang.Morphotactics.IsTerminal(word.Last.Morpheme))
            {
                var newPossibleWord = Word.CopyOf(word);
                words.Add(newPossibleWord);
            }
        }
Пример #8
0
        public string LerimdekilerdenTest(string rootWord)
        {
            Root root = tr.GetRootsHavingSurface(rootWord).First();
            word = new Word(root);
            word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m"));
            word.AddSuffix(tr.GetSuffix("IC_HAL_BULUNMA_DA"));
            word.AddSuffix(tr.GetSuffix("IC_AITLIK_ki"));
            word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(tr.GetSuffix("IC_HAL_AYRILMA_DAn"));

            return word.GetSurface();
        }
Пример #9
0
        public string TestGeneration(string rootWord)
        {
            var root = Tr.GetRootsHavingSurface(rootWord).First();

            var word = new Word(root);
            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA"));
            word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki"));
            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn"));

            return word.GetSurface();
        }
Пример #10
0
        public string TestGeneration(string rootWord)
        {
            var root = Tr.GetRootsHavingSurface(rootWord).First();

            var word = new Word(root);

            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA"));
            word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki"));
            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn"));

            return(word.GetSurface());
        }
Пример #11
0
        public void TestCopyOf()
        {
            var root = Tr.GetRootsHavingSurface("kitap").First();
            var word = new Word(root);
            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA"));
            word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki"));
            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn"));

            var copy = Word.CopyOf(word);

            Assert.True(word.Equals(word));
            Assert.False(word.Equals(null));

            Assert.AreNotSame(word, copy);
            Assert.AreEqual(word.GetSurface(), copy.GetSurface());
            Assert.True(word.Equals(copy));
        }
Пример #12
0
        public void TestCopyOf()
        {
            var root = Tr.GetRootsHavingSurface("kitap").First();
            var word = new Word(root);

            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA"));
            word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki"));
            word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"));
            word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn"));

            var copy = Word.CopyOf(word);

            Assert.True(word.Equals(word));
            Assert.False(word.Equals(null));

            Assert.AreNotSame(word, copy);
            Assert.AreEqual(word.GetSurface(), copy.GetSurface());
            Assert.True(word.Equals(copy));
        }
Пример #13
0
        private void GetPossibleWords(Word word, string restOfWord, IList<Word> words, bool checkTransition)
        {
            if (restOfWord.Length == 0)
            {
                var newPossibleWord = new Word(word);
                words.Add(newPossibleWord);
                return;
            }

            IList<KeyValuePair<string, Suffix>> possibleFirstSuffixes = GetPossibleFirstSuffixes(restOfWord);

            if (possibleFirstSuffixes.Count == 0)
            {
                return;
            }

            foreach (var pair in possibleFirstSuffixes)
            {
                //Burada değişiklik yaptık, şimdilik sorun yok
                if (!_lang.Morphotactics.HasTransition(word.Last.Morpheme.Id, pair.Value.Id) && checkTransition)
                {
                    continue;
                }
                word.AddSuffix(pair.Value);
                GetPossibleWords(word, restOfWord.Remove(0, pair.Key.Length), words, checkTransition);
                word.RemoveLastSuffix();
            }
        }
Пример #14
0
        private static void Main()
        {
            //var lines = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\corpus\unigrams.txt",
            //    Encoding.UTF8);

            //List<string> resolved = new List<string>();
            //List<string> unresolved = new List<string>();

            //foreach (var line in lines)
            //{
            //    var token = line.Split(null)[0];
            //    var solutions = Analyzer.Analyze(token);
            //    if (solutions.Count == 0)
            //    {
            //        unresolved.Add(line);
            //    }
            //    else
            //    {
            //        resolved.Add(line);
            //    }
            //}

            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", resolved, Encoding.UTF8);
            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_unresolved", unresolved, Encoding.UTF8);

            //var nuve_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", Encoding.UTF8);
            //var zemberek_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\zemberek_resolved",
            //    Encoding.UTF8);

            //var onlyZemberek = zemberek_resolved.Except(nuve_resolved);
            //var onlyNuve = nuve_resolved.Except(zemberek_resolved);

            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyZemberek", onlyZemberek, Encoding.UTF8);
            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyNuve", onlyNuve, Encoding.UTF8);

            //File.WriteAllText(@"C:\Users\harun_000\Desktop\vboxshare\tokenized-paragraphs-with-punctuation.txt", Split(text), Encoding.UTF8);

            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\vboxshare\sentences-no-punctuation-remove-empty-lines.txt", paragraphs.Select(p => Regex.Replace(p, "[^\\p{L}\\p{Nd}' ]", "")), Encoding.UTF8);

            //Benchmarker.TestWithAMillionWords(Analyzer);
            //Benchmarker.TestWithAMillionTokens(Analyzer);

            //Language tr = Language.Turkish;

            ////Analysis
            //var analyzer = new WordAnalyzer(tr);

            ////Morphologic Analysis and stemming
            //IList<Word> solutions = analyzer.Analyze("deneme");

            //foreach (var solution in solutions)
            //{
            //    Console.WriteLine("\t{0}", solution);
            //    Console.WriteLine("\toriginal:{0} stem:{1}\n",
            //    solution.GetSurface(),
            //    solution.GetStem().GetSurface()); //Stemming
            //}

            var tr = Language.Turkish;
            var root = tr.GetRootsHavingSurface("gel").First();
            Word word = new Word(root);

            if(!word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr"), tr))
            {
                Console.WriteLine("Adding the suffix IC_COGUL_lAr after a verb is not valid!");
                Console.WriteLine(word.GetSurface()); //prints just gel "gel"
            }

            Console.WriteLine(root);
            Console.WriteLine(word);
            Console.WriteLine(word.GetSurface());

            Test();
        }
Пример #15
0
        private static void Main()
        {
            //var lines = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\corpus\unigrams.txt",
            //    Encoding.UTF8);

            //List<string> resolved = new List<string>();
            //List<string> unresolved = new List<string>();

            //foreach (var line in lines)
            //{
            //    var token = line.Split(null)[0];
            //    var solutions = Analyzer.Analyze(token);
            //    if (solutions.Count == 0)
            //    {
            //        unresolved.Add(line);
            //    }
            //    else
            //    {
            //        resolved.Add(line);
            //    }
            //}


            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", resolved, Encoding.UTF8);
            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_unresolved", unresolved, Encoding.UTF8);


            //var nuve_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", Encoding.UTF8);
            //var zemberek_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\zemberek_resolved",
            //    Encoding.UTF8);


            //var onlyZemberek = zemberek_resolved.Except(nuve_resolved);
            //var onlyNuve = nuve_resolved.Except(zemberek_resolved);

            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyZemberek", onlyZemberek, Encoding.UTF8);
            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyNuve", onlyNuve, Encoding.UTF8);


            //File.WriteAllText(@"C:\Users\harun_000\Desktop\vboxshare\tokenized-paragraphs-with-punctuation.txt", Split(text), Encoding.UTF8);

            //File.WriteAllLines(@"C:\Users\harun_000\Desktop\vboxshare\sentences-no-punctuation-remove-empty-lines.txt", paragraphs.Select(p => Regex.Replace(p, "[^\\p{L}\\p{Nd}' ]", "")), Encoding.UTF8);


            //Benchmarker.TestWithAMillionWords(Analyzer);
            //Benchmarker.TestWithAMillionTokens(Analyzer);

            //Language tr = Language.Turkish;

            ////Analysis
            //var analyzer = new WordAnalyzer(tr);

            ////Morphologic Analysis and stemming
            //IList<Word> solutions = analyzer.Analyze("deneme");

            //foreach (var solution in solutions)
            //{
            //    Console.WriteLine("\t{0}", solution);
            //    Console.WriteLine("\toriginal:{0} stem:{1}\n",
            //    solution.GetSurface(),
            //    solution.GetStem().GetSurface()); //Stemming
            //}


            var  tr   = Language.Turkish;
            var  root = tr.GetRootsHavingSurface("gel").First();
            Word word = new Word(root);

            if (!word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr"), tr))
            {
                Console.WriteLine("Adding the suffix IC_COGUL_lAr after a verb is not valid!");
                Console.WriteLine(word.GetSurface()); //prints just gel "gel"
            }


            Console.WriteLine(root);
            Console.WriteLine(word);
            Console.WriteLine(word.GetSurface());


            Test();
        }