Exemplo n.º 1
0
        private static void GitHubReadmeExamples()
        {
            var tr        = LanguageFactory.Create(LanguageType.Turkish);
            var solutions = tr.Analyze("yolsuzu");

            foreach (var solution in solutions)
            {
                Console.WriteLine("\t{0}", solution);
                Console.WriteLine("\toriginal:{0} stem:{1} root:{2}\n",
                                  solution.GetSurface(),
                                  solution.GetStem().GetSurface(),
                                  solution.Root); //Stemming
            }

            //Method 1: Specify the ids of the morphemes that constitute the word
            var word1 = tr.Generate("kitap/ISIM", "IC_COGUL_lAr", "IC_SAHIPLIK_BEN_(U)m",
                                    "IC_HAL_BULUNMA_DA", "IC_AITLIK_ki", "IC_COGUL_lAr", "IC_HAL_AYRILMA_DAn");

            //Method 2: Specify the string representation of the analysis of the word.
            var analysis = "kitap/ISIM IC_COGUL_lAr IC_SAHIPLIK_BEN_(U)m";
            var word2    = tr.GetWord(analysis);

            Console.WriteLine(word1.GetSurface());
            Console.WriteLine(word2.GetSurface());
        }
        public void Create_ShouldCreateLanguageModel(Db db, [Content] DbItem item)
        {
            var contentItem = db.GetItem(item.ID);

            Context.Item = contentItem;
            var language = LanguageFactory.Create(Context.Language);

            language.Should().NotBeNull();
            language.TwoLetterCode.Should().BeEquivalentTo(Context.Language.Name);
        }
Exemplo n.º 3
0
        private static void AnalysisAndStemming()
        {
            var tr    = LanguageFactory.Create(LanguageType.Turkish);
            var stems = tr.Analyze("ehemmiyetsiz").Select(s => s.Root.LexicalForm).ToList();

            foreach (var stem in stems)
            {
                Console.WriteLine("\t{0}", stem);
            }
        }
Exemplo n.º 4
0
        private static void Generation()
        {
            var tr        = LanguageFactory.Create(LanguageType.Turkish);
            var solutions = tr.Analyze("suyu");
            var surfaces  = solutions[0].GetSurfacesAfterEachPhase();

            foreach (var surface in surfaces)
            {
                Console.WriteLine(surface);
            }
        }
Exemplo n.º 5
0
        public static void prepareDocuments(List <Document> documents, HashSet <string> classifierGrams)
        {
            var filenames = from fullFilename
                            in Directory.EnumerateFiles("raw_texts/", "*.txt", SearchOption.AllDirectories)
                            select Path.GetFullPath(fullFilename);

            List <string> list = filenames.ToList();
            StreamReader  sr2  = new StreamReader("stop_words.txt", Encoding.GetEncoding("ISO-8859-9"));
            Language      tr   = LanguageFactory.Create(LanguageType.Turkish);

            char[]   seperators = { ' ', '\n', '\t', '\r', '\0' };
            string[] stopWords  = sr2.ReadToEnd().Split('\n');
            Dictionary <string, int>         allGrams = new Dictionary <string, int>();
            List <Dictionary <string, int> > grams    = new List <Dictionary <string, int> >();

            Parallel.ForEach(list, i =>
            {
                StreamReader sr  = new StreamReader(i, Encoding.GetEncoding("ISO-8859-9"));
                string cleanData = PreProcessing.editFile(i, sr, sr2, tr, seperators, stopWords);
                var gram         = CreateGramsFrequencies.make2Gram(cleanData).Concat(CreateGramsFrequencies.make3gram(cleanData)).ToDictionary(e => e.Key, e => e.Value);//2gram ile 3 grami birlestir
                grams.Add(gram);
                documents.Add(new Document(Path.GetFileNameWithoutExtension(i), Directory.GetParent(i).Name, gram));

                sr.Close();
            });

            foreach (var i in grams)
            {
                foreach (KeyValuePair <string, int> ix in i)
                {
                    if (!allGrams.ContainsKey(ix.Key))
                    {
                        allGrams.Add(ix.Key, ix.Value);
                    }
                    else
                    {
                        allGrams[ix.Key] += ix.Value;
                    }
                }
            }


            foreach (KeyValuePair <string, int> gram in allGrams)
            {
                if (gram.Value >= 50)
                {
                    if (!classifierGrams.Contains(gram.Key))
                    {
                        classifierGrams.Add(gram.Key);
                    }
                }
            }
        }
Exemplo n.º 6
0
            string CekimEkleriniTemizle(string metin)                              // çekim eklerini temizler
            {
                string[] kelimeler = metin.Split(' ');                             //metindeki her kelimeyi alır
                Language tr        = LanguageFactory.Create(LanguageType.Turkish); //nuve kütüphanesi
                string   sonhali   = "";

                for (int i = 0; i < kelimeler.Length; i++)
                {
                    IList <Word> kokler = tr.Analyze(kelimeler[i]);        // kelimenin analizi
                    if (kokler.Count > 0)                                  // sonuc varsa
                    {
                        sonhali += kokler[0].GetStem().GetSurface() + "_"; // kelimenin govdesini al(GetStem) ve stringe çevir(GetSurface)
                    }
                }
                return(sonhali);
            }
Exemplo n.º 7
0
        public static string[] ReplaceRoots(string root, string[] words)
        {
            Language turkish = LanguageFactory.Create(LanguageType.Turkish);

            var replacedWords = new List <string>();

            foreach (string word in words)
            {
                IEnumerable <Word> solutions = turkish.Analyze(word);
                foreach (Word solution in solutions)
                {
                    string output = solution.GetSurface();
                    solution.Root = turkish.GetRootsHavingSurface(root).First();
                    output       += "\t" + solution.GetSurface();
                    replacedWords.Add(output);
                }
            }
            return(replacedWords.ToArray());
        }
        private void KelimeKokleriniBul()
        {
            bool         girdi = false;
            Language     tr    = LanguageFactory.Create(LanguageType.Turkish);
            IList <Word> words;

            foreach (string gelen in DosyaKelimeler)
            {
                girdi = false;
                //Console.WriteLine(gelen);
                words = tr.Analyze(gelen);
                if (words.Count != 0)
                {
                    if (sozluk.ContainsKey(words[0].GetStem().ToString().Split('/')[0]))
                    {
                        sozluk[words[0].GetStem().ToString().Split('/')[0]]++;
                        girdi = true;
                    }

                    if (!girdi)
                    {
                        sozluk.Add(words[0].GetStem().ToString().Split('/')[0], 1);
                        girdi = false;
                    }
                }
                else
                {
                    if (sozluk.ContainsKey(gelen))
                    {
                        sozluk[gelen]++;
                        girdi = true;
                    }

                    if (!girdi)
                    {
                        sozluk.Add(gelen, 1);
                        girdi = false;
                    }
                }
            }
        }
Exemplo n.º 9
0
        public void Test()
        {
            var tr    = LanguageFactory.Create(LanguageType.Turkish);
            var newTr = MutableLanguage.CopyFrom(tr);

            var entry = new RootEntry(
                lex: "başa gel",
                pos: "FIIL",
                surfaces: new[] { "başa gel" },
                labels: new[] { "cverb" },
                rules: Enumerable.Empty <string>());

            Assert.True(newTr.TryAdd(entry));

            var solutions = tr.Analyze("başa gelen");

            Assert.AreEqual(0, solutions.Count);

            var solutionsExtendedTr = newTr.Analyze("başa gelen");

            Assert.AreEqual(1, solutionsExtendedTr.Count);

            Assert.AreEqual("başa gel/FIIL FIILIMSI_SIFAT_(y)An", solutionsExtendedTr[0].Analysis);
        }
Exemplo n.º 10
0
 public static Language GetActive()
 {
     return(LanguageFactory.Create(Context.Language));
 }
        public void tokenize(string path, int classIndex)
        {
            StreamReader r  = new StreamReader(path, Encoding.GetEncoding("windows-1254"));
            Language     tr = LanguageFactory.Create(LanguageType.Turkish);
            string       line;

            string[] rawWords;
            string[] token;
            string   analyzMax  = "";
            int      analyzMaxL = Int32.MinValue;
            string   temp;
            int      tempL;
            int      docMax  = Int32.MinValue;
            int      docMax2 = Int32.MinValue;
            int      analyzL;
            int      tokenL;
            Dictionary <string, int> document = new Dictionary <string, int>();

            while (!r.EndOfStream)//tokenizer kısımları
            {
                line     = r.ReadLine();
                rawWords = Regex.Split(line, @"((\)('|’)\w+)|\W('|’)|('|’)\W|^('|’)|$('|’)|\d+('|’)\w+|\d+\w+|\d+[^a-zA-Z ]+\w+|\w+\d+|\d+|(\)|\())|[^\w('|’)]", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
                //Büyülü regex'e göre metini parçalara ayırır
                foreach (string w in rawWords)
                {
                    //boş stringler ve istenmeyen bazı durumlar da -örneğin sayılar- atılır
                    if (w != "" && Regex.IsMatch(w, @"\D\w", RegexOptions.Compiled))
                    {
                        analyzMaxL = Int32.MinValue;
                        analyzMax  = "";
                        //daha çok büyülü regex
                        token    = Regex.Split(w, @"(\W*)('|’)(\w+|\W+)", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
                        token[0] = token[0].ToLower();
                        IList <Word> solutions = tr.Analyze(token[0]);//morfolojik analiz
                        foreach (var solution in solutions)
                        {
                            temp  = solution.GetStem().GetSurface(); //Stemming
                            tempL = temp.Length;
                            //Genel olarak köke ne kadar az yaklaşırsa metinle o kadar alakalı
                            //olduğunu tespit ettik bu yüzden en uzun stemi aldık
                            if (tempL > analyzMaxL)
                            {
                                analyzMaxL = tempL;
                                analyzMax  = temp;
                            }
                        }
                        analyzL = analyzMax.Length;
                        tokenL  = token[0].Length;
                        //stop words leri eliyor
                        if (analyzMax != "")
                        {
                            if (analyzL > 2 && !stopWords.Contains(analyzMax))
                            {
                                if (corpus.ContainsKey(analyzMax))
                                {
                                    corpus[analyzMax]++;
                                }
                                else
                                {
                                    corpus.Add(analyzMax, 1);
                                    wordIndex.Add(analyzMax, Index);
                                    Index++;
                                }
                                if (document.ContainsKey(analyzMax))
                                {
                                    document[analyzMax]++;
                                }
                                else
                                {
                                    document.Add(analyzMax, 1);
                                }
                            }
                        }
                        else
                        {
                            if (token[0].Length > 2 && !stopWords.Contains(token[0]))
                            {
                                if (corpus.ContainsKey(token[0]))
                                {
                                    corpus[token[0]]++;
                                }
                                else
                                {
                                    corpus.Add(token[0], 1);
                                    wordIndex.Add(token[0], Index);
                                    Index++;
                                }
                                if (document.ContainsKey(token[0]))
                                {
                                    document[token[0]]++;
                                }
                                else
                                {
                                    document.Add(token[0], 1);
                                }
                            }
                        }
                        if (analyzL > 2 && !stopWords.Contains(analyzMax))
                        {
                            if (document[analyzMax] > docMax)
                            {
                                docMax2 = docMax;
                                docMax  = document[analyzMax];
                            }
                            else if (document[analyzMax] > docMax2)
                            {
                                docMax2 = document[analyzMax];
                            }
                            else
                            {
                                if (tokenL > 2 && !stopWords.Contains(token[0]) && analyzMax == "")
                                {
                                    if (document[token[0]] > docMax)
                                    {
                                        docMax2 = docMax;
                                        docMax  = document[token[0]];
                                    }
                                    else if (document[token[0]] > docMax2)
                                    {
                                        docMax2 = document[token[0]];
                                    }
                                }
                            }
                        }
                    }
                }
            }
            DocumentMax.Add(new int[] { docMax, docMax2 });
            DocumentClassIndex.Add(classIndex);
            DocumentWordFreq.Add(document);
        }