private void GetPossibleWords(Word word, string restOfWord, IList <Word> words, bool checkTransition) { if (restOfWord.Length == 0) { var newPossibleWord = Word.CopyOf(word); words.Add(newPossibleWord); return; } IList <KeyValuePair <string, Suffix> > possibleFirstSuffixes = GetPossibleFirstSuffixes(restOfWord); if (possibleFirstSuffixes.Count == 0) { return; } foreach (var pair in possibleFirstSuffixes) { //Burada değişiklik yaptık, şimdilik sorun yok if (!_lang.Morphotactics.HasTransition(word.Last.Morpheme.Id, pair.Value.Id) && checkTransition) { continue; } word.AddSuffix(pair.Value); GetPossibleWords(word, restOfWord.Remove(0, pair.Key.Length), words, checkTransition); word.RemoveLastSuffix(); } }
/// <summary> /// Creates and returns a word with the specified "morpheme id" sequence /// </summary> public Word Generate(params string[] morphemes) { StringExtensions.ThrowIfNullAny(morphemes); var index = StringExtensions.ContainsWhitespaceAny(morphemes); //var index = StringExtensions.ContainsWhitespaceAny(morphemes); //if (index >= 0) //{ // throw new ArgumentException($"Morpheme identifier can not contain whitespace: \"{morphemes[index]}\""); //} var root = GetRoot(morphemes[0]); if (root == null) { return(null); } var word = new Word(root); for (var i = 1; i < morphemes.Length; i++) { var suffix = GetSuffix(morphemes[i]); if (suffix == null) { return(null); } word.AddSuffix(suffix); } return(word); }
public bool TestStrictGeneration(string rootWord, string suffixId) { var root = Tr.GetRootsHavingSurface(rootWord).First(); var word = new Word(root); var suffix = Tr.GetSuffix(suffixId); return word.AddSuffix(suffix, Tr); }
public bool TestStrictGeneration(string rootWord, string suffixId) { var root = Tr.GetRootsHavingSurface(rootWord).First(); var word = new Word(root); var suffix = Tr.GetSuffix(suffixId); return(word.AddSuffix(suffix, Tr)); }
public void TestStrictGeneration2() { var root = Tr.GetRootsHavingSurface("gel").First(); var word = new Word(root); var copy = Word.CopyOf(word); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr"), Tr); Assert.True(copy.Equals(word)); Assert.True(word.Equals(copy)); }
private void GetPossibleWords(Word word, string restOfWord, IList <Word> words) { var suffixes = FindPossibleSuffixes(word.Last.Morpheme, restOfWord); foreach (var suffix in suffixes) { word.AddSuffix(suffix.Morpheme); GetPossibleWords(word, restOfWord.Remove(0, suffix.Surface.Length), words); word.RemoveLastSuffix(); } if (restOfWord.Length == 0 && _lang.Morphotactics.IsTerminal(word.Last.Morpheme)) { var newPossibleWord = Word.CopyOf(word); words.Add(newPossibleWord); } }
public string LerimdekilerdenTest(string rootWord) { Root root = tr.GetRootsHavingSurface(rootWord).First(); word = new Word(root); word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m")); word.AddSuffix(tr.GetSuffix("IC_HAL_BULUNMA_DA")); word.AddSuffix(tr.GetSuffix("IC_AITLIK_ki")); word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(tr.GetSuffix("IC_HAL_AYRILMA_DAn")); return word.GetSurface(); }
public string TestGeneration(string rootWord) { var root = Tr.GetRootsHavingSurface(rootWord).First(); var word = new Word(root); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m")); word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA")); word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki")); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn")); return word.GetSurface(); }
public string TestGeneration(string rootWord) { var root = Tr.GetRootsHavingSurface(rootWord).First(); var word = new Word(root); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m")); word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA")); word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki")); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn")); return(word.GetSurface()); }
public void TestCopyOf() { var root = Tr.GetRootsHavingSurface("kitap").First(); var word = new Word(root); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(Tr.GetSuffix("IC_SAHIPLIK_BEN_(U)m")); word.AddSuffix(Tr.GetSuffix("IC_HAL_BULUNMA_DA")); word.AddSuffix(Tr.GetSuffix("IC_AITLIK_ki")); word.AddSuffix(Tr.GetSuffix("IC_COGUL_lAr")); word.AddSuffix(Tr.GetSuffix("IC_HAL_AYRILMA_DAn")); var copy = Word.CopyOf(word); Assert.True(word.Equals(word)); Assert.False(word.Equals(null)); Assert.AreNotSame(word, copy); Assert.AreEqual(word.GetSurface(), copy.GetSurface()); Assert.True(word.Equals(copy)); }
private void GetPossibleWords(Word word, string restOfWord, IList<Word> words, bool checkTransition) { if (restOfWord.Length == 0) { var newPossibleWord = new Word(word); words.Add(newPossibleWord); return; } IList<KeyValuePair<string, Suffix>> possibleFirstSuffixes = GetPossibleFirstSuffixes(restOfWord); if (possibleFirstSuffixes.Count == 0) { return; } foreach (var pair in possibleFirstSuffixes) { //Burada değişiklik yaptık, şimdilik sorun yok if (!_lang.Morphotactics.HasTransition(word.Last.Morpheme.Id, pair.Value.Id) && checkTransition) { continue; } word.AddSuffix(pair.Value); GetPossibleWords(word, restOfWord.Remove(0, pair.Key.Length), words, checkTransition); word.RemoveLastSuffix(); } }
private static void Main() { //var lines = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\corpus\unigrams.txt", // Encoding.UTF8); //List<string> resolved = new List<string>(); //List<string> unresolved = new List<string>(); //foreach (var line in lines) //{ // var token = line.Split(null)[0]; // var solutions = Analyzer.Analyze(token); // if (solutions.Count == 0) // { // unresolved.Add(line); // } // else // { // resolved.Add(line); // } //} //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", resolved, Encoding.UTF8); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_unresolved", unresolved, Encoding.UTF8); //var nuve_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", Encoding.UTF8); //var zemberek_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\zemberek_resolved", // Encoding.UTF8); //var onlyZemberek = zemberek_resolved.Except(nuve_resolved); //var onlyNuve = nuve_resolved.Except(zemberek_resolved); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyZemberek", onlyZemberek, Encoding.UTF8); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyNuve", onlyNuve, Encoding.UTF8); //File.WriteAllText(@"C:\Users\harun_000\Desktop\vboxshare\tokenized-paragraphs-with-punctuation.txt", Split(text), Encoding.UTF8); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\vboxshare\sentences-no-punctuation-remove-empty-lines.txt", paragraphs.Select(p => Regex.Replace(p, "[^\\p{L}\\p{Nd}' ]", "")), Encoding.UTF8); //Benchmarker.TestWithAMillionWords(Analyzer); //Benchmarker.TestWithAMillionTokens(Analyzer); //Language tr = Language.Turkish; ////Analysis //var analyzer = new WordAnalyzer(tr); ////Morphologic Analysis and stemming //IList<Word> solutions = analyzer.Analyze("deneme"); //foreach (var solution in solutions) //{ // Console.WriteLine("\t{0}", solution); // Console.WriteLine("\toriginal:{0} stem:{1}\n", // solution.GetSurface(), // solution.GetStem().GetSurface()); //Stemming //} var tr = Language.Turkish; var root = tr.GetRootsHavingSurface("gel").First(); Word word = new Word(root); if(!word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr"), tr)) { Console.WriteLine("Adding the suffix IC_COGUL_lAr after a verb is not valid!"); Console.WriteLine(word.GetSurface()); //prints just gel "gel" } Console.WriteLine(root); Console.WriteLine(word); Console.WriteLine(word.GetSurface()); Test(); }
private static void Main() { //var lines = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\corpus\unigrams.txt", // Encoding.UTF8); //List<string> resolved = new List<string>(); //List<string> unresolved = new List<string>(); //foreach (var line in lines) //{ // var token = line.Split(null)[0]; // var solutions = Analyzer.Analyze(token); // if (solutions.Count == 0) // { // unresolved.Add(line); // } // else // { // resolved.Add(line); // } //} //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", resolved, Encoding.UTF8); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_unresolved", unresolved, Encoding.UTF8); //var nuve_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\nuve_resolved", Encoding.UTF8); //var zemberek_resolved = File.ReadAllLines(@"C:\Users\harun_000\Desktop\workspace\zemberek_resolved", // Encoding.UTF8); //var onlyZemberek = zemberek_resolved.Except(nuve_resolved); //var onlyNuve = nuve_resolved.Except(zemberek_resolved); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyZemberek", onlyZemberek, Encoding.UTF8); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\workspace\onlyNuve", onlyNuve, Encoding.UTF8); //File.WriteAllText(@"C:\Users\harun_000\Desktop\vboxshare\tokenized-paragraphs-with-punctuation.txt", Split(text), Encoding.UTF8); //File.WriteAllLines(@"C:\Users\harun_000\Desktop\vboxshare\sentences-no-punctuation-remove-empty-lines.txt", paragraphs.Select(p => Regex.Replace(p, "[^\\p{L}\\p{Nd}' ]", "")), Encoding.UTF8); //Benchmarker.TestWithAMillionWords(Analyzer); //Benchmarker.TestWithAMillionTokens(Analyzer); //Language tr = Language.Turkish; ////Analysis //var analyzer = new WordAnalyzer(tr); ////Morphologic Analysis and stemming //IList<Word> solutions = analyzer.Analyze("deneme"); //foreach (var solution in solutions) //{ // Console.WriteLine("\t{0}", solution); // Console.WriteLine("\toriginal:{0} stem:{1}\n", // solution.GetSurface(), // solution.GetStem().GetSurface()); //Stemming //} var tr = Language.Turkish; var root = tr.GetRootsHavingSurface("gel").First(); Word word = new Word(root); if (!word.AddSuffix(tr.GetSuffix("IC_COGUL_lAr"), tr)) { Console.WriteLine("Adding the suffix IC_COGUL_lAr after a verb is not valid!"); Console.WriteLine(word.GetSurface()); //prints just gel "gel" } Console.WriteLine(root); Console.WriteLine(word); Console.WriteLine(word.GetSurface()); Test(); }