コード例 #1
0
 public void NormalizeVerbToken(CoNLLToken token)
 {
     if (token.Lemma.Contains("%aux"))
     {
         token.Lemma = token.Lemma.Replace("%aux", string.Empty);
     }
     if (token.Lemma.Contains("%passiv"))
     {
         token.Lemma = token.Lemma.Replace("%passiv", string.Empty);
     }
 }
コード例 #2
0
ファイル: Morphy.cs プロジェクト: Liebeck/IWNLP.Lemmatizer
 public void ProcessSentence(CoNLLSentence sentence)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             //else if (morphyDictionary.ContainsKey(token.Form.ToLower())) // adding a lower case comparison worsens the results
             //{
             //    token.PredictedLemmas = new List<string>();
             //    token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             //}
         }
         else if (token.POS == "ADJA" || token.POS == "ADJD")
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             else if (morphyDictionary.ContainsKey(token.Form.ToLower()))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             }
         }
         else if (token.POS.StartsWith("V"))
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             else if (morphyDictionary.ContainsKey(token.Form.ToLower()))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             }
         }
     }
 }
コード例 #3
0
 public void AddLookup(PartOfSpeech pos, CoNLLToken token)
 {
     if (token.PredictedLemmas == null || (token.PredictedLemmas != null && token.PredictedLemmas.Count == 0))
     {
         this.AddMissingLookup(pos, token.Form, token.Lemma, token.PredictedLemmas);
     }
     else
     {
         if (token.PredictedLemmas.Count == 1)
         {
             this.AddWrongLookup(pos, token.Form, token.Lemma, token.PredictedLemmas);
         }
         else
         {
             this.AddAmbiguousLookup(pos, token.Form, token.Lemma, token.PredictedLemmas);
         }
     }
 }
コード例 #4
0
        //protected bool IsExactMatchOrGuess(String goldLemma, String form, List<String> lemmas)
        //{
        //    if (lemmas == null || lemmas.Count == 0)
        //    {
        //        return (form == goldLemma);
        //    }
        //    if (lemmas.Count > 1)
        //    {
        //        return false;
        //    }
        //    else
        //    {
        //        return goldLemma.ToLower() == lemmas[0].ToLower();
        //    }
        //}



        public DetailedLookupResults EvaluateTwoResources(string path, string path2, string comment)
        {
            Console.WriteLine(comment);
            List <CoNLLSentence> sentences  = XMLSerializer.Deserialize <List <CoNLLSentence> >(path);
            List <CoNLLSentence> sentences2 = XMLSerializer.Deserialize <List <CoNLLSentence> >(path2);

            DetailedLookupResults result = new DetailedLookupResults()
            {
                TotalNounCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS == "NN" && !x.Lemma.Contains("|") && !x.Lemma.Contains("_") && (x.Lemma != "unknown" && x.Form != "unknown")),
                TotalVerbCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS.StartsWith("V")),
                TotalAdjectiveCount = sentences.SelectMany(x => x.Tokens).Count(x => (x.POS == "ADJA" || x.POS == "ADJD") && (x.Lemma != "NULL" && x.Form != "NULL"))// the second condition is for tokens in the HDT corpus that have the lemma "NULL"
            };

            for (int i = 0; i < sentences.Count; i++)
            {
                CoNLLSentence sentence = sentences[i];
                for (int j = 0; j < sentence.Tokens.Count; j++)
                {
                    CoNLLToken token = sentence.Tokens[j];
                    if (token.POS == "NN")
                    {
                        if (token.Lemma.Contains("|") || token.Lemma.Contains("_") || (token.Lemma == "unknown" && token.Form != "unknown"))
                        {
                            continue;
                        }
                        if (!(token.PredictedLemmas == null || token.PredictedLemmas.Count == 0))
                        {
                            if (token.PredictedLemmas.Count == 1)
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                                {
                                    result.NounsCorrectlyLemmatizedCount++;
                                }
                            }
                            else // if more than one lemma is found, compare with the second resource
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                                {
                                    result.NounsCorrectlyLemmatizedCount++;
                                }
                            }
                        }
                        else // if no lemma is found, compare with the second resource
                        {
                            if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                            {
                                result.NounsCorrectlyLemmatizedCount++;
                            }
                        }
                    }
                    else if (token.POS.StartsWith("V"))
                    {
                        NormalizeVerbToken(token);
                        if (!(token.PredictedLemmas == null || token.PredictedLemmas.Count == 0))
                        {
                            if (token.PredictedLemmas.Count == 1)
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                                {
                                    result.VerbsCorrectlyLemmatizedCount++;
                                }
                            }
                            else // if more than one lemma is found, compare with the second resource
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                                {
                                    result.VerbsCorrectlyLemmatizedCount++;
                                }
                            }
                        }
                        else // if no lemma is found, compare with the second resource
                        {
                            if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                            {
                                result.VerbsCorrectlyLemmatizedCount++;
                            }
                        }
                    }
                    else if (token.POS == "ADJA" || token.POS == "ADJD")
                    {
                        if (token.Lemma == "NULL" && token.Form != "NULL") // ~ 2000 adjectives in the HDT corpus have "NULL" as lemma. Ignore them for the evaluation
                        {
                            continue;
                        }
                        if (!(token.PredictedLemmas == null || token.PredictedLemmas.Count == 0))
                        {
                            if (token.PredictedLemmas.Count == 1)
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                                {
                                    result.AdjectivesCorrectlyLemmatizedCount++;
                                }
                            }
                            else // if more than one lemma is found, compare with the second resource
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                                {
                                    result.AdjectivesCorrectlyLemmatizedCount++;
                                }
                            }
                        }
                        else // if no lemma is found, compare with the second resource
                        {
                            if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                            {
                                result.AdjectivesCorrectlyLemmatizedCount++;
                            }
                        }
                    }
                }
            }
            Console.WriteLine(result.ToString());
            return(result);
        }
コード例 #5
0
        public DetailedLookupResults Evaluate(string path, string comment)
        {
            Console.WriteLine(comment);
            List <CoNLLSentence> sentences = XMLSerializer.Deserialize <List <CoNLLSentence> >(path);

            DetailedLookupResults result = new DetailedLookupResults()
            {
                TotalNounCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS == "NN" && !x.Lemma.Contains("|") && !x.Lemma.Contains("_") && (x.Lemma != "unknown" && x.Form != "unknown")),
                TotalVerbCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS.StartsWith("V")),
                TotalAdjectiveCount = sentences.SelectMany(x => x.Tokens).Count(x => (x.POS == "ADJA" || x.POS == "ADJD") && (x.Lemma != "NULL" && x.Form != "NULL"))// the second condition is for tokens in the HDT corpus that have the lemma "NULL"
            };

            for (int i = 0; i < sentences.Count; i++)
            {
                CoNLLSentence sentence = sentences[i];
                for (int j = 0; j < sentence.Tokens.Count; j++)
                {
                    CoNLLToken token = sentence.Tokens[j];
                    if (token.POS == "NN")
                    {
                        if (token.Lemma.Contains("|") || token.Lemma.Contains("_") || (token.Lemma == "unknown" && token.Form != "unknown"))
                        {
                            continue;
                        }
                        if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                        {
                            result.NounsCorrectlyLemmatizedCount++;
                        }
                        else
                        {
                            result.AddLookup(PartOfSpeech.Noun, token);
                        }
                    }
                    else if (token.POS.StartsWith("V"))
                    {
                        NormalizeVerbToken(token);
                        if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                        {
                            result.VerbsCorrectlyLemmatizedCount++;
                        }
                        else
                        {
                            result.AddLookup(PartOfSpeech.Verb, token);
                        }
                    }
                    else if (token.POS == "ADJA" || token.POS == "ADJD")
                    {
                        if (token.Lemma == "NULL" && token.Form != "NULL")  // ~ 2000 adjectives in the HDT corpus have "NULL" as lemma. Ignore them for the evaluation
                        {
                            continue;
                        }
                        if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                        {
                            result.AdjectivesCorrectlyLemmatizedCount++;
                        }
                        else
                        {
                            result.AddLookup(PartOfSpeech.Adjective, token);
                        }
                        //else
                        //{
                        //    if (token.PredictedLemmas != null && token.PredictedLemmas.Count == 1)
                        //    {
                        //        String key = String.Format("{0}->{1} != {2}", token.Form, token.PredictedLemmas[0], token.Lemma);
                        //        if (!wrongMappings.ContainsKey(key))
                        //        {
                        //            wrongMappings.Add(key, 0);
                        //        }
                        //        wrongMappings[key] = wrongMappings[key] + 1;
                        //    }
                        //}
                    }
                }
            }

            //var wrongMappingSorted = wrongMappings.OrderByDescending(x => x.Value);

            //foreach (var entry in wrongMappingSorted.Take(100))
            //{
            //    Console.WriteLine(entry.Key + ": " + entry.Value);
            //}

            Console.WriteLine(result.ToString());
            return(result);
        }
コード例 #6
0
 public static void ProcessSentence(CoNLLSentence sentence, Lemmatizer iwnlp)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     //is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             List <POS> pos = new List <POS>()
             {
                 POS.Noun, POS.X
             };
             if (iwnlp.ContainsEntry(token.Form, POS.Noun))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.X))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
             }
             else if (iwnlp.ContainsEntry(token.Form, pos, true))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, pos, true);
             }
         }
         else
         {
             if (token.POS == "ADJA" || token.POS == "ADJD")
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Adjective))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Adjective, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Noun, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.X, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 //else if (iwnlp.ContainsEntry(token.Form,true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
             else if (token.POS.StartsWith("V"))
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 // test
                 //else if (iwnlp.ContainsEntry(token.Form, true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
         }
     }
 }