예제 #1
0
        /// <summary>
        /// 依存句法分析(神经网络句法模型需要-Xms1g -Xmx1g -Xmn512m)
        /// 内部采用NeuralNetworkDependencyParser实现,用户可以直接调用NeuralNetworkDependencyParser.compute(sentence)
        /// 也可以调用基于MaxEnt的依存句法分析器MaxEntDependencyParser.compute(sentence)
        /// 《基于神经网络的高性能依存句法分析器》
        ///https://www.hankcs.com/nlp/parsing/neural-network-based-dependency-parser.html
        ///《最大熵依存句法分析器的实现》
        ///https://www.hankcs.com/nlp/parsing/to-achieve-the-maximum-entropy-of-the-dependency-parser.html
        ///《基于CRF序列标注的中文依存句法分析器的Java实现》
        ///https://www.hankcs.com/nlp/parsing/crf-sequence-annotation-chinese-dependency-parser-implementation-based-on-java.html
        /// </summary>
        public void DependencyParser()
        {
            CoNLLSentence sentence = HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。");

            Console.WriteLine(sentence);
            // 可以方便地遍历它
            foreach (CoNLLWord word in sentence)
            {
                Console.WriteLine($"{word.LEMMA} --({word.DEPREL})--> {word.HEAD.LEMMA}\n");
            }
            // 也可以直接拿到数组,任意顺序或逆序遍历
            CoNLLWord[] wordArray = sentence.getWordArray();
            for (int i = wordArray.Length - 1; i >= 0; i--)
            {
                CoNLLWord word = wordArray[i];
                Console.WriteLine($"{word.LEMMA} --({word.DEPREL})-->{word.HEAD.LEMMA}\n");
            }
            // 还可以直接遍历子树,从某棵子树的某个节点一路遍历到虚根
            CoNLLWord head = wordArray[12];

            while ((head = head.HEAD) != null)
            {
                if (head == CoNLLWord.ROOT)
                {
                    Console.WriteLine(head.LEMMA);
                }
                else
                {
                    Console.WriteLine($"{head.LEMMA} --({head.DEPREL})--> ");
                }
            }
        }
예제 #2
0
        //static void LemmatizeMate(List<CoNLLSentence> corpus, String exportPath, MateTools mateTools)
        //{
        //    int count = corpus.Count;
        //    for (int i = 0; i < count; i++)
        //    {
        //        CoNLLSentence sentence = corpus[i];
        //        mateTools.ProcessSentence(sentence);
        //        Console.WriteLine(i);
        //    }
        //    XMLSerializer.Serialize<List<CoNLLSentence>>(corpus, exportPath);

        //}

        static void LemmatizeTreeTagger(List <CoNLLSentence> corpus, string exportPath, TreeTagger treeTagger)
        {
            int count = corpus.Count;

            for (int j = 0; j < count; j++)
            {
                CoNLLSentence sentence = corpus[j];
                treeTagger.ProcessSentence(sentence);
                Console.WriteLine(j);
            }
            XMLSerializer.Serialize <List <CoNLLSentence> >(corpus, exportPath);
        }
예제 #3
0
        public void ProcessSentence(CoNLLSentence sentence)
        {
            string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
            is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
            for (int i = 0; i < sentence.Tokens.Count; i++)
            {
                String mateToolsLemma = sentenceMateTools.plemmas[i + 1]; // zero based index is ROOT node
                sentence.Tokens[i].PredictedLemmas = new List<string>();
                sentence.Tokens[i].PredictedLemmas.Add(mateToolsLemma);
            }

        }
예제 #4
0
        static void LemmatizeMorphy(List <CoNLLSentence> corpus, string exportPath)
        {
            Morphy morphy = new Morphy();

            morphy.InitMorphy(AppSettingsWrapper.MorphyCSV);
            int count = corpus.Count;

            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                morphy.ProcessSentence(sentence);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize <List <CoNLLSentence> >(corpus, exportPath);
        }
예제 #5
0
        static void LemmatizeIWNLP(List <CoNLLSentence> corpus, string exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();

            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;

            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize <List <CoNLLSentence> >(corpus, exportPath);
        }
예제 #6
0
 public void ProcessSentence(CoNLLSentence sentence)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List<string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             //else if (morphyDictionary.ContainsKey(token.Form.ToLower())) // adding a lower case comparison worsens the results
             //{
             //    token.PredictedLemmas = new List<string>();
             //    token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             //}
         }
         else if (token.POS == "ADJA" || token.POS == "ADJD")
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List<string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             else if (morphyDictionary.ContainsKey(token.Form.ToLower()))
             {
                 token.PredictedLemmas = new List<string>();
                 token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             }
         }
         else if (token.POS.StartsWith("V"))
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List<string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             else if (morphyDictionary.ContainsKey(token.Form.ToLower()))
             {
                 token.PredictedLemmas = new List<string>();
                 token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             }
         }
     }
 }
예제 #7
0
 public void ProcessSentence(CoNLLSentence sentence)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             //else if (morphyDictionary.ContainsKey(token.Form.ToLower())) // adding a lower case comparison worsens the results
             //{
             //    token.PredictedLemmas = new List<string>();
             //    token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             //}
         }
         else if (token.POS == "ADJA" || token.POS == "ADJD")
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             else if (morphyDictionary.ContainsKey(token.Form.ToLower()))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             }
         }
         else if (token.POS.StartsWith("V"))
         {
             if (morphyDictionary.ContainsKey(token.Form))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form];
             }
             else if (morphyDictionary.ContainsKey(token.Form.ToLower()))
             {
                 token.PredictedLemmas = new List <string>();
                 token.PredictedLemmas = morphyDictionary[token.Form.ToLower()];
             }
         }
     }
 }
예제 #8
0
        public void ProcessSentence(CoNLLSentence sentence)
        {
            StringBuilder tokenString = new StringBuilder();
            foreach (CoNLLToken token in sentence.Tokens)
            {
                tokenString.AppendLine(String.Format("{0}", token.Form));
            }

            System.IO.File.WriteAllText(inputPath, tokenString.ToString(), Encoding.UTF8);

            //german-utf8.par input.txt output.txt -token -lemma 
            Process treeTaggerProcess = new Process()
            {
                StartInfo = new ProcessStartInfo()
                {

                    FileName = AppSettingsWrapper.TreeTagger.TreeTaggerExePath,
                    Arguments = String.Format("\"{0}\" \"{1}\" \"{2}\" -token -lemma",
                            AppSettingsWrapper.TreeTagger.TreeTaggerGermanPath,
                            inputPath,
                            outputPath),
                    RedirectStandardOutput = true,
                    UseShellExecute = false,
                    CreateNoWindow = true 

                }
            };
            treeTaggerProcess.Start();

            // the new process runs asynchronous. Wait until it stops to read its output afterwards
            treeTaggerProcess.WaitForExit();

            String[] allLinesOuputFile = System.IO.File.ReadAllLines(outputPath, Encoding.UTF8);
            for (int i = 0; i < sentence.Tokens.Count; i++)
            {
                String[] line = allLinesOuputFile[i].Split(new String[] { "\t" }, StringSplitOptions.RemoveEmptyEntries);
                if (line.Length == 3 && line[2] != "<unknown>")
                {
                    sentence.Tokens[i].PredictedLemmas = line[2].Split(new String[] { "|" }, StringSplitOptions.RemoveEmptyEntries).ToList();
                    // TreeTagger can return multiple lemmas
                    // Example: Stiften	NN	Stift|Stiften
                    // For instance "Stiften" will return "Stift" and "Stiften".
                }
            }
        }
예제 #9
0
        public void ProcessSentence(CoNLLSentence sentence)
        {
            StringBuilder tokenString = new StringBuilder();

            foreach (CoNLLToken token in sentence.Tokens)
            {
                tokenString.AppendLine(string.Format("{0}", token.Form));
            }

            System.IO.File.WriteAllText(inputPath, tokenString.ToString(), Encoding.UTF8);

            //german-utf8.par input.txt output.txt -token -lemma
            Process treeTaggerProcess = new Process()
            {
                StartInfo = new ProcessStartInfo()
                {
                    FileName  = AppSettingsWrapper.TreeTagger.TreeTaggerExePath,
                    Arguments = string.Format("\"{0}\" \"{1}\" \"{2}\" -token -lemma",
                                              AppSettingsWrapper.TreeTagger.TreeTaggerGermanPath,
                                              inputPath,
                                              outputPath),
                    RedirectStandardOutput = true,
                    UseShellExecute        = false,
                    CreateNoWindow         = true
                }
            };

            treeTaggerProcess.Start();

            // the new process runs asynchronous. Wait until it stops to read its output afterwards
            treeTaggerProcess.WaitForExit();

            string[] allLinesOuputFile = System.IO.File.ReadAllLines(outputPath, Encoding.UTF8);
            for (int i = 0; i < sentence.Tokens.Count; i++)
            {
                string[] line = allLinesOuputFile[i].Split(new string[] { "\t" }, StringSplitOptions.RemoveEmptyEntries);
                if (line.Length == 3 && line[2] != "<unknown>")
                {
                    sentence.Tokens[i].PredictedLemmas = line[2].Split(new string[] { "|" }, StringSplitOptions.RemoveEmptyEntries).ToList();
                    // TreeTagger can return multiple lemmas
                    // Example: Stiften	NN	Stift|Stiften
                    // For instance "Stiften" will return "Stift" and "Stiften".
                }
            }
        }
예제 #10
0
        //protected bool IsExactMatchOrGuess(String goldLemma, String form, List<String> lemmas)
        //{
        //    if (lemmas == null || lemmas.Count == 0)
        //    {
        //        return (form == goldLemma);
        //    }
        //    if (lemmas.Count > 1)
        //    {
        //        return false;
        //    }
        //    else
        //    {
        //        return goldLemma.ToLower() == lemmas[0].ToLower();
        //    }
        //}



        public DetailedLookupResults EvaluateTwoResources(string path, string path2, string comment)
        {
            Console.WriteLine(comment);
            List <CoNLLSentence> sentences  = XMLSerializer.Deserialize <List <CoNLLSentence> >(path);
            List <CoNLLSentence> sentences2 = XMLSerializer.Deserialize <List <CoNLLSentence> >(path2);

            DetailedLookupResults result = new DetailedLookupResults()
            {
                TotalNounCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS == "NN" && !x.Lemma.Contains("|") && !x.Lemma.Contains("_") && (x.Lemma != "unknown" && x.Form != "unknown")),
                TotalVerbCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS.StartsWith("V")),
                TotalAdjectiveCount = sentences.SelectMany(x => x.Tokens).Count(x => (x.POS == "ADJA" || x.POS == "ADJD") && (x.Lemma != "NULL" && x.Form != "NULL"))// the second condition is for tokens in the HDT corpus that have the lemma "NULL"
            };

            for (int i = 0; i < sentences.Count; i++)
            {
                CoNLLSentence sentence = sentences[i];
                for (int j = 0; j < sentence.Tokens.Count; j++)
                {
                    CoNLLToken token = sentence.Tokens[j];
                    if (token.POS == "NN")
                    {
                        if (token.Lemma.Contains("|") || token.Lemma.Contains("_") || (token.Lemma == "unknown" && token.Form != "unknown"))
                        {
                            continue;
                        }
                        if (!(token.PredictedLemmas == null || token.PredictedLemmas.Count == 0))
                        {
                            if (token.PredictedLemmas.Count == 1)
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                                {
                                    result.NounsCorrectlyLemmatizedCount++;
                                }
                            }
                            else // if more than one lemma is found, compare with the second resource
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                                {
                                    result.NounsCorrectlyLemmatizedCount++;
                                }
                            }
                        }
                        else // if no lemma is found, compare with the second resource
                        {
                            if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                            {
                                result.NounsCorrectlyLemmatizedCount++;
                            }
                        }
                    }
                    else if (token.POS.StartsWith("V"))
                    {
                        NormalizeVerbToken(token);
                        if (!(token.PredictedLemmas == null || token.PredictedLemmas.Count == 0))
                        {
                            if (token.PredictedLemmas.Count == 1)
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                                {
                                    result.VerbsCorrectlyLemmatizedCount++;
                                }
                            }
                            else // if more than one lemma is found, compare with the second resource
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                                {
                                    result.VerbsCorrectlyLemmatizedCount++;
                                }
                            }
                        }
                        else // if no lemma is found, compare with the second resource
                        {
                            if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                            {
                                result.VerbsCorrectlyLemmatizedCount++;
                            }
                        }
                    }
                    else if (token.POS == "ADJA" || token.POS == "ADJD")
                    {
                        if (token.Lemma == "NULL" && token.Form != "NULL") // ~ 2000 adjectives in the HDT corpus have "NULL" as lemma. Ignore them for the evaluation
                        {
                            continue;
                        }
                        if (!(token.PredictedLemmas == null || token.PredictedLemmas.Count == 0))
                        {
                            if (token.PredictedLemmas.Count == 1)
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                                {
                                    result.AdjectivesCorrectlyLemmatizedCount++;
                                }
                            }
                            else // if more than one lemma is found, compare with the second resource
                            {
                                if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                                {
                                    result.AdjectivesCorrectlyLemmatizedCount++;
                                }
                            }
                        }
                        else // if no lemma is found, compare with the second resource
                        {
                            if (IsLowerCaseExactMatch(token.Lemma, sentences2[i].Tokens[j].PredictedLemmas))
                            {
                                result.AdjectivesCorrectlyLemmatizedCount++;
                            }
                        }
                    }
                }
            }
            Console.WriteLine(result.ToString());
            return(result);
        }
예제 #11
0
        public DetailedLookupResults Evaluate(string path, string comment)
        {
            Console.WriteLine(comment);
            List <CoNLLSentence> sentences = XMLSerializer.Deserialize <List <CoNLLSentence> >(path);

            DetailedLookupResults result = new DetailedLookupResults()
            {
                TotalNounCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS == "NN" && !x.Lemma.Contains("|") && !x.Lemma.Contains("_") && (x.Lemma != "unknown" && x.Form != "unknown")),
                TotalVerbCount      = sentences.SelectMany(x => x.Tokens).Count(x => x.POS.StartsWith("V")),
                TotalAdjectiveCount = sentences.SelectMany(x => x.Tokens).Count(x => (x.POS == "ADJA" || x.POS == "ADJD") && (x.Lemma != "NULL" && x.Form != "NULL"))// the second condition is for tokens in the HDT corpus that have the lemma "NULL"
            };

            for (int i = 0; i < sentences.Count; i++)
            {
                CoNLLSentence sentence = sentences[i];
                for (int j = 0; j < sentence.Tokens.Count; j++)
                {
                    CoNLLToken token = sentence.Tokens[j];
                    if (token.POS == "NN")
                    {
                        if (token.Lemma.Contains("|") || token.Lemma.Contains("_") || (token.Lemma == "unknown" && token.Form != "unknown"))
                        {
                            continue;
                        }
                        if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                        {
                            result.NounsCorrectlyLemmatizedCount++;
                        }
                        else
                        {
                            result.AddLookup(PartOfSpeech.Noun, token);
                        }
                    }
                    else if (token.POS.StartsWith("V"))
                    {
                        NormalizeVerbToken(token);
                        if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                        {
                            result.VerbsCorrectlyLemmatizedCount++;
                        }
                        else
                        {
                            result.AddLookup(PartOfSpeech.Verb, token);
                        }
                    }
                    else if (token.POS == "ADJA" || token.POS == "ADJD")
                    {
                        if (token.Lemma == "NULL" && token.Form != "NULL")  // ~ 2000 adjectives in the HDT corpus have "NULL" as lemma. Ignore them for the evaluation
                        {
                            continue;
                        }
                        if (IsLowerCaseExactMatch(token.Lemma, token.PredictedLemmas))
                        {
                            result.AdjectivesCorrectlyLemmatizedCount++;
                        }
                        else
                        {
                            result.AddLookup(PartOfSpeech.Adjective, token);
                        }
                        //else
                        //{
                        //    if (token.PredictedLemmas != null && token.PredictedLemmas.Count == 1)
                        //    {
                        //        String key = String.Format("{0}->{1} != {2}", token.Form, token.PredictedLemmas[0], token.Lemma);
                        //        if (!wrongMappings.ContainsKey(key))
                        //        {
                        //            wrongMappings.Add(key, 0);
                        //        }
                        //        wrongMappings[key] = wrongMappings[key] + 1;
                        //    }
                        //}
                    }
                }
            }

            //var wrongMappingSorted = wrongMappings.OrderByDescending(x => x.Value);

            //foreach (var entry in wrongMappingSorted.Take(100))
            //{
            //    Console.WriteLine(entry.Key + ": " + entry.Value);
            //}

            Console.WriteLine(result.ToString());
            return(result);
        }
 public static void ProcessSentence(CoNLLSentence sentence, Lemmatizer iwnlp)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     //is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             List<POS> pos = new List<POS>() { POS.Noun, POS.X };
             if (iwnlp.ContainsEntry(token.Form, POS.Noun))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.X))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
             }
             else if(iwnlp.ContainsEntry(token.Form, pos, true))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, pos, true);
             }
         }
         else
         {
             if (token.POS == "ADJA" || token.POS == "ADJD")
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Adjective))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Adjective, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Noun, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.X, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 //else if (iwnlp.ContainsEntry(token.Form,true)) 
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
             else if (token.POS.StartsWith("V"))
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                     // test
                 //else if (iwnlp.ContainsEntry(token.Form, true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
         }
     }
 }
예제 #13
0
        public List <CoNLLSentence> ReadFile(string path, Corpus corpus)
        {
            System.IO.StreamReader file      = new System.IO.StreamReader(path);
            List <CoNLLSentence>   sentences = new List <CoNLLSentence>();

            int           sentenceNumber = 1;
            CoNLLSentence nextSentence   = new CoNLLSentence();

            nextSentence.ID     = sentenceNumber++;
            nextSentence.Tokens = new List <CoNLLToken>();
            string line;

            while ((line = file.ReadLine()) != null)
            {
                if (line.StartsWith("#"))
                {
                    continue;
                }
                if (string.IsNullOrEmpty(line))
                {
                    sentences.Add(nextSentence);
                    nextSentence        = new CoNLLSentence();
                    nextSentence.ID     = sentenceNumber++;
                    nextSentence.Tokens = new List <CoNLLToken>();
                }
                else
                {
                    string[] values = line.Split(new string[] { "\t" }, StringSplitOptions.RemoveEmptyEntries);
                    if (corpus == Corpus.Tiger || corpus == Corpus.HDT)
                    {
                        nextSentence.Tokens.Add(new CoNLLToken()
                        {
                            ID    = values[0],
                            Form  = values[1],
                            Lemma = values[2],
                            POS   = values[4],
                        });
                    }
                    else
                    {
                        string lemma = values[6];
                        if (lemma == "#refl")
                        {
                            lemma = string.Empty;
                        }
                        if (lemma.Contains("#"))
                        {
                            lemma = lemma.Replace("#", string.Empty);
                        }
                        nextSentence.Tokens.Add(new CoNLLToken()
                        {
                            ID    = values[2],
                            Form  = values[3],
                            Lemma = lemma,
                            POS   = values[4],
                        });
                    }
                }
            }
            return(sentences);
        }
예제 #14
0
        public List<CoNLLSentence> ReadFile(String path, Corpus corpus)
        {
            System.IO.StreamReader file = new System.IO.StreamReader(path);
            List<CoNLLSentence> sentences = new List<CoNLLSentence>();

            int sentenceNumber = 1;
            CoNLLSentence nextSentence = new CoNLLSentence();
            nextSentence.ID = sentenceNumber++;
            nextSentence.Tokens = new List<CoNLLToken>();
            String line;
            
            while ((line = file.ReadLine()) != null)
            {
                if (line.StartsWith("#")) 
                {
                    continue;
                }
                if (String.IsNullOrEmpty(line))
                {
                    sentences.Add(nextSentence);
                    nextSentence = new CoNLLSentence();
                    nextSentence.ID = sentenceNumber++;
                    nextSentence.Tokens = new List<CoNLLToken>();
                }
                else
                {
                    String[] values = line.Split(new String[] { "\t" }, StringSplitOptions.RemoveEmptyEntries);
                    if (corpus == Corpus.Tiger || corpus == Corpus.HDT)
                    {
                        nextSentence.Tokens.Add(new CoNLLToken()
                        {
                            ID = values[0],
                            Form = values[1],
                            Lemma = values[2],
                            POS = values[4],
                        });
                    }
                    else 
                    {
                        String lemma = values[6];
                        if (lemma == "#refl") 
                        {
                            lemma = String.Empty;
                        }
                        if (lemma.Contains("#")) 
                        {
                            lemma = lemma.Replace("#", String.Empty);
                        }
                        nextSentence.Tokens.Add(new CoNLLToken()
                        {
                            ID = values[2],
                            Form = values[3],
                            Lemma = lemma,
                            POS = values[4],
                        });                        
                    }

                }
            }
            return sentences;
        }
예제 #15
0
 public static void ProcessSentence(CoNLLSentence sentence, Lemmatizer iwnlp)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     //is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             List <POS> pos = new List <POS>()
             {
                 POS.Noun, POS.X
             };
             if (iwnlp.ContainsEntry(token.Form, POS.Noun))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.X))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
             }
             else if (iwnlp.ContainsEntry(token.Form, pos, true))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, pos, true);
             }
         }
         else
         {
             if (token.POS == "ADJA" || token.POS == "ADJD")
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Adjective))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Adjective, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Noun, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.X, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 //else if (iwnlp.ContainsEntry(token.Form,true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
             else if (token.POS.StartsWith("V"))
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 // test
                 //else if (iwnlp.ContainsEntry(token.Form, true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
         }
     }
 }