Lemmatizer C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input    = "کتاب‌ها";
            expected = "کتاب";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "آتشفشان";
            expected = "آتشفشان";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "می‌روم";
            expected = "رفت#رو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "گفته شده است";
            expected = "گفت#گو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "مردم";
            expected = "مردم";
            actual   = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }

コード例 #2

0

ファイルを表示

ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator

        private static void EnrichLemmatizer(Lemmatizer lemmatizer, string enricherFilePath)
        {
            var fileReader = new EnricherFileReader(enricherFilePath);
            var newLemmas  = fileReader.ReadAllLemmaEntries();

            EnrichLemmatizer(lemmatizer, newLemmas);
        }

コード例 #3

0

ファイルを表示

        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List <string> inputs = new List <string>()
            {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List <string> expecteds = new List <string>()
            {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List <string> pos = new List <string>()
            {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input    = inputs[i];
                expected = expecteds[i];
                p        = pos[i];
                if (p == null)
                {
                    actual = lemmatizer.Lemmatize(input);
                }
                else
                {
                    actual = lemmatizer.Lemmatize(input, p);
                }
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }

コード例 #4

0

ファイルを表示

ファイル: LemmatizerTest.cs プロジェクト: nvdnkpr/NHazm

        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input = "کتاب‌ها";
            expected = "کتاب";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "آتشفشان";
            expected = "آتشفشان";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "می‌روم";
            expected = "رفت#رو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "گفته شده است";
            expected = "گفت#گو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "مردم";
            expected = "مردم";
            actual = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }

コード例 #5

0

ファイルを表示

ファイル: LemmatizationHandler.cs プロジェクト: Moskvinr/Crawler

 static LemmatizationHandler()
 {
     enStream     = File.OpenRead(EnLemmatizerpath);
     ruStream     = File.OpenRead(RuLemmatizerpath);
     enLemmatizer = new Lemmatizer(enStream);
     ruLemmatizer = new Lemmatizer(ruStream);
 }

コード例 #6

0

ファイルを表示

ファイル: Program.cs プロジェクト: stuartd/LemmaGenerator

        static void Main(string[] args)
        {
            var currentDirectory   = Environment.CurrentDirectory + "/../../";
            var lemmatizerFilePath = currentDirectory + "../Test/Data/full7z-mlteast-en.lem";

            var fileName       = Path.GetFileNameWithoutExtension(lemmatizerFilePath) + "-modified";
            var extension      = Path.GetExtension(lemmatizerFilePath);
            var outputFilePath = string.Format("{0}Output/{1}{2}", currentDirectory, fileName, extension);

            var enricherFilePaths = Directory.GetFiles(currentDirectory + "Input/");


            using (var stream = File.OpenRead(lemmatizerFilePath))
            {
                // create base lemmatizer with data in the base source file
                var lemmatizer = new Lemmatizer(stream);

                // then, enrich lemmatizer with every other files
                foreach (var filePath in enricherFilePaths)
                {
                    EnrichLemmatizerWithDataFile(lemmatizer, filePath);
                }

                // persist lemmatizer in output file
                Console.WriteLine("Writing output file...");
                using (var oStream = File.Create(outputFilePath))
                {
                    lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true);
                }
                Console.WriteLine("Outuput file written at {0}", outputFilePath);
            }

            Console.WriteLine("OK");
            Console.ReadKey();
        }

コード例 #7

0

ファイルを表示

ファイル: LemmatizerTest.cs プロジェクト: najafzadeh1055/NHazm

        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List<string> inputs = new List<string>() {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List<string> expecteds = new List<string>() {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List<string> pos = new List<string>() {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input = inputs[i];
                expected = expecteds[i];
                p = pos[i];
                if (p == null)
                    actual = lemmatizer.Lemmatize(input);
                else
                    actual = lemmatizer.Lemmatize(input, p);
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }

コード例 #8

0

ファイルを表示

ファイル: wordmap.cs プロジェクト: qa1/wordmap

        public Dictionary <string, int> CalcWordCount(string[] words)
        {
            Dictionary <string, int> wordCount = new Dictionary <string, int>();
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string word in words)
            {
                string lemmWord = word;
                if (word.Length >= 5)
                {
                    lemmWord = lemmatize.Lemmatize(word);
                }

                if (wordCount.ContainsKey(lemmWord))
                {
                    wordCount[lemmWord]++;
                }
                else
                {
                    wordCount.Add(lemmWord, 1);
                }
            }

            return(wordCount);
        }

コード例 #9

0

ファイルを表示

        static Lemmatizator()
        {
            var stream = new FileStream(@"full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                _lemmatizer = new Lemmatizer(stream);
            }
        }

コード例 #10

0

ファイルを表示

ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator

 private static void EnrichLemmatizer(Lemmatizer lemmatizer, IEnumerable <Tuple <string, string, int> > wordsAndLemmaToAdd)
 {
     // add new words and lemma
     foreach (var wordAndLemma in wordsAndLemmaToAdd)
     {
         AddExampleOrException(lemmatizer, wordAndLemma.Item1, wordAndLemma.Item2);
     }
 }

コード例 #11

0

ファイルを表示

ファイル: ParseBasedPhrasalVerbDetector.cs プロジェクト: AlexPoint/PhrasalVerbParser

 public ParseBasedPhrasalVerbDetector(EnglishTreebankParser parser, Lemmatizer lemmatizer,
                                      EnglishMaximumEntropyTokenizer tokenizer, EnglishMaximumEntropyPosTagger tagger)
 {
     this.parser     = parser;
     this.lemmatizer = lemmatizer;
     this.tokenizer  = tokenizer;
     this.tagger     = tagger;
 }

コード例 #12

0

ファイルを表示

        public void ConjugationsTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input;

            string[] expected, actual;

            input    = "خورد#خور";
            expected = new string[] {
                "خوردم", "خوردی", "خورد", "خوردیم", "خوردید", "خوردند",
                "نخوردم", "نخوردی", "نخورد", "نخوردیم", "نخوردید", "نخوردند",
                "خورم", "خوری", /*"خورد",*/ "خوریم", "خورید", "خورند",
                "نخورم", "نخوری", /*"نخورد",*/ "نخوریم", "نخورید", "نخورند",
                "می‌خوردم", "می‌خوردی", /*"می‌خورد",*/ "می‌خوردیم", "می‌خوردید", "می‌خوردند",
                "نمی‌خوردم", "نمی‌خوردی", "نمی‌خورد", "نمی‌خوردیم", "نمی‌خوردید", "نمی‌خوردند",
                "خورده‌ام", "خورده‌ای", "خورده", "خورده‌ایم", "خورده‌اید", "خورده‌اند",
                "نخورده‌ام", "نخورده‌ای", "نخورده", "نخورده‌ایم", "نخورده‌اید", "نخورده‌اند",
                "می‌خورم", "می‌خوری", "می‌خورد", "می‌خوریم", "می‌خورید", "می‌خورند",
                "نمی‌خورم", "نمی‌خوری", /*"نمی‌خورد",*/ "نمی‌خوریم", "نمی‌خورید", "نمی‌خورند",
                "بخورم", "بخوری", "بخورد", "بخوریم", "بخورید", "بخورند",
                "بخور", "نخور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
                }
            }

            input    = "آورد#آور";
            expected = new string[] {
                "آوردم", "آوردی", "آورد", "آوردیم", "آوردید", "آوردند",
                "نیاوردم", "نیاوردی", "نیاورد", "نیاوردیم", "نیاوردید", "نیاوردند",
                "آورم", "آوری", /*"آورد",*/ "آوریم", "آورید", "آورند",
                "نیاورم", "نیاوری", /*"نیاورد",*/ "نیاوریم", "نیاورید", "نیاورند",
                "می‌آوردم", "می‌آوردی", /*"می‌آورد",*/ "می‌آوردیم", "می‌آوردید", "می‌آوردند",
                "نمی‌آوردم", "نمی‌آوردی", "نمی‌آورد", "نمی‌آوردیم", "نمی‌آوردید", "نمی‌آوردند",
                "آورده‌ام", "آورده‌ای", "آورده", "آورده‌ایم", "آورده‌اید", "آورده‌اند",
                "نیاورده‌ام", "نیاورده‌ای", "نیاورده", "نیاورده‌ایم", "نیاورده‌اید", "نیاورده‌اند",
                "می‌آورم", "می‌آوری", "می‌آورد", "می‌آوریم", "می‌آورید", "می‌آورند",
                "نمی‌آورم", "نمی‌آوری", /*"نمی‌آورد",*/ "نمی‌آوریم", "نمی‌آورید", "نمی‌آورند",
                "بیاورم", "بیاوری", "بیاورد", "بیاوریم", "بیاورید", "بیاورند",
                "بیاور", "نیاور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
                }
            }
        }

コード例 #13

0

ファイルを表示

ファイル: WordNormalizer.cs プロジェクト: binhnd10/EDictionary

        public WordNormalizer()
        {
            stemmer = new EnglishStemmer();

            var path   = Path.Combine(ApplicationPath.BaseDirectory, "lemmatizer-en.lem");
            var stream = File.OpenRead(path);

            lemmatizer = new Lemmatizer(stream);
        }

コード例 #14

0

ファイルを表示

ファイル: TFIDF.cs プロジェクト: sinthi315/Text-Summarizing-Tool

        private static List <string> GetVocabulary(string[] sentences, out List <List <string> > lemmalizeWords, int vocabularyThreshold)
        {
            string        filteredLine;
            List <string> filterLine     = new List <string>();
            List <string> tokenizedWords = new List <string>();
            List <string> vocabulary     = new List <string>();

            lemmalizeWords = new List <List <string> >();
            Dictionary <string, int> tFrequency = new Dictionary <string, int>();
            var stream     = File.OpenRead(path);
            var lemmatizer = new Lemmatizer(stream);

            int docIndex = 0;

            foreach (var doc in sentences)
            {
                List <string> stemmedDoc = new List <string>();
                docIndex++;

                tokenizedWords = Tokenize(doc);

                List <string> lemmalizeWord = new List <string>();
                foreach (string part in tokenizedWords)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
                    filteredLine = StopwordTool.RemoveStopwords(stripped);
                    string stem = lemmatizer.Lemmatize(filteredLine);
                    lemmalizeWord.Add(stem);

                    if (stem.Length > 0)
                    {
                        if (tFrequency.ContainsKey(stem))
                        {
                            tFrequency[stem]++;
                        }
                        else
                        {
                            tFrequency.Add(stem, 0);
                        }

                        stemmedDoc.Add(stem);
                    }
                }
                lemmalizeWords.Add(lemmalizeWord);
            }

            var vocabList = tFrequency.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return(vocabulary);
        }

コード例 #15

0

ファイルを表示

        private static Lemmatizer CreateLemmatizerFromFile()
        {
            var currentDirectory = Directory.GetCurrentDirectory();
            var dataFilePath     = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "english.lem");

            using (var stream = File.OpenRead(dataFilePath))
            {
                var lemmatizer = new Lemmatizer(stream);
                return(lemmatizer);
            }
        }

コード例 #16

0

ファイルを表示

 private void lemmatizeWordsChkbox_CheckedChanged(object sender, EventArgs e)
 {
     if (this.lemmatizeWordsChkbox.Checked)
     {
         Lemmatizer.setInstance(new DefaultLemmatizer(Conf.LEMMATIZATION_WORDS_PATH));
     }
     else
     {
         Lemmatizer.setInstance(new NullLemmatizer());
     }
 }

コード例 #17

0

ファイルを表示

ファイル: Application.cs プロジェクト: jasongorman/Conceptual

        private static void LoadLemmatizer()
        {
            string path   = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
            var    stream = File.OpenRead(path + @"\full7z-mlteast-en.lem");
            // terrible fudge to suppress unhandled deserialization exception message from LemmaSharp.
            // Empty try..catch didn't work.
            TextWriter output = Console.Out;

            Console.SetOut(new StreamWriter(Stream.Null));
            _lemma = new Lemmatizer(stream);
            Console.SetOut(output);
        }

コード例 #18

0

ファイルを表示

ファイル: LemmatizerTest.cs プロジェクト: najafzadeh1055/NHazm

        public void ConjugationsTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input;
            string[] expected, actual;

            input = "خورد#خور";
            expected = new string[] {
                "خوردم", "خوردی", "خورد", "خوردیم", "خوردید", "خوردند",
                "نخوردم", "نخوردی", "نخورد", "نخوردیم", "نخوردید", "نخوردند",
                "خورم", "خوری", /*"خورد",*/ "خوریم", "خورید", "خورند",
                "نخورم", "نخوری", /*"نخورد",*/ "نخوریم", "نخورید", "نخورند",
                "می‌خوردم", "می‌خوردی", /*"می‌خورد",*/ "می‌خوردیم", "می‌خوردید", "می‌خوردند",
                "نمی‌خوردم", "نمی‌خوردی", "نمی‌خورد", "نمی‌خوردیم", "نمی‌خوردید", "نمی‌خوردند",
                "خورده‌ام", "خورده‌ای", "خورده", "خورده‌ایم", "خورده‌اید", "خورده‌اند",
                "نخورده‌ام", "نخورده‌ای", "نخورده", "نخورده‌ایم", "نخورده‌اید", "نخورده‌اند",
                "می‌خورم", "می‌خوری", "می‌خورد", "می‌خوریم", "می‌خورید", "می‌خورند",
                "نمی‌خورم", "نمی‌خوری", /*"نمی‌خورد",*/ "نمی‌خوریم", "نمی‌خورید", "نمی‌خورند",
                "بخورم", "بخوری", "بخورد", "بخوریم", "بخورید", "بخورند",
                "بخور", "نخور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
            }

            input = "آورد#آور";
            expected = new string[] {
                "آوردم", "آوردی", "آورد", "آوردیم", "آوردید", "آوردند",
                "نیاوردم", "نیاوردی", "نیاورد", "نیاوردیم", "نیاوردید", "نیاوردند",
                "آورم", "آوری", /*"آورد",*/ "آوریم", "آورید", "آورند",
                "نیاورم", "نیاوری", /*"نیاورد",*/ "نیاوریم", "نیاورید", "نیاورند",
                "می‌آوردم", "می‌آوردی", /*"می‌آورد",*/ "می‌آوردیم", "می‌آوردید", "می‌آوردند",
                "نمی‌آوردم", "نمی‌آوردی", "نمی‌آورد", "نمی‌آوردیم", "نمی‌آوردید", "نمی‌آوردند",
                "آورده‌ام", "آورده‌ای", "آورده", "آورده‌ایم", "آورده‌اید", "آورده‌اند",
                "نیاورده‌ام", "نیاورده‌ای", "نیاورده", "نیاورده‌ایم", "نیاورده‌اید", "نیاورده‌اند",
                "می‌آورم", "می‌آوری", "می‌آورد", "می‌آوریم", "می‌آورید", "می‌آورند",
                "نمی‌آورم", "نمی‌آوری", /*"نمی‌آورد",*/ "نمی‌آوریم", "نمی‌آورید", "نمی‌آورند",
                "بیاورم", "بیاوری", "بیاورد", "بیاوریم", "بیاورید", "بیاورند",
                "بیاور", "نیاور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
            }
        }

コード例 #19

0

ファイルを表示

 public void LoadModels(BinarySerializer taggerModelSer, BinarySerializer lemmatizerModelSer)
 {
     /*Utils.ThrowException(taggerModelSer == null ? new ArgumentNullException("taggerModelSer") : null);
      * mLogger.Debug("Load", "Nalagam model za označevanje ...");
      * mSuffixTree = new PatriciaTree(taggerModelSer);
      * mFeatureSpace = Utils.LoadDictionary<string, int>(taggerModelSer);
      * mModel = new MaximumEntropyClassifierFast<string>(taggerModelSer);*/
     if (lemmatizerModelSer != null)
     {
         mLogger.Debug("Load", "Nalagam model za lematizacijo ...");
         mConsiderTags = lemmatizerModelSer.ReadBool();
         mLemmatizer   = new Lemmatizer(lemmatizerModelSer);
     }
 }

コード例 #20

0

ファイルを表示

        public int Classify(string text)
        {
            Lemmatizer.Stopwords = _stopwords;
            var vector = Lemmatizer.LemmatizeCurrentText(text);
            int sign = 1, tone = 0, toneCount = 0;

            for (int i = 0; i < vector.Count; i++)
            {
                if (_sentilex.ContainsKey(vector[i]) || _specificLexicon.ContainsKey(vector[i]))
                {
                    int basictone = (_specificLexicon.ContainsKey(vector[i])) ? _specificLexicon[vector[i]] : _sentilex[vector[i]];
                    if (i - 1 > 0)
                    {
                        if (_negations.Contains(vector[i - 1]))
                        {
                            sign = -1;
                        }
                        if (_gains.Contains(vector[i - 1]))
                        {
                            if (i - 2 > 0 && _negations.Contains(vector[i - 2]))
                            {
                                sign = -1;
                            }
                        }
                    }
                    tone += basictone * sign;
                    toneCount++;
                }
            }
            if (toneCount > 0)
            {
                var doubletone = tone / (toneCount + 0.0);
                if (doubletone > 0.3)
                {
                    return(1);
                }
                else if (doubletone < -0.3)
                {
                    return(-1);
                }
                else
                {
                    return(0);
                }
            }
            else
            {
                return(0);
            }
        }

コード例 #21

0

ファイルを表示

        static async Task Main(string[] args)
        {
            var documentsFolderPath = PathConstants.CrawlerResultPath;

            var resultFolderPath = PathConstants.LemmatizationResultPath;

            if (!Directory.Exists(resultFolderPath))
            {
                Directory.CreateDirectory(resultFolderPath);
            }

            var regex = new Regex(@"[\p{IsCyrillic}]+");

            foreach (var file in Directory.EnumerateFiles(documentsFolderPath, "*.html"))
            {
                using (var fs = new FileStream(file, FileMode.Open))
                {
                    var document = new HtmlDocument();
                    document.Load(fs);

                    var rootNode = document.DocumentNode;

                    if (rootNode == null)
                    {
                        continue;
                    }

                    var tokens = regex.Matches(rootNode.InnerText)
                                 .Select(x => x.ToString())
                                 .Where(x => !string.IsNullOrEmpty(x) && !string.IsNullOrWhiteSpace(x));

                    var tokenized = string.Join(" ", tokens);

                    var lemmas = new Lemmatizer("./mystem/mystem.exe").LemmatizeText(tokenized)
                                 .Trim()
                                 .Replace("   ", " ");

                    var outputPath = Path.Combine(resultFolderPath, file
                                                  .Replace(documentsFolderPath + "\\", "")
                                                  .Replace(".html", ".txt"));

                    using (var stream = new FileStream(outputPath, FileMode.OpenOrCreate))
                    {
                        var buffer = Encoding.UTF8.GetBytes(lemmas);
                        await stream.WriteAsync(buffer, 0, buffer.Length);
                    }
                }
            }
        }

コード例 #22

0

ファイルを表示

ファイル: Program.cs プロジェクト: Liebeck/IWNLP.Lemmatizer

        static void LemmatizeIWNLP(List<CoNLLSentence> corpus, String exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();
            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;
            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize<List<CoNLLSentence>>(corpus, exportPath);

        }

コード例 #23

0

ファイルを表示

ファイル: Program.cs プロジェクト: Liebeck/IWNLP.Lemmatizer

        static void LemmatizeIWNLP(List <CoNLLSentence> corpus, string exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();

            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;

            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize <List <CoNLLSentence> >(corpus, exportPath);
        }

コード例 #24

0

ファイルを表示

ファイル: QueryIndependentFeatures.cs プロジェクト: geekonweb/FeatureDealer

        public QueryIndependentFeatures()
        {
            Console.WriteLine("Initializing QueryIndependentFeatures");
            const string buhWordsFilepath           = "Dictionaries/BuhWords.txt";
            const string taxWordsFilepath           = "Dictionaries/TaxWords.txt";
            const string controlSystemWordsFilepath = "Dictionaries/ControlSystemWords.txt";
            const string formWordsFilepath          = "Dictionaries/FormWords.txt";

            buhWords           = File.ReadAllLines(buhWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            taxWords           = File.ReadAllLines(taxWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            controlSystemWords =
                File.ReadAllLines(controlSystemWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            formWords             = File.ReadAllLines(formWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            buhonlineDataProvider = new BuhOnlineDataProvider();
            lemmatizer            = new Lemmatizer();
        }

コード例 #25

0

ファイルを表示

ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator

        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // add example
                lemmatizer.AddExample(word, lemma);
                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }

コード例 #26

0

ファイルを表示

ファイル: MorphologyModel.cs プロジェクト: bluelight1324/Eliza

        /// <summary>
        /// Инциализирует новый экземпляр класса <see cref="MorphologyModel"/>.
        /// </summary>
        /// <param name="nGramm">N-граммная модель.</param>
        /// <param name="entModel">Модель классов неоднозначности.</param>
        /// <param name="folder">Папка для хранения файлов модели.</param>
        /// <param name="lemmaFile">Файл лемматизатора.</param>
        public MorphologyModel(TagNGramm nGramm, IEntropyClassModel entModel,
                               string folder, string lemmaFile)
        {
            this.nGramm   = nGramm;
            this.entClass = entModel;
            this.folder   = folder;
            if (Directory.Exists(folder))
            {
                Directory.Delete(folder, true);
            }
            Directory.CreateDirectory(folder);
            this.lemmaFile = folder + "\\" + lemmaFile;
            File.Copy(lemmaFile, this.lemmaFile);
            FileStream fs = File.OpenRead(this.lemmaFile);

            lemmatizer = new Lemmatizer(fs);
            Initialize();
        }

コード例 #27

0

ファイルを表示

ファイル: wordmap.cs プロジェクト: qa1/wordmap

        public Dictionary <string, int> deletePrepVerb(List <string> prepVerbList, Dictionary <string, int> wordCount)
        {
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string element in prepVerbList)
            {
                string lemElement = element;
                if (element.Length >= 5)
                {
                    lemElement = lemmatize.Lemmatize(element);
                }
                if (wordCount.ContainsKey(lemElement))
                {
                    wordCount.Remove(lemElement);
                }
            }

            return(wordCount);
        }

コード例 #28

0

ファイルを表示

        public void parse(string inputString, int emotion)
        {
            var lemmatizer = new Lemmatizer();

            this.emotion = emotion;
            StringReader input    = new StringReader(inputString);
            var          vector   = Lemmatizer.LemmatizeCurrentText(inputString);
            string       prevWord = null;

            foreach (var str in vector)
            {
                addGramm(str);
                if (prevWord != null)
                {
                    addGramm(prevWord + " " + str);
                }
                prevWord = str;
            }
        }

コード例 #29

0

ファイルを表示

ファイル: Program.cs プロジェクト: Rooniey/KSR

        private static Dictionary <string, int> GetMostFrequentTermsForLabel(List <LabeledArticle> articles, string label, int termCount = 20, string[] stopList = null)
        {
            List <TokenizedArticle> tokenizedArticles = new List <TokenizedArticle>();

//            List<TokenizedArticle> allTokenizedArticles = new List<TokenizedArticle>();

            foreach (var article in articles)
            {
//                allTokenizedArticles.Add(tokenized);
                if (article.Label == label)
                {
                    var art            = TextUtility.ReplaceSpecialCharacters(article.Article.Body);
                    var processedWords = StopWordsFilterProcessor.Process(Tokenizer.TokenizeWords(art), stopList);
                    processedWords = Lemmatizer.Process(processedWords);
                    var tokenized = new TokenizedArticle(article, processedWords);
                    tokenizedArticles.Add(tokenized);
                }
            }


            Dictionary <string, int> countDictionary = new Dictionary <string, int>();

            foreach (var tokenizedArticle in tokenizedArticles)
            {
                foreach (var token in tokenizedArticle.Tokens)
                {
                    if (countDictionary.ContainsKey(token.Word))
                    {
                        countDictionary[token.Word]++;
                    }
                    else
                    {
                        countDictionary[token.Word] = 1;
                    }
                }
            }

            return(countDictionary
                   .OrderByDescending(pair => pair.Value)
                   .Take(termCount)
                   .ToDictionary(pair => pair.Key, pair => pair.Value));
        }

コード例 #30

0

ファイルを表示

ファイル: Program.cs プロジェクト: stuartd/LemmaGenerator

        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            // compute the lemma of this example
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // if the computed lemma is different from what we expect,
                // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail)
                lemmatizer.AddExample(word, lemma);

                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }

コード例 #31

0

ファイルを表示

ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator

        private static void EnrichLemmatizerFile(string lemmatizerFilePath, string outputFilePath,
                                                 IEnumerable <string> enricherFilePaths)
        {
            using (var stream = File.OpenRead(lemmatizerFilePath))
            {
                var lemmatizer = new Lemmatizer(stream);
                // enrich lemmatizer with every other file
                foreach (var filePath in enricherFilePaths)
                {
                    EnrichLemmatizer(lemmatizer, filePath);
                }

                // persist lemmatizer in output file
                Console.WriteLine("Writing output file...");
                using (var oStream = File.Create(outputFilePath))
                {
                    lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true);
                }
                Console.WriteLine("Outuput file written at {0}", outputFilePath);
            }
        }

コード例 #32

0

ファイルを表示

ファイル: ValuesController.cs プロジェクト: 1borodat1/LemmatizatorAPI

        public ActionResult <IEnumerable <string> > Get()
        {
            //var LemFilePath = @"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem";
            //var filePath = @"C:\Users\D.Pugach\Downloads\test.txt";
            //var stream = File.OpenRead(LemFilePath);
            var sb     = new StringBuilder();
            var stream = new FileStream(@"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                var allText    = "Вазомоторный ринит что это такое Среди многочисленных видов ринита (насморка) эта патология занимает особое место, поскольку этиология ее возникновения до сих пор до конца не изучена. Вазомоторный ринит, чаще всего поражающий людей старше 20 лет, является заболеванием, которое может протекать в виде";
                var lemmatizer = new Lemmatizer(stream);

                lemmatizer.Lemmatize(allText.ToLower());
                foreach (var word in allText.Split(' '))
                {
                    sb.Append(lemmatizer.Lemmatize(word)).Append(" ");
                }
                Console.WriteLine(sb.ToString());
            }

            return(new string[] { sb.ToString() });
        }

コード例 #33

0

ファイルを表示

ファイル: TextGenerator.cs プロジェクト: cout00/BorodaTextToTable

        public TextGenerator(IFileReader inpReader, string RelationName)
        {
            this.RelationName = RelationName;
            Sentences         = new StringParser(inpReader).GetSentances;
            var stream     = File.OpenRead(AppDomain.CurrentDomain.BaseDirectory + @"\\full7z-mlteast-ru.lem");
            var lemmatizer = new Lemmatizer(stream);

            WordsAll = new List <string>();
            foreach (var Sentence in Sentences)
            {
                for (int i = 0; i < Sentence.Count; i++)
                {
                    Sentence[i] = lemmatizer.Lemmatize(Sentence[i]);
                    WordsAll.Add(Sentence[i]);
                }
            }
            WordsAll = WordsAll.Distinct(new PartialComparer()).ToList();
            var SentenceNum = 0;

            foreach (var Sentence in Sentences)
            {
                foreach (var Word in Sentence)
                {
                    recList.Add(new Record()
                    {
                        TransactId = SentenceNum, ItemId = WordsAll.FindIndex(a => a == Word), Item = Word
                    });
                }
                SentenceNum++;
            }
            //for (int i = 0; i < Sentences.Count; i++)
            //{
            //    for (int j = 0; j < Sentences[i].Count; j++)
            //    {
            //        recList.Add(new Record() { TransactId = i, ItemId = j, Item = Sentences[i][j] });
            //    }
            //}
        }

コード例 #34

0

ファイルを表示

ファイル: MorphologyModel.cs プロジェクト: bluelight1324/Eliza

        /// <summary>
        /// Инциализирует новый экземпляр класса <see cref="MorphologyModel"/>.
        /// </summary>
        /// <param name="si">Информация о сериализации.</param>
        /// <param name="context">Контекст.</param>
        protected MorphologyModel(SerializationInfo si, StreamingContext context)
        {
            this.folder      = si.GetString("folder");
            this.groups      = (List <TagGroup>)si.GetValue("groups", typeof(List <TagGroup>));
            this.punctuation = (List <string>)si.GetValue("punctuation", typeof(List <string>));
            using (FileStream fs1 = File.Open(string.Format("{0}/{1}", folder, "nGramm.mdl"), FileMode.Open))
                using (FileStream fs2 = File.Open(string.Format("{0}/{1}", folder, "entClass.mdl"), FileMode.Open))
                {
                    this.nGramm   = Serializer.Deserialize <TagNGramm>(fs1);
                    this.entClass = new DawgEntropyClassModel();
                    entClass.Load(fs2);
                }
            this.sentenceDelimiters = (List <string>)si.GetValue("delimiters", typeof(List <string>));
            this.lemmaFile          = si.GetString("lemmaFile");
            FileStream fs = File.Open(this.lemmaFile, FileMode.Open);

            this.lemmatizer      = new Lemmatizer(fs);
            this.sentencePattern = (Regex)si.GetValue("sentencePattern", typeof(Regex));
            this.lexemPattern    = (Regex)si.GetValue("lexemPattern", typeof(Regex));
            this.minLengh        = si.GetInt32("minLenght");
            serviceTags          = (Dictionary <string, Tag>)si.GetValue("serviceTags",
                                                                         typeof(Dictionary <string, Tag>));
        }

コード例 #35

0

ファイルを表示

ファイル: TextMiningUtils.cs プロジェクト: viidea/latino

 public static void GetLanguageTools(Language language, out Set<string>.ReadOnly stopWords, out IStemmer stemmer)
 {
     switch (language)
     {
         case Language.Bulgarian:
             stopWords = StopWords.BulgarianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Czech:
             stopWords = StopWords.CzechStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Danish:
             stopWords = StopWords.DanishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Dutch:
             stopWords = StopWords.DutchStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.English:
             stopWords = StopWords.EnglishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Estonian:
             stopWords = null; // *** stop words are missing
             stemmer = new Lemmatizer(language);
             break;
         case Language.Finnish:
             stopWords = StopWords.FinnishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.French:
             stopWords = StopWords.FrenchStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.German:
             stopWords = StopWords.GermanStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Hungarian:
             stopWords = StopWords.HungarianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Italian:
             stopWords = StopWords.ItalianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Norwegian:
             stopWords = StopWords.NorwegianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Portuguese:
             stopWords = StopWords.PortugueseStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Romanian:
             stopWords = StopWords.RomanianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Russian:
             stopWords = StopWords.RussianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Serbian:
             stopWords = StopWords.SerbianStopWordsLatin;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Slovene:
             stopWords = StopWords.SloveneStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Spanish:
             stopWords = StopWords.SpanishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Swedish:
             stopWords = StopWords.SwedishStopWords;
             stemmer = new Stemmer(language);
             break;
         default:
             throw new ArgumentNotSupportedException("language");
     }
 }

コード例 #36

0

ファイルを表示

ファイル: IWNLPSentenceProcessor.cs プロジェクト: Liebeck/IWNLP.Lemmatizer

 public static void ProcessSentence(CoNLLSentence sentence, Lemmatizer iwnlp)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     //is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             List<POS> pos = new List<POS>() { POS.Noun, POS.X };
             if (iwnlp.ContainsEntry(token.Form, POS.Noun))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.X))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
             }
             else if(iwnlp.ContainsEntry(token.Form, pos, true))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, pos, true);
             }
         }
         else
         {
             if (token.POS == "ADJA" || token.POS == "ADJD")
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Adjective))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Adjective, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Noun, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.X, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 //else if (iwnlp.ContainsEntry(token.Form,true)) 
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
             else if (token.POS.StartsWith("V"))
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                     // test
                 //else if (iwnlp.ContainsEntry(token.Form, true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
         }
     }
 }

C# (CSharp) Lemmatizerの例