コード例 #1
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input    = "کتاب‌ها";
            expected = "کتاب";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "آتشفشان";
            expected = "آتشفشان";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "می‌روم";
            expected = "رفت#رو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "گفته شده است";
            expected = "گفت#گو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "مردم";
            expected = "مردم";
            actual   = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator
        private static void EnrichLemmatizer(Lemmatizer lemmatizer, string enricherFilePath)
        {
            var fileReader = new EnricherFileReader(enricherFilePath);
            var newLemmas  = fileReader.ReadAllLemmaEntries();

            EnrichLemmatizer(lemmatizer, newLemmas);
        }
コード例 #3
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List <string> inputs = new List <string>()
            {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List <string> expecteds = new List <string>()
            {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List <string> pos = new List <string>()
            {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input    = inputs[i];
                expected = expecteds[i];
                p        = pos[i];
                if (p == null)
                {
                    actual = lemmatizer.Lemmatize(input);
                }
                else
                {
                    actual = lemmatizer.Lemmatize(input, p);
                }
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }
コード例 #4
0
ファイル: LemmatizerTest.cs プロジェクト: nvdnkpr/NHazm
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input = "کتاب‌ها";
            expected = "کتاب";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "آتشفشان";
            expected = "آتشفشان";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "می‌روم";
            expected = "رفت#رو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "گفته شده است";
            expected = "گفت#گو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "مردم";
            expected = "مردم";
            actual = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }
コード例 #5
0
 static LemmatizationHandler()
 {
     enStream     = File.OpenRead(EnLemmatizerpath);
     ruStream     = File.OpenRead(RuLemmatizerpath);
     enLemmatizer = new Lemmatizer(enStream);
     ruLemmatizer = new Lemmatizer(ruStream);
 }
コード例 #6
0
ファイル: Program.cs プロジェクト: stuartd/LemmaGenerator
        static void Main(string[] args)
        {
            var currentDirectory   = Environment.CurrentDirectory + "/../../";
            var lemmatizerFilePath = currentDirectory + "../Test/Data/full7z-mlteast-en.lem";

            var fileName       = Path.GetFileNameWithoutExtension(lemmatizerFilePath) + "-modified";
            var extension      = Path.GetExtension(lemmatizerFilePath);
            var outputFilePath = string.Format("{0}Output/{1}{2}", currentDirectory, fileName, extension);

            var enricherFilePaths = Directory.GetFiles(currentDirectory + "Input/");


            using (var stream = File.OpenRead(lemmatizerFilePath))
            {
                // create base lemmatizer with data in the base source file
                var lemmatizer = new Lemmatizer(stream);

                // then, enrich lemmatizer with every other files
                foreach (var filePath in enricherFilePaths)
                {
                    EnrichLemmatizerWithDataFile(lemmatizer, filePath);
                }

                // persist lemmatizer in output file
                Console.WriteLine("Writing output file...");
                using (var oStream = File.Create(outputFilePath))
                {
                    lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true);
                }
                Console.WriteLine("Outuput file written at {0}", outputFilePath);
            }

            Console.WriteLine("OK");
            Console.ReadKey();
        }
コード例 #7
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List<string> inputs = new List<string>() {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List<string> expecteds = new List<string>() {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List<string> pos = new List<string>() {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input = inputs[i];
                expected = expecteds[i];
                p = pos[i];
                if (p == null)
                    actual = lemmatizer.Lemmatize(input);
                else
                    actual = lemmatizer.Lemmatize(input, p);
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }
コード例 #8
0
ファイル: wordmap.cs プロジェクト: qa1/wordmap
        public Dictionary <string, int> CalcWordCount(string[] words)
        {
            Dictionary <string, int> wordCount = new Dictionary <string, int>();
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string word in words)
            {
                string lemmWord = word;
                if (word.Length >= 5)
                {
                    lemmWord = lemmatize.Lemmatize(word);
                }

                if (wordCount.ContainsKey(lemmWord))
                {
                    wordCount[lemmWord]++;
                }
                else
                {
                    wordCount.Add(lemmWord, 1);
                }
            }

            return(wordCount);
        }
コード例 #9
0
        static Lemmatizator()
        {
            var stream = new FileStream(@"full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                _lemmatizer = new Lemmatizer(stream);
            }
        }
コード例 #10
0
ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator
 private static void EnrichLemmatizer(Lemmatizer lemmatizer, IEnumerable <Tuple <string, string, int> > wordsAndLemmaToAdd)
 {
     // add new words and lemma
     foreach (var wordAndLemma in wordsAndLemmaToAdd)
     {
         AddExampleOrException(lemmatizer, wordAndLemma.Item1, wordAndLemma.Item2);
     }
 }
コード例 #11
0
 public ParseBasedPhrasalVerbDetector(EnglishTreebankParser parser, Lemmatizer lemmatizer,
                                      EnglishMaximumEntropyTokenizer tokenizer, EnglishMaximumEntropyPosTagger tagger)
 {
     this.parser     = parser;
     this.lemmatizer = lemmatizer;
     this.tokenizer  = tokenizer;
     this.tagger     = tagger;
 }
コード例 #12
0
        public void ConjugationsTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input;

            string[] expected, actual;

            input    = "خورد#خور";
            expected = new string[] {
                "خوردم", "خوردی", "خورد", "خوردیم", "خوردید", "خوردند",
                "نخوردم", "نخوردی", "نخورد", "نخوردیم", "نخوردید", "نخوردند",
                "خورم", "خوری", /*"خورد",*/ "خوریم", "خورید", "خورند",
                "نخورم", "نخوری", /*"نخورد",*/ "نخوریم", "نخورید", "نخورند",
                "می‌خوردم", "می‌خوردی", /*"می‌خورد",*/ "می‌خوردیم", "می‌خوردید", "می‌خوردند",
                "نمی‌خوردم", "نمی‌خوردی", "نمی‌خورد", "نمی‌خوردیم", "نمی‌خوردید", "نمی‌خوردند",
                "خورده‌ام", "خورده‌ای", "خورده", "خورده‌ایم", "خورده‌اید", "خورده‌اند",
                "نخورده‌ام", "نخورده‌ای", "نخورده", "نخورده‌ایم", "نخورده‌اید", "نخورده‌اند",
                "می‌خورم", "می‌خوری", "می‌خورد", "می‌خوریم", "می‌خورید", "می‌خورند",
                "نمی‌خورم", "نمی‌خوری", /*"نمی‌خورد",*/ "نمی‌خوریم", "نمی‌خورید", "نمی‌خورند",
                "بخورم", "بخوری", "بخورد", "بخوریم", "بخورید", "بخورند",
                "بخور", "نخور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
                }
            }

            input    = "آورد#آور";
            expected = new string[] {
                "آوردم", "آوردی", "آورد", "آوردیم", "آوردید", "آوردند",
                "نیاوردم", "نیاوردی", "نیاورد", "نیاوردیم", "نیاوردید", "نیاوردند",
                "آورم", "آوری", /*"آورد",*/ "آوریم", "آورید", "آورند",
                "نیاورم", "نیاوری", /*"نیاورد",*/ "نیاوریم", "نیاورید", "نیاورند",
                "می‌آوردم", "می‌آوردی", /*"می‌آورد",*/ "می‌آوردیم", "می‌آوردید", "می‌آوردند",
                "نمی‌آوردم", "نمی‌آوردی", "نمی‌آورد", "نمی‌آوردیم", "نمی‌آوردید", "نمی‌آوردند",
                "آورده‌ام", "آورده‌ای", "آورده", "آورده‌ایم", "آورده‌اید", "آورده‌اند",
                "نیاورده‌ام", "نیاورده‌ای", "نیاورده", "نیاورده‌ایم", "نیاورده‌اید", "نیاورده‌اند",
                "می‌آورم", "می‌آوری", "می‌آورد", "می‌آوریم", "می‌آورید", "می‌آورند",
                "نمی‌آورم", "نمی‌آوری", /*"نمی‌آورد",*/ "نمی‌آوریم", "نمی‌آورید", "نمی‌آورند",
                "بیاورم", "بیاوری", "بیاورد", "بیاوریم", "بیاورید", "بیاورند",
                "بیاور", "نیاور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
                }
            }
        }
コード例 #13
0
        public WordNormalizer()
        {
            stemmer = new EnglishStemmer();

            var path   = Path.Combine(ApplicationPath.BaseDirectory, "lemmatizer-en.lem");
            var stream = File.OpenRead(path);

            lemmatizer = new Lemmatizer(stream);
        }
コード例 #14
0
        private static List <string> GetVocabulary(string[] sentences, out List <List <string> > lemmalizeWords, int vocabularyThreshold)
        {
            string        filteredLine;
            List <string> filterLine     = new List <string>();
            List <string> tokenizedWords = new List <string>();
            List <string> vocabulary     = new List <string>();

            lemmalizeWords = new List <List <string> >();
            Dictionary <string, int> tFrequency = new Dictionary <string, int>();
            var stream     = File.OpenRead(path);
            var lemmatizer = new Lemmatizer(stream);

            int docIndex = 0;

            foreach (var doc in sentences)
            {
                List <string> stemmedDoc = new List <string>();
                docIndex++;

                tokenizedWords = Tokenize(doc);

                List <string> lemmalizeWord = new List <string>();
                foreach (string part in tokenizedWords)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
                    filteredLine = StopwordTool.RemoveStopwords(stripped);
                    string stem = lemmatizer.Lemmatize(filteredLine);
                    lemmalizeWord.Add(stem);

                    if (stem.Length > 0)
                    {
                        if (tFrequency.ContainsKey(stem))
                        {
                            tFrequency[stem]++;
                        }
                        else
                        {
                            tFrequency.Add(stem, 0);
                        }

                        stemmedDoc.Add(stem);
                    }
                }
                lemmalizeWords.Add(lemmalizeWord);
            }

            var vocabList = tFrequency.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return(vocabulary);
        }
コード例 #15
0
        private static Lemmatizer CreateLemmatizerFromFile()
        {
            var currentDirectory = Directory.GetCurrentDirectory();
            var dataFilePath     = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "english.lem");

            using (var stream = File.OpenRead(dataFilePath))
            {
                var lemmatizer = new Lemmatizer(stream);
                return(lemmatizer);
            }
        }
コード例 #16
0
 private void lemmatizeWordsChkbox_CheckedChanged(object sender, EventArgs e)
 {
     if (this.lemmatizeWordsChkbox.Checked)
     {
         Lemmatizer.setInstance(new DefaultLemmatizer(Conf.LEMMATIZATION_WORDS_PATH));
     }
     else
     {
         Lemmatizer.setInstance(new NullLemmatizer());
     }
 }
コード例 #17
0
ファイル: Application.cs プロジェクト: jasongorman/Conceptual
        private static void LoadLemmatizer()
        {
            string path   = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
            var    stream = File.OpenRead(path + @"\full7z-mlteast-en.lem");
            // terrible fudge to suppress unhandled deserialization exception message from LemmaSharp.
            // Empty try..catch didn't work.
            TextWriter output = Console.Out;

            Console.SetOut(new StreamWriter(Stream.Null));
            _lemma = new Lemmatizer(stream);
            Console.SetOut(output);
        }
コード例 #18
0
        public void ConjugationsTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input;
            string[] expected, actual;

            input = "خورد#خور";
            expected = new string[] {
                "خوردم", "خوردی", "خورد", "خوردیم", "خوردید", "خوردند",
                "نخوردم", "نخوردی", "نخورد", "نخوردیم", "نخوردید", "نخوردند",
                "خورم", "خوری", /*"خورد",*/ "خوریم", "خورید", "خورند",
                "نخورم", "نخوری", /*"نخورد",*/ "نخوریم", "نخورید", "نخورند",
                "می‌خوردم", "می‌خوردی", /*"می‌خورد",*/ "می‌خوردیم", "می‌خوردید", "می‌خوردند",
                "نمی‌خوردم", "نمی‌خوردی", "نمی‌خورد", "نمی‌خوردیم", "نمی‌خوردید", "نمی‌خوردند",
                "خورده‌ام", "خورده‌ای", "خورده", "خورده‌ایم", "خورده‌اید", "خورده‌اند",
                "نخورده‌ام", "نخورده‌ای", "نخورده", "نخورده‌ایم", "نخورده‌اید", "نخورده‌اند",
                "می‌خورم", "می‌خوری", "می‌خورد", "می‌خوریم", "می‌خورید", "می‌خورند",
                "نمی‌خورم", "نمی‌خوری", /*"نمی‌خورد",*/ "نمی‌خوریم", "نمی‌خورید", "نمی‌خورند",
                "بخورم", "بخوری", "بخورد", "بخوریم", "بخورید", "بخورند",
                "بخور", "نخور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
            }

            input = "آورد#آور";
            expected = new string[] {
                "آوردم", "آوردی", "آورد", "آوردیم", "آوردید", "آوردند",
                "نیاوردم", "نیاوردی", "نیاورد", "نیاوردیم", "نیاوردید", "نیاوردند",
                "آورم", "آوری", /*"آورد",*/ "آوریم", "آورید", "آورند",
                "نیاورم", "نیاوری", /*"نیاورد",*/ "نیاوریم", "نیاورید", "نیاورند",
                "می‌آوردم", "می‌آوردی", /*"می‌آورد",*/ "می‌آوردیم", "می‌آوردید", "می‌آوردند",
                "نمی‌آوردم", "نمی‌آوردی", "نمی‌آورد", "نمی‌آوردیم", "نمی‌آوردید", "نمی‌آوردند",
                "آورده‌ام", "آورده‌ای", "آورده", "آورده‌ایم", "آورده‌اید", "آورده‌اند",
                "نیاورده‌ام", "نیاورده‌ای", "نیاورده", "نیاورده‌ایم", "نیاورده‌اید", "نیاورده‌اند",
                "می‌آورم", "می‌آوری", "می‌آورد", "می‌آوریم", "می‌آورید", "می‌آورند",
                "نمی‌آورم", "نمی‌آوری", /*"نمی‌آورد",*/ "نمی‌آوریم", "نمی‌آورید", "نمی‌آورند",
                "بیاورم", "بیاوری", "بیاورد", "بیاوریم", "بیاورید", "بیاورند",
                "بیاور", "نیاور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
            }
        }
コード例 #19
0
 public void LoadModels(BinarySerializer taggerModelSer, BinarySerializer lemmatizerModelSer)
 {
     /*Utils.ThrowException(taggerModelSer == null ? new ArgumentNullException("taggerModelSer") : null);
      * mLogger.Debug("Load", "Nalagam model za označevanje ...");
      * mSuffixTree = new PatriciaTree(taggerModelSer);
      * mFeatureSpace = Utils.LoadDictionary<string, int>(taggerModelSer);
      * mModel = new MaximumEntropyClassifierFast<string>(taggerModelSer);*/
     if (lemmatizerModelSer != null)
     {
         mLogger.Debug("Load", "Nalagam model za lematizacijo ...");
         mConsiderTags = lemmatizerModelSer.ReadBool();
         mLemmatizer   = new Lemmatizer(lemmatizerModelSer);
     }
 }
コード例 #20
0
        public int Classify(string text)
        {
            Lemmatizer.Stopwords = _stopwords;
            var vector = Lemmatizer.LemmatizeCurrentText(text);
            int sign = 1, tone = 0, toneCount = 0;

            for (int i = 0; i < vector.Count; i++)
            {
                if (_sentilex.ContainsKey(vector[i]) || _specificLexicon.ContainsKey(vector[i]))
                {
                    int basictone = (_specificLexicon.ContainsKey(vector[i])) ? _specificLexicon[vector[i]] : _sentilex[vector[i]];
                    if (i - 1 > 0)
                    {
                        if (_negations.Contains(vector[i - 1]))
                        {
                            sign = -1;
                        }
                        if (_gains.Contains(vector[i - 1]))
                        {
                            if (i - 2 > 0 && _negations.Contains(vector[i - 2]))
                            {
                                sign = -1;
                            }
                        }
                    }
                    tone += basictone * sign;
                    toneCount++;
                }
            }
            if (toneCount > 0)
            {
                var doubletone = tone / (toneCount + 0.0);
                if (doubletone > 0.3)
                {
                    return(1);
                }
                else if (doubletone < -0.3)
                {
                    return(-1);
                }
                else
                {
                    return(0);
                }
            }
            else
            {
                return(0);
            }
        }
コード例 #21
0
        static async Task Main(string[] args)
        {
            var documentsFolderPath = PathConstants.CrawlerResultPath;

            var resultFolderPath = PathConstants.LemmatizationResultPath;

            if (!Directory.Exists(resultFolderPath))
            {
                Directory.CreateDirectory(resultFolderPath);
            }

            var regex = new Regex(@"[\p{IsCyrillic}]+");

            foreach (var file in Directory.EnumerateFiles(documentsFolderPath, "*.html"))
            {
                using (var fs = new FileStream(file, FileMode.Open))
                {
                    var document = new HtmlDocument();
                    document.Load(fs);

                    var rootNode = document.DocumentNode;

                    if (rootNode == null)
                    {
                        continue;
                    }

                    var tokens = regex.Matches(rootNode.InnerText)
                                 .Select(x => x.ToString())
                                 .Where(x => !string.IsNullOrEmpty(x) && !string.IsNullOrWhiteSpace(x));

                    var tokenized = string.Join(" ", tokens);

                    var lemmas = new Lemmatizer("./mystem/mystem.exe").LemmatizeText(tokenized)
                                 .Trim()
                                 .Replace("   ", " ");

                    var outputPath = Path.Combine(resultFolderPath, file
                                                  .Replace(documentsFolderPath + "\\", "")
                                                  .Replace(".html", ".txt"));

                    using (var stream = new FileStream(outputPath, FileMode.OpenOrCreate))
                    {
                        var buffer = Encoding.UTF8.GetBytes(lemmas);
                        await stream.WriteAsync(buffer, 0, buffer.Length);
                    }
                }
            }
        }
コード例 #22
0
ファイル: Program.cs プロジェクト: Liebeck/IWNLP.Lemmatizer
        static void LemmatizeIWNLP(List<CoNLLSentence> corpus, String exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();
            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;
            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize<List<CoNLLSentence>>(corpus, exportPath);

        }
コード例 #23
0
ファイル: Program.cs プロジェクト: Liebeck/IWNLP.Lemmatizer
        static void LemmatizeIWNLP(List <CoNLLSentence> corpus, string exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();

            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;

            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize <List <CoNLLSentence> >(corpus, exportPath);
        }
コード例 #24
0
        public QueryIndependentFeatures()
        {
            Console.WriteLine("Initializing QueryIndependentFeatures");
            const string buhWordsFilepath           = "Dictionaries/BuhWords.txt";
            const string taxWordsFilepath           = "Dictionaries/TaxWords.txt";
            const string controlSystemWordsFilepath = "Dictionaries/ControlSystemWords.txt";
            const string formWordsFilepath          = "Dictionaries/FormWords.txt";

            buhWords           = File.ReadAllLines(buhWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            taxWords           = File.ReadAllLines(taxWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            controlSystemWords =
                File.ReadAllLines(controlSystemWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            formWords             = File.ReadAllLines(formWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            buhonlineDataProvider = new BuhOnlineDataProvider();
            lemmatizer            = new Lemmatizer();
        }
コード例 #25
0
ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // add example
                lemmatizer.AddExample(word, lemma);
                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
コード例 #26
0
        /// <summary>
        /// Инциализирует новый экземпляр класса <see cref="MorphologyModel"/>.
        /// </summary>
        /// <param name="nGramm">N-граммная модель.</param>
        /// <param name="entModel">Модель классов неоднозначности.</param>
        /// <param name="folder">Папка для хранения файлов модели.</param>
        /// <param name="lemmaFile">Файл лемматизатора.</param>
        public MorphologyModel(TagNGramm nGramm, IEntropyClassModel entModel,
                               string folder, string lemmaFile)
        {
            this.nGramm   = nGramm;
            this.entClass = entModel;
            this.folder   = folder;
            if (Directory.Exists(folder))
            {
                Directory.Delete(folder, true);
            }
            Directory.CreateDirectory(folder);
            this.lemmaFile = folder + "\\" + lemmaFile;
            File.Copy(lemmaFile, this.lemmaFile);
            FileStream fs = File.OpenRead(this.lemmaFile);

            lemmatizer = new Lemmatizer(fs);
            Initialize();
        }
コード例 #27
0
ファイル: wordmap.cs プロジェクト: qa1/wordmap
        public Dictionary <string, int> deletePrepVerb(List <string> prepVerbList, Dictionary <string, int> wordCount)
        {
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string element in prepVerbList)
            {
                string lemElement = element;
                if (element.Length >= 5)
                {
                    lemElement = lemmatize.Lemmatize(element);
                }
                if (wordCount.ContainsKey(lemElement))
                {
                    wordCount.Remove(lemElement);
                }
            }

            return(wordCount);
        }
コード例 #28
0
        public void parse(string inputString, int emotion)
        {
            var lemmatizer = new Lemmatizer();

            this.emotion = emotion;
            StringReader input    = new StringReader(inputString);
            var          vector   = Lemmatizer.LemmatizeCurrentText(inputString);
            string       prevWord = null;

            foreach (var str in vector)
            {
                addGramm(str);
                if (prevWord != null)
                {
                    addGramm(prevWord + " " + str);
                }
                prevWord = str;
            }
        }
コード例 #29
0
ファイル: Program.cs プロジェクト: Rooniey/KSR
        private static Dictionary <string, int> GetMostFrequentTermsForLabel(List <LabeledArticle> articles, string label, int termCount = 20, string[] stopList = null)
        {
            List <TokenizedArticle> tokenizedArticles = new List <TokenizedArticle>();

//            List<TokenizedArticle> allTokenizedArticles = new List<TokenizedArticle>();

            foreach (var article in articles)
            {
//                allTokenizedArticles.Add(tokenized);
                if (article.Label == label)
                {
                    var art            = TextUtility.ReplaceSpecialCharacters(article.Article.Body);
                    var processedWords = StopWordsFilterProcessor.Process(Tokenizer.TokenizeWords(art), stopList);
                    processedWords = Lemmatizer.Process(processedWords);
                    var tokenized = new TokenizedArticle(article, processedWords);
                    tokenizedArticles.Add(tokenized);
                }
            }


            Dictionary <string, int> countDictionary = new Dictionary <string, int>();

            foreach (var tokenizedArticle in tokenizedArticles)
            {
                foreach (var token in tokenizedArticle.Tokens)
                {
                    if (countDictionary.ContainsKey(token.Word))
                    {
                        countDictionary[token.Word]++;
                    }
                    else
                    {
                        countDictionary[token.Word] = 1;
                    }
                }
            }

            return(countDictionary
                   .OrderByDescending(pair => pair.Value)
                   .Take(termCount)
                   .ToDictionary(pair => pair.Key, pair => pair.Value));
        }
コード例 #30
0
ファイル: Program.cs プロジェクト: stuartd/LemmaGenerator
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            // compute the lemma of this example
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // if the computed lemma is different from what we expect,
                // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail)
                lemmatizer.AddExample(word, lemma);

                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
コード例 #31
0
ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator
        private static void EnrichLemmatizerFile(string lemmatizerFilePath, string outputFilePath,
                                                 IEnumerable <string> enricherFilePaths)
        {
            using (var stream = File.OpenRead(lemmatizerFilePath))
            {
                var lemmatizer = new Lemmatizer(stream);
                // enrich lemmatizer with every other file
                foreach (var filePath in enricherFilePaths)
                {
                    EnrichLemmatizer(lemmatizer, filePath);
                }

                // persist lemmatizer in output file
                Console.WriteLine("Writing output file...");
                using (var oStream = File.Create(outputFilePath))
                {
                    lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true);
                }
                Console.WriteLine("Outuput file written at {0}", outputFilePath);
            }
        }
コード例 #32
0
        public ActionResult <IEnumerable <string> > Get()
        {
            //var LemFilePath = @"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem";
            //var filePath = @"C:\Users\D.Pugach\Downloads\test.txt";
            //var stream = File.OpenRead(LemFilePath);
            var sb     = new StringBuilder();
            var stream = new FileStream(@"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                var allText    = "Вазомоторный ринит что это такое Среди многочисленных видов ринита (насморка) эта патология занимает особое место, поскольку этиология ее возникновения до сих пор до конца не изучена. Вазомоторный ринит, чаще всего поражающий людей старше 20 лет, является заболеванием, которое может протекать в виде";
                var lemmatizer = new Lemmatizer(stream);

                lemmatizer.Lemmatize(allText.ToLower());
                foreach (var word in allText.Split(' '))
                {
                    sb.Append(lemmatizer.Lemmatize(word)).Append(" ");
                }
                Console.WriteLine(sb.ToString());
            }

            return(new string[] { sb.ToString() });
        }
コード例 #33
0
        public TextGenerator(IFileReader inpReader, string RelationName)
        {
            this.RelationName = RelationName;
            Sentences         = new StringParser(inpReader).GetSentances;
            var stream     = File.OpenRead(AppDomain.CurrentDomain.BaseDirectory + @"\\full7z-mlteast-ru.lem");
            var lemmatizer = new Lemmatizer(stream);

            WordsAll = new List <string>();
            foreach (var Sentence in Sentences)
            {
                for (int i = 0; i < Sentence.Count; i++)
                {
                    Sentence[i] = lemmatizer.Lemmatize(Sentence[i]);
                    WordsAll.Add(Sentence[i]);
                }
            }
            WordsAll = WordsAll.Distinct(new PartialComparer()).ToList();
            var SentenceNum = 0;

            foreach (var Sentence in Sentences)
            {
                foreach (var Word in Sentence)
                {
                    recList.Add(new Record()
                    {
                        TransactId = SentenceNum, ItemId = WordsAll.FindIndex(a => a == Word), Item = Word
                    });
                }
                SentenceNum++;
            }
            //for (int i = 0; i < Sentences.Count; i++)
            //{
            //    for (int j = 0; j < Sentences[i].Count; j++)
            //    {
            //        recList.Add(new Record() { TransactId = i, ItemId = j, Item = Sentences[i][j] });
            //    }
            //}
        }
コード例 #34
0
        /// <summary>
        /// Инциализирует новый экземпляр класса <see cref="MorphologyModel"/>.
        /// </summary>
        /// <param name="si">Информация о сериализации.</param>
        /// <param name="context">Контекст.</param>
        protected MorphologyModel(SerializationInfo si, StreamingContext context)
        {
            this.folder      = si.GetString("folder");
            this.groups      = (List <TagGroup>)si.GetValue("groups", typeof(List <TagGroup>));
            this.punctuation = (List <string>)si.GetValue("punctuation", typeof(List <string>));
            using (FileStream fs1 = File.Open(string.Format("{0}/{1}", folder, "nGramm.mdl"), FileMode.Open))
                using (FileStream fs2 = File.Open(string.Format("{0}/{1}", folder, "entClass.mdl"), FileMode.Open))
                {
                    this.nGramm   = Serializer.Deserialize <TagNGramm>(fs1);
                    this.entClass = new DawgEntropyClassModel();
                    entClass.Load(fs2);
                }
            this.sentenceDelimiters = (List <string>)si.GetValue("delimiters", typeof(List <string>));
            this.lemmaFile          = si.GetString("lemmaFile");
            FileStream fs = File.Open(this.lemmaFile, FileMode.Open);

            this.lemmatizer      = new Lemmatizer(fs);
            this.sentencePattern = (Regex)si.GetValue("sentencePattern", typeof(Regex));
            this.lexemPattern    = (Regex)si.GetValue("lexemPattern", typeof(Regex));
            this.minLengh        = si.GetInt32("minLenght");
            serviceTags          = (Dictionary <string, Tag>)si.GetValue("serviceTags",
                                                                         typeof(Dictionary <string, Tag>));
        }
コード例 #35
0
ファイル: TextMiningUtils.cs プロジェクト: viidea/latino
 public static void GetLanguageTools(Language language, out Set<string>.ReadOnly stopWords, out IStemmer stemmer)
 {
     switch (language)
     {
         case Language.Bulgarian:
             stopWords = StopWords.BulgarianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Czech:
             stopWords = StopWords.CzechStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Danish:
             stopWords = StopWords.DanishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Dutch:
             stopWords = StopWords.DutchStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.English:
             stopWords = StopWords.EnglishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Estonian:
             stopWords = null; // *** stop words are missing
             stemmer = new Lemmatizer(language);
             break;
         case Language.Finnish:
             stopWords = StopWords.FinnishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.French:
             stopWords = StopWords.FrenchStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.German:
             stopWords = StopWords.GermanStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Hungarian:
             stopWords = StopWords.HungarianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Italian:
             stopWords = StopWords.ItalianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Norwegian:
             stopWords = StopWords.NorwegianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Portuguese:
             stopWords = StopWords.PortugueseStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Romanian:
             stopWords = StopWords.RomanianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Russian:
             stopWords = StopWords.RussianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Serbian:
             stopWords = StopWords.SerbianStopWordsLatin;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Slovene:
             stopWords = StopWords.SloveneStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Spanish:
             stopWords = StopWords.SpanishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Swedish:
             stopWords = StopWords.SwedishStopWords;
             stemmer = new Stemmer(language);
             break;
         default:
             throw new ArgumentNotSupportedException("language");
     }
 }
コード例 #36
0
 public static void ProcessSentence(CoNLLSentence sentence, Lemmatizer iwnlp)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     //is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             List<POS> pos = new List<POS>() { POS.Noun, POS.X };
             if (iwnlp.ContainsEntry(token.Form, POS.Noun))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.X))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
             }
             else if(iwnlp.ContainsEntry(token.Form, pos, true))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, pos, true);
             }
         }
         else
         {
             if (token.POS == "ADJA" || token.POS == "ADJD")
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Adjective))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Adjective, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Noun, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.X, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 //else if (iwnlp.ContainsEntry(token.Form,true)) 
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
             else if (token.POS.StartsWith("V"))
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                     // test
                 //else if (iwnlp.ContainsEntry(token.Form, true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
         }
     }
 }