Exemplo n.º 1
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input    = "کتاب‌ها";
            expected = "کتاب";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "آتشفشان";
            expected = "آتشفشان";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "می‌روم";
            expected = "رفت#رو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "گفته شده است";
            expected = "گفت#گو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "مردم";
            expected = "مردم";
            actual   = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }
Exemplo n.º 2
0
        private static void EnrichLemmatizer(Lemmatizer lemmatizer, string enricherFilePath)
        {
            var fileReader = new EnricherFileReader(enricherFilePath);
            var newLemmas  = fileReader.ReadAllLemmaEntries();

            EnrichLemmatizer(lemmatizer, newLemmas);
        }
Exemplo n.º 3
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List <string> inputs = new List <string>()
            {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List <string> expecteds = new List <string>()
            {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List <string> pos = new List <string>()
            {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input    = inputs[i];
                expected = expecteds[i];
                p        = pos[i];
                if (p == null)
                {
                    actual = lemmatizer.Lemmatize(input);
                }
                else
                {
                    actual = lemmatizer.Lemmatize(input, p);
                }
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }
Exemplo n.º 4
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input = "کتاب‌ها";
            expected = "کتاب";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "آتشفشان";
            expected = "آتشفشان";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "می‌روم";
            expected = "رفت#رو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "گفته شده است";
            expected = "گفت#گو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "مردم";
            expected = "مردم";
            actual = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }
Exemplo n.º 5
0
 static LemmatizationHandler()
 {
     enStream     = File.OpenRead(EnLemmatizerpath);
     ruStream     = File.OpenRead(RuLemmatizerpath);
     enLemmatizer = new Lemmatizer(enStream);
     ruLemmatizer = new Lemmatizer(ruStream);
 }
Exemplo n.º 6
0
        static void Main(string[] args)
        {
            var currentDirectory   = Environment.CurrentDirectory + "/../../";
            var lemmatizerFilePath = currentDirectory + "../Test/Data/full7z-mlteast-en.lem";

            var fileName       = Path.GetFileNameWithoutExtension(lemmatizerFilePath) + "-modified";
            var extension      = Path.GetExtension(lemmatizerFilePath);
            var outputFilePath = string.Format("{0}Output/{1}{2}", currentDirectory, fileName, extension);

            var enricherFilePaths = Directory.GetFiles(currentDirectory + "Input/");


            using (var stream = File.OpenRead(lemmatizerFilePath))
            {
                // create base lemmatizer with data in the base source file
                var lemmatizer = new Lemmatizer(stream);

                // then, enrich lemmatizer with every other files
                foreach (var filePath in enricherFilePaths)
                {
                    EnrichLemmatizerWithDataFile(lemmatizer, filePath);
                }

                // persist lemmatizer in output file
                Console.WriteLine("Writing output file...");
                using (var oStream = File.Create(outputFilePath))
                {
                    lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true);
                }
                Console.WriteLine("Outuput file written at {0}", outputFilePath);
            }

            Console.WriteLine("OK");
            Console.ReadKey();
        }
Exemplo n.º 7
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List<string> inputs = new List<string>() {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List<string> expecteds = new List<string>() {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List<string> pos = new List<string>() {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input = inputs[i];
                expected = expecteds[i];
                p = pos[i];
                if (p == null)
                    actual = lemmatizer.Lemmatize(input);
                else
                    actual = lemmatizer.Lemmatize(input, p);
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }
Exemplo n.º 8
0
        public Dictionary <string, int> CalcWordCount(string[] words)
        {
            Dictionary <string, int> wordCount = new Dictionary <string, int>();
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string word in words)
            {
                string lemmWord = word;
                if (word.Length >= 5)
                {
                    lemmWord = lemmatize.Lemmatize(word);
                }

                if (wordCount.ContainsKey(lemmWord))
                {
                    wordCount[lemmWord]++;
                }
                else
                {
                    wordCount.Add(lemmWord, 1);
                }
            }

            return(wordCount);
        }
Exemplo n.º 9
0
        static Lemmatizator()
        {
            var stream = new FileStream(@"full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                _lemmatizer = new Lemmatizer(stream);
            }
        }
Exemplo n.º 10
0
 private static void EnrichLemmatizer(Lemmatizer lemmatizer, IEnumerable <Tuple <string, string, int> > wordsAndLemmaToAdd)
 {
     // add new words and lemma
     foreach (var wordAndLemma in wordsAndLemmaToAdd)
     {
         AddExampleOrException(lemmatizer, wordAndLemma.Item1, wordAndLemma.Item2);
     }
 }
 public ParseBasedPhrasalVerbDetector(EnglishTreebankParser parser, Lemmatizer lemmatizer,
                                      EnglishMaximumEntropyTokenizer tokenizer, EnglishMaximumEntropyPosTagger tagger)
 {
     this.parser     = parser;
     this.lemmatizer = lemmatizer;
     this.tokenizer  = tokenizer;
     this.tagger     = tagger;
 }
Exemplo n.º 12
0
        public void ConjugationsTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input;

            string[] expected, actual;

            input    = "خورد#خور";
            expected = new string[] {
                "خوردم", "خوردی", "خورد", "خوردیم", "خوردید", "خوردند",
                "نخوردم", "نخوردی", "نخورد", "نخوردیم", "نخوردید", "نخوردند",
                "خورم", "خوری", /*"خورد",*/ "خوریم", "خورید", "خورند",
                "نخورم", "نخوری", /*"نخورد",*/ "نخوریم", "نخورید", "نخورند",
                "می‌خوردم", "می‌خوردی", /*"می‌خورد",*/ "می‌خوردیم", "می‌خوردید", "می‌خوردند",
                "نمی‌خوردم", "نمی‌خوردی", "نمی‌خورد", "نمی‌خوردیم", "نمی‌خوردید", "نمی‌خوردند",
                "خورده‌ام", "خورده‌ای", "خورده", "خورده‌ایم", "خورده‌اید", "خورده‌اند",
                "نخورده‌ام", "نخورده‌ای", "نخورده", "نخورده‌ایم", "نخورده‌اید", "نخورده‌اند",
                "می‌خورم", "می‌خوری", "می‌خورد", "می‌خوریم", "می‌خورید", "می‌خورند",
                "نمی‌خورم", "نمی‌خوری", /*"نمی‌خورد",*/ "نمی‌خوریم", "نمی‌خورید", "نمی‌خورند",
                "بخورم", "بخوری", "بخورد", "بخوریم", "بخورید", "بخورند",
                "بخور", "نخور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
                }
            }

            input    = "آورد#آور";
            expected = new string[] {
                "آوردم", "آوردی", "آورد", "آوردیم", "آوردید", "آوردند",
                "نیاوردم", "نیاوردی", "نیاورد", "نیاوردیم", "نیاوردید", "نیاوردند",
                "آورم", "آوری", /*"آورد",*/ "آوریم", "آورید", "آورند",
                "نیاورم", "نیاوری", /*"نیاورد",*/ "نیاوریم", "نیاورید", "نیاورند",
                "می‌آوردم", "می‌آوردی", /*"می‌آورد",*/ "می‌آوردیم", "می‌آوردید", "می‌آوردند",
                "نمی‌آوردم", "نمی‌آوردی", "نمی‌آورد", "نمی‌آوردیم", "نمی‌آوردید", "نمی‌آوردند",
                "آورده‌ام", "آورده‌ای", "آورده", "آورده‌ایم", "آورده‌اید", "آورده‌اند",
                "نیاورده‌ام", "نیاورده‌ای", "نیاورده", "نیاورده‌ایم", "نیاورده‌اید", "نیاورده‌اند",
                "می‌آورم", "می‌آوری", "می‌آورد", "می‌آوریم", "می‌آورید", "می‌آورند",
                "نمی‌آورم", "نمی‌آوری", /*"نمی‌آورد",*/ "نمی‌آوریم", "نمی‌آورید", "نمی‌آورند",
                "بیاورم", "بیاوری", "بیاورد", "بیاوریم", "بیاورید", "بیاورند",
                "بیاور", "نیاور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
                }
            }
        }
Exemplo n.º 13
0
        public WordNormalizer()
        {
            stemmer = new EnglishStemmer();

            var path   = Path.Combine(ApplicationPath.BaseDirectory, "lemmatizer-en.lem");
            var stream = File.OpenRead(path);

            lemmatizer = new Lemmatizer(stream);
        }
Exemplo n.º 14
0
        private static List <string> GetVocabulary(string[] sentences, out List <List <string> > lemmalizeWords, int vocabularyThreshold)
        {
            string        filteredLine;
            List <string> filterLine     = new List <string>();
            List <string> tokenizedWords = new List <string>();
            List <string> vocabulary     = new List <string>();

            lemmalizeWords = new List <List <string> >();
            Dictionary <string, int> tFrequency = new Dictionary <string, int>();
            var stream     = File.OpenRead(path);
            var lemmatizer = new Lemmatizer(stream);

            int docIndex = 0;

            foreach (var doc in sentences)
            {
                List <string> stemmedDoc = new List <string>();
                docIndex++;

                tokenizedWords = Tokenize(doc);

                List <string> lemmalizeWord = new List <string>();
                foreach (string part in tokenizedWords)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
                    filteredLine = StopwordTool.RemoveStopwords(stripped);
                    string stem = lemmatizer.Lemmatize(filteredLine);
                    lemmalizeWord.Add(stem);

                    if (stem.Length > 0)
                    {
                        if (tFrequency.ContainsKey(stem))
                        {
                            tFrequency[stem]++;
                        }
                        else
                        {
                            tFrequency.Add(stem, 0);
                        }

                        stemmedDoc.Add(stem);
                    }
                }
                lemmalizeWords.Add(lemmalizeWord);
            }

            var vocabList = tFrequency.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return(vocabulary);
        }
Exemplo n.º 15
0
        private static Lemmatizer CreateLemmatizerFromFile()
        {
            var currentDirectory = Directory.GetCurrentDirectory();
            var dataFilePath     = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "english.lem");

            using (var stream = File.OpenRead(dataFilePath))
            {
                var lemmatizer = new Lemmatizer(stream);
                return(lemmatizer);
            }
        }
Exemplo n.º 16
0
 private void lemmatizeWordsChkbox_CheckedChanged(object sender, EventArgs e)
 {
     if (this.lemmatizeWordsChkbox.Checked)
     {
         Lemmatizer.setInstance(new DefaultLemmatizer(Conf.LEMMATIZATION_WORDS_PATH));
     }
     else
     {
         Lemmatizer.setInstance(new NullLemmatizer());
     }
 }
Exemplo n.º 17
0
        private static void LoadLemmatizer()
        {
            string path   = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
            var    stream = File.OpenRead(path + @"\full7z-mlteast-en.lem");
            // terrible fudge to suppress unhandled deserialization exception message from LemmaSharp.
            // Empty try..catch didn't work.
            TextWriter output = Console.Out;

            Console.SetOut(new StreamWriter(Stream.Null));
            _lemma = new Lemmatizer(stream);
            Console.SetOut(output);
        }
Exemplo n.º 18
0
        public void ConjugationsTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input;
            string[] expected, actual;

            input = "خورد#خور";
            expected = new string[] {
                "خوردم", "خوردی", "خورد", "خوردیم", "خوردید", "خوردند",
                "نخوردم", "نخوردی", "نخورد", "نخوردیم", "نخوردید", "نخوردند",
                "خورم", "خوری", /*"خورد",*/ "خوریم", "خورید", "خورند",
                "نخورم", "نخوری", /*"نخورد",*/ "نخوریم", "نخورید", "نخورند",
                "می‌خوردم", "می‌خوردی", /*"می‌خورد",*/ "می‌خوردیم", "می‌خوردید", "می‌خوردند",
                "نمی‌خوردم", "نمی‌خوردی", "نمی‌خورد", "نمی‌خوردیم", "نمی‌خوردید", "نمی‌خوردند",
                "خورده‌ام", "خورده‌ای", "خورده", "خورده‌ایم", "خورده‌اید", "خورده‌اند",
                "نخورده‌ام", "نخورده‌ای", "نخورده", "نخورده‌ایم", "نخورده‌اید", "نخورده‌اند",
                "می‌خورم", "می‌خوری", "می‌خورد", "می‌خوریم", "می‌خورید", "می‌خورند",
                "نمی‌خورم", "نمی‌خوری", /*"نمی‌خورد",*/ "نمی‌خوریم", "نمی‌خورید", "نمی‌خورند",
                "بخورم", "بخوری", "بخورد", "بخوریم", "بخورید", "بخورند",
                "بخور", "نخور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
            }

            input = "آورد#آور";
            expected = new string[] {
                "آوردم", "آوردی", "آورد", "آوردیم", "آوردید", "آوردند",
                "نیاوردم", "نیاوردی", "نیاورد", "نیاوردیم", "نیاوردید", "نیاوردند",
                "آورم", "آوری", /*"آورد",*/ "آوریم", "آورید", "آورند",
                "نیاورم", "نیاوری", /*"نیاورد",*/ "نیاوریم", "نیاورید", "نیاورند",
                "می‌آوردم", "می‌آوردی", /*"می‌آورد",*/ "می‌آوردیم", "می‌آوردید", "می‌آوردند",
                "نمی‌آوردم", "نمی‌آوردی", "نمی‌آورد", "نمی‌آوردیم", "نمی‌آوردید", "نمی‌آوردند",
                "آورده‌ام", "آورده‌ای", "آورده", "آورده‌ایم", "آورده‌اید", "آورده‌اند",
                "نیاورده‌ام", "نیاورده‌ای", "نیاورده", "نیاورده‌ایم", "نیاورده‌اید", "نیاورده‌اند",
                "می‌آورم", "می‌آوری", "می‌آورد", "می‌آوریم", "می‌آورید", "می‌آورند",
                "نمی‌آورم", "نمی‌آوری", /*"نمی‌آورد",*/ "نمی‌آوریم", "نمی‌آورید", "نمی‌آورند",
                "بیاورم", "بیاوری", "بیاورد", "بیاوریم", "بیاورید", "بیاورند",
                "بیاور", "نیاور"
            };
            actual = lemmatizer.Conjugations(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to generate conjugations of '" + input + "' verb");
            for (int i = 0; i < expected.Length; i++)
            {
                if (!actual.Contains(expected[i]))
                    Assert.AreEqual(expected[i], actual[i], "Failed to generate conjugations of '" + input + "' verb");
            }
        }
Exemplo n.º 19
0
 public void LoadModels(BinarySerializer taggerModelSer, BinarySerializer lemmatizerModelSer)
 {
     /*Utils.ThrowException(taggerModelSer == null ? new ArgumentNullException("taggerModelSer") : null);
      * mLogger.Debug("Load", "Nalagam model za označevanje ...");
      * mSuffixTree = new PatriciaTree(taggerModelSer);
      * mFeatureSpace = Utils.LoadDictionary<string, int>(taggerModelSer);
      * mModel = new MaximumEntropyClassifierFast<string>(taggerModelSer);*/
     if (lemmatizerModelSer != null)
     {
         mLogger.Debug("Load", "Nalagam model za lematizacijo ...");
         mConsiderTags = lemmatizerModelSer.ReadBool();
         mLemmatizer   = new Lemmatizer(lemmatizerModelSer);
     }
 }
Exemplo n.º 20
0
        public int Classify(string text)
        {
            Lemmatizer.Stopwords = _stopwords;
            var vector = Lemmatizer.LemmatizeCurrentText(text);
            int sign = 1, tone = 0, toneCount = 0;

            for (int i = 0; i < vector.Count; i++)
            {
                if (_sentilex.ContainsKey(vector[i]) || _specificLexicon.ContainsKey(vector[i]))
                {
                    int basictone = (_specificLexicon.ContainsKey(vector[i])) ? _specificLexicon[vector[i]] : _sentilex[vector[i]];
                    if (i - 1 > 0)
                    {
                        if (_negations.Contains(vector[i - 1]))
                        {
                            sign = -1;
                        }
                        if (_gains.Contains(vector[i - 1]))
                        {
                            if (i - 2 > 0 && _negations.Contains(vector[i - 2]))
                            {
                                sign = -1;
                            }
                        }
                    }
                    tone += basictone * sign;
                    toneCount++;
                }
            }
            if (toneCount > 0)
            {
                var doubletone = tone / (toneCount + 0.0);
                if (doubletone > 0.3)
                {
                    return(1);
                }
                else if (doubletone < -0.3)
                {
                    return(-1);
                }
                else
                {
                    return(0);
                }
            }
            else
            {
                return(0);
            }
        }
Exemplo n.º 21
0
        static async Task Main(string[] args)
        {
            var documentsFolderPath = PathConstants.CrawlerResultPath;

            var resultFolderPath = PathConstants.LemmatizationResultPath;

            if (!Directory.Exists(resultFolderPath))
            {
                Directory.CreateDirectory(resultFolderPath);
            }

            var regex = new Regex(@"[\p{IsCyrillic}]+");

            foreach (var file in Directory.EnumerateFiles(documentsFolderPath, "*.html"))
            {
                using (var fs = new FileStream(file, FileMode.Open))
                {
                    var document = new HtmlDocument();
                    document.Load(fs);

                    var rootNode = document.DocumentNode;

                    if (rootNode == null)
                    {
                        continue;
                    }

                    var tokens = regex.Matches(rootNode.InnerText)
                                 .Select(x => x.ToString())
                                 .Where(x => !string.IsNullOrEmpty(x) && !string.IsNullOrWhiteSpace(x));

                    var tokenized = string.Join(" ", tokens);

                    var lemmas = new Lemmatizer("./mystem/mystem.exe").LemmatizeText(tokenized)
                                 .Trim()
                                 .Replace("   ", " ");

                    var outputPath = Path.Combine(resultFolderPath, file
                                                  .Replace(documentsFolderPath + "\\", "")
                                                  .Replace(".html", ".txt"));

                    using (var stream = new FileStream(outputPath, FileMode.OpenOrCreate))
                    {
                        var buffer = Encoding.UTF8.GetBytes(lemmas);
                        await stream.WriteAsync(buffer, 0, buffer.Length);
                    }
                }
            }
        }
Exemplo n.º 22
0
        static void LemmatizeIWNLP(List<CoNLLSentence> corpus, String exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();
            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;
            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize<List<CoNLLSentence>>(corpus, exportPath);

        }
Exemplo n.º 23
0
        static void LemmatizeIWNLP(List <CoNLLSentence> corpus, string exportPath)
        {
            Lemmatizer IWNLP = new Lemmatizer();

            IWNLP.Load(AppSettingsWrapper.IWNLPPath);

            int count = corpus.Count;

            for (int i = 0; i < count; i++)
            {
                CoNLLSentence sentence = corpus[i];
                IWNLPSentenceProcessor.ProcessSentence(sentence, IWNLP);
                //Console.WriteLine(i);
            }
            XMLSerializer.Serialize <List <CoNLLSentence> >(corpus, exportPath);
        }
        public QueryIndependentFeatures()
        {
            Console.WriteLine("Initializing QueryIndependentFeatures");
            const string buhWordsFilepath           = "Dictionaries/BuhWords.txt";
            const string taxWordsFilepath           = "Dictionaries/TaxWords.txt";
            const string controlSystemWordsFilepath = "Dictionaries/ControlSystemWords.txt";
            const string formWordsFilepath          = "Dictionaries/FormWords.txt";

            buhWords           = File.ReadAllLines(buhWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            taxWords           = File.ReadAllLines(taxWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            controlSystemWords =
                File.ReadAllLines(controlSystemWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            formWords             = File.ReadAllLines(formWordsFilepath).Select(line => line.Trim().ToLower()).ToList();
            buhonlineDataProvider = new BuhOnlineDataProvider();
            lemmatizer            = new Lemmatizer();
        }
Exemplo n.º 25
0
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // add example
                lemmatizer.AddExample(word, lemma);
                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
Exemplo n.º 26
0
        /// <summary>
        /// Инциализирует новый экземпляр класса <see cref="MorphologyModel"/>.
        /// </summary>
        /// <param name="nGramm">N-граммная модель.</param>
        /// <param name="entModel">Модель классов неоднозначности.</param>
        /// <param name="folder">Папка для хранения файлов модели.</param>
        /// <param name="lemmaFile">Файл лемматизатора.</param>
        public MorphologyModel(TagNGramm nGramm, IEntropyClassModel entModel,
                               string folder, string lemmaFile)
        {
            this.nGramm   = nGramm;
            this.entClass = entModel;
            this.folder   = folder;
            if (Directory.Exists(folder))
            {
                Directory.Delete(folder, true);
            }
            Directory.CreateDirectory(folder);
            this.lemmaFile = folder + "\\" + lemmaFile;
            File.Copy(lemmaFile, this.lemmaFile);
            FileStream fs = File.OpenRead(this.lemmaFile);

            lemmatizer = new Lemmatizer(fs);
            Initialize();
        }
Exemplo n.º 27
0
        public Dictionary <string, int> deletePrepVerb(List <string> prepVerbList, Dictionary <string, int> wordCount)
        {
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string element in prepVerbList)
            {
                string lemElement = element;
                if (element.Length >= 5)
                {
                    lemElement = lemmatize.Lemmatize(element);
                }
                if (wordCount.ContainsKey(lemElement))
                {
                    wordCount.Remove(lemElement);
                }
            }

            return(wordCount);
        }
Exemplo n.º 28
0
        public void parse(string inputString, int emotion)
        {
            var lemmatizer = new Lemmatizer();

            this.emotion = emotion;
            StringReader input    = new StringReader(inputString);
            var          vector   = Lemmatizer.LemmatizeCurrentText(inputString);
            string       prevWord = null;

            foreach (var str in vector)
            {
                addGramm(str);
                if (prevWord != null)
                {
                    addGramm(prevWord + " " + str);
                }
                prevWord = str;
            }
        }
Exemplo n.º 29
0
        private static Dictionary <string, int> GetMostFrequentTermsForLabel(List <LabeledArticle> articles, string label, int termCount = 20, string[] stopList = null)
        {
            List <TokenizedArticle> tokenizedArticles = new List <TokenizedArticle>();

//            List<TokenizedArticle> allTokenizedArticles = new List<TokenizedArticle>();

            foreach (var article in articles)
            {
//                allTokenizedArticles.Add(tokenized);
                if (article.Label == label)
                {
                    var art            = TextUtility.ReplaceSpecialCharacters(article.Article.Body);
                    var processedWords = StopWordsFilterProcessor.Process(Tokenizer.TokenizeWords(art), stopList);
                    processedWords = Lemmatizer.Process(processedWords);
                    var tokenized = new TokenizedArticle(article, processedWords);
                    tokenizedArticles.Add(tokenized);
                }
            }


            Dictionary <string, int> countDictionary = new Dictionary <string, int>();

            foreach (var tokenizedArticle in tokenizedArticles)
            {
                foreach (var token in tokenizedArticle.Tokens)
                {
                    if (countDictionary.ContainsKey(token.Word))
                    {
                        countDictionary[token.Word]++;
                    }
                    else
                    {
                        countDictionary[token.Word] = 1;
                    }
                }
            }

            return(countDictionary
                   .OrderByDescending(pair => pair.Value)
                   .Take(termCount)
                   .ToDictionary(pair => pair.Key, pair => pair.Value));
        }
Exemplo n.º 30
0
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            // compute the lemma of this example
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // if the computed lemma is different from what we expect,
                // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail)
                lemmatizer.AddExample(word, lemma);

                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
Exemplo n.º 31
0
        private static void EnrichLemmatizerFile(string lemmatizerFilePath, string outputFilePath,
                                                 IEnumerable <string> enricherFilePaths)
        {
            using (var stream = File.OpenRead(lemmatizerFilePath))
            {
                var lemmatizer = new Lemmatizer(stream);
                // enrich lemmatizer with every other file
                foreach (var filePath in enricherFilePaths)
                {
                    EnrichLemmatizer(lemmatizer, filePath);
                }

                // persist lemmatizer in output file
                Console.WriteLine("Writing output file...");
                using (var oStream = File.Create(outputFilePath))
                {
                    lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true);
                }
                Console.WriteLine("Outuput file written at {0}", outputFilePath);
            }
        }
Exemplo n.º 32
0
        public ActionResult <IEnumerable <string> > Get()
        {
            //var LemFilePath = @"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem";
            //var filePath = @"C:\Users\D.Pugach\Downloads\test.txt";
            //var stream = File.OpenRead(LemFilePath);
            var sb     = new StringBuilder();
            var stream = new FileStream(@"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                var allText    = "Вазомоторный ринит что это такое Среди многочисленных видов ринита (насморка) эта патология занимает особое место, поскольку этиология ее возникновения до сих пор до конца не изучена. Вазомоторный ринит, чаще всего поражающий людей старше 20 лет, является заболеванием, которое может протекать в виде";
                var lemmatizer = new Lemmatizer(stream);

                lemmatizer.Lemmatize(allText.ToLower());
                foreach (var word in allText.Split(' '))
                {
                    sb.Append(lemmatizer.Lemmatize(word)).Append(" ");
                }
                Console.WriteLine(sb.ToString());
            }

            return(new string[] { sb.ToString() });
        }
Exemplo n.º 33
0
        public TextGenerator(IFileReader inpReader, string RelationName)
        {
            this.RelationName = RelationName;
            Sentences         = new StringParser(inpReader).GetSentances;
            var stream     = File.OpenRead(AppDomain.CurrentDomain.BaseDirectory + @"\\full7z-mlteast-ru.lem");
            var lemmatizer = new Lemmatizer(stream);

            WordsAll = new List <string>();
            foreach (var Sentence in Sentences)
            {
                for (int i = 0; i < Sentence.Count; i++)
                {
                    Sentence[i] = lemmatizer.Lemmatize(Sentence[i]);
                    WordsAll.Add(Sentence[i]);
                }
            }
            WordsAll = WordsAll.Distinct(new PartialComparer()).ToList();
            var SentenceNum = 0;

            foreach (var Sentence in Sentences)
            {
                foreach (var Word in Sentence)
                {
                    recList.Add(new Record()
                    {
                        TransactId = SentenceNum, ItemId = WordsAll.FindIndex(a => a == Word), Item = Word
                    });
                }
                SentenceNum++;
            }
            //for (int i = 0; i < Sentences.Count; i++)
            //{
            //    for (int j = 0; j < Sentences[i].Count; j++)
            //    {
            //        recList.Add(new Record() { TransactId = i, ItemId = j, Item = Sentences[i][j] });
            //    }
            //}
        }
Exemplo n.º 34
0
        /// <summary>
        /// Инциализирует новый экземпляр класса <see cref="MorphologyModel"/>.
        /// </summary>
        /// <param name="si">Информация о сериализации.</param>
        /// <param name="context">Контекст.</param>
        protected MorphologyModel(SerializationInfo si, StreamingContext context)
        {
            this.folder      = si.GetString("folder");
            this.groups      = (List <TagGroup>)si.GetValue("groups", typeof(List <TagGroup>));
            this.punctuation = (List <string>)si.GetValue("punctuation", typeof(List <string>));
            using (FileStream fs1 = File.Open(string.Format("{0}/{1}", folder, "nGramm.mdl"), FileMode.Open))
                using (FileStream fs2 = File.Open(string.Format("{0}/{1}", folder, "entClass.mdl"), FileMode.Open))
                {
                    this.nGramm   = Serializer.Deserialize <TagNGramm>(fs1);
                    this.entClass = new DawgEntropyClassModel();
                    entClass.Load(fs2);
                }
            this.sentenceDelimiters = (List <string>)si.GetValue("delimiters", typeof(List <string>));
            this.lemmaFile          = si.GetString("lemmaFile");
            FileStream fs = File.Open(this.lemmaFile, FileMode.Open);

            this.lemmatizer      = new Lemmatizer(fs);
            this.sentencePattern = (Regex)si.GetValue("sentencePattern", typeof(Regex));
            this.lexemPattern    = (Regex)si.GetValue("lexemPattern", typeof(Regex));
            this.minLengh        = si.GetInt32("minLenght");
            serviceTags          = (Dictionary <string, Tag>)si.GetValue("serviceTags",
                                                                         typeof(Dictionary <string, Tag>));
        }
Exemplo n.º 35
0
 public static void GetLanguageTools(Language language, out Set<string>.ReadOnly stopWords, out IStemmer stemmer)
 {
     switch (language)
     {
         case Language.Bulgarian:
             stopWords = StopWords.BulgarianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Czech:
             stopWords = StopWords.CzechStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Danish:
             stopWords = StopWords.DanishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Dutch:
             stopWords = StopWords.DutchStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.English:
             stopWords = StopWords.EnglishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Estonian:
             stopWords = null; // *** stop words are missing
             stemmer = new Lemmatizer(language);
             break;
         case Language.Finnish:
             stopWords = StopWords.FinnishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.French:
             stopWords = StopWords.FrenchStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.German:
             stopWords = StopWords.GermanStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Hungarian:
             stopWords = StopWords.HungarianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Italian:
             stopWords = StopWords.ItalianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Norwegian:
             stopWords = StopWords.NorwegianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Portuguese:
             stopWords = StopWords.PortugueseStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Romanian:
             stopWords = StopWords.RomanianStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Russian:
             stopWords = StopWords.RussianStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Serbian:
             stopWords = StopWords.SerbianStopWordsLatin;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Slovene:
             stopWords = StopWords.SloveneStopWords;
             stemmer = new Lemmatizer(language);
             break;
         case Language.Spanish:
             stopWords = StopWords.SpanishStopWords;
             stemmer = new Stemmer(language);
             break;
         case Language.Swedish:
             stopWords = StopWords.SwedishStopWords;
             stemmer = new Stemmer(language);
             break;
         default:
             throw new ArgumentNotSupportedException("language");
     }
 }
 public static void ProcessSentence(CoNLLSentence sentence, Lemmatizer iwnlp)
 {
     string[] tokenArray = sentence.Tokens.Select(x => x.Form).ToArray();
     //is2.data.SentenceData09 sentenceMateTools = mateToolsWrapper.TagSentenceLemmatizerAndPOS(tokenArray, true);
     for (int i = 0; i < sentence.Tokens.Count; i++)
     {
         CoNLLToken token = sentence.Tokens[i];
         if (token.POS == "NN")
         {
             List<POS> pos = new List<POS>() { POS.Noun, POS.X };
             if (iwnlp.ContainsEntry(token.Form, POS.Noun))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.X))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X);
             }
             else if (iwnlp.ContainsEntry(token.Form, POS.AdjectivalDeclension))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.AdjectivalDeclension);
             }
             else if(iwnlp.ContainsEntry(token.Form, pos, true))
             {
                 token.PredictedLemmas = iwnlp.GetLemmas(token.Form, pos, true);
             }
         }
         else
         {
             if (token.POS == "ADJA" || token.POS == "ADJD")
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Adjective))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Adjective, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Adjective, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Noun, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Noun, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.X, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.X, true);
                 }
                 else if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                 //else if (iwnlp.ContainsEntry(token.Form,true)) 
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
             else if (token.POS.StartsWith("V"))
             {
                 if (iwnlp.ContainsEntry(token.Form, POS.Verb, true))
                 {
                     token.PredictedLemmas = iwnlp.GetLemmas(token.Form, POS.Verb, true);
                 }
                     // test
                 //else if (iwnlp.ContainsEntry(token.Form, true))
                 //{
                 //    token.PredictedLemmas = iwnlp.GetLemmas(token.Form, true);
                 //}
             }
         }
     }
 }