コード例 #1
0
ファイル: LemmatizerTest.cs プロジェクト: nvdnkpr/NHazm
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input = "کتاب‌ها";
            expected = "کتاب";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "آتشفشان";
            expected = "آتشفشان";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "می‌روم";
            expected = "رفت#رو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "گفته شده است";
            expected = "گفت#گو";
            actual = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input = "مردم";
            expected = "مردم";
            actual = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }
コード例 #2
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List <string> inputs = new List <string>()
            {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List <string> expecteds = new List <string>()
            {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List <string> pos = new List <string>()
            {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input    = inputs[i];
                expected = expecteds[i];
                p        = pos[i];
                if (p == null)
                {
                    actual = lemmatizer.Lemmatize(input);
                }
                else
                {
                    actual = lemmatizer.Lemmatize(input, p);
                }
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }
コード例 #3
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual;

            input    = "کتاب‌ها";
            expected = "کتاب";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "آتشفشان";
            expected = "آتشفشان";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "می‌روم";
            expected = "رفت#رو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "گفته شده است";
            expected = "گفت#گو";
            actual   = lemmatizer.Lemmatize(input);
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");

            input    = "مردم";
            expected = "مردم";
            actual   = lemmatizer.Lemmatize(input, "N");
            Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
        }
コード例 #4
0
ファイル: wordmap.cs プロジェクト: qa1/wordmap
        public Dictionary <string, int> CalcWordCount(string[] words)
        {
            Dictionary <string, int> wordCount = new Dictionary <string, int>();
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string word in words)
            {
                string lemmWord = word;
                if (word.Length >= 5)
                {
                    lemmWord = lemmatize.Lemmatize(word);
                }

                if (wordCount.ContainsKey(lemmWord))
                {
                    wordCount[lemmWord]++;
                }
                else
                {
                    wordCount.Add(lemmWord, 1);
                }
            }

            return(wordCount);
        }
コード例 #5
0
ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // add example
                lemmatizer.AddExample(word, lemma);
                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
コード例 #6
0
        private static List <string> GetVocabulary(string[] sentences, out List <List <string> > lemmalizeWords, int vocabularyThreshold)
        {
            string        filteredLine;
            List <string> filterLine     = new List <string>();
            List <string> tokenizedWords = new List <string>();
            List <string> vocabulary     = new List <string>();

            lemmalizeWords = new List <List <string> >();
            Dictionary <string, int> tFrequency = new Dictionary <string, int>();
            var stream     = File.OpenRead(path);
            var lemmatizer = new Lemmatizer(stream);

            int docIndex = 0;

            foreach (var doc in sentences)
            {
                List <string> stemmedDoc = new List <string>();
                docIndex++;

                tokenizedWords = Tokenize(doc);

                List <string> lemmalizeWord = new List <string>();
                foreach (string part in tokenizedWords)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
                    filteredLine = StopwordTool.RemoveStopwords(stripped);
                    string stem = lemmatizer.Lemmatize(filteredLine);
                    lemmalizeWord.Add(stem);

                    if (stem.Length > 0)
                    {
                        if (tFrequency.ContainsKey(stem))
                        {
                            tFrequency[stem]++;
                        }
                        else
                        {
                            tFrequency.Add(stem, 0);
                        }

                        stemmedDoc.Add(stem);
                    }
                }
                lemmalizeWords.Add(lemmalizeWord);
            }

            var vocabList = tFrequency.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return(vocabulary);
        }
コード例 #7
0
ファイル: Program.cs プロジェクト: stuartd/LemmaGenerator
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            // compute the lemma of this example
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // if the computed lemma is different from what we expect,
                // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail)
                lemmatizer.AddExample(word, lemma);

                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
コード例 #8
0
        public ActionResult <IEnumerable <string> > Get()
        {
            //var LemFilePath = @"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem";
            //var filePath = @"C:\Users\D.Pugach\Downloads\test.txt";
            //var stream = File.OpenRead(LemFilePath);
            var sb     = new StringBuilder();
            var stream = new FileStream(@"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem", FileMode.Open);

            using (stream) {
                var allText    = "Вазомоторный ринит что это такое Среди многочисленных видов ринита (насморка) эта патология занимает особое место, поскольку этиология ее возникновения до сих пор до конца не изучена. Вазомоторный ринит, чаще всего поражающий людей старше 20 лет, является заболеванием, которое может протекать в виде";
                var lemmatizer = new Lemmatizer(stream);

                lemmatizer.Lemmatize(allText.ToLower());
                foreach (var word in allText.Split(' '))
                {
                    sb.Append(lemmatizer.Lemmatize(word)).Append(" ");
                }
                Console.WriteLine(sb.ToString());
            }

            return(new string[] { sb.ToString() });
        }
コード例 #9
0
        private void countWords(string scrubbedChat, Dictionary <string, int> dict)
        {
            var words = wordFinder.Matches(scrubbedChat)
                        .Cast <Match>()
                        .Select(m => lemmatizer.Lemmatize(m.Value));

            foreach (var word in words)
            {
                var tword = word.ToLower().Trim();
                int count = 0;
                dict.TryGetValue(tword, out count);
                ++count;
                dict[tword] = count;
            }
        }
コード例 #10
0
ファイル: wordmap.cs プロジェクト: qa1/wordmap
        public Dictionary <string, int> deletePrepVerb(List <string> prepVerbList, Dictionary <string, int> wordCount)
        {
            Lemmatizer lemmatize = new Lemmatizer();

            foreach (string element in prepVerbList)
            {
                string lemElement = element;
                if (element.Length >= 5)
                {
                    lemElement = lemmatize.Lemmatize(element);
                }
                if (wordCount.ContainsKey(lemElement))
                {
                    wordCount.Remove(lemElement);
                }
            }

            return(wordCount);
        }
コード例 #11
0
        public bool IsMatch(string sentence, PhrasalVerb phrasalVerb)
        {
            var tokens          = tokenizer.Tokenize(sentence);
            var matchRoot       = false;
            var particleToMatch = 0;

            for (var i = 0; i < tokens.Length; i++)
            {
                var token = tokens[i];
                if (!matchRoot)
                {
                    // try to match the root first
                    matchRoot = string.Equals(token, phrasalVerb.Root, StringComparison.InvariantCultureIgnoreCase)
                                ||
                                string.Equals(lemmatizer.Lemmatize(token), phrasalVerb.Root,
                                              StringComparison.InvariantCultureIgnoreCase);
                }
                else
                {
                    // match all particles
                    if (phrasalVerb.Particles.Count > particleToMatch)
                    {
                        var particle = phrasalVerb.Particles[particleToMatch];
                        var isMatch  = string.Equals(token, particle, StringComparison.InvariantCultureIgnoreCase);
                        if (isMatch)
                        {
                            particleToMatch++;
                            if (particleToMatch >= phrasalVerb.Particles.Count)
                            {
                                // we matched all particles
                                return(true);
                            }
                        }
                    }
                }
            }
            // if we get here, matching failed
            return(false);
        }
コード例 #12
0
        public TextGenerator(IFileReader inpReader, string RelationName)
        {
            this.RelationName = RelationName;
            Sentences         = new StringParser(inpReader).GetSentances;
            var stream     = File.OpenRead(AppDomain.CurrentDomain.BaseDirectory + @"\\full7z-mlteast-ru.lem");
            var lemmatizer = new Lemmatizer(stream);

            WordsAll = new List <string>();
            foreach (var Sentence in Sentences)
            {
                for (int i = 0; i < Sentence.Count; i++)
                {
                    Sentence[i] = lemmatizer.Lemmatize(Sentence[i]);
                    WordsAll.Add(Sentence[i]);
                }
            }
            WordsAll = WordsAll.Distinct(new PartialComparer()).ToList();
            var SentenceNum = 0;

            foreach (var Sentence in Sentences)
            {
                foreach (var Word in Sentence)
                {
                    recList.Add(new Record()
                    {
                        TransactId = SentenceNum, ItemId = WordsAll.FindIndex(a => a == Word), Item = Word
                    });
                }
                SentenceNum++;
            }
            //for (int i = 0; i < Sentences.Count; i++)
            //{
            //    for (int j = 0; j < Sentences[i].Count; j++)
            //    {
            //        recList.Add(new Record() { TransactId = i, ItemId = j, Item = Sentences[i][j] });
            //    }
            //}
        }
コード例 #13
0
        public Payload RunPlugin(Payload Input)
        {
            Payload pData = new Payload();

            pData.FileID    = Input.FileID;
            pData.SegmentID = Input.SegmentID;


            for (int counter = 0; counter < Input.StringArrayList.Count; counter++)
            {
                string[] TextToLemmatize = Input.StringArrayList[counter];
                //make sure everything is lowercase
                TextToLemmatize = TextToLemmatize.Select(s => s.ToLowerInvariant()).ToArray();
                for (int i = 0; i < TextToLemmatize.Length; i++)
                {
                    TextToLemmatize[i] = Lemmatizer.Lemmatize(TextToLemmatize[i]);
                }
                pData.StringArrayList.Add(TextToLemmatize);
                pData.SegmentNumber.Add(Input.SegmentNumber[counter]);
            }

            return(pData);
        }
コード例 #14
0
        public static string Lemmatize(string word)
        {
            try
            {
                using (ruStream)
                {
                    return(ruLemmatizer.Lemmatize(word));
                }
            }
            catch (Exception _)
            {
                using (enStream)
                {
                    return(enLemmatizer.Lemmatize(word));
                }
            }
            catch
            {
                Console.WriteLine("Something went wrong");
            }

            return(string.Empty);
        }
コード例 #15
0
ファイル: wordmap.cs プロジェクト: qa1/wordmap
        public string lemmatize(string word)
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            return(word.Length >= 5 ? lemmatizer.Lemmatize(word) : word);
        }
コード例 #16
0
        /// <summary>
        /// Сопоставляет каждому слову его тэг.
        /// </summary>
        /// <param name="sentence">Исходное предложение.</param>
        /// <returns>Возвращает предложение, где каждому слову поставлен в
        /// соответствие тэг.</returns>
        /// <remarks>Алгоритм сначала пытается проверить самые простые случаи (
        /// число или знак пунктуации), затем проверяет тэг из словаря. Если тэг
        /// содержит неоднозначности, то они разрешаются с помощью обученных
        /// классификаторов.</remarks>
        public List <Lexem> Predict(IEnumerable <string> sentence)
        {
            List <WordForm> result = new List <WordForm>();
            int             i      = 0;

            foreach (var word in sentence)
            {
                string wordLower = word.ToLower();
                if (punctuation.Contains(wordLower))
                {
                    var n = new Lexem(i, wordLower, Tag.Punctuation);
                    n.Lemma = n.Word;
                    result.Add(n);
                }
                else if (serviceTags.ContainsKey(wordLower))
                {
                    var n = new Lexem(i, wordLower, serviceTags[wordLower]);
                    n.Lemma = n.Word;
                    result.Add(n);
                }
                else
                {
                    var n = new Lexem(i, wordLower, entClass.GetEntropyClass(wordLower));
                    n.Lemma = lemmatizer.Lemmatize(n.Word);
                    result.Add(n);
                }
                i++;
            }
            i = 0;
            foreach (var window in result.BuildAllWindows(7))
            {
                double v;
                if (double.TryParse(result[i].Word, out v)) //если число
                {
                    result[i].Tag = Tag.Number;
                }
                else if (punctuation.Contains(result[i].Word)) //если пунктуация
                {
                    result[i].Tag = Tag.Punctuation;
                }
                else //иначе нужно использовать модель
                {
                    //начнём с класса энтропии для данного слова
                    Tag resultingTag = result[i].Tag;
                    //для каждой группы аттрибутов будем пытаться уменьшить класс энтропии
                    foreach (var group in groups)
                    {
                        //если текущий тэг содержит один или более элементов группы
                        if (group.ContainsGroup(resultingTag))
                        {
                            //если текущий тэг содержит более одного элемента группы
                            if (!group.ContainsOneFromGroup(resultingTag))
                            {
                                //преобразуем окно в вектор и угадаем, используя модель
                                Tag predictedTag = group.Predict(window.ToVector(nGramm, entClass));
                                resultingTag &= ~group.FullGroup;
                                resultingTag |= predictedTag;
                            }
                        }
                    }
                    result[i].Tag = resultingTag;
                }
                i++; //переходим к следующему слову
            }
            var res = new List <Lexem>();

            for (int j = 0; j < result.Count; j++)
            {
                res.Add(new Lexem(j, result[j].Word, result[j].Tag));
                res[j].Lemma = result[j].Lemma;
            }
            return(res);
        }
コード例 #17
0
        // *** End of Dec-2011 ***

        public void Tag(Corpus corpus, out int lemmaCorrect, out int lemmaCorrectLowercase, out int lemmaWords, bool xmlMode)
        {
            DateTime startTime = DateTime.Now;

            mLogger.Debug("Tag", "Označujem besedilo ...");
            lemmaCorrect          = 0;
            lemmaCorrectLowercase = 0;
            lemmaWords            = 0;
            for (int i = 0; i < corpus.TaggedWords.Count; i++)
            {
                //mLogger.ProgressFast(Logger.Level.Info, /*sender=*/this, "Tag", "{0} / {1}", i + 1, corpus.TaggedWords.Count);
                //BinaryVector featureVector = corpus.GenerateFeatureVector(i, mFeatureSpace, /*extendFeatureSpace=*/false, mSuffixTree);
                //Prediction<string> result = mModel.Predict(featureVector);
                if ((corpus.TaggedWords[i].MoreInfo != null && corpus.TaggedWords[i].MoreInfo.Punctuation) ||
                    (corpus.TaggedWords[i].MoreInfo == null && mNonWordRegex.Match(corpus.TaggedWords[i].WordLower).Success)) // non-word
                {
                    /*bool flag = false;
                     * foreach (KeyDat<double, string> item in result)
                     * {
                     *  if (corpus.TaggedWords[i].Word == item.Dat || corpus.TaggedWords[i].Word + "<eos>" == item.Dat)
                     *  {
                     *      corpus.TaggedWords[i].Tag = item.Dat;
                     *      flag = true;
                     *      break;
                     *  }
                     * }
                     * if (!flag)
                     * {
                     *  corpus.TaggedWords[i].Tag = corpus.TaggedWords[i].Word;
                     * }*/
                }
                else // word
                {
                    string wordLower = corpus.TaggedWords[i].WordLower;
                    //Set<string> filter = mSuffixTree.Contains(wordLower) ? mSuffixTree.GetTags(wordLower) : null;
                    //result = ProcessResult(result, filter);//???!!!
                    string goldTag = corpus.TaggedWords[i].Tag;
                    string word    = corpus.TaggedWords[i].Word;
                    string rule;

                    /*if (filter == null)
                     * {
                     *  filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule);
                     * }
                     * else
                     * {
                     *  filter = Rules.ApplyTaggerRules(filter, word, out rule);
                     *  if (filter.Count == 0) { filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); }
                     * }
                     * result = ProcessResult(result, filter);//???!!!
                     * string predictedTag;
                     * if (result.Count == 0)
                     * {
                     *  predictedTag = Rules.GetMostFrequentTag(wordLower, filter);
                     * }
                     * else
                     * {
                     *  predictedTag = result.BestClassLabel;
                     * }
                     * corpus.TaggedWords[i].Tag = predictedTag;*/
                    if (mLemmatizer != null)
                    {
                        string lemma;
                        lemma = /*mConsiderTags ? mLemmatizer.Lemmatize(wordLower, predictedTag) : */ mLemmatizer.Lemmatize(wordLower);
                        //lemma = Rules.FixLemma(lemma, corpus.TaggedWords[i].Word, predictedTag);
                        if (string.IsNullOrEmpty(lemma))
                        {
                            lemma = wordLower;
                        }
                        if (xmlMode)
                        {
                            lemmaWords++;
                            if (lemma == corpus.TaggedWords[i].Lemma)
                            {
                                lemmaCorrect++;
                            }
                            if (corpus.TaggedWords[i].Lemma != null && lemma.ToLower() == corpus.TaggedWords[i].Lemma.ToLower())
                            {
                                lemmaCorrectLowercase++;
                            }
                        }
                        corpus.TaggedWords[i].Lemma = lemma;
                    }
                }
            }
            TimeSpan span = DateTime.Now - startTime;

            mLogger.Debug("Tag", "Trajanje označevanja: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds);
        }
コード例 #18
0
        partial void FindWordsButton_TouchUpInside(UIButton sender)
        {
            string mytext = UserInput.Text;

            List <string> stopword = new List <string>();

            foreach (var line in File.ReadLines("stopword.txt"))
            {
                stopword.Add(line);
            }



            //string[] stopword = new string[] { "user", "a", "she", "should", "few", "what", "their", "on", "this", "is" };
            int count_it = 0;
            //foreach (string r_t in raw_text_string)
            //string rt = mytext;
            List <double> predicted_labels = new List <double>();
            string        hate_speech_var  = "NOT OFFENSIVE";
            double        prob             = 0.0;

            {
                count_it += 1;
                Console.WriteLine(count_it.ToString());
                string raw_text   = mytext;
                string input_text = "a " + raw_text;

                //NOTE:
                // 1. If the input_text starts with @ as its first character, the Xamarin Natual Language namespace based Tokenizer breaks
                // ERROR: SecTaskLoadEntitlements failed error=22 cs_flags=200, pid=30250
                // ERROR: SecTaskCopyDebugDescription: MarsHabitatPrice[30250]/0#-1 LF=0
                // To tackle this, add a character 'a' in front of the string which will become a separate token and will be removed in stopwords (or remove explicitly)

                //causes the escape sequence to be ignore


                //STAGE 1: Tokenization + Lowercase + Remove Punctionation
                // TODO: Make sure that the apostrophe's remaining inside the word due to wierd tokenization is removed in stopword stage
                // TODO: Check Apple's documentation/ github repo on this issue
                //    def tokenize(self):
                //      from nltk import word_tokenize
                //      for i, tweet in tqdm(enumerate(self.data), 'Tokenization'):
                //      self.data[i] = word_tokenize(tweet.lower())
                //      return self.data
                //NSValue[] tokens;
                Console.WriteLine("**START**");

                string[] tokens_str = null;

                if (!String.IsNullOrWhiteSpace(input_text))
                {
                    var tokenizer = new NLTokenizer(NLTokenUnit.Word); // or could be NLTokenUnit.Sentence for sentence tokenization

                    tokenizer.String = input_text;
                    var range = new NSRange(0, input_text.Length);
                    //[P]Console.WriteLine("RANGE:");
                    //[P]Console.WriteLine(range);



                    NSValue[] tokens = tokenizer.GetTokens(range); // Returns Array of NSValue Objects, each wrapping an NSRange value
                                                                   //Attempting to print the string from an individual NSValue Object

                    //Attempt 1: Try to Directly convert using ToString method- FAILED
                    //Console.WriteLine("1) Tokens: ");
                    //Console.WriteLine(tokens[0].ToString()); // Output: NSRange: {0, 3}


                    //Attempt 2: Since Previous attempt gives an NSRange value, try to convert that again using ToString()- FAILED
                    //Console.WriteLine("2) Tokens: ");
                    //Console.WriteLine(tokens[0].ToString().ToString()); // Output: NSRange: {0, 3}


                    //Attempt 3: Since, NSRange = structure used to describe a portion of a series, such as characters in a string, we
                    // try to extract the RangeValue from NSValue Object and use its location and length to get a substring out of joined array of tokens - PASSED
                    //Console.WriteLine("3) Tokens: ");

                    //NSRange rr = tokens[0].RangeValue; // Extract Range Value
                    //string s = input_text.Substring((int)rr.Location, (int)rr.Length);  // Error: Object reference not set to an instance of an object
                    //Console.WriteLine(s);

                    tokens_str = new string[tokens.Length];

                    for (int i = 0; i < tokens.Length; i++)
                    {
                        NSRange rr = tokens[i].RangeValue; // Extract Range Value
                        string  s  = input_text.Substring((int)rr.Location, (int)rr.Length);
                        tokens_str[i] = s.ToLower();
                        //[P]Console.WriteLine(s);
                    }


                    //NSRange rr = tokens[0].RangeValue; // Extract Range Value
                    //string s = input_text.Substring((int)rr.Location, (int)rr.Length);  // Error: Object reference not set to an instance of an object
                    //Console.WriteLine(s);



                    //NSRange rr = tokens[0].RangeValue;
                    //string s = Text.Substring((int)rr.Location, (int)rr.Length);

                    //string temp11;

                    //[P]for (int i = 0; i < tokens_str.Length; i++)
                    //[P]{
                    //[P]    temp11 = tokens_str[i] + " ";
                    //[P]Console.Write(temp11);
                    //[P]}
                }
                Console.WriteLine("**END**");



                //EXPECTED OUTPUT:
                // ['@', 'user', 'she', 'should', 'ask', 'a', 'few', 'native', 'americans', 'what', 'their', 'take', 'on', 'this', 'is', '.']
                // ['@', 'user', '@', 'user', 'go', 'home', 'you', '’', 're', 'drunk', '!', '!', '!', '@', 'user', '#', 'maga', '#', 'trump2020', '👊🇺🇸👊', 'url']
                // ['amazon', 'is', 'investigating', 'chinese', 'employees', 'who', 'are', 'selling', 'internal', 'data', 'to', 'third-party', 'sellers', 'looking', 'for', 'an', 'edge', 'in', 'the', 'competitive', 'marketplace', '.', 'url', '#', 'amazon', '#', 'maga', '#', 'kag', '#', 'china', '#', 'tcot']


                //ACTUAL OUPUT:
                // a user she should ask a few native americans what their take on this is
                // a user user go home you’re drunk user maga trump2020 👊 🇺🇸 👊 url
                // a amazon is investigating chinese employees who are selling internal data to third party sellers looking for an edge in the competitive marketplace url amazon maga kag china tcot


                //TODO: TEST THE TOKENIZER EXTENSIVELY AND GET BUGS


                //STAGE 2: Stop-Word Removal

                //def remove_stopwords(self):
                //  from nltk.corpus import stopwords
                //  import re
                //  stop = set(stopwords.words("english"))
                //  noise = ['user']
                //  for i, tweet in tqdm(enumerate(self.data), 'Stopwords Removal'):
                //      self.data[i] = [w for w in tweet if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w) and w not in noise]
                //  return self.data



                var stopword_set = new HashSet <string>(stopword);  //Hashset of Stopwords

                // Let t_tokens_str be the tokenized version of the string

                //string[] t_tokens_str = new string[] { "amazon","are","investigating","who","chinese","is",".","who"};

                List <string> stop_tokens_str = new List <string>();

                foreach (string word in tokens_str)
                {
                    if (stopword_set.Contains(word))
                    {
                        continue;
                    }
                    else
                    {
                        stop_tokens_str.Add(word);
                    }
                }
                Console.WriteLine("AFTER STOPWORD REMOVAL");

                //[P]foreach (string word in stop_tokens_str)
                //[P]Console.WriteLine(word);


                //StringBuilder input = new StringBuilder("Did you try this yourself before asking");
                //foreach (string word in tokens_str)
                //{
                //    if word in stopword
                //}
                //Console.WriteLine(input);

                //EXPECTED OUTPUT:
                // ['ask', 'native', 'americans', 'take'],
                // ['go', 'home', 'drunk', 'maga', 'trump2020', 'url'],
                // ['amazon', 'investigating', 'chinese', 'employees', 'selling', 'internal', 'data', 'third-party', 'sellers', 'looking', 'edge', 'competitive', 'marketplace', 'url', 'amazon', 'maga', 'kag', 'china', 'tcot']

                //ACTUAL OUTPUT:
                // ['ask', 'native', 'americans', 'take']

                //STAGE 3: Lemmatization

                //def lemmatize(self):
                //    from nltk.stem import WordNetLemmatizer
                //    wnl = WordNetLemmatizer()
                //    for i, tweet in tqdm(enumerate(self.data), 'Lemmatization'):
                //        for j, word in enumerate(tweet):
                //            self.data[i][j] = wnl.lemmatize(word, pos = self.get_pos(word))
                //    return self.data

                var currentDirectory = Directory.GetCurrentDirectory();


                //string[] fileArray = Directory.GetFiles(currentDirectory);


                //foreach (string f in fileArray)
                //{
                //    Console.WriteLine();
                //}

                var dataFilePath = string.Format("{0}/{1}", currentDirectory, "full7z-mlteast-en.lem"); // maybe add @



                //Console.WriteLine(File.Exists(dataFilePath) ? "File exists." : "File does not exist.");

                //var path = "Resources/full7z-mlteast-en.lem";
                var stream     = File.OpenRead(dataFilePath);
                var lemmatizer = new Lemmatizer(stream); //Load Lemmatizer with the given dataFilePath



                List <string> lemma_tokens_str = new List <string>();

                foreach (string word in stop_tokens_str)
                {
                    var result2 = lemmatizer.Lemmatize(word);
                    lemma_tokens_str.Add(result2);
                }
                Console.WriteLine("AFTER LEMMATIZATION");

                //[P]foreach (string word in lemma_tokens_str)
                //[P]Console.WriteLine(word);



                //EXPECTED OUTPUT:
                // ['ask', 'native', 'american', 'take']
                // ['go', 'home', 'drunk', 'maga', 'trump2020', 'url']
                // ['amazon', 'investigate', 'chinese', 'employee', 'sell', 'internal', 'data', 'third-party', 'seller', 'look', 'edge', 'competitive', 'marketplace', 'url', 'amazon', 'maga', 'kag', 'china', 'tcot']


                //ACTUAL OUTPUT:
                // ['ask','native','american','take']


                // TODO: EDGE CASES- OOV WORDS
                // STAGE 4: Count-Vectorization
                //FileStream meta_stream = File.Open("metadata_length_tweet_size_vocab.txt", FileMode.Open);


                //INPUT: lemma_tokens_str which is List<string>

                int len_tweet  = 0;
                int size_vocab = 0;


                // Tweet Length and Size of Vocab
                foreach (var line in File.ReadLines("metadata_length_tweet_size_vocab.txt"))
                {
                    string[] temp = line.Split(" ");
                    len_tweet  = Convert.ToInt32(temp[0]);
                    size_vocab = Convert.ToInt32(temp[1]);
                }

                // Vocab Mapping

                var word_code = new Dictionary <string, int>();

                string temp_word = null;
                int    temp_code = 0;

                foreach (var line in File.ReadLines("vocab_mapping.txt"))
                {
                    string[] temp2 = line.Split(" ");
                    temp_word = temp2[0];
                    temp_code = Convert.ToInt32(temp2[1]);
                    if (word_code.ContainsKey(temp_word))
                    {
                        continue;
                    }
                    else
                    {
                        word_code.Add(temp_word, temp_code);
                    }
                }

                //[P]foreach (string key in word_code.Keys)
                //[P]{
                //[P]Console.WriteLine(String.Format("{0}: {1}", key, word_code[key]));
                //[P]}

                // Vocab Mapping has been loaded to a dictionary


                // Now create vectors corresponding to each tweet


                //Convert list of string to mapped double array

                double[] example      = new double[len_tweet];//Final Array automatically assigned to 0
                int      k            = 0;
                int      m            = 0;
                string   possible_key = null;
                while (k < lemma_tokens_str.Count)
                {
                    //[P]Console.Write("lemma_tokens_str.Count: ");
                    //[P]Console.WriteLine(lemma_tokens_str.Count); //21

                    //[P]Console.Write("k: ");
                    //[P]Console.WriteLine(k); //3

                    //[P]foreach (var ss2 in lemma_tokens_str)
                    //[P]{
                    //[P]Console.Write(ss2);
                    //[P]}

                    //Console.WriteLine("\n Posiala a");
                    possible_key = lemma_tokens_str[k];
                    //[P]Console.WriteLine("\n Possible Key:");
                    //[P]Console.WriteLine(possible_key);
                    if (word_code.ContainsKey(possible_key))
                    {
                        example[m] = word_code[lemma_tokens_str[k]];
                        m         += 1;
                        k         += 1;
                    }
                    else
                    {
                        k += 1;
                        continue;
                    }
                }

                //[P]foreach(var vv in example)
                //[P]{
                //[P]Console.WriteLine(vv);
                //[P]}



                //*****PREPROCESSING ENDS HERE*****//


                // Initialize MLMultiArray
                //Swift Sytax doesn't work
                //MLMultiArray temp1 = MLMultiArray(shape:[1, 44], MLMultiArrayDataType: MLMultiArrayDataType.double);

                //double[] example = {6620,  1912,  9987,  4577, 10130, 13048,  5191, 10897,   208, 13091,  9104,    0,
                //     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                //     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                //     0,     0,     0,     0,     0,     0,     0,     0 };

                //TODO: Figure out how to convert 'example' into a MLMultiArray called 'temp1'


                //******START TODO*******//

                nint[] ns_sum = new nint[] { 44, 1, 1 }; // Try to exchange dimensions

                //MLMultiArray temp1 = new MLMultiArray(ns_sum, MLMultiArrayDataType.Double, out NSError error3);
                //for (int i = 0; i < 44; i++)
                //{
                // Convert each example[i] to NSNumber
                //temp1.SetObject(new NSNumber(example[i]),i);

                //Console.WriteLine(i);
                //}
                //var example = new double[] {6620,  1912,  9987,  4577, 10130, 13048,  5191, 10897,   208, 13091,  9104,    0,
                //     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                //     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                //     0,     0,     0,     0,     0,     0,     0,     0 };

                MLMultiArray temp1 = new MLMultiArray(ns_sum, MLMultiArrayDataType.Double, out NSError error3);

                var narray = new NSNumber[example.Length];
                for (int i = 0; i < narray.Length; i++)
                {
                    narray[i] = NSNumber.FromDouble(example[i]);
                }

                for (int i = 0; i < narray.Length; i++)
                {
                    temp1.SetObject(narray[i], i);
                }


                //MLMultiArray temp1 = new MLMultiArray(narray, MLMultiArrayDataType.Double, out NSError error3);


                //******END TODO*******//
                Console.WriteLine("EXAMPLE-OUTPUT");



                var hate_coremlOutput = this.hate_model.GetPrediction(temp1, out NSError error2);


                if (error3 != null)
                {
                    throw new Exception("Unexpected runtime error.");
                }

                if (error2 != null)
                {
                    throw new Exception("Error with Hate Model during Runtime.\n");
                }

                var hate_prob = hate_coremlOutput.Output1;


                Console.WriteLine("Hate Probability Score: ");


                Console.WriteLine(hate_prob);

                //            List<double> predicted_labels = new List<double>();
                //              string hate_speech_var = "NOT HATE SPEECH";

                predicted_labels.Add(hate_prob[0].DoubleValue);
                predicted_labels.Add(hate_prob[1].DoubleValue);

                if (hate_prob[0].DoubleValue > hate_prob[1].DoubleValue)
                {
                    hate_speech_var = "OFFENSIVE";
                    prob            = hate_prob[0].DoubleValue;
                }
                else
                {
                    prob = hate_prob[1].DoubleValue;
                }
            }



            string title_text        = "TEXT IS " + hate_speech_var;
            string body_text         = "Probability of being " + hate_speech_var + " is " + prob.ToString();
            var    okAlertController = UIAlertController.Create(title_text, body_text, UIAlertControllerStyle.Alert);

            //Add Action
            okAlertController.AddAction(UIAlertAction.Create("OK", UIAlertActionStyle.Default, null));

            // Present Alert
            PresentViewController(okAlertController, true, null);
        }
コード例 #19
0
        static void Main(string[] args)
        {
            // Create readable file
            var currentDirectory = Directory.GetCurrentDirectory();
            var dataFilePath     = $"{currentDirectory}/Data/Custom/full7z-mlteast-en-modified.lem";

            using (var fstream = File.OpenRead(dataFilePath))
            {
                var lemmatizer = new Lemmatizer(fstream);

                // add examples
                var examples = new List <Tuple <string, string> >()
                {
                    /*new Tuple<string,string>("acting","act"),
                    *  new Tuple<string,string>("balled","ball"),
                    *  new Tuple<string,string>("balled","ball"),
                    *  new Tuple<string,string>("ballsed","balls"),
                    *  new Tuple<string,string>("bogged","bog"),
                    *  new Tuple<string,string>("bottomed","bottom"),
                    *  new Tuple<string,string>("bounced","bounce"),
                    *  new Tuple<string,string>("boxed","box"),
                    *  new Tuple<string,string>("brought","bring"),
                    *  new Tuple<string,string>("cashed","cash"),
                    *  new Tuple<string,string>("clouded","cloud"),
                    *  new Tuple<string,string>("cozied","cozy"),
                    *  new Tuple<string,string>("divided","divide"),
                    *  new Tuple<string,string>("felt","feel"),
                    *  new Tuple<string,string>("fiddling","fiddle"),
                    *  new Tuple<string,string>("fishing","fish"),
                    *  new Tuple<string,string>("fleshed","flesh"),
                    *  new Tuple<string,string>("fobbed","fob"),
                    *  new Tuple<string,string>("following","follow"),
                    *  new Tuple<string,string>("homing","home"),
                    *  new Tuple<string,string>("hunkered","hunker"),
                    *  new Tuple<string,string>("leveled","level"),
                    *  new Tuple<string,string>("laid","lay"),
                    *  new Tuple<string,string>("limbered","limber"),
                    *  new Tuple<string,string>("livened","liven"),
                    *  new Tuple<string,string>("livened","liven"),
                    *  new Tuple<string,string>("loaded","load"),
                    *  new Tuple<string,string>("magicked","magic"),
                    *  new Tuple<string,string>("messing","mess"),
                    *  new Tuple<string,string>("meted","mete"),
                    *  new Tuple<string,string>("mouthing","mouth"),
                    *  new Tuple<string,string>("perked","perk"),
                    *  new Tuple<string,string>("pootling","pootle"),
                    *  new Tuple<string,string>("sacked","sack"),
                    *  new Tuple<string,string>("screwing","screw"),
                    *  new Tuple<string,string>("sexed","sex"),
                    *  new Tuple<string,string>("shacked","shack"),
                    *  new Tuple<string,string>("speeded","speed"),
                    *  new Tuple<string,string>("spirited","spirit"),
                    *  new Tuple<string,string>("started","start"),
                    *  new Tuple<string,string>("stove","stave"),
                    *  new Tuple<string,string>("swung","swing"),
                    *  new Tuple<string,string>("teed","tee"),
                    *  new Tuple<string,string>("tired","tire"),
                    *  new Tuple<string,string>("used","use"),
                    *  new Tuple<string,string>("vacuumed","vacuum"),
                    *  new Tuple<string,string>("whiled","while"),
                    *  new Tuple<string,string>("wigged","wig"),
                    *  new Tuple<string,string>("zoned","zone"),*/
                    new Tuple <string, string>("don't", "do"),
                    new Tuple <string, string>("doesn't", "do"),
                    new Tuple <string, string>("didn't", "did"),
                    new Tuple <string, string>("won't", "will"),
                    new Tuple <string, string>("shan't", "shall"),
                    new Tuple <string, string>("can't", "can"),
                    new Tuple <string, string>("couldn't", "could"),
                    new Tuple <string, string>("wouldn't", "would"),
                    new Tuple <string, string>("shouldn't", "should"),
                    new Tuple <string, string>("mustn't", "must"),
                    new Tuple <string, string>("mightn't", "might"),
                    new Tuple <string, string>("oughtn't", "ought"),
                    new Tuple <string, string>("needn't", "need"),
                    new Tuple <string, string>("aren't", "are"),
                    new Tuple <string, string>("isn't", "be"),
                    new Tuple <string, string>("wasn't", "be"),
                    new Tuple <string, string>("weren't", "be"),
                    new Tuple <string, string>("haven't", "have"),
                    new Tuple <string, string>("hasn't", "have"),
                    new Tuple <string, string>("hadn't", "have"),
                    new Tuple <string, string>("'s", "'s"),
                    new Tuple <string, string>("'ve", "have"),
                    new Tuple <string, string>("'m", "be"),
                    new Tuple <string, string>("'re", "be"),
                    new Tuple <string, string>("'ll", "will"),
                };
                foreach (var example in examples)
                {
                    var lemma = lemmatizer.Lemmatize(example.Item1);
                    Console.WriteLine("{0} --> {1} {2}", example.Item1, lemma, lemma != example.Item2 ? ("!= " + example.Item2):"");
                }
            }


            Console.WriteLine("==========");
            Console.WriteLine("OK");
            Console.ReadLine();
        }
コード例 #20
0
        /*public bool IsMatch(string sentence, PhrasalVerb phrasalVerb)
         * {
         *  var tokens = tokenizer.Tokenize(sentence);
         *  var pv = MatchingPhrasalVerbs(sentence, new List<PhrasalVerb>() {phrasalVerb});
         *  return pv.Any();
         * }*/

        /*public List<PhrasalVerb> MatchingPhrasalVerbs(string sentence, List<PhrasalVerb> phrasalVerbs)
         * {
         *  // tokenize sentence
         *  var tokens = tokenizer.Tokenize(sentence);
         *  var taggedWords = tagger.Tag(tokens)/*.Where(t => Regex.IsMatch(t, "[A-Z]+")).ToList()#1#;
         *  // create parse tree
         *  var parse = parser.DoParse(tokens);
         *  // retrieve dependencies
         *  var dependencies = ComputeDependencies(parse).ToList();
         *
         *  // compute matching phrasal verbs
         *  var matchingPhrasalVerbs = new List<PhrasalVerb>();
         *  foreach (var phrasalVerb in phrasalVerbs)
         *  {
         *      // get relevant dependencies found
         *      var parts = phrasalVerb.Name.Split(' ');
         *      var root = parts.First();
         *      // find dependencies for this root
         *      var relevantDepedencies = dependencies
         *          .Where(
         *              d =>
         *                  ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()),
         *                      StringComparison.InvariantCultureIgnoreCase) && d.Gov().Index() < d.Dep().Index())
         ||
         ||                  (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()),
         ||                      StringComparison.InvariantCultureIgnoreCase) && d.Dep().Index() < d.Gov().Index()))
         ||                 && (!phrasalVerb.Inseparable || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1)
         ||                     // for non separable verbs
         ||                 && (!phrasalVerb.SeparableMandatory || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1)
         ||         // for separable mandatory verbs
         ||         //&& d.Gov().Index() >= 1 && IsVerb(taggedWords[d.Gov().Index() - 1])
         ||         )
         ||         .ToList();
         ||
         ||     // We take only the 2nd part
         ||     // For phrasal verbs with several particles, that's a good approximation for now
         ||     // (we could check that all the particles are also linked)
         ||     if (relevantDepedencies.Any() && parts.Count() > 1)
         ||     {
         ||         var particle1 = parts[1];
         ||         var prtDependencies = relevantDepedencies.Where(d => d.Reln().GetShortName() == "prt").ToList();
         ||         if (prtDependencies.Any())
         ||         {
         ||             // if root has a prt dependency, don't look at other relations
         ||             if (prtDependencies
         ||                 .Any(d => string.Equals(particle1, d.Dep().GetWord(),StringComparison.InvariantCultureIgnoreCase)
         || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase)))
         ||             {
         ||                 matchingPhrasalVerbs.Add(phrasalVerb);
         ||             }
         ||         }
         ||         else
         ||         {
         ||             // otherwise, look at all the other relations
         ||             var relevantRelationships = relevantDepedencies
         ||                 .Where(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase)
         || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase))
         ||                 .ToList();
         ||             if (relevantRelationships.Any())
         ||             {
         ||                 matchingPhrasalVerbs.Add(phrasalVerb);
         ||             }
         ||         }
         ||     }
         || }
         || return matchingPhrasalVerbs;
         ||}
         ||
         ||private IEnumerable<TypedDependency> ComputeDependencies(Parse parse)
         ||{
         || // Extract dependencies from lexical tree
         || var tlp = new PennTreebankLanguagePack();
         || var gsf = tlp.GrammaticalStructureFactory();
         || var tree = new ParseTree(parse);
         || try
         || {
         ||     var gs = gsf.NewGrammaticalStructure(tree);
         ||     return gs.TypedDependencies();
         || }
         || catch (Exception)
         || {
         ||     Console.WriteLine("Exception when computing deps for {0}", parse);
         ||     return new List<TypedDependency>();
         || }
         ||}*/

        public List <PhrasalVerb> MatchingPhrasalVerbs(string sentence, List <PhrasalVerb> phrasalVerbs)
        {
            // tokenize sentence
            var tokens = tokenizer.Tokenize(sentence);
            // create parse tree
            var parse = parser.DoParse(tokens);
            // retrieve dependencies
            var dependencies = ComputeDependencies(parse).ToList();

            var matchingPhrasalVerbs = new List <PhrasalVerb>();

            foreach (var phrasalVerb in phrasalVerbs)
            {
                // get relevant dependencies found
                var parts = phrasalVerb.Name.Split(' ').ToList();
                var root  = parts.First();
                // find dependencies for this root
                var rootRelatedDependencies = dependencies
                                              .Where(d => // the (lemmatized) token must be equal to the gov/dep of the dependency
                                                     ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()), StringComparison.InvariantCultureIgnoreCase) &&
                                                       d.Gov().Index() < d.Dep().Index()) ||
                                                      (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()), StringComparison.InvariantCultureIgnoreCase) &&
                                                       d.Dep().Index() < d.Gov().Index()))
                                                     // if the phrasal verb is inseparable, no word must be between the root and the particle
                                                     && (!phrasalVerb.Inseparable.HasValue || (!phrasalVerb.Inseparable.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1))
                                                     // if the phrasal verb is mandatory seprable, at least one word must be between the root and the particle
                                                     && (!phrasalVerb.SeparableMandatory.HasValue || (!phrasalVerb.SeparableMandatory.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1))
                                                     )
                                              .ToList();

                // We take only the 2nd part
                // For phrasal verbs with several particles, that's a good approximation for now
                // (we could check that all the particles are also linked)
                if (rootRelatedDependencies.Any() && parts.Count() > 1)
                {
                    var particle1            = parts[1];
                    var relevantDependencies = rootRelatedDependencies.Where(d => d.Reln().GetShortName() == "prt").ToList();
                    if (!relevantDependencies.Any())
                    {
                        // if no "prt" relation, take all relations whatsoever.
                        relevantDependencies = rootRelatedDependencies;
                    }

                    // if one of relevant dependencies have the particle as gov/dep, it's good!
                    var rootParticle1Dependency = relevantDependencies
                                                  .FirstOrDefault(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase) ||
                                                                  string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase));
                    if (rootParticle1Dependency != null && !AreWordSeparatedInSentence(rootParticle1Dependency, dependencies))
                    {
                        var remainingParts = parts.Skip(2).ToList();
                        var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1;

                        var endOfSentenceTokens = tokens.Skip(lastTokenIndex + 1).ToList();
                        if (endOfSentenceTokens.Any())
                        {
                            for (var i = 0; i < endOfSentenceTokens.Count; i++)
                            {
                                if (i < remainingParts.Count)
                                {
                                    if (!string.Equals(remainingParts[i], endOfSentenceTokens[i],
                                                       StringComparison.InvariantCultureIgnoreCase))
                                    {
                                        // no match, get out of the loop
                                        break;
                                    }
                                }
                                else
                                {
                                    // all the remaining parts were included in the remaining tokens --> OK
                                    matchingPhrasalVerbs.Add(phrasalVerb);
                                }
                            }
                        }
                        else
                        {
                            // if there is no remaining parts, the phrasal verb matches
                            if (!remainingParts.Any())
                            {
                                matchingPhrasalVerbs.Add(phrasalVerb);
                            }
                        }


                        /*if (parts.Count <= 2)
                         * {
                         *  // phrasal verb has 1 particle only; we're done
                         *  matchingPhrasalVerbs.Add(phrasalVerb);
                         * }
                         * else
                         * {
                         *  // otherwise, check that the other particles are in the sentence (approximation)
                         *  var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1;
                         *  var endOfSentenceTokens = tokens.Skip(lastTokenIndex).ToList();
                         *  if (parts.Skip(2).All(endOfSentenceTokens.Contains))
                         *  {
                         *      matchingPhrasalVerbs.Add(phrasalVerb);
                         *  }
                         * }*/
                    }
                }
            }

            return(matchingPhrasalVerbs);
        }
コード例 #21
0
        public void LemmatizeTest()
        {
            Lemmatizer lemmatizer = new Lemmatizer();

            string input, expected, actual, p;

            List<string> inputs = new List<string>() {
                "کتاب‌ها", "آتشفشان", "می‌روم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی"
            };
            List<string> expecteds = new List<string>() {
                "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی"
            };
            List<string> pos = new List<string>() {
                null, null, null, null, null, "N", "AJ"
            };

            for (var i = 0; i < inputs.Count; i++)
            {
                input = inputs[i];
                expected = expecteds[i];
                p = pos[i];
                if (p == null)
                    actual = lemmatizer.Lemmatize(input);
                else
                    actual = lemmatizer.Lemmatize(input, p);
                Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word");
            }
        }
コード例 #22
0
 private string[] GetWords(string[] requirementsTokens)
 {
     return(_filter.Filter(requirementsTokens).Select(word => _lemmatizer.Lemmatize(word)).ToArray());
 }
コード例 #23
0
 public string Lemmatize(string word)
 {
     return(lemmatizer.Lemmatize(word));
 }