public void LemmatizeTest() { Lemmatizer lemmatizer = new Lemmatizer(); string input, expected, actual; input = "کتابها"; expected = "کتاب"; actual = lemmatizer.Lemmatize(input); Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); input = "آتشفشان"; expected = "آتشفشان"; actual = lemmatizer.Lemmatize(input); Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); input = "میروم"; expected = "رفت#رو"; actual = lemmatizer.Lemmatize(input); Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); input = "گفته شده است"; expected = "گفت#گو"; actual = lemmatizer.Lemmatize(input); Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); input = "مردم"; expected = "مردم"; actual = lemmatizer.Lemmatize(input, "N"); Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); }
public void LemmatizeTest() { Lemmatizer lemmatizer = new Lemmatizer(); string input, expected, actual, p; List <string> inputs = new List <string>() { "کتابها", "آتشفشان", "میروم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی" }; List <string> expecteds = new List <string>() { "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی" }; List <string> pos = new List <string>() { null, null, null, null, null, "N", "AJ" }; for (var i = 0; i < inputs.Count; i++) { input = inputs[i]; expected = expecteds[i]; p = pos[i]; if (p == null) { actual = lemmatizer.Lemmatize(input); } else { actual = lemmatizer.Lemmatize(input, p); } Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); } }
public Dictionary <string, int> CalcWordCount(string[] words) { Dictionary <string, int> wordCount = new Dictionary <string, int>(); Lemmatizer lemmatize = new Lemmatizer(); foreach (string word in words) { string lemmWord = word; if (word.Length >= 5) { lemmWord = lemmatize.Lemmatize(word); } if (wordCount.ContainsKey(lemmWord)) { wordCount[lemmWord]++; } else { wordCount.Add(lemmWord, 1); } } return(wordCount); }
private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma) { var computedLemma = lemmatizer.Lemmatize(word); if (computedLemma != lemma) { // add example lemmatizer.AddExample(word, lemma); // if still doesn't work --> add exception var computedLemma2 = lemmatizer.Lemmatize(word); if (computedLemma2 != lemma) { Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma); lemmatizer.AddException(word, lemma); } } }
private static List <string> GetVocabulary(string[] sentences, out List <List <string> > lemmalizeWords, int vocabularyThreshold) { string filteredLine; List <string> filterLine = new List <string>(); List <string> tokenizedWords = new List <string>(); List <string> vocabulary = new List <string>(); lemmalizeWords = new List <List <string> >(); Dictionary <string, int> tFrequency = new Dictionary <string, int>(); var stream = File.OpenRead(path); var lemmatizer = new Lemmatizer(stream); int docIndex = 0; foreach (var doc in sentences) { List <string> stemmedDoc = new List <string>(); docIndex++; tokenizedWords = Tokenize(doc); List <string> lemmalizeWord = new List <string>(); foreach (string part in tokenizedWords) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); filteredLine = StopwordTool.RemoveStopwords(stripped); string stem = lemmatizer.Lemmatize(filteredLine); lemmalizeWord.Add(stem); if (stem.Length > 0) { if (tFrequency.ContainsKey(stem)) { tFrequency[stem]++; } else { tFrequency.Add(stem, 0); } stemmedDoc.Add(stem); } } lemmalizeWords.Add(lemmalizeWord); } var vocabList = tFrequency.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return(vocabulary); }
private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma) { // compute the lemma of this example var computedLemma = lemmatizer.Lemmatize(word); if (computedLemma != lemma) { // if the computed lemma is different from what we expect, // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail) lemmatizer.AddExample(word, lemma); // if still doesn't work --> add exception var computedLemma2 = lemmatizer.Lemmatize(word); if (computedLemma2 != lemma) { Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma); lemmatizer.AddException(word, lemma); } } }
public ActionResult <IEnumerable <string> > Get() { //var LemFilePath = @"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem"; //var filePath = @"C:\Users\D.Pugach\Downloads\test.txt"; //var stream = File.OpenRead(LemFilePath); var sb = new StringBuilder(); var stream = new FileStream(@"C:\Users\D.Pugach\Downloads\full7z-mlteast-ru.lem", FileMode.Open); using (stream) { var allText = "Вазомоторный ринит что это такое Среди многочисленных видов ринита (насморка) эта патология занимает особое место, поскольку этиология ее возникновения до сих пор до конца не изучена. Вазомоторный ринит, чаще всего поражающий людей старше 20 лет, является заболеванием, которое может протекать в виде"; var lemmatizer = new Lemmatizer(stream); lemmatizer.Lemmatize(allText.ToLower()); foreach (var word in allText.Split(' ')) { sb.Append(lemmatizer.Lemmatize(word)).Append(" "); } Console.WriteLine(sb.ToString()); } return(new string[] { sb.ToString() }); }
private void countWords(string scrubbedChat, Dictionary <string, int> dict) { var words = wordFinder.Matches(scrubbedChat) .Cast <Match>() .Select(m => lemmatizer.Lemmatize(m.Value)); foreach (var word in words) { var tword = word.ToLower().Trim(); int count = 0; dict.TryGetValue(tword, out count); ++count; dict[tword] = count; } }
public Dictionary <string, int> deletePrepVerb(List <string> prepVerbList, Dictionary <string, int> wordCount) { Lemmatizer lemmatize = new Lemmatizer(); foreach (string element in prepVerbList) { string lemElement = element; if (element.Length >= 5) { lemElement = lemmatize.Lemmatize(element); } if (wordCount.ContainsKey(lemElement)) { wordCount.Remove(lemElement); } } return(wordCount); }
public bool IsMatch(string sentence, PhrasalVerb phrasalVerb) { var tokens = tokenizer.Tokenize(sentence); var matchRoot = false; var particleToMatch = 0; for (var i = 0; i < tokens.Length; i++) { var token = tokens[i]; if (!matchRoot) { // try to match the root first matchRoot = string.Equals(token, phrasalVerb.Root, StringComparison.InvariantCultureIgnoreCase) || string.Equals(lemmatizer.Lemmatize(token), phrasalVerb.Root, StringComparison.InvariantCultureIgnoreCase); } else { // match all particles if (phrasalVerb.Particles.Count > particleToMatch) { var particle = phrasalVerb.Particles[particleToMatch]; var isMatch = string.Equals(token, particle, StringComparison.InvariantCultureIgnoreCase); if (isMatch) { particleToMatch++; if (particleToMatch >= phrasalVerb.Particles.Count) { // we matched all particles return(true); } } } } } // if we get here, matching failed return(false); }
public TextGenerator(IFileReader inpReader, string RelationName) { this.RelationName = RelationName; Sentences = new StringParser(inpReader).GetSentances; var stream = File.OpenRead(AppDomain.CurrentDomain.BaseDirectory + @"\\full7z-mlteast-ru.lem"); var lemmatizer = new Lemmatizer(stream); WordsAll = new List <string>(); foreach (var Sentence in Sentences) { for (int i = 0; i < Sentence.Count; i++) { Sentence[i] = lemmatizer.Lemmatize(Sentence[i]); WordsAll.Add(Sentence[i]); } } WordsAll = WordsAll.Distinct(new PartialComparer()).ToList(); var SentenceNum = 0; foreach (var Sentence in Sentences) { foreach (var Word in Sentence) { recList.Add(new Record() { TransactId = SentenceNum, ItemId = WordsAll.FindIndex(a => a == Word), Item = Word }); } SentenceNum++; } //for (int i = 0; i < Sentences.Count; i++) //{ // for (int j = 0; j < Sentences[i].Count; j++) // { // recList.Add(new Record() { TransactId = i, ItemId = j, Item = Sentences[i][j] }); // } //} }
public Payload RunPlugin(Payload Input) { Payload pData = new Payload(); pData.FileID = Input.FileID; pData.SegmentID = Input.SegmentID; for (int counter = 0; counter < Input.StringArrayList.Count; counter++) { string[] TextToLemmatize = Input.StringArrayList[counter]; //make sure everything is lowercase TextToLemmatize = TextToLemmatize.Select(s => s.ToLowerInvariant()).ToArray(); for (int i = 0; i < TextToLemmatize.Length; i++) { TextToLemmatize[i] = Lemmatizer.Lemmatize(TextToLemmatize[i]); } pData.StringArrayList.Add(TextToLemmatize); pData.SegmentNumber.Add(Input.SegmentNumber[counter]); } return(pData); }
public static string Lemmatize(string word) { try { using (ruStream) { return(ruLemmatizer.Lemmatize(word)); } } catch (Exception _) { using (enStream) { return(enLemmatizer.Lemmatize(word)); } } catch { Console.WriteLine("Something went wrong"); } return(string.Empty); }
public string lemmatize(string word) { Lemmatizer lemmatizer = new Lemmatizer(); return(word.Length >= 5 ? lemmatizer.Lemmatize(word) : word); }
/// <summary> /// Сопоставляет каждому слову его тэг. /// </summary> /// <param name="sentence">Исходное предложение.</param> /// <returns>Возвращает предложение, где каждому слову поставлен в /// соответствие тэг.</returns> /// <remarks>Алгоритм сначала пытается проверить самые простые случаи ( /// число или знак пунктуации), затем проверяет тэг из словаря. Если тэг /// содержит неоднозначности, то они разрешаются с помощью обученных /// классификаторов.</remarks> public List <Lexem> Predict(IEnumerable <string> sentence) { List <WordForm> result = new List <WordForm>(); int i = 0; foreach (var word in sentence) { string wordLower = word.ToLower(); if (punctuation.Contains(wordLower)) { var n = new Lexem(i, wordLower, Tag.Punctuation); n.Lemma = n.Word; result.Add(n); } else if (serviceTags.ContainsKey(wordLower)) { var n = new Lexem(i, wordLower, serviceTags[wordLower]); n.Lemma = n.Word; result.Add(n); } else { var n = new Lexem(i, wordLower, entClass.GetEntropyClass(wordLower)); n.Lemma = lemmatizer.Lemmatize(n.Word); result.Add(n); } i++; } i = 0; foreach (var window in result.BuildAllWindows(7)) { double v; if (double.TryParse(result[i].Word, out v)) //если число { result[i].Tag = Tag.Number; } else if (punctuation.Contains(result[i].Word)) //если пунктуация { result[i].Tag = Tag.Punctuation; } else //иначе нужно использовать модель { //начнём с класса энтропии для данного слова Tag resultingTag = result[i].Tag; //для каждой группы аттрибутов будем пытаться уменьшить класс энтропии foreach (var group in groups) { //если текущий тэг содержит один или более элементов группы if (group.ContainsGroup(resultingTag)) { //если текущий тэг содержит более одного элемента группы if (!group.ContainsOneFromGroup(resultingTag)) { //преобразуем окно в вектор и угадаем, используя модель Tag predictedTag = group.Predict(window.ToVector(nGramm, entClass)); resultingTag &= ~group.FullGroup; resultingTag |= predictedTag; } } } result[i].Tag = resultingTag; } i++; //переходим к следующему слову } var res = new List <Lexem>(); for (int j = 0; j < result.Count; j++) { res.Add(new Lexem(j, result[j].Word, result[j].Tag)); res[j].Lemma = result[j].Lemma; } return(res); }
// *** End of Dec-2011 *** public void Tag(Corpus corpus, out int lemmaCorrect, out int lemmaCorrectLowercase, out int lemmaWords, bool xmlMode) { DateTime startTime = DateTime.Now; mLogger.Debug("Tag", "Označujem besedilo ..."); lemmaCorrect = 0; lemmaCorrectLowercase = 0; lemmaWords = 0; for (int i = 0; i < corpus.TaggedWords.Count; i++) { //mLogger.ProgressFast(Logger.Level.Info, /*sender=*/this, "Tag", "{0} / {1}", i + 1, corpus.TaggedWords.Count); //BinaryVector featureVector = corpus.GenerateFeatureVector(i, mFeatureSpace, /*extendFeatureSpace=*/false, mSuffixTree); //Prediction<string> result = mModel.Predict(featureVector); if ((corpus.TaggedWords[i].MoreInfo != null && corpus.TaggedWords[i].MoreInfo.Punctuation) || (corpus.TaggedWords[i].MoreInfo == null && mNonWordRegex.Match(corpus.TaggedWords[i].WordLower).Success)) // non-word { /*bool flag = false; * foreach (KeyDat<double, string> item in result) * { * if (corpus.TaggedWords[i].Word == item.Dat || corpus.TaggedWords[i].Word + "<eos>" == item.Dat) * { * corpus.TaggedWords[i].Tag = item.Dat; * flag = true; * break; * } * } * if (!flag) * { * corpus.TaggedWords[i].Tag = corpus.TaggedWords[i].Word; * }*/ } else // word { string wordLower = corpus.TaggedWords[i].WordLower; //Set<string> filter = mSuffixTree.Contains(wordLower) ? mSuffixTree.GetTags(wordLower) : null; //result = ProcessResult(result, filter);//???!!! string goldTag = corpus.TaggedWords[i].Tag; string word = corpus.TaggedWords[i].Word; string rule; /*if (filter == null) * { * filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); * } * else * { * filter = Rules.ApplyTaggerRules(filter, word, out rule); * if (filter.Count == 0) { filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); } * } * result = ProcessResult(result, filter);//???!!! * string predictedTag; * if (result.Count == 0) * { * predictedTag = Rules.GetMostFrequentTag(wordLower, filter); * } * else * { * predictedTag = result.BestClassLabel; * } * corpus.TaggedWords[i].Tag = predictedTag;*/ if (mLemmatizer != null) { string lemma; lemma = /*mConsiderTags ? mLemmatizer.Lemmatize(wordLower, predictedTag) : */ mLemmatizer.Lemmatize(wordLower); //lemma = Rules.FixLemma(lemma, corpus.TaggedWords[i].Word, predictedTag); if (string.IsNullOrEmpty(lemma)) { lemma = wordLower; } if (xmlMode) { lemmaWords++; if (lemma == corpus.TaggedWords[i].Lemma) { lemmaCorrect++; } if (corpus.TaggedWords[i].Lemma != null && lemma.ToLower() == corpus.TaggedWords[i].Lemma.ToLower()) { lemmaCorrectLowercase++; } } corpus.TaggedWords[i].Lemma = lemma; } } } TimeSpan span = DateTime.Now - startTime; mLogger.Debug("Tag", "Trajanje označevanja: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); }
partial void FindWordsButton_TouchUpInside(UIButton sender) { string mytext = UserInput.Text; List <string> stopword = new List <string>(); foreach (var line in File.ReadLines("stopword.txt")) { stopword.Add(line); } //string[] stopword = new string[] { "user", "a", "she", "should", "few", "what", "their", "on", "this", "is" }; int count_it = 0; //foreach (string r_t in raw_text_string) //string rt = mytext; List <double> predicted_labels = new List <double>(); string hate_speech_var = "NOT OFFENSIVE"; double prob = 0.0; { count_it += 1; Console.WriteLine(count_it.ToString()); string raw_text = mytext; string input_text = "a " + raw_text; //NOTE: // 1. If the input_text starts with @ as its first character, the Xamarin Natual Language namespace based Tokenizer breaks // ERROR: SecTaskLoadEntitlements failed error=22 cs_flags=200, pid=30250 // ERROR: SecTaskCopyDebugDescription: MarsHabitatPrice[30250]/0#-1 LF=0 // To tackle this, add a character 'a' in front of the string which will become a separate token and will be removed in stopwords (or remove explicitly) //causes the escape sequence to be ignore //STAGE 1: Tokenization + Lowercase + Remove Punctionation // TODO: Make sure that the apostrophe's remaining inside the word due to wierd tokenization is removed in stopword stage // TODO: Check Apple's documentation/ github repo on this issue // def tokenize(self): // from nltk import word_tokenize // for i, tweet in tqdm(enumerate(self.data), 'Tokenization'): // self.data[i] = word_tokenize(tweet.lower()) // return self.data //NSValue[] tokens; Console.WriteLine("**START**"); string[] tokens_str = null; if (!String.IsNullOrWhiteSpace(input_text)) { var tokenizer = new NLTokenizer(NLTokenUnit.Word); // or could be NLTokenUnit.Sentence for sentence tokenization tokenizer.String = input_text; var range = new NSRange(0, input_text.Length); //[P]Console.WriteLine("RANGE:"); //[P]Console.WriteLine(range); NSValue[] tokens = tokenizer.GetTokens(range); // Returns Array of NSValue Objects, each wrapping an NSRange value //Attempting to print the string from an individual NSValue Object //Attempt 1: Try to Directly convert using ToString method- FAILED //Console.WriteLine("1) Tokens: "); //Console.WriteLine(tokens[0].ToString()); // Output: NSRange: {0, 3} //Attempt 2: Since Previous attempt gives an NSRange value, try to convert that again using ToString()- FAILED //Console.WriteLine("2) Tokens: "); //Console.WriteLine(tokens[0].ToString().ToString()); // Output: NSRange: {0, 3} //Attempt 3: Since, NSRange = structure used to describe a portion of a series, such as characters in a string, we // try to extract the RangeValue from NSValue Object and use its location and length to get a substring out of joined array of tokens - PASSED //Console.WriteLine("3) Tokens: "); //NSRange rr = tokens[0].RangeValue; // Extract Range Value //string s = input_text.Substring((int)rr.Location, (int)rr.Length); // Error: Object reference not set to an instance of an object //Console.WriteLine(s); tokens_str = new string[tokens.Length]; for (int i = 0; i < tokens.Length; i++) { NSRange rr = tokens[i].RangeValue; // Extract Range Value string s = input_text.Substring((int)rr.Location, (int)rr.Length); tokens_str[i] = s.ToLower(); //[P]Console.WriteLine(s); } //NSRange rr = tokens[0].RangeValue; // Extract Range Value //string s = input_text.Substring((int)rr.Location, (int)rr.Length); // Error: Object reference not set to an instance of an object //Console.WriteLine(s); //NSRange rr = tokens[0].RangeValue; //string s = Text.Substring((int)rr.Location, (int)rr.Length); //string temp11; //[P]for (int i = 0; i < tokens_str.Length; i++) //[P]{ //[P] temp11 = tokens_str[i] + " "; //[P]Console.Write(temp11); //[P]} } Console.WriteLine("**END**"); //EXPECTED OUTPUT: // ['@', 'user', 'she', 'should', 'ask', 'a', 'few', 'native', 'americans', 'what', 'their', 'take', 'on', 'this', 'is', '.'] // ['@', 'user', '@', 'user', 'go', 'home', 'you', '’', 're', 'drunk', '!', '!', '!', '@', 'user', '#', 'maga', '#', 'trump2020', '👊🇺🇸👊', 'url'] // ['amazon', 'is', 'investigating', 'chinese', 'employees', 'who', 'are', 'selling', 'internal', 'data', 'to', 'third-party', 'sellers', 'looking', 'for', 'an', 'edge', 'in', 'the', 'competitive', 'marketplace', '.', 'url', '#', 'amazon', '#', 'maga', '#', 'kag', '#', 'china', '#', 'tcot'] //ACTUAL OUPUT: // a user she should ask a few native americans what their take on this is // a user user go home you’re drunk user maga trump2020 👊 🇺🇸 👊 url // a amazon is investigating chinese employees who are selling internal data to third party sellers looking for an edge in the competitive marketplace url amazon maga kag china tcot //TODO: TEST THE TOKENIZER EXTENSIVELY AND GET BUGS //STAGE 2: Stop-Word Removal //def remove_stopwords(self): // from nltk.corpus import stopwords // import re // stop = set(stopwords.words("english")) // noise = ['user'] // for i, tweet in tqdm(enumerate(self.data), 'Stopwords Removal'): // self.data[i] = [w for w in tweet if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w) and w not in noise] // return self.data var stopword_set = new HashSet <string>(stopword); //Hashset of Stopwords // Let t_tokens_str be the tokenized version of the string //string[] t_tokens_str = new string[] { "amazon","are","investigating","who","chinese","is",".","who"}; List <string> stop_tokens_str = new List <string>(); foreach (string word in tokens_str) { if (stopword_set.Contains(word)) { continue; } else { stop_tokens_str.Add(word); } } Console.WriteLine("AFTER STOPWORD REMOVAL"); //[P]foreach (string word in stop_tokens_str) //[P]Console.WriteLine(word); //StringBuilder input = new StringBuilder("Did you try this yourself before asking"); //foreach (string word in tokens_str) //{ // if word in stopword //} //Console.WriteLine(input); //EXPECTED OUTPUT: // ['ask', 'native', 'americans', 'take'], // ['go', 'home', 'drunk', 'maga', 'trump2020', 'url'], // ['amazon', 'investigating', 'chinese', 'employees', 'selling', 'internal', 'data', 'third-party', 'sellers', 'looking', 'edge', 'competitive', 'marketplace', 'url', 'amazon', 'maga', 'kag', 'china', 'tcot'] //ACTUAL OUTPUT: // ['ask', 'native', 'americans', 'take'] //STAGE 3: Lemmatization //def lemmatize(self): // from nltk.stem import WordNetLemmatizer // wnl = WordNetLemmatizer() // for i, tweet in tqdm(enumerate(self.data), 'Lemmatization'): // for j, word in enumerate(tweet): // self.data[i][j] = wnl.lemmatize(word, pos = self.get_pos(word)) // return self.data var currentDirectory = Directory.GetCurrentDirectory(); //string[] fileArray = Directory.GetFiles(currentDirectory); //foreach (string f in fileArray) //{ // Console.WriteLine(); //} var dataFilePath = string.Format("{0}/{1}", currentDirectory, "full7z-mlteast-en.lem"); // maybe add @ //Console.WriteLine(File.Exists(dataFilePath) ? "File exists." : "File does not exist."); //var path = "Resources/full7z-mlteast-en.lem"; var stream = File.OpenRead(dataFilePath); var lemmatizer = new Lemmatizer(stream); //Load Lemmatizer with the given dataFilePath List <string> lemma_tokens_str = new List <string>(); foreach (string word in stop_tokens_str) { var result2 = lemmatizer.Lemmatize(word); lemma_tokens_str.Add(result2); } Console.WriteLine("AFTER LEMMATIZATION"); //[P]foreach (string word in lemma_tokens_str) //[P]Console.WriteLine(word); //EXPECTED OUTPUT: // ['ask', 'native', 'american', 'take'] // ['go', 'home', 'drunk', 'maga', 'trump2020', 'url'] // ['amazon', 'investigate', 'chinese', 'employee', 'sell', 'internal', 'data', 'third-party', 'seller', 'look', 'edge', 'competitive', 'marketplace', 'url', 'amazon', 'maga', 'kag', 'china', 'tcot'] //ACTUAL OUTPUT: // ['ask','native','american','take'] // TODO: EDGE CASES- OOV WORDS // STAGE 4: Count-Vectorization //FileStream meta_stream = File.Open("metadata_length_tweet_size_vocab.txt", FileMode.Open); //INPUT: lemma_tokens_str which is List<string> int len_tweet = 0; int size_vocab = 0; // Tweet Length and Size of Vocab foreach (var line in File.ReadLines("metadata_length_tweet_size_vocab.txt")) { string[] temp = line.Split(" "); len_tweet = Convert.ToInt32(temp[0]); size_vocab = Convert.ToInt32(temp[1]); } // Vocab Mapping var word_code = new Dictionary <string, int>(); string temp_word = null; int temp_code = 0; foreach (var line in File.ReadLines("vocab_mapping.txt")) { string[] temp2 = line.Split(" "); temp_word = temp2[0]; temp_code = Convert.ToInt32(temp2[1]); if (word_code.ContainsKey(temp_word)) { continue; } else { word_code.Add(temp_word, temp_code); } } //[P]foreach (string key in word_code.Keys) //[P]{ //[P]Console.WriteLine(String.Format("{0}: {1}", key, word_code[key])); //[P]} // Vocab Mapping has been loaded to a dictionary // Now create vectors corresponding to each tweet //Convert list of string to mapped double array double[] example = new double[len_tweet];//Final Array automatically assigned to 0 int k = 0; int m = 0; string possible_key = null; while (k < lemma_tokens_str.Count) { //[P]Console.Write("lemma_tokens_str.Count: "); //[P]Console.WriteLine(lemma_tokens_str.Count); //21 //[P]Console.Write("k: "); //[P]Console.WriteLine(k); //3 //[P]foreach (var ss2 in lemma_tokens_str) //[P]{ //[P]Console.Write(ss2); //[P]} //Console.WriteLine("\n Posiala a"); possible_key = lemma_tokens_str[k]; //[P]Console.WriteLine("\n Possible Key:"); //[P]Console.WriteLine(possible_key); if (word_code.ContainsKey(possible_key)) { example[m] = word_code[lemma_tokens_str[k]]; m += 1; k += 1; } else { k += 1; continue; } } //[P]foreach(var vv in example) //[P]{ //[P]Console.WriteLine(vv); //[P]} //*****PREPROCESSING ENDS HERE*****// // Initialize MLMultiArray //Swift Sytax doesn't work //MLMultiArray temp1 = MLMultiArray(shape:[1, 44], MLMultiArrayDataType: MLMultiArrayDataType.double); //double[] example = {6620, 1912, 9987, 4577, 10130, 13048, 5191, 10897, 208, 13091, 9104, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0 }; //TODO: Figure out how to convert 'example' into a MLMultiArray called 'temp1' //******START TODO*******// nint[] ns_sum = new nint[] { 44, 1, 1 }; // Try to exchange dimensions //MLMultiArray temp1 = new MLMultiArray(ns_sum, MLMultiArrayDataType.Double, out NSError error3); //for (int i = 0; i < 44; i++) //{ // Convert each example[i] to NSNumber //temp1.SetObject(new NSNumber(example[i]),i); //Console.WriteLine(i); //} //var example = new double[] {6620, 1912, 9987, 4577, 10130, 13048, 5191, 10897, 208, 13091, 9104, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0 }; MLMultiArray temp1 = new MLMultiArray(ns_sum, MLMultiArrayDataType.Double, out NSError error3); var narray = new NSNumber[example.Length]; for (int i = 0; i < narray.Length; i++) { narray[i] = NSNumber.FromDouble(example[i]); } for (int i = 0; i < narray.Length; i++) { temp1.SetObject(narray[i], i); } //MLMultiArray temp1 = new MLMultiArray(narray, MLMultiArrayDataType.Double, out NSError error3); //******END TODO*******// Console.WriteLine("EXAMPLE-OUTPUT"); var hate_coremlOutput = this.hate_model.GetPrediction(temp1, out NSError error2); if (error3 != null) { throw new Exception("Unexpected runtime error."); } if (error2 != null) { throw new Exception("Error with Hate Model during Runtime.\n"); } var hate_prob = hate_coremlOutput.Output1; Console.WriteLine("Hate Probability Score: "); Console.WriteLine(hate_prob); // List<double> predicted_labels = new List<double>(); // string hate_speech_var = "NOT HATE SPEECH"; predicted_labels.Add(hate_prob[0].DoubleValue); predicted_labels.Add(hate_prob[1].DoubleValue); if (hate_prob[0].DoubleValue > hate_prob[1].DoubleValue) { hate_speech_var = "OFFENSIVE"; prob = hate_prob[0].DoubleValue; } else { prob = hate_prob[1].DoubleValue; } } string title_text = "TEXT IS " + hate_speech_var; string body_text = "Probability of being " + hate_speech_var + " is " + prob.ToString(); var okAlertController = UIAlertController.Create(title_text, body_text, UIAlertControllerStyle.Alert); //Add Action okAlertController.AddAction(UIAlertAction.Create("OK", UIAlertActionStyle.Default, null)); // Present Alert PresentViewController(okAlertController, true, null); }
static void Main(string[] args) { // Create readable file var currentDirectory = Directory.GetCurrentDirectory(); var dataFilePath = $"{currentDirectory}/Data/Custom/full7z-mlteast-en-modified.lem"; using (var fstream = File.OpenRead(dataFilePath)) { var lemmatizer = new Lemmatizer(fstream); // add examples var examples = new List <Tuple <string, string> >() { /*new Tuple<string,string>("acting","act"), * new Tuple<string,string>("balled","ball"), * new Tuple<string,string>("balled","ball"), * new Tuple<string,string>("ballsed","balls"), * new Tuple<string,string>("bogged","bog"), * new Tuple<string,string>("bottomed","bottom"), * new Tuple<string,string>("bounced","bounce"), * new Tuple<string,string>("boxed","box"), * new Tuple<string,string>("brought","bring"), * new Tuple<string,string>("cashed","cash"), * new Tuple<string,string>("clouded","cloud"), * new Tuple<string,string>("cozied","cozy"), * new Tuple<string,string>("divided","divide"), * new Tuple<string,string>("felt","feel"), * new Tuple<string,string>("fiddling","fiddle"), * new Tuple<string,string>("fishing","fish"), * new Tuple<string,string>("fleshed","flesh"), * new Tuple<string,string>("fobbed","fob"), * new Tuple<string,string>("following","follow"), * new Tuple<string,string>("homing","home"), * new Tuple<string,string>("hunkered","hunker"), * new Tuple<string,string>("leveled","level"), * new Tuple<string,string>("laid","lay"), * new Tuple<string,string>("limbered","limber"), * new Tuple<string,string>("livened","liven"), * new Tuple<string,string>("livened","liven"), * new Tuple<string,string>("loaded","load"), * new Tuple<string,string>("magicked","magic"), * new Tuple<string,string>("messing","mess"), * new Tuple<string,string>("meted","mete"), * new Tuple<string,string>("mouthing","mouth"), * new Tuple<string,string>("perked","perk"), * new Tuple<string,string>("pootling","pootle"), * new Tuple<string,string>("sacked","sack"), * new Tuple<string,string>("screwing","screw"), * new Tuple<string,string>("sexed","sex"), * new Tuple<string,string>("shacked","shack"), * new Tuple<string,string>("speeded","speed"), * new Tuple<string,string>("spirited","spirit"), * new Tuple<string,string>("started","start"), * new Tuple<string,string>("stove","stave"), * new Tuple<string,string>("swung","swing"), * new Tuple<string,string>("teed","tee"), * new Tuple<string,string>("tired","tire"), * new Tuple<string,string>("used","use"), * new Tuple<string,string>("vacuumed","vacuum"), * new Tuple<string,string>("whiled","while"), * new Tuple<string,string>("wigged","wig"), * new Tuple<string,string>("zoned","zone"),*/ new Tuple <string, string>("don't", "do"), new Tuple <string, string>("doesn't", "do"), new Tuple <string, string>("didn't", "did"), new Tuple <string, string>("won't", "will"), new Tuple <string, string>("shan't", "shall"), new Tuple <string, string>("can't", "can"), new Tuple <string, string>("couldn't", "could"), new Tuple <string, string>("wouldn't", "would"), new Tuple <string, string>("shouldn't", "should"), new Tuple <string, string>("mustn't", "must"), new Tuple <string, string>("mightn't", "might"), new Tuple <string, string>("oughtn't", "ought"), new Tuple <string, string>("needn't", "need"), new Tuple <string, string>("aren't", "are"), new Tuple <string, string>("isn't", "be"), new Tuple <string, string>("wasn't", "be"), new Tuple <string, string>("weren't", "be"), new Tuple <string, string>("haven't", "have"), new Tuple <string, string>("hasn't", "have"), new Tuple <string, string>("hadn't", "have"), new Tuple <string, string>("'s", "'s"), new Tuple <string, string>("'ve", "have"), new Tuple <string, string>("'m", "be"), new Tuple <string, string>("'re", "be"), new Tuple <string, string>("'ll", "will"), }; foreach (var example in examples) { var lemma = lemmatizer.Lemmatize(example.Item1); Console.WriteLine("{0} --> {1} {2}", example.Item1, lemma, lemma != example.Item2 ? ("!= " + example.Item2):""); } } Console.WriteLine("=========="); Console.WriteLine("OK"); Console.ReadLine(); }
/*public bool IsMatch(string sentence, PhrasalVerb phrasalVerb) * { * var tokens = tokenizer.Tokenize(sentence); * var pv = MatchingPhrasalVerbs(sentence, new List<PhrasalVerb>() {phrasalVerb}); * return pv.Any(); * }*/ /*public List<PhrasalVerb> MatchingPhrasalVerbs(string sentence, List<PhrasalVerb> phrasalVerbs) * { * // tokenize sentence * var tokens = tokenizer.Tokenize(sentence); * var taggedWords = tagger.Tag(tokens)/*.Where(t => Regex.IsMatch(t, "[A-Z]+")).ToList()#1#; * // create parse tree * var parse = parser.DoParse(tokens); * // retrieve dependencies * var dependencies = ComputeDependencies(parse).ToList(); * * // compute matching phrasal verbs * var matchingPhrasalVerbs = new List<PhrasalVerb>(); * foreach (var phrasalVerb in phrasalVerbs) * { * // get relevant dependencies found * var parts = phrasalVerb.Name.Split(' '); * var root = parts.First(); * // find dependencies for this root * var relevantDepedencies = dependencies * .Where( * d => * ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()), * StringComparison.InvariantCultureIgnoreCase) && d.Gov().Index() < d.Dep().Index()) || || (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()), || StringComparison.InvariantCultureIgnoreCase) && d.Dep().Index() < d.Gov().Index())) || && (!phrasalVerb.Inseparable || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1) || // for non separable verbs || && (!phrasalVerb.SeparableMandatory || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1) || // for separable mandatory verbs || //&& d.Gov().Index() >= 1 && IsVerb(taggedWords[d.Gov().Index() - 1]) || ) || .ToList(); || || // We take only the 2nd part || // For phrasal verbs with several particles, that's a good approximation for now || // (we could check that all the particles are also linked) || if (relevantDepedencies.Any() && parts.Count() > 1) || { || var particle1 = parts[1]; || var prtDependencies = relevantDepedencies.Where(d => d.Reln().GetShortName() == "prt").ToList(); || if (prtDependencies.Any()) || { || // if root has a prt dependency, don't look at other relations || if (prtDependencies || .Any(d => string.Equals(particle1, d.Dep().GetWord(),StringComparison.InvariantCultureIgnoreCase) || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase))) || { || matchingPhrasalVerbs.Add(phrasalVerb); || } || } || else || { || // otherwise, look at all the other relations || var relevantRelationships = relevantDepedencies || .Where(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase) || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase)) || .ToList(); || if (relevantRelationships.Any()) || { || matchingPhrasalVerbs.Add(phrasalVerb); || } || } || } || } || return matchingPhrasalVerbs; ||} || ||private IEnumerable<TypedDependency> ComputeDependencies(Parse parse) ||{ || // Extract dependencies from lexical tree || var tlp = new PennTreebankLanguagePack(); || var gsf = tlp.GrammaticalStructureFactory(); || var tree = new ParseTree(parse); || try || { || var gs = gsf.NewGrammaticalStructure(tree); || return gs.TypedDependencies(); || } || catch (Exception) || { || Console.WriteLine("Exception when computing deps for {0}", parse); || return new List<TypedDependency>(); || } ||}*/ public List <PhrasalVerb> MatchingPhrasalVerbs(string sentence, List <PhrasalVerb> phrasalVerbs) { // tokenize sentence var tokens = tokenizer.Tokenize(sentence); // create parse tree var parse = parser.DoParse(tokens); // retrieve dependencies var dependencies = ComputeDependencies(parse).ToList(); var matchingPhrasalVerbs = new List <PhrasalVerb>(); foreach (var phrasalVerb in phrasalVerbs) { // get relevant dependencies found var parts = phrasalVerb.Name.Split(' ').ToList(); var root = parts.First(); // find dependencies for this root var rootRelatedDependencies = dependencies .Where(d => // the (lemmatized) token must be equal to the gov/dep of the dependency ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()), StringComparison.InvariantCultureIgnoreCase) && d.Gov().Index() < d.Dep().Index()) || (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()), StringComparison.InvariantCultureIgnoreCase) && d.Dep().Index() < d.Gov().Index())) // if the phrasal verb is inseparable, no word must be between the root and the particle && (!phrasalVerb.Inseparable.HasValue || (!phrasalVerb.Inseparable.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1)) // if the phrasal verb is mandatory seprable, at least one word must be between the root and the particle && (!phrasalVerb.SeparableMandatory.HasValue || (!phrasalVerb.SeparableMandatory.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1)) ) .ToList(); // We take only the 2nd part // For phrasal verbs with several particles, that's a good approximation for now // (we could check that all the particles are also linked) if (rootRelatedDependencies.Any() && parts.Count() > 1) { var particle1 = parts[1]; var relevantDependencies = rootRelatedDependencies.Where(d => d.Reln().GetShortName() == "prt").ToList(); if (!relevantDependencies.Any()) { // if no "prt" relation, take all relations whatsoever. relevantDependencies = rootRelatedDependencies; } // if one of relevant dependencies have the particle as gov/dep, it's good! var rootParticle1Dependency = relevantDependencies .FirstOrDefault(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase) || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase)); if (rootParticle1Dependency != null && !AreWordSeparatedInSentence(rootParticle1Dependency, dependencies)) { var remainingParts = parts.Skip(2).ToList(); var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1; var endOfSentenceTokens = tokens.Skip(lastTokenIndex + 1).ToList(); if (endOfSentenceTokens.Any()) { for (var i = 0; i < endOfSentenceTokens.Count; i++) { if (i < remainingParts.Count) { if (!string.Equals(remainingParts[i], endOfSentenceTokens[i], StringComparison.InvariantCultureIgnoreCase)) { // no match, get out of the loop break; } } else { // all the remaining parts were included in the remaining tokens --> OK matchingPhrasalVerbs.Add(phrasalVerb); } } } else { // if there is no remaining parts, the phrasal verb matches if (!remainingParts.Any()) { matchingPhrasalVerbs.Add(phrasalVerb); } } /*if (parts.Count <= 2) * { * // phrasal verb has 1 particle only; we're done * matchingPhrasalVerbs.Add(phrasalVerb); * } * else * { * // otherwise, check that the other particles are in the sentence (approximation) * var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1; * var endOfSentenceTokens = tokens.Skip(lastTokenIndex).ToList(); * if (parts.Skip(2).All(endOfSentenceTokens.Contains)) * { * matchingPhrasalVerbs.Add(phrasalVerb); * } * }*/ } } } return(matchingPhrasalVerbs); }
public void LemmatizeTest() { Lemmatizer lemmatizer = new Lemmatizer(); string input, expected, actual, p; List<string> inputs = new List<string>() { "کتابها", "آتشفشان", "میروم", "گفته شده است", "نچشیده است", "مردم", "اجتماعی" }; List<string> expecteds = new List<string>() { "کتاب", "آتشفشان", "رفت#رو", "گفت#گو", "چشید#چش", "مردم", "اجتماعی" }; List<string> pos = new List<string>() { null, null, null, null, null, "N", "AJ" }; for (var i = 0; i < inputs.Count; i++) { input = inputs[i]; expected = expecteds[i]; p = pos[i]; if (p == null) actual = lemmatizer.Lemmatize(input); else actual = lemmatizer.Lemmatize(input, p); Assert.AreEqual(expected, actual, "Failed to lematize of '" + input + "' word"); } }
private string[] GetWords(string[] requirementsTokens) { return(_filter.Filter(requirementsTokens).Select(word => _lemmatizer.Lemmatize(word)).ToArray()); }
public string Lemmatize(string word) { return(lemmatizer.Lemmatize(word)); }