public document(string title, string content, string rg, int c) { var line = title.Split(new[] { @"\", "." }, StringSplitOptions.None); Title = line[line.Length - 2]; Content = content; StopWords = new [] { "" }; IStemmer stemmer = new EnglishStemmer(); var regex = ""; if (rg == null) { regex = @"[A-Za-z\-]+"; } else { regex = rg; } var valueEnumerable = Regex.Matches(content.ToLower(), regex); var lt = valueEnumerable.Cast <Match>().Select(match => match.Value).ToList(); count = lt.Count; ListWorld = valueEnumerable.Cast <Match>().Select(match => match.Value). ToList().Except(StopWords).OrderBy(a => a).ToList(); ListWorld = ListWorld.ToList().ConvertAll(d => stemmer.Stem(d.ToLower())); }
public document(string title, string content, Stopword st, string rg, int s) { Title = title; Content = content.ToLower(); IStemmer stemmer = new EnglishStemmer(); var regex = ""; if (rg == null) { regex = @"[A-Za-z\-]+"; } else { regex = rg; } var valueEnumerable = Regex.Matches(content, regex); var lt = valueEnumerable.Cast <Match>().Select(match => match.Value).ToList(); count = lt.Count; StopWords = st.Lst.ToArray(); ListWorld = valueEnumerable.Cast <Match>().Select(match => match.Value). ToList().ConvertAll(a => a.ToLower()).Except(StopWords).OrderBy(a => a); ListWorld = ListWorld.ToList().ConvertAll(d => stemmer.Stem(d.ToLower())); }
// Inverse Documnet Frequency public double IDF(string palavra) { IStemmer stemmer = new EnglishStemmer(); Tokenizer TK = new Tokenizer(); int count = 1; if (stemm) { foreach (string s in documentos) { string str = stemmer.Stem(s); if (TK.Tokenize(s).Contains(palavra)) { count++; } } } else { foreach (string s in documentos) { if (TK.Tokenize(s).Contains(palavra)) { count++; } } } return(Math.Log(documentos.Count + 1 / Convert.ToDouble(count))); }
/// <summary> /// Получение хэшей шинглов /// </summary> /// <param name="source"></param> /// <param name="shingleLength"></param> /// <returns></returns> private static HashSet <string> GetShingles(ref string source, int shingleLength) { var stemmer = new EnglishStemmer(); var split = source.Split(Delims.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); for (var index = 0; index < split.Length; index++) { split[index] = stemmer.Stem(split[index]); } var shingles = new HashSet <string>(); var tmp = ""; if (split.Length < shingleLength) { return(shingles); } for (var j = 0; j < shingleLength; j++) { tmp += split[j]; } for (var i = shingleLength; i < split.Length; i++) { var sb = new StringBuilder(tmp, split[i - shingleLength].Length, tmp.Length - split[i - shingleLength].Length, 100 * shingleLength); sb.Append(split[i]); shingles.Add(tmp = sb.ToString()); } return(shingles); }
public List <string> StemmerTest() { var es = new EnglishStemmer(); return((from s in new string[] { "computing", "computer", "compute", "computation" } select es.Stem(s)).ToList()); }
private Dictionary <string, int> refineAndGroupTerms(List <string> terms, int num) { List <String> retList = new List <string>(); if (terms != null && terms.Length > 0) { foreach (string t in terms) { String refinedTerm = new EnglishStemmer().Stem(t.Trim()); if (refinedTerm.Length > 1) { retList.Add(refinedTerm); } } if (num > 0) { Dictionary <String, int> most = retList.GroupBy(o => o).OrderByDescending(grp => grp.Count()) .Select(grp => grp).Take(num) .ToDictionary(r => r.Key, r => r.Count() * 1000); return(most); } else { return(null); } } else { return(null); } }
private void copy_from(EnglishStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; copy_from(other); }
private string[] StemmWords(string[] input) { IStemmer stemmer = new EnglishStemmer(); var stemmedWords = stemmer.GetSteamWords(input); return(stemmedWords); }
private void calculoTFIDF(List <string> wordsLimpa) { IStemmer stemmer = new EnglishStemmer(); Dictionary <string, double> TF = TermFreq(wordsLimpa); Dictionary <string, double> tf_idf = new Dictionary <string, double>(); if (stemm) { foreach (string s in TF.Keys) { tf_idf.Add(s, TF[s] * IDF(stemmer.Stem(s))); } } else { foreach (string s in TF.Keys) { tf_idf.Add(s, TF[s] * IDF(s)); } } listBox1.Items.Clear(); foreach (var item in tf_idf.OrderByDescending(r => r.Value)) { listBox1.Items.Add(item.Key + " : " + item.Value); } }
public CommentService(CommentRepository cr, EnglishStemmer es, WordRepository wr, ResourceRepository rr) { _commentRepo = cr; _stemmer = es; _wRepo = wr; _rRepo = rr; }
private void copy_from(EnglishStemmer other) { _bYFound = other._bYFound; _p2 = other._p2; _p1 = other._p1; copy_from(other); }
public static List <List <string> > EnglishStemming(List <List <string> > doc_words) { EnglishStemmer stemmer = new EnglishStemmer(); List <string> stemmed_words = new List <string>(); List <List <string> > stem_docs = new List <List <string> >(); List <string> test = new List <string>(); //string sorting; int i = 0; foreach (List <string> doc in doc_words) { stemmed_words = test.Select(x => x).Distinct().ToList(); stem_docs.Add(stemmed_words); foreach (string word in doc) { if (word.Length > 0) { var stem_word = stemmer.Stem(word); if (!stemmed_words.Contains(word)) { stemmed_words.Add(stem_word); } i++; } } } return(stem_docs); }//Stem text in English--------------------------------------
/// <summary> /// Parses and tokenizes a list of documents, returning a vocabulary of words. /// </summary> /// <param name="docs">string[]</param> /// <param name="stemmedDocs">List of List of string</param> /// <returns>Vocabulary (list of strings)</returns> public List <string> GetVocabulary(string doc) { List <string> vocabulary = new List <string>(); Dictionary <string, int> wordCountList = new Dictionary <string, int>(); int docIndex = 0; { docIndex++; if (docIndex % 100 == 0) { Console.WriteLine("Processing " + docIndex + "/" + doc.Length); } string[] parts2 = Tokenize(doc); List <string> words = new List <string>(); foreach (string part in parts2) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); if (!StopWords.stopWordsList.Contains(stripped.ToLower())) { try { //var english = new EnglishWord(stripped); EnglishStemmer stemmer = new EnglishStemmer(); string stem = stemmer.GetSteamWord(stripped); words.Add(stem); if (stem.Length > 0) { // Build the word count list. if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } } } catch { } } } } // Get the top words. var vocabList = wordCountList; foreach (var item in vocabList) { vocabulary.Add(item.Key); } return(vocabulary); }
public void English_BaseTest() { EnglishStemmer stemmer = new EnglishStemmer(); Assert.AreEqual("do", stemmer.Stem("doing")); Assert.AreEqual("andes", stemmer.Stem("andes")); Assert.AreEqual("coincidenti", stemmer.Stem("coincidential")); Assert.AreEqual("ration", stemmer.Stem("rationalism")); Assert.AreEqual("caress", stemmer.Stem("caresses")); Assert.AreEqual("fli", stemmer.Stem("flies")); Assert.AreEqual("die", stemmer.Stem("dies")); Assert.AreEqual("mule", stemmer.Stem("mules")); Assert.AreEqual("deni", stemmer.Stem("denied")); Assert.AreEqual("die", stemmer.Stem("died")); Assert.AreEqual("agre", stemmer.Stem("agreed")); Assert.AreEqual("own", stemmer.Stem("owned")); Assert.AreEqual("humbl", stemmer.Stem("humbled")); Assert.AreEqual("size", stemmer.Stem("sized")); Assert.AreEqual("meet", stemmer.Stem("meeting")); Assert.AreEqual("state", stemmer.Stem("stating")); Assert.AreEqual("siez", stemmer.Stem("siezing")); Assert.AreEqual("item", stemmer.Stem("itemization")); Assert.AreEqual("sensat", stemmer.Stem("sensational")); Assert.AreEqual("tradit", stemmer.Stem("traditional")); Assert.AreEqual("refer", stemmer.Stem("reference")); Assert.AreEqual("colon", stemmer.Stem("colonizer")); Assert.AreEqual("plot", stemmer.Stem("plotted")); }
//steeming only one word public static string Stemming(string word) { StemmerBase englishStemmer = new EnglishStemmer(); string stemWord = englishStemmer.Stem(word); return(stemWord); }
protected internal virtual void copy_from(EnglishStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; base.copy_from(other); }
static void Main(string[] args) { var reader = new ExcelReader("TrainingData.xls"); var sheet = reader.GetWorksheet("Training").ToVector <string>("Sentiment"); var tokeinzed = sheet.Tokenize(); var stemmer = new EnglishStemmer(); var i = 0; Console.WriteLine("Loading..."); foreach (string[] strings in tokeinzed) { foreach (string word in strings) { Console.WriteLine($"{word} - {stemmer.Stem(word)} "); } Console.WriteLine(); Console.WriteLine(); i++; if (i == 10) { break; } } Console.Read(); }
public static string Stem(string word) { var stemmer = new EnglishStemmer(); stemmer.Current = word; stemmer.Stem(); return(stemmer.Current); }
private static string AplicarStemming(string texto) { var stemmer = new EnglishStemmer(); stemmer.SetCurrent(texto); stemmer.Stem(); return(stemmer.GetCurrent()); }
public static string GetStemmedQuery(this String query) { var stemmer = new EnglishStemmer(); stemmer.SetCurrent(query); stemmer.Stem(); return(stemmer.GetCurrent()); }
public static string[] QueryStemmer(string[] input) { //create stemmer Stemmer stemmer = new EnglishStemmer(); //store stemmed words in string array string[] queryStems = stemmer.GetSteamWords(input); return(queryStems); }
public static List <string> GetVocabulary(string doc, out List <string> stemmedDoc, int vocabularyThreshold) { List <string> vocabulary = new List <string>(); Dictionary <string, int> wordCountList = new Dictionary <string, int>(); stemmedDoc = new List <string>(); string[] parts2 = GenerateKeywordsList(doc); List <string> words = new List <string>(); foreach (string part in parts2) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); //if (!StopWords.stopWordsList.Contains(stripped.ToLower())) //{ try { var stemmer = new EnglishStemmer(); var stem = stemmer.Stem(stripped); words.Add(stem); if (stem.Length > 0) { // Build the word count list. if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch { } //} } // Get the top words. var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return(vocabulary); }
//This Method receives an array of strings (sentences) and gets all the words from it. private OutputWords GetWordsFromSentences(string[] sentences) { //For this case, we initialize an English Stemmer, but there are other languages stemmers too. EnglishStemmer stemmer = new EnglishStemmer(); OutputWords SampleOutput = new OutputWords(); SampleOutput.Results = new List <WordItem>(); foreach (string sentence in sentences) { int index = Array.IndexOf(sentences, sentence); //Get all the words from the sentence, deleting punctuation marks. var punctuation = sentence.Where(Char.IsPunctuation).Distinct().ToArray(); var words = sentence.Split().Select(x => x.Trim(punctuation)); foreach (string word in words) { string stemmedWord = stemmer.Stem(word); //Check if the word is one of the words we don't want to include in the analysis or the result set. if (!StopWordsHelper.isStopword(word) && !String.IsNullOrWhiteSpace(word)) { //Check if we have evaluated the Unique word. if (SampleOutput.Results.Any(x => stemmer.Stem(x.Word) == stemmedWord)) { //Increments the TotalOcurrences value of the word Item by 1 and adds the sentence index value to the SentenceIndexes list if it hasn't already been added. SampleOutput.Results.Where(x => stemmer.Stem(x.Word) == stemmedWord).ToList() .ForEach(x => { x.TotalOcurrences++; if (x.SentenceIndexes.LastOrDefault() != index) { x.SentenceIndexes.Add(index); } }); } else { //Create a new word Item. WordItem item = new WordItem(word, 1, index); //Adds the item to the results list. SampleOutput.Results.Add(item); } } } } ; //Orders alphabetically the results list word items. SampleOutput.Results = SampleOutput.Results.OrderBy(x => x.Word.ToLower()).ToList(); return(SampleOutput); }
string StemByRuEn(string text) { var r = new RussianStemmer().Stem(text); var r2 = new EnglishStemmer().Stem(text); if (r.Length > r2.Length)//we chosee the text which have deleted more { return(r2); } return(r); }
/*****************************************************/ /*************** PRIVATE METHODS ***************/ /*****************************************************/ /// <summary> /// Trims punctuation and spacing off of the supplied word and if specified, /// also stems the word /// </summary> /// <param name="word">The word to be simplified</param> /// <param name="stemWord">If true, stems the word</param> /// <returns>The trimmed word</returns> private string SimplifyWord(string word, bool stemWord) { string trimmedWord = word.Trim().ToLower().TrimEnd(',', ':', ';', '.', '!', '?', 's'); if (stemWord) { trimmedWord = new EnglishStemmer().Stem(trimmedWord); } return(trimmedWord); }
//stem all words - produces a base string in an attempt to represent related words public static List <string> StemmWords(List <string> words) { List <string> Stemmed = new List <string>(); StemmerBase englishStemmer = new EnglishStemmer(); foreach (string word in words) { Stemmed.Add(englishStemmer.Stem(word)); } return(Stemmed); }
public document(string title, string content) { var line = title.Split(new string[] { @"\", "." }, StringSplitOptions.None); Title = line[line.Length - 2]; Content = content; IStemmer stemmer = new EnglishStemmer(); const string regex = @"[A-Za-z\-]+"; var valueEnumerable = Regex.Matches(content, regex); ListWorld = valueEnumerable.Cast <Match>().Select(match => match.Value).ToList().Except(stop_words).OrderBy(a => a); ListWorld = ListWorld.ToList().ConvertAll(d => stemmer.Stem(d.ToLower())); }
public static void LoadAnalyzationData(string sent, string token, string pos, string chunker, string tags, string nouns) { analyzer = new AggregateAnalyzer { sent, token, pos, chunker }; wordStemmer = new EnglishStemmer(); tripletService = new TripletService(new ReplyTripletService(), new QuestionTripletService()); string[] lines = File.ReadAllLines(tags); POSTagValues = JObject.Parse(lines[0]); LowValueNouns = new List <string>(File.ReadAllLines(nouns)); }
public List <Article> CheckKeywordPosition(string keyword, List <Article> articles) { EnglishStemmer stemmer = new EnglishStemmer(); string stemmedWord = stemmer.Stem(keyword); int max = 100; int min = 1000; foreach (Article article in articles) { int position = 100; int i = 0; while (i < article.Words.Count) { if (keyword == article.Words[i]) { if (i >= 100) { position = 0; break; } else { position -= i; break; } } i++; } article.AllCharacteristicValues.Add(position); if (min > position) { min = position; } } int elements = articles[0].AllCharacteristicValues.Count; foreach (Article article in articles) { double oldValue = article.AllCharacteristicValues.Last(); double newValue = CalcMinMaxNormalization(oldValue, max, min); article.AllCharacteristicValues[elements - 1] = newValue; } return(articles); }
//Snowball Stemmer. Stems an input string array according to the English language. private string[] StringStemmer(string[] stringArray) { var stemmer = new EnglishStemmer(); for (int i = 0; i < stringArray.Length; i++) { stemmer.SetCurrent(stringArray[i]); if (stemmer.Stem()) { stringArray[i] = stemmer.GetCurrent(); } } return(stringArray); }
private void copy_from(EnglishStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; base.copy_from(other); }