예제 #1
0
        public document(string title, string content, string rg, int c)
        {
            var line = title.Split(new[] { @"\", "." }, StringSplitOptions.None);

            Title     = line[line.Length - 2];
            Content   = content;
            StopWords = new [] { "" };
            IStemmer stemmer = new EnglishStemmer();
            var      regex   = "";

            if (rg == null)
            {
                regex = @"[A-Za-z\-]+";
            }
            else
            {
                regex = rg;
            }
            var valueEnumerable = Regex.Matches(content.ToLower(), regex);
            var lt = valueEnumerable.Cast <Match>().Select(match => match.Value).ToList();

            count     = lt.Count;
            ListWorld = valueEnumerable.Cast <Match>().Select(match => match.Value).
                        ToList().Except(StopWords).OrderBy(a => a).ToList();
            ListWorld = ListWorld.ToList().ConvertAll(d => stemmer.Stem(d.ToLower()));
        }
예제 #2
0
        public document(string title, string content, Stopword st, string rg, int s)
        {
            Title = title;

            Content = content.ToLower();
            IStemmer stemmer = new EnglishStemmer();
            var      regex   = "";

            if (rg == null)
            {
                regex = @"[A-Za-z\-]+";
            }
            else
            {
                regex = rg;
            }
            var valueEnumerable = Regex.Matches(content, regex);
            var lt = valueEnumerable.Cast <Match>().Select(match => match.Value).ToList();

            count     = lt.Count;
            StopWords = st.Lst.ToArray();
            ListWorld = valueEnumerable.Cast <Match>().Select(match => match.Value).
                        ToList().ConvertAll(a => a.ToLower()).Except(StopWords).OrderBy(a => a);
            ListWorld = ListWorld.ToList().ConvertAll(d => stemmer.Stem(d.ToLower()));
        }
        // Inverse Documnet Frequency
        public double IDF(string palavra)
        {
            IStemmer  stemmer = new EnglishStemmer();
            Tokenizer TK      = new Tokenizer();
            int       count   = 1;

            if (stemm)
            {
                foreach (string s in documentos)
                {
                    string str = stemmer.Stem(s);
                    if (TK.Tokenize(s).Contains(palavra))
                    {
                        count++;
                    }
                }
            }
            else
            {
                foreach (string s in documentos)
                {
                    if (TK.Tokenize(s).Contains(palavra))
                    {
                        count++;
                    }
                }
            }
            return(Math.Log(documentos.Count + 1 / Convert.ToDouble(count)));
        }
예제 #4
0
        /// <summary>
        /// Получение хэшей шинглов
        /// </summary>
        /// <param name="source"></param>
        /// <param name="shingleLength"></param>
        /// <returns></returns>
        private static HashSet <string> GetShingles(ref string source, int shingleLength)
        {
            var stemmer = new EnglishStemmer();
            var split   = source.Split(Delims.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

            for (var index = 0; index < split.Length; index++)
            {
                split[index] = stemmer.Stem(split[index]);
            }
            var shingles = new HashSet <string>();
            var tmp      = "";

            if (split.Length < shingleLength)
            {
                return(shingles);
            }
            for (var j = 0; j < shingleLength; j++)
            {
                tmp += split[j];
            }
            for (var i = shingleLength; i < split.Length; i++)
            {
                var sb = new StringBuilder(tmp, split[i - shingleLength].Length,
                                           tmp.Length - split[i - shingleLength].Length, 100 * shingleLength);
                sb.Append(split[i]);
                shingles.Add(tmp = sb.ToString());
            }
            return(shingles);
        }
        public List <string> StemmerTest()
        {
            var es = new EnglishStemmer();

            return((from s in new string[] { "computing", "computer", "compute", "computation" }
                    select es.Stem(s)).ToList());
        }
예제 #6
0
        private Dictionary <string, int> refineAndGroupTerms(List <string> terms, int num)
        {
            List <String> retList = new List <string>();

            if (terms != null && terms.Length > 0)
            {
                foreach (string t in terms)
                {
                    String refinedTerm = new EnglishStemmer().Stem(t.Trim());
                    if (refinedTerm.Length > 1)
                    {
                        retList.Add(refinedTerm);
                    }
                }

                if (num > 0)
                {
                    Dictionary <String, int> most = retList.GroupBy(o => o).OrderByDescending(grp => grp.Count())
                                                    .Select(grp => grp).Take(num)
                                                    .ToDictionary(r => r.Key, r => r.Count() * 1000);

                    return(most);
                }
                else
                {
                    return(null);
                }
            }
            else
            {
                return(null);
            }
        }
예제 #7
0
 private void copy_from(EnglishStemmer other)
 {
     B_Y_found = other.B_Y_found;
     I_p2      = other.I_p2;
     I_p1      = other.I_p1;
     copy_from(other);
 }
예제 #8
0
        private string[] StemmWords(string[] input)
        {
            IStemmer stemmer      = new EnglishStemmer();
            var      stemmedWords = stemmer.GetSteamWords(input);

            return(stemmedWords);
        }
        private void calculoTFIDF(List <string> wordsLimpa)
        {
            IStemmer stemmer = new EnglishStemmer();
            Dictionary <string, double> TF     = TermFreq(wordsLimpa);
            Dictionary <string, double> tf_idf = new Dictionary <string, double>();

            if (stemm)
            {
                foreach (string s in TF.Keys)
                {
                    tf_idf.Add(s, TF[s] * IDF(stemmer.Stem(s)));
                }
            }
            else
            {
                foreach (string s in TF.Keys)
                {
                    tf_idf.Add(s, TF[s] * IDF(s));
                }
            }
            listBox1.Items.Clear();
            foreach (var item in tf_idf.OrderByDescending(r => r.Value))
            {
                listBox1.Items.Add(item.Key + " : " + item.Value);
            }
        }
 public CommentService(CommentRepository cr, EnglishStemmer es, WordRepository wr, ResourceRepository rr)
 {
     _commentRepo = cr;
     _stemmer     = es;
     _wRepo       = wr;
     _rRepo       = rr;
 }
예제 #11
0
 private void copy_from(EnglishStemmer other)
 {
     _bYFound = other._bYFound;
     _p2      = other._p2;
     _p1      = other._p1;
     copy_from(other);
 }
예제 #12
0
        public static List <List <string> > EnglishStemming(List <List <string> > doc_words)
        {
            EnglishStemmer        stemmer       = new EnglishStemmer();
            List <string>         stemmed_words = new List <string>();
            List <List <string> > stem_docs     = new List <List <string> >();
            List <string>         test          = new List <string>();
            //string sorting;

            int i = 0;

            foreach (List <string> doc in doc_words)
            {
                stemmed_words = test.Select(x => x).Distinct().ToList();
                stem_docs.Add(stemmed_words);

                foreach (string word in doc)
                {
                    if (word.Length > 0)
                    {
                        var stem_word = stemmer.Stem(word);
                        if (!stemmed_words.Contains(word))
                        {
                            stemmed_words.Add(stem_word);
                        }
                        i++;
                    }
                }
            }
            return(stem_docs);
        }//Stem text in English--------------------------------------
예제 #13
0
        /// <summary>
        /// Parses and tokenizes a list of documents, returning a vocabulary of words.
        /// </summary>
        /// <param name="docs">string[]</param>
        /// <param name="stemmedDocs">List of List of string</param>
        /// <returns>Vocabulary (list of strings)</returns>
        public List <string> GetVocabulary(string doc)
        {
            List <string>            vocabulary    = new List <string>();
            Dictionary <string, int> wordCountList = new Dictionary <string, int>();
            int docIndex = 0;

            {
                docIndex++;

                if (docIndex % 100 == 0)
                {
                    Console.WriteLine("Processing " + docIndex + "/" + doc.Length);
                }

                string[] parts2 = Tokenize(doc);

                List <string> words = new List <string>();
                foreach (string part in parts2)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                    if (!StopWords.stopWordsList.Contains(stripped.ToLower()))
                    {
                        try
                        {
                            //var english = new EnglishWord(stripped);
                            EnglishStemmer stemmer = new EnglishStemmer();
                            string         stem    = stemmer.GetSteamWord(stripped);
                            words.Add(stem);

                            if (stem.Length > 0)
                            {
                                // Build the word count list.
                                if (wordCountList.ContainsKey(stem))
                                {
                                    wordCountList[stem]++;
                                }
                                else
                                {
                                    wordCountList.Add(stem, 0);
                                }
                            }
                        }
                        catch
                        {
                        }
                    }
                }
            }

            // Get the top words.
            var vocabList = wordCountList;

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }
            return(vocabulary);
        }
예제 #14
0
        public void English_BaseTest()
        {
            EnglishStemmer stemmer = new EnglishStemmer();

            Assert.AreEqual("do", stemmer.Stem("doing"));
            Assert.AreEqual("andes", stemmer.Stem("andes"));
            Assert.AreEqual("coincidenti", stemmer.Stem("coincidential"));
            Assert.AreEqual("ration", stemmer.Stem("rationalism"));

            Assert.AreEqual("caress", stemmer.Stem("caresses"));
            Assert.AreEqual("fli", stemmer.Stem("flies"));
            Assert.AreEqual("die", stemmer.Stem("dies"));
            Assert.AreEqual("mule", stemmer.Stem("mules"));
            Assert.AreEqual("deni", stemmer.Stem("denied"));
            Assert.AreEqual("die", stemmer.Stem("died"));
            Assert.AreEqual("agre", stemmer.Stem("agreed"));
            Assert.AreEqual("own", stemmer.Stem("owned"));
            Assert.AreEqual("humbl", stemmer.Stem("humbled"));
            Assert.AreEqual("size", stemmer.Stem("sized"));
            Assert.AreEqual("meet", stemmer.Stem("meeting"));
            Assert.AreEqual("state", stemmer.Stem("stating"));
            Assert.AreEqual("siez", stemmer.Stem("siezing"));
            Assert.AreEqual("item", stemmer.Stem("itemization"));
            Assert.AreEqual("sensat", stemmer.Stem("sensational"));
            Assert.AreEqual("tradit", stemmer.Stem("traditional"));
            Assert.AreEqual("refer", stemmer.Stem("reference"));
            Assert.AreEqual("colon", stemmer.Stem("colonizer"));
            Assert.AreEqual("plot", stemmer.Stem("plotted"));
        }
        //steeming only one word
        public static string Stemming(string word)
        {
            StemmerBase englishStemmer = new EnglishStemmer();
            string      stemWord       = englishStemmer.Stem(word);

            return(stemWord);
        }
예제 #16
0
 protected internal virtual void copy_from(EnglishStemmer other)
 {
     B_Y_found = other.B_Y_found;
     I_p2      = other.I_p2;
     I_p1      = other.I_p1;
     base.copy_from(other);
 }
예제 #17
0
        static void Main(string[] args)
        {
            var reader    = new ExcelReader("TrainingData.xls");
            var sheet     = reader.GetWorksheet("Training").ToVector <string>("Sentiment");
            var tokeinzed = sheet.Tokenize();
            var stemmer   = new EnglishStemmer();
            var i         = 0;

            Console.WriteLine("Loading...");
            foreach (string[] strings in tokeinzed)
            {
                foreach (string word in strings)
                {
                    Console.WriteLine($"{word} - {stemmer.Stem(word)}  ");
                }

                Console.WriteLine();
                Console.WriteLine();
                i++;

                if (i == 10)
                {
                    break;
                }
            }
            Console.Read();
        }
예제 #18
0
        public static string Stem(string word)
        {
            var stemmer = new EnglishStemmer();

            stemmer.Current = word;
            stemmer.Stem();
            return(stemmer.Current);
        }
예제 #19
0
        private static string AplicarStemming(string texto)
        {
            var stemmer = new EnglishStemmer();

            stemmer.SetCurrent(texto);
            stemmer.Stem();
            return(stemmer.GetCurrent());
        }
예제 #20
0
        public static string GetStemmedQuery(this String query)
        {
            var stemmer = new EnglishStemmer();

            stemmer.SetCurrent(query);
            stemmer.Stem();
            return(stemmer.GetCurrent());
        }
예제 #21
0
        public static string[] QueryStemmer(string[] input)
        {
            //create stemmer
            Stemmer stemmer = new EnglishStemmer();

            //store stemmed words in string array
            string[] queryStems = stemmer.GetSteamWords(input);

            return(queryStems);
        }
예제 #22
0
        public static List <string> GetVocabulary(string doc, out List <string> stemmedDoc, int vocabularyThreshold)
        {
            List <string>            vocabulary    = new List <string>();
            Dictionary <string, int> wordCountList = new Dictionary <string, int>();

            stemmedDoc = new List <string>();

            string[] parts2 = GenerateKeywordsList(doc);

            List <string> words = new List <string>();

            foreach (string part in parts2)
            {
                // Strip non-alphanumeric characters.
                string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                //if (!StopWords.stopWordsList.Contains(stripped.ToLower()))
                //{
                try
                {
                    var stemmer = new EnglishStemmer();
                    var stem    = stemmer.Stem(stripped);
                    words.Add(stem);

                    if (stem.Length > 0)
                    {
                        // Build the word count list.
                        if (wordCountList.ContainsKey(stem))
                        {
                            wordCountList[stem]++;
                        }
                        else
                        {
                            wordCountList.Add(stem, 0);
                        }

                        stemmedDoc.Add(stem);
                    }
                }
                catch
                {
                }
                //}
            }

            // Get the top words.
            var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return(vocabulary);
        }
예제 #23
0
        //This Method receives an array of strings (sentences) and gets all the words from it.
        private OutputWords GetWordsFromSentences(string[] sentences)
        {
            //For this case, we initialize an English Stemmer, but there are other languages stemmers too.
            EnglishStemmer stemmer = new EnglishStemmer();

            OutputWords SampleOutput = new OutputWords();

            SampleOutput.Results = new List <WordItem>();

            foreach (string sentence in sentences)
            {
                int index = Array.IndexOf(sentences, sentence);

                //Get all the words from the sentence, deleting punctuation marks.
                var punctuation = sentence.Where(Char.IsPunctuation).Distinct().ToArray();
                var words       = sentence.Split().Select(x => x.Trim(punctuation));

                foreach (string word in words)
                {
                    string stemmedWord = stemmer.Stem(word);

                    //Check if the word is one of the words we don't want to include in the analysis or the result set.
                    if (!StopWordsHelper.isStopword(word) && !String.IsNullOrWhiteSpace(word))
                    {
                        //Check if we have evaluated the Unique word.
                        if (SampleOutput.Results.Any(x => stemmer.Stem(x.Word) == stemmedWord))
                        {
                            //Increments the TotalOcurrences value of the word Item by 1 and adds the sentence index value to the SentenceIndexes list if it hasn't already been added.
                            SampleOutput.Results.Where(x => stemmer.Stem(x.Word) == stemmedWord).ToList()
                            .ForEach(x =>
                            {
                                x.TotalOcurrences++; if (x.SentenceIndexes.LastOrDefault() != index)
                                {
                                    x.SentenceIndexes.Add(index);
                                }
                            });
                        }
                        else
                        {
                            //Create a new word Item.
                            WordItem item = new WordItem(word, 1, index);

                            //Adds the item to the results list.
                            SampleOutput.Results.Add(item);
                        }
                    }
                }
            }
            ;

            //Orders alphabetically the results list word items.
            SampleOutput.Results = SampleOutput.Results.OrderBy(x => x.Word.ToLower()).ToList();

            return(SampleOutput);
        }
예제 #24
0
파일: Bot.cs 프로젝트: ayazzali/PodCastBot
        string StemByRuEn(string text)
        {
            var r  = new RussianStemmer().Stem(text);
            var r2 = new EnglishStemmer().Stem(text);

            if (r.Length > r2.Length)//we chosee the text which have deleted more
            {
                return(r2);
            }
            return(r);
        }
        /*****************************************************/
        /***************    PRIVATE METHODS    ***************/
        /*****************************************************/

        /// <summary>
        /// Trims punctuation and spacing off of the supplied word and if specified,
        /// also stems the word
        /// </summary>
        /// <param name="word">The word to be simplified</param>
        /// <param name="stemWord">If true, stems the word</param>
        /// <returns>The trimmed word</returns>
        private string SimplifyWord(string word, bool stemWord)
        {
            string trimmedWord = word.Trim().ToLower().TrimEnd(',', ':', ';', '.', '!', '?', 's');

            if (stemWord)
            {
                trimmedWord = new EnglishStemmer().Stem(trimmedWord);
            }

            return(trimmedWord);
        }
        //stem all words - produces a base string in an attempt to represent related words
        public static List <string> StemmWords(List <string> words)
        {
            List <string> Stemmed        = new List <string>();
            StemmerBase   englishStemmer = new EnglishStemmer();

            foreach (string word in words)
            {
                Stemmed.Add(englishStemmer.Stem(word));
            }

            return(Stemmed);
        }
        public document(string title, string content)
        {
            var line = title.Split(new string[] { @"\", "." }, StringSplitOptions.None);

            Title   = line[line.Length - 2];
            Content = content;
            IStemmer     stemmer         = new EnglishStemmer();
            const string regex           = @"[A-Za-z\-]+";
            var          valueEnumerable = Regex.Matches(content, regex);

            ListWorld = valueEnumerable.Cast <Match>().Select(match => match.Value).ToList().Except(stop_words).OrderBy(a => a);
            ListWorld = ListWorld.ToList().ConvertAll(d => stemmer.Stem(d.ToLower()));
        }
예제 #28
0
        public static void LoadAnalyzationData(string sent, string token, string pos, string chunker, string tags, string nouns)
        {
            analyzer = new AggregateAnalyzer {
                sent, token, pos, chunker
            };
            wordStemmer    = new EnglishStemmer();
            tripletService = new TripletService(new ReplyTripletService(), new QuestionTripletService());

            string[] lines = File.ReadAllLines(tags);
            POSTagValues = JObject.Parse(lines[0]);

            LowValueNouns = new List <string>(File.ReadAllLines(nouns));
        }
        public List <Article> CheckKeywordPosition(string keyword, List <Article> articles)
        {
            EnglishStemmer stemmer = new EnglishStemmer();

            string stemmedWord = stemmer.Stem(keyword);
            int    max         = 100;
            int    min         = 1000;

            foreach (Article article in articles)
            {
                int position = 100;
                int i        = 0;

                while (i < article.Words.Count)
                {
                    if (keyword == article.Words[i])
                    {
                        if (i >= 100)
                        {
                            position = 0;
                            break;
                        }
                        else
                        {
                            position -= i;
                            break;
                        }
                    }
                    i++;
                }

                article.AllCharacteristicValues.Add(position);

                if (min > position)
                {
                    min = position;
                }
            }

            int elements = articles[0].AllCharacteristicValues.Count;

            foreach (Article article in articles)
            {
                double oldValue = article.AllCharacteristicValues.Last();
                double newValue = CalcMinMaxNormalization(oldValue, max, min);
                article.AllCharacteristicValues[elements - 1] = newValue;
            }

            return(articles);
        }
예제 #30
0
파일: Tokenizer.cs 프로젝트: Lyngse/P7
        //Snowball Stemmer. Stems an input string array according to the English language.
        private string[] StringStemmer(string[] stringArray)
        {
            var stemmer = new EnglishStemmer();

            for (int i = 0; i < stringArray.Length; i++)
            {
                stemmer.SetCurrent(stringArray[i]);
                if (stemmer.Stem())
                {
                    stringArray[i] = stemmer.GetCurrent();
                }
            }

            return(stringArray);
        }
예제 #31
0
 private void copy_from(EnglishStemmer other)
 {
     B_Y_found = other.B_Y_found;
     I_p2 = other.I_p2;
     I_p1 = other.I_p1;
     base.copy_from(other);
 }