Example #1
0
        /// <summary>
        /// A static method that gets a unique word count for each of the words in a string
        /// </summary>
        /// <param name="str">The String that will be broken into a distinct word count</param>
        /// <returns>A distinct word count in the form of a dictionary(word, count)</returns>
        public static Dictionary <string, double> GetWordCount(this string str)
        {
            //Check to see that the user pased an actual string
            //If they didn't return them an empty dictionary
            if (String.IsNullOrEmpty(str))
            {
                return(new Dictionary <string, double>());
            }
            //Create the stemmer used to impliment Porters Algorithm for stemming strings
            //The purpose of this is to take words like lovely and convert them to love,
            //This helps attain more accurate results
            var stemmer = new PorterStemmerAlgorithm.PorterStemmer();
            //A dummy double used as the output for the Double.TryParse
            //This eliminates numbers from the
            Double num;
            Regex  rgx = new Regex("[^a-zA-Z0-9]");

            str = rgx.Replace(str, " ");
            //Split the words first removing _ characters
            return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str)
                   //Cast them to an enumerable of the matches.
                   .Cast <Match>()
                   //Convert the strings to lower, Stem them for consistency and select them.
                   .Select(m => stemmer.stemTerm(m.Value.ToLower()))
                   //Group Them by their text
                   .GroupBy(p => p)
                   //Select a new object where the Word is the text and the Count is the number of occurences of that word
                   .Select(g => new { Word = g.Key, Count = g.Count() })
                   //Order them by word (not necessary but I like order)
                   .OrderBy(p => p.Word)
                   //Remove all items that are found in the stop words dictionary, or are simply numbers
                   .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num))
                   //Convert this list to a dictionary where the word is the key and the number of its occurences is the value
                   .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count)));
        }
Example #2
0
        public ActionResult Index(string gelenUrl)
        {
            WebClient client = new WebClient();
            string    url    = gelenUrl;

            Uri urlDomain = new Uri(url);


            string downloadString = client.DownloadString(url);//parametre olarak gelcek -- HTML olarak content indirilir

            byte[] bytes = Encoding.Default.GetBytes(downloadString);
            downloadString = Encoding.UTF8.GetString(bytes); //indirilen HTML utf-8 e çevrildi. Yapılmasa da olur zira ingilizce yaptık sonradan.

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(downloadString);             //Oluşturulan HtmlDocument tipindeki veriye indirilen html içeriği atanır.

            int cumleSayisi = 0;                          //TF-IDF hesaplamaları için  cümle sayılarının tutulacağı değişken.

            var stopWords = StopWords.GetStopWords("en"); // Metin işlenirken yararı olmayacak kelimelerin ayıklanması adına ingilizce stopwordsun ilgili değişkene atanması.

            List <string> kelimeler = new List <string>();

            HtmlIsleyici htmlIsleyici1 = new HtmlIsleyici();

            htmlIsleyici1.htmlIsle(htmlDoc);
            kelimeler   = htmlIsleyici1.kelimeler;
            cumleSayisi = htmlIsleyici1.cumleSayisi;

            KelimeDuzeltici kelimeDuzeltici1 = new KelimeDuzeltici();

            kelimeler = kelimeDuzeltici1.kelimeDuzelt(kelimeler, urlDomain);

            List <WordAndFreq> kelimeFrekans = new List <WordAndFreq>();

            KelimeFrekansYapici kelimeFrekansYapici1 = new KelimeFrekansYapici();

            kelimeFrekans = kelimeFrekansYapici1.KelimeFrekansYap(kelimeler);


            TfIdfCalculator agirlikHesap = new TfIdfCalculator();

            List <WordAndWeight> weihtedKelimeler = new List <WordAndWeight>();

            AgirlikliKelimeListesi agirlikliKelimeListesi1 = new AgirlikliKelimeListesi();

            weihtedKelimeler = agirlikliKelimeListesi1.AgirlikliListeYap(kelimeFrekans, kelimeler.Count, cumleSayisi);

            AnahtarKelimeBelirleyici anahtarKelimeBelirleyici1 = new AnahtarKelimeBelirleyici();
            List <WordAndFreq>       anahtarKelimeler          = new List <WordAndFreq>();

            anahtarKelimeler = anahtarKelimeBelirleyici1.AnahtarKelimeBelirle(weihtedKelimeler, kelimeFrekans);

            Asama2ViewModel asama2ViewModel = new Asama2ViewModel();

            asama2ViewModel.KeywordListesi = anahtarKelimeler;

            //return View(asama2ViewModel);
            return(RedirectToAction("Index", "Asama3", new { gelenUrl = url }));
        }
Example #3
0
        private IDictionary <string, double> GetWordIfidf(string text, IEnumerable <string> allowPos)
        {
            IEnumerable <string> words = null;

            if (allowPos.IsNotEmpty())
            {
                words = FilterCutByPos(text, allowPos);
            }
            else
            {
                words = Segmenter.Cut(text);
            }

            // Calculate TF
            var freq = new Dictionary <string, double>();

            foreach (var word in words)
            {
                var w = word;
                if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower()))
                {
                    continue;
                }
                freq[w] = freq.GetDefault(w, 0.0) + 1.0;
            }
            var total = freq.Values.Sum();

            foreach (var k in freq.Keys.ToList())
            {
                freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total;
            }

            return(freq);
        }
Example #4
0
        static void Main(string[] args)
        {
            var blackList = StopWords.ReadBlackListWords();

            if (args.Length == 0)
            {
                args = new string[] { "" };
            }

            var inputString = UIInput.ProcessInput(args[0]);

            var splitList = Splitter.SplitString(inputString);

            var cleansedList = StopWords.ReturnCleansedString(blackList, splitList);

            var allWords = CountStrings.StringsCountFromList(cleansedList);

            var uniqueWords = CountStrings.UniqueStringsCountFromList(cleansedList);

            var averageDigitsPerWord = CountStrings.AverageNumberOfDigitsPerWord(cleansedList, allWords);

            var testFuncPointer = CountStrings.ReturnAverage(CountStrings.FuncPointer, cleansedList, allWords); // only for test purposes

            // Kapseln
            Console.WriteLine("number of words: " + allWords + ", unique: " + uniqueWords + ", average word length: " + averageDigitsPerWord.ToString(CultureInfo.GetCultureInfo("en-GB")));

#if DEBUG
            Console.Read();
#endif
        }
        public Dictionary <string, int> GetWordOccurancesFromText(string text)
        {
            if (string.IsNullOrEmpty(text))
            {
                throw new ArgumentNullException(nameof(text));
            }

            text = GetPlainText(text);

            var result  = new Dictionary <string, int>();
            var matches = Regex.Matches(text, WordRegexPattern);

            foreach (Match word in matches)
            {
                var key = word.Value.ToLower();
                if (!StopWords.Contains(key, StringComparer.OrdinalIgnoreCase))
                {
                    if (result.ContainsKey(key))
                    {
                        result[key]++;
                    }
                    else
                    {
                        result.Add(key, 1);
                    }
                }
            }
            return(result);
        }
Example #6
0
        private void StripAnalysisText(List <string> rawAnalysisWords)
        {
            List <string> strippedList = new List <string>();

            foreach (string s in rawAnalysisWords)
            {
                string p;
                string q;
                if (!string.IsNullOrEmpty(s))
                {
                    if (Rgx.IsMatch(s))
                    {
                        p = Rgx.Replace(s, "").ToLower();
                    }
                    else
                    {
                        p = s.ToLower();
                    }

                    if (!StopWords.Contains(p.ToLower()) && !string.IsNullOrWhiteSpace(p))
                    {
                        if (Apos.IsMatch(p))
                        {
                            q = Apos.Replace(p, "").ToLower();
                            strippedList.Add(q);
                        }
                        else
                        {
                            strippedList.Add(p);
                        }
                    }
                }
            }
            StrippedWords = strippedList;
        }
Example #7
0
        /// <summary>
        /// Gets a cleaned string with stop words removed, stemmed, and tokenized.
        /// </summary>
        /// <param name="sentence"></param>
        /// <returns></returns>
        public string[] Process(string sentence)
        {
            var tokens = sentence.Tokenize();

            if (!RemoveStopWords && !StemWords)
            {
                return(tokens);
            }

            var result = new List <string>();

            foreach (string token in tokens)
            {
                if (RemoveStopWords && StemWords)
                {
                    if (StopWords.All(stopWord => !string.Equals(token, stopWord, StringComparison.InvariantCultureIgnoreCase)))
                    {
                        result.Add(_stemmer.Stem(token));
                    }
                }
                else if (!RemoveStopWords && StemWords)
                {
                    result.Add(_stemmer.Stem(token));
                }
                else if (RemoveStopWords && !StemWords)
                {
                    if (StopWords.All(stopWord => !string.Equals(token, stopWord, StringComparison.InvariantCultureIgnoreCase)))
                    {
                        result.Add(token);
                    }
                }
            }

            return(result.ToArray());
        }
        static void Main(string[] args)
        {
            StopWords stop = new StopWords("StopWords.txt");

            Documentos doc   = new Documentos("TestCollection.txt");
            Indice     Index = new Indice(StopWords.Palabras, doc.TotalDocumentos, "TestCollection.txt");

            for (int i = 0; i < Index.PalabrasIndice.Length; i++)
            {
                Console.Write(Index.PalabrasIndice[i] + " = \t");
                for (int j = 0; j < 5; j++)
                {
                    Console.Write("|" + Index.DocsIndice[i, j] + "|");
                }
                Console.WriteLine();
            }

            Thread T1 = new Thread(new ParameterizedThreadStart(SRI.Consultar));


            //SRI buscar = new SRI("hola mundo, este is mi primera frase for you",stop);

            //for (int i = 0; i < doc.TotalDocumentos; i++)
            //Console.WriteLine(doc.TotalDocumentos[i]);
            //Indice Ind = new Indice();
            Console.ReadKey();
        }
Example #9
0
 /// <summary>
 /// Determines whether [the specified token] [is a stop word].
 /// </summary>
 /// <param name="token">The token.</param>
 /// <param name="language">The language.</param>
 /// <returns><c>true</c> if [the specified token] [is a stop word]; otherwise, <c>false</c>.</returns>
 public bool IsStopWord(string token, StopWordsLanguage language)
 {
     if (string.IsNullOrEmpty(token) || !StopWordsLanguages.TryGetValue(language, out var StopWords))
     {
         return(false);
     }
     return(StopWords.IsStopWord(token));
 }
 protected DefaultSlugityConfig()
 {
     TextCase              = TextCase.LowerCase;
     StringSeparator       = '-';
     MaxLength             = 100;
     ReplacementCharacters = new CharacterReplacement();
     StripStopWords        = false;
     StopWords             = new StopWords();
 }
 public JavaStopWord() : base(".java")
 {
     #region StopWords
     StopWords.Add("abstract");
     StopWords.Add("assert");
     StopWords.Add("boolean");
     StopWords.Add("break");
     StopWords.Add("byte");
     StopWords.Add("case");
     StopWords.Add("catch");
     StopWords.Add("char");
     StopWords.Add("class");
     StopWords.Add("const");
     StopWords.Add("continue");
     StopWords.Add("default");
     StopWords.Add("do");
     StopWords.Add("double");
     StopWords.Add("else");
     StopWords.Add("enum");
     StopWords.Add("extends");
     StopWords.Add("final");
     StopWords.Add("finally");
     StopWords.Add("float");
     StopWords.Add("for");
     StopWords.Add("goto");
     StopWords.Add("if");
     StopWords.Add("implements");
     StopWords.Add("import");
     StopWords.Add("instanceof");
     StopWords.Add("int");
     StopWords.Add("interface");
     StopWords.Add("long");
     StopWords.Add("native");
     StopWords.Add("new");
     StopWords.Add("package");
     StopWords.Add("private");
     StopWords.Add("protected");
     StopWords.Add("public");
     StopWords.Add("return");
     StopWords.Add("short");
     StopWords.Add("static");
     StopWords.Add("strictfp");
     StopWords.Add("super");
     StopWords.Add("switch");
     StopWords.Add("synchronized");
     StopWords.Add("this");
     StopWords.Add("throw");
     StopWords.Add("throws");
     StopWords.Add("transient");
     StopWords.Add("try");
     StopWords.Add("void");
     StopWords.Add("volatile");
     StopWords.Add("while");
     #endregion
 }
Example #12
0
 /// <summary>
 /// Marks the stop words.
 /// </summary>
 /// <param name="tokens">The tokens.</param>
 /// <param name="language">The language.</param>
 /// <returns>The tokens.</returns>
 public Token[] MarkStopWords(Token[] tokens, StopWordsLanguage language)
 {
     if (tokens is null || tokens.Length == 0)
     {
         return(Array.Empty <Token>());
     }
     if (!StopWordsLanguages.TryGetValue(language, out var StopWords))
     {
         return(tokens);
     }
     return(StopWords.MarkStopWords(tokens));
 }
Example #13
0
        public ActionResult Index(string gelenUrl)
        {
            WebClient client = new WebClient();
            string    url    = gelenUrl;

            Uri urlDomain = new Uri(url);

            Console.WriteLine("Domain part : " + urlDomain.Host); //Domain ayrıştırır



            string downloadString = client.DownloadString(url);//parametre olarak gelcek -- HTML olarak content indirilir

            byte[] bytes = Encoding.Default.GetBytes(downloadString);
            downloadString = Encoding.UTF8.GetString(bytes); //indirilen HTML utf-8 e çevrildi. Yapılmasa da olur zira ingilizce yaptık sonradan.

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(downloadString);             //Oluşturulan HtmlDocument tipindeki veriye indirilen html içeriği atanır.

            int cumleSayisi = 0;                          //TF-IDF hesaplamaları için  cümle sayılarının tutulacağı değişken.

            var stopWords = StopWords.GetStopWords("en"); // Metin işlenirken yararı olmayacak kelimelerin ayıklanması adına ingilizce stopwordsun ilgili değişkene atanması.

            List <string> kelimeler = new List <string>();


            HtmlIsleyici htmlIsleyici1 = new HtmlIsleyici();

            htmlIsleyici1.htmlIsle(htmlDoc);
            kelimeler   = htmlIsleyici1.kelimeler;
            cumleSayisi = htmlIsleyici1.cumleSayisi;



            KelimeDuzeltici kelimeDuzeltici1 = new KelimeDuzeltici();

            kelimeler = kelimeDuzeltici1.kelimeDuzelt(kelimeler, urlDomain);



            List <WordAndFreq> kelimeFrekans = new List <WordAndFreq>();

            KelimeFrekansYapici kelimeFrekansYapici1 = new KelimeFrekansYapici();

            kelimeFrekans = kelimeFrekansYapici1.KelimeFrekansYap(kelimeler);

            Asama1ViewModel asama1ViewModel = new Asama1ViewModel();

            asama1ViewModel.FrekansListesi = kelimeFrekans;

            return(View(asama1ViewModel));
        }
Example #14
0
        public TextRankExtractor()
        {
            Span = 5;

            Segmenter    = new JiebaSegmenter();
            PosSegmenter = new PosSegmenter(Segmenter);
            SetStopWords(ConfigManager.StopWordsFile);
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }
        }
        static void Main(string[] args)
        {
            // load and process data from the files
            Dictionary <string, int> dataCpp  = loadAndProcessData("files/training_cpp.txt");
            Dictionary <string, int> dataJava = loadAndProcessData("files/training_java.txt");

            // create model
            ModelTrainer mt = new ModelTrainer();

            // load processed training data in model
            foreach (var item in dataCpp)
            {
                mt.loadData(LANG_CPP, item.Value, item.Key);
            }
            foreach (var item in dataJava)
            {
                mt.loadData(LANG_JAVA, item.Value, item.Key);
            }

            mt.computeRemovingThresholdValue();
            //mt.setRemovingThresholdValue(0);
            mt.trainData();

            // load test file
            string        testFile      = "files/test_file.txt";
            string        pred          = new FileReader(testFile).getAllTxt(true);
            List <string> predProcessed = new Token().wordsOnlyTokenizer(pred);

            predProcessed = new StopWords().removeWordsBasedOnStopWordsList(predProcessed);

            // calculate the values and create probabilities based on labels
            Predictor predictor = new Predictor();
            var       mymodel   = mt.getDataModelTrained();

            predictor.makePrediction(mymodel, predProcessed, LANG_CPP);
            var max = predictor.argmax();

            //debugFunc(mt, predictor);

            // output message for prediction
            string msg = "\nThe model predicts: ";

            switch (max.Key)
            {
            case LANG_CPP: msg += "C++"; break;

            case LANG_JAVA: msg += "Java"; break;
                // add here more languages
            }
            msg += " as the language of the file(" + testFile + ").\nPrediction procent: " + max.Value;
            Console.WriteLine(msg);
        }
        private async Task LoadWordListAsync()
        {
            string path = "StopWords.txt";

            using (StreamReader sr = new StreamReader(path, System.Text.Encoding.UTF8))
            {
                string line;
                while ((line = await sr.ReadLineAsync()) != null)
                {
                    StopWords.Add(line);
                }
            }
        }
Example #17
0
        public void DeleteStopWord_WorksCorrectly()
        {
            var stopwords = new StopWords();
            var filter    = new StopWordsFilter(stopwords);
            var input     = new List <string> {
                "Abc", "of", "cba", "IN", "the", "car"
            };

            stopwords.Remove("iN");
            filter.Filter(input).Should().BeEquivalentTo(new List <string> {
                "abc", "in", "cba", "car"
            });
        }
        public void FilterStopWord_URL(string value)
        {
            StopWords  model     = new StopWords();
            ISeoFilter seoFilter = new StopWordUrlFilterService();
            IEnumerable <StopWords> resultLink = new List <StopWords>();

            string response   = seoFilter.GetDataFromSource(value);
            var    res        = seoFilter.GetResponseBody(response);
            var    result     = seoFilter.CalculateStopCount(res);
            var    resultBody = model.PopulateStopWords(result);

            Assert.IsNotNull(resultBody, "Stop Words is available");
        }
Example #19
0
        public void TestRemoveStopWords()
        {
            StopWords stop = new StopWords();

            string test           = "This is a test string";
            string exceptedResult = "test string";
            string actualResult   = StopWords.RemoveStopwords(test);

            Console.WriteLine("Expected: " + exceptedResult);
            Console.WriteLine("Result: " + actualResult);

            Assert.AreEqual(exceptedResult, actualResult, "Stop words were successfully removed.");
        }
Example #20
0
        public IEnumerable <Posting> Index(sqliteContext dbContext, List <string> words)
        {
            Dictionary <string, Posting> postings = new Dictionary <string, Posting>();
            int index = 0;

            foreach (var token in Tokens)
            {
                var word = token.ToString().ToLower();
                index++;
                if (StopWords.Contains(word))
                {
                    continue;
                }
                if (words != null && !words.Contains(word))
                {
                    continue;
                }

                if (dbContext?.IndexWord.Find(word) == null)
                {
                    dbContext?.IndexWord.Add(new IndexWord()
                    {
                        Word = word
                    });
                }

                if (!postings.ContainsKey(word))
                {
                    postings.Add(word, new Posting()
                    {
                        Word         = word,
                        DocumentName = name,
                        Indexes      = ""
                    });
                }

                var p = postings[word];
                p.Frequency++;
                if (p.Indexes != "")
                {
                    p.Indexes += ",";
                }
                p.Indexes += index;
            }

            foreach (var posting in postings)
            {
                dbContext?.Posting.Add(posting.Value);
            }
            return(postings.Values);
        }
Example #21
0
        public bool IsBugFix(Commit commit)
        {
            if (!Regex.IsMatch(commit.Message, MessageRegExp, RegexOptions.IgnoreCase))
            {
                return(false);
            }
            string messageToLower = commit.Message.ToLower();

            if (StopWords.Any(x => messageToLower.IndexOf(x) > 0))
            {
                return(false);
            }
            return(true);
        }
Example #22
0
        /// <summary>
        /// determines if the passed term is likely to be of interest in "more like" comparisons
        /// </summary>
        /// <param name="term"> The word being considered </param>
        /// <returns> <c>true</c> if should be ignored, <c>false</c> if should be used in further analysis </returns>
        private bool IsNoiseWord(string term)
        {
            int len = term.Length;

            if (MinWordLen > 0 && len < MinWordLen)
            {
                return(true);
            }
            if (MaxWordLen > 0 && len > MaxWordLen)
            {
                return(true);
            }
            return(StopWords != null && StopWords.Contains(term));
        }
Example #23
0
        /// <summary>
        /// 文本排序
        /// </summary>
        public TextRankExtractor()
        {
            Span = 5;

            Segmenter    = new Segmenter();
            PosSegmenter = new PosSegmenter(Segmenter);

            StopWords = Dict.StopWords;

            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }
        }
 public CSHTMLStopWord()
     : base(".cshtml")
 {
     #region StopWords
     StopWords.Add("xml");
     StopWords.Add("Xml");
     StopWords.Add("name");
     StopWords.Add("Name");
     StopWords.Add("version");
     StopWords.Add("Version");
     StopWords.Add("value");
     StopWords.Add("Value");
     StopWords.Add("LocaleResource");
     #endregion
 }
Example #25
0
        public TfidfExtractor()
        {
            Segmenter    = new JiebaSegmenter();
            PosSegmenter = new PosSegmenter(Segmenter);
            SetStopWords(ConfigManager.StopWordsFile);
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }

            Loader = new IdfLoader(DefaultIdfFile);

            IdfFreq   = Loader.IdfFreq;
            MedianIdf = Loader.MedianIdf;
        }
        public void FilterStopWord_Text(string value)
        {
            StopWords  model     = new StopWords();
            ISeoFilter seoFilter = new StopWordTextFilterService();
            IEnumerable <StopWords> resultLink = new List <StopWords>();

            string response   = seoFilter.GetDataFromSource(value);
            var    res        = seoFilter.GetResponseBody(response);
            var    result     = seoFilter.CalculateStopCount(res);
            var    resultBody = model.PopulateStopWords(result);

            var body = resultBody.FirstOrDefault().Body;

            Assert.IsTrue(resultBody.Any(c => c.Word == "and" && c.Count == 2), "Word 'and' is 2 count");
        }
Example #27
0
        public override int GetHashCode()
        {
            const int prime  = 31;
            int       result = base.GetHashCode();

            result = prime * result + ((Analyzer == null) ? 0 : Analyzer.GetHashCode());
            result = prime * result + ((fieldName == null) ? 0 : fieldName.GetHashCode());
            result = prime * result + ((LikeText == null) ? 0 : LikeText.GetHashCode());
            result = prime * result + MaxQueryTerms;
            result = prime * result + MinDocFreq;
            result = prime * result + MinTermFrequency;
            result = prime * result + Arrays.GetHashCode(MoreLikeFields);
            result = prime * result + Number.FloatToIntBits(PercentTermsToMatch);
            result = prime * result + ((StopWords == null) ? 0 : StopWords.GetHashCode());
            return(result);
        }
Example #28
0
        public void StopWords_ReturnCleansedString()
        {
            // arrange
            var blackList  = new string[] { "a", "b" };
            var in_RawList = new List <string> {
                "a", "b", "c"
            };
            var cleansedList = new List <string> {
                "c"
            };
            // act
            List <string> resultList = StopWords.ReturnCleansedString(blackList, in_RawList);

            // assert
            CollectionAssert.Equals(resultList, cleansedList);
        }
        public Boolean ShouldStop(ChatMessage chatMessage)
        {
            if (!chatMessage.IsModerator)
            {
                return(false);
            }

            if (chatMessage.Username.Equals("streamelements", StringComparison.OrdinalIgnoreCase))
            {
                return(false);
            }

            String[] messageWords = chatMessage.Message.Split(' ');

            return(messageWords.Any(word => StopWords.Contains(word.ToLower()) || word.Equals(TwitchClientManager.Name.Value, StringComparison.OrdinalIgnoreCase)));
        }
        public static List <List <string> > GetDocuments() //returns a list of string arrays that will be each document stemmed and have stop word removed
        {
            //Save all documents to single string
            Assembly assembly     = Assembly.GetExecutingAssembly();
            string   resourceName = "Information_Retrieval_System.Resources.MEDDocuments.txt";

            string result;

            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                using (StreamReader reader = new StreamReader(stream))
                {
                    //single string that contains all documents
                    result = reader.ReadToEnd();
                }
            }
            string[] docSeperator = { ".I" };
            //split document string into separate strings for each document
            string[] splitDocuments = result.Split(docSeperator, StringSplitOptions.RemoveEmptyEntries);

            //save each document to list
            List <string[]> initialDocumentsList = new List <string[]>();

            string[] docDelimiters = { ".W", "\r", "\n", " ", ".", ",", "?", "!", "-", "/", "'", "(", ")" };
            foreach (string doc in splitDocuments)
            {
                string[] d = doc.Split(docDelimiters, StringSplitOptions.RemoveEmptyEntries);

                initialDocumentsList.Add(d);
            }

            //take stop words and stem all documents
            List <List <string> > finalDocumentsList = new List <List <string> >();

            foreach (string[] dt in initialDocumentsList)
            {
                string[] removedStopWords = StopWords.RemoveStopWords(dt);
                string[] finalDoc         = WordStemmer.QueryStemmer(removedStopWords);
                //remove doc number from beginning of document content
                List <string> temp = new List <string>(finalDoc);
                temp.RemoveAt(0);
                finalDocumentsList.Add(temp);
            }
            //return processed list of documents
            return(finalDocumentsList);
        }
        public void TestCanConstruct()
        {
            var stopWords = new StopWords(new CultureData(CultureInfo.CurrentCulture), new List<string>());

            Assert.IsInstanceOfType(stopWords, typeof(LanguageWordList));
        }
Example #32
0
 public Cleaner()
 {
     this.m_StopWords = new StopWords();
 }