/// <summary> /// A static method that gets a unique word count for each of the words in a string /// </summary> /// <param name="str">The String that will be broken into a distinct word count</param> /// <returns>A distinct word count in the form of a dictionary(word, count)</returns> public static Dictionary <string, double> GetWordCount(this string str) { //Check to see that the user pased an actual string //If they didn't return them an empty dictionary if (String.IsNullOrEmpty(str)) { return(new Dictionary <string, double>()); } //Create the stemmer used to impliment Porters Algorithm for stemming strings //The purpose of this is to take words like lovely and convert them to love, //This helps attain more accurate results var stemmer = new PorterStemmerAlgorithm.PorterStemmer(); //A dummy double used as the output for the Double.TryParse //This eliminates numbers from the Double num; Regex rgx = new Regex("[^a-zA-Z0-9]"); str = rgx.Replace(str, " "); //Split the words first removing _ characters return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str) //Cast them to an enumerable of the matches. .Cast <Match>() //Convert the strings to lower, Stem them for consistency and select them. .Select(m => stemmer.stemTerm(m.Value.ToLower())) //Group Them by their text .GroupBy(p => p) //Select a new object where the Word is the text and the Count is the number of occurences of that word .Select(g => new { Word = g.Key, Count = g.Count() }) //Order them by word (not necessary but I like order) .OrderBy(p => p.Word) //Remove all items that are found in the stop words dictionary, or are simply numbers .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num)) //Convert this list to a dictionary where the word is the key and the number of its occurences is the value .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count))); }
public ActionResult Index(string gelenUrl) { WebClient client = new WebClient(); string url = gelenUrl; Uri urlDomain = new Uri(url); string downloadString = client.DownloadString(url);//parametre olarak gelcek -- HTML olarak content indirilir byte[] bytes = Encoding.Default.GetBytes(downloadString); downloadString = Encoding.UTF8.GetString(bytes); //indirilen HTML utf-8 e çevrildi. Yapılmasa da olur zira ingilizce yaptık sonradan. var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(downloadString); //Oluşturulan HtmlDocument tipindeki veriye indirilen html içeriği atanır. int cumleSayisi = 0; //TF-IDF hesaplamaları için cümle sayılarının tutulacağı değişken. var stopWords = StopWords.GetStopWords("en"); // Metin işlenirken yararı olmayacak kelimelerin ayıklanması adına ingilizce stopwordsun ilgili değişkene atanması. List <string> kelimeler = new List <string>(); HtmlIsleyici htmlIsleyici1 = new HtmlIsleyici(); htmlIsleyici1.htmlIsle(htmlDoc); kelimeler = htmlIsleyici1.kelimeler; cumleSayisi = htmlIsleyici1.cumleSayisi; KelimeDuzeltici kelimeDuzeltici1 = new KelimeDuzeltici(); kelimeler = kelimeDuzeltici1.kelimeDuzelt(kelimeler, urlDomain); List <WordAndFreq> kelimeFrekans = new List <WordAndFreq>(); KelimeFrekansYapici kelimeFrekansYapici1 = new KelimeFrekansYapici(); kelimeFrekans = kelimeFrekansYapici1.KelimeFrekansYap(kelimeler); TfIdfCalculator agirlikHesap = new TfIdfCalculator(); List <WordAndWeight> weihtedKelimeler = new List <WordAndWeight>(); AgirlikliKelimeListesi agirlikliKelimeListesi1 = new AgirlikliKelimeListesi(); weihtedKelimeler = agirlikliKelimeListesi1.AgirlikliListeYap(kelimeFrekans, kelimeler.Count, cumleSayisi); AnahtarKelimeBelirleyici anahtarKelimeBelirleyici1 = new AnahtarKelimeBelirleyici(); List <WordAndFreq> anahtarKelimeler = new List <WordAndFreq>(); anahtarKelimeler = anahtarKelimeBelirleyici1.AnahtarKelimeBelirle(weihtedKelimeler, kelimeFrekans); Asama2ViewModel asama2ViewModel = new Asama2ViewModel(); asama2ViewModel.KeywordListesi = anahtarKelimeler; //return View(asama2ViewModel); return(RedirectToAction("Index", "Asama3", new { gelenUrl = url })); }
private IDictionary <string, double> GetWordIfidf(string text, IEnumerable <string> allowPos) { IEnumerable <string> words = null; if (allowPos.IsNotEmpty()) { words = FilterCutByPos(text, allowPos); } else { words = Segmenter.Cut(text); } // Calculate TF var freq = new Dictionary <string, double>(); foreach (var word in words) { var w = word; if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower())) { continue; } freq[w] = freq.GetDefault(w, 0.0) + 1.0; } var total = freq.Values.Sum(); foreach (var k in freq.Keys.ToList()) { freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total; } return(freq); }
static void Main(string[] args) { var blackList = StopWords.ReadBlackListWords(); if (args.Length == 0) { args = new string[] { "" }; } var inputString = UIInput.ProcessInput(args[0]); var splitList = Splitter.SplitString(inputString); var cleansedList = StopWords.ReturnCleansedString(blackList, splitList); var allWords = CountStrings.StringsCountFromList(cleansedList); var uniqueWords = CountStrings.UniqueStringsCountFromList(cleansedList); var averageDigitsPerWord = CountStrings.AverageNumberOfDigitsPerWord(cleansedList, allWords); var testFuncPointer = CountStrings.ReturnAverage(CountStrings.FuncPointer, cleansedList, allWords); // only for test purposes // Kapseln Console.WriteLine("number of words: " + allWords + ", unique: " + uniqueWords + ", average word length: " + averageDigitsPerWord.ToString(CultureInfo.GetCultureInfo("en-GB"))); #if DEBUG Console.Read(); #endif }
public Dictionary <string, int> GetWordOccurancesFromText(string text) { if (string.IsNullOrEmpty(text)) { throw new ArgumentNullException(nameof(text)); } text = GetPlainText(text); var result = new Dictionary <string, int>(); var matches = Regex.Matches(text, WordRegexPattern); foreach (Match word in matches) { var key = word.Value.ToLower(); if (!StopWords.Contains(key, StringComparer.OrdinalIgnoreCase)) { if (result.ContainsKey(key)) { result[key]++; } else { result.Add(key, 1); } } } return(result); }
private void StripAnalysisText(List <string> rawAnalysisWords) { List <string> strippedList = new List <string>(); foreach (string s in rawAnalysisWords) { string p; string q; if (!string.IsNullOrEmpty(s)) { if (Rgx.IsMatch(s)) { p = Rgx.Replace(s, "").ToLower(); } else { p = s.ToLower(); } if (!StopWords.Contains(p.ToLower()) && !string.IsNullOrWhiteSpace(p)) { if (Apos.IsMatch(p)) { q = Apos.Replace(p, "").ToLower(); strippedList.Add(q); } else { strippedList.Add(p); } } } } StrippedWords = strippedList; }
/// <summary> /// Gets a cleaned string with stop words removed, stemmed, and tokenized. /// </summary> /// <param name="sentence"></param> /// <returns></returns> public string[] Process(string sentence) { var tokens = sentence.Tokenize(); if (!RemoveStopWords && !StemWords) { return(tokens); } var result = new List <string>(); foreach (string token in tokens) { if (RemoveStopWords && StemWords) { if (StopWords.All(stopWord => !string.Equals(token, stopWord, StringComparison.InvariantCultureIgnoreCase))) { result.Add(_stemmer.Stem(token)); } } else if (!RemoveStopWords && StemWords) { result.Add(_stemmer.Stem(token)); } else if (RemoveStopWords && !StemWords) { if (StopWords.All(stopWord => !string.Equals(token, stopWord, StringComparison.InvariantCultureIgnoreCase))) { result.Add(token); } } } return(result.ToArray()); }
static void Main(string[] args) { StopWords stop = new StopWords("StopWords.txt"); Documentos doc = new Documentos("TestCollection.txt"); Indice Index = new Indice(StopWords.Palabras, doc.TotalDocumentos, "TestCollection.txt"); for (int i = 0; i < Index.PalabrasIndice.Length; i++) { Console.Write(Index.PalabrasIndice[i] + " = \t"); for (int j = 0; j < 5; j++) { Console.Write("|" + Index.DocsIndice[i, j] + "|"); } Console.WriteLine(); } Thread T1 = new Thread(new ParameterizedThreadStart(SRI.Consultar)); //SRI buscar = new SRI("hola mundo, este is mi primera frase for you",stop); //for (int i = 0; i < doc.TotalDocumentos; i++) //Console.WriteLine(doc.TotalDocumentos[i]); //Indice Ind = new Indice(); Console.ReadKey(); }
/// <summary> /// Determines whether [the specified token] [is a stop word]. /// </summary> /// <param name="token">The token.</param> /// <param name="language">The language.</param> /// <returns><c>true</c> if [the specified token] [is a stop word]; otherwise, <c>false</c>.</returns> public bool IsStopWord(string token, StopWordsLanguage language) { if (string.IsNullOrEmpty(token) || !StopWordsLanguages.TryGetValue(language, out var StopWords)) { return(false); } return(StopWords.IsStopWord(token)); }
protected DefaultSlugityConfig() { TextCase = TextCase.LowerCase; StringSeparator = '-'; MaxLength = 100; ReplacementCharacters = new CharacterReplacement(); StripStopWords = false; StopWords = new StopWords(); }
public JavaStopWord() : base(".java") { #region StopWords StopWords.Add("abstract"); StopWords.Add("assert"); StopWords.Add("boolean"); StopWords.Add("break"); StopWords.Add("byte"); StopWords.Add("case"); StopWords.Add("catch"); StopWords.Add("char"); StopWords.Add("class"); StopWords.Add("const"); StopWords.Add("continue"); StopWords.Add("default"); StopWords.Add("do"); StopWords.Add("double"); StopWords.Add("else"); StopWords.Add("enum"); StopWords.Add("extends"); StopWords.Add("final"); StopWords.Add("finally"); StopWords.Add("float"); StopWords.Add("for"); StopWords.Add("goto"); StopWords.Add("if"); StopWords.Add("implements"); StopWords.Add("import"); StopWords.Add("instanceof"); StopWords.Add("int"); StopWords.Add("interface"); StopWords.Add("long"); StopWords.Add("native"); StopWords.Add("new"); StopWords.Add("package"); StopWords.Add("private"); StopWords.Add("protected"); StopWords.Add("public"); StopWords.Add("return"); StopWords.Add("short"); StopWords.Add("static"); StopWords.Add("strictfp"); StopWords.Add("super"); StopWords.Add("switch"); StopWords.Add("synchronized"); StopWords.Add("this"); StopWords.Add("throw"); StopWords.Add("throws"); StopWords.Add("transient"); StopWords.Add("try"); StopWords.Add("void"); StopWords.Add("volatile"); StopWords.Add("while"); #endregion }
/// <summary> /// Marks the stop words. /// </summary> /// <param name="tokens">The tokens.</param> /// <param name="language">The language.</param> /// <returns>The tokens.</returns> public Token[] MarkStopWords(Token[] tokens, StopWordsLanguage language) { if (tokens is null || tokens.Length == 0) { return(Array.Empty <Token>()); } if (!StopWordsLanguages.TryGetValue(language, out var StopWords)) { return(tokens); } return(StopWords.MarkStopWords(tokens)); }
public ActionResult Index(string gelenUrl) { WebClient client = new WebClient(); string url = gelenUrl; Uri urlDomain = new Uri(url); Console.WriteLine("Domain part : " + urlDomain.Host); //Domain ayrıştırır string downloadString = client.DownloadString(url);//parametre olarak gelcek -- HTML olarak content indirilir byte[] bytes = Encoding.Default.GetBytes(downloadString); downloadString = Encoding.UTF8.GetString(bytes); //indirilen HTML utf-8 e çevrildi. Yapılmasa da olur zira ingilizce yaptık sonradan. var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(downloadString); //Oluşturulan HtmlDocument tipindeki veriye indirilen html içeriği atanır. int cumleSayisi = 0; //TF-IDF hesaplamaları için cümle sayılarının tutulacağı değişken. var stopWords = StopWords.GetStopWords("en"); // Metin işlenirken yararı olmayacak kelimelerin ayıklanması adına ingilizce stopwordsun ilgili değişkene atanması. List <string> kelimeler = new List <string>(); HtmlIsleyici htmlIsleyici1 = new HtmlIsleyici(); htmlIsleyici1.htmlIsle(htmlDoc); kelimeler = htmlIsleyici1.kelimeler; cumleSayisi = htmlIsleyici1.cumleSayisi; KelimeDuzeltici kelimeDuzeltici1 = new KelimeDuzeltici(); kelimeler = kelimeDuzeltici1.kelimeDuzelt(kelimeler, urlDomain); List <WordAndFreq> kelimeFrekans = new List <WordAndFreq>(); KelimeFrekansYapici kelimeFrekansYapici1 = new KelimeFrekansYapici(); kelimeFrekans = kelimeFrekansYapici1.KelimeFrekansYap(kelimeler); Asama1ViewModel asama1ViewModel = new Asama1ViewModel(); asama1ViewModel.FrekansListesi = kelimeFrekans; return(View(asama1ViewModel)); }
public TextRankExtractor() { Span = 5; Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } }
static void Main(string[] args) { // load and process data from the files Dictionary <string, int> dataCpp = loadAndProcessData("files/training_cpp.txt"); Dictionary <string, int> dataJava = loadAndProcessData("files/training_java.txt"); // create model ModelTrainer mt = new ModelTrainer(); // load processed training data in model foreach (var item in dataCpp) { mt.loadData(LANG_CPP, item.Value, item.Key); } foreach (var item in dataJava) { mt.loadData(LANG_JAVA, item.Value, item.Key); } mt.computeRemovingThresholdValue(); //mt.setRemovingThresholdValue(0); mt.trainData(); // load test file string testFile = "files/test_file.txt"; string pred = new FileReader(testFile).getAllTxt(true); List <string> predProcessed = new Token().wordsOnlyTokenizer(pred); predProcessed = new StopWords().removeWordsBasedOnStopWordsList(predProcessed); // calculate the values and create probabilities based on labels Predictor predictor = new Predictor(); var mymodel = mt.getDataModelTrained(); predictor.makePrediction(mymodel, predProcessed, LANG_CPP); var max = predictor.argmax(); //debugFunc(mt, predictor); // output message for prediction string msg = "\nThe model predicts: "; switch (max.Key) { case LANG_CPP: msg += "C++"; break; case LANG_JAVA: msg += "Java"; break; // add here more languages } msg += " as the language of the file(" + testFile + ").\nPrediction procent: " + max.Value; Console.WriteLine(msg); }
private async Task LoadWordListAsync() { string path = "StopWords.txt"; using (StreamReader sr = new StreamReader(path, System.Text.Encoding.UTF8)) { string line; while ((line = await sr.ReadLineAsync()) != null) { StopWords.Add(line); } } }
public void DeleteStopWord_WorksCorrectly() { var stopwords = new StopWords(); var filter = new StopWordsFilter(stopwords); var input = new List <string> { "Abc", "of", "cba", "IN", "the", "car" }; stopwords.Remove("iN"); filter.Filter(input).Should().BeEquivalentTo(new List <string> { "abc", "in", "cba", "car" }); }
public void FilterStopWord_URL(string value) { StopWords model = new StopWords(); ISeoFilter seoFilter = new StopWordUrlFilterService(); IEnumerable <StopWords> resultLink = new List <StopWords>(); string response = seoFilter.GetDataFromSource(value); var res = seoFilter.GetResponseBody(response); var result = seoFilter.CalculateStopCount(res); var resultBody = model.PopulateStopWords(result); Assert.IsNotNull(resultBody, "Stop Words is available"); }
public void TestRemoveStopWords() { StopWords stop = new StopWords(); string test = "This is a test string"; string exceptedResult = "test string"; string actualResult = StopWords.RemoveStopwords(test); Console.WriteLine("Expected: " + exceptedResult); Console.WriteLine("Result: " + actualResult); Assert.AreEqual(exceptedResult, actualResult, "Stop words were successfully removed."); }
public IEnumerable <Posting> Index(sqliteContext dbContext, List <string> words) { Dictionary <string, Posting> postings = new Dictionary <string, Posting>(); int index = 0; foreach (var token in Tokens) { var word = token.ToString().ToLower(); index++; if (StopWords.Contains(word)) { continue; } if (words != null && !words.Contains(word)) { continue; } if (dbContext?.IndexWord.Find(word) == null) { dbContext?.IndexWord.Add(new IndexWord() { Word = word }); } if (!postings.ContainsKey(word)) { postings.Add(word, new Posting() { Word = word, DocumentName = name, Indexes = "" }); } var p = postings[word]; p.Frequency++; if (p.Indexes != "") { p.Indexes += ","; } p.Indexes += index; } foreach (var posting in postings) { dbContext?.Posting.Add(posting.Value); } return(postings.Values); }
public bool IsBugFix(Commit commit) { if (!Regex.IsMatch(commit.Message, MessageRegExp, RegexOptions.IgnoreCase)) { return(false); } string messageToLower = commit.Message.ToLower(); if (StopWords.Any(x => messageToLower.IndexOf(x) > 0)) { return(false); } return(true); }
/// <summary> /// determines if the passed term is likely to be of interest in "more like" comparisons /// </summary> /// <param name="term"> The word being considered </param> /// <returns> <c>true</c> if should be ignored, <c>false</c> if should be used in further analysis </returns> private bool IsNoiseWord(string term) { int len = term.Length; if (MinWordLen > 0 && len < MinWordLen) { return(true); } if (MaxWordLen > 0 && len > MaxWordLen) { return(true); } return(StopWords != null && StopWords.Contains(term)); }
/// <summary> /// 文本排序 /// </summary> public TextRankExtractor() { Span = 5; Segmenter = new Segmenter(); PosSegmenter = new PosSegmenter(Segmenter); StopWords = Dict.StopWords; if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } }
public CSHTMLStopWord() : base(".cshtml") { #region StopWords StopWords.Add("xml"); StopWords.Add("Xml"); StopWords.Add("name"); StopWords.Add("Name"); StopWords.Add("version"); StopWords.Add("Version"); StopWords.Add("value"); StopWords.Add("Value"); StopWords.Add("LocaleResource"); #endregion }
public TfidfExtractor() { Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } Loader = new IdfLoader(DefaultIdfFile); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; }
public void FilterStopWord_Text(string value) { StopWords model = new StopWords(); ISeoFilter seoFilter = new StopWordTextFilterService(); IEnumerable <StopWords> resultLink = new List <StopWords>(); string response = seoFilter.GetDataFromSource(value); var res = seoFilter.GetResponseBody(response); var result = seoFilter.CalculateStopCount(res); var resultBody = model.PopulateStopWords(result); var body = resultBody.FirstOrDefault().Body; Assert.IsTrue(resultBody.Any(c => c.Word == "and" && c.Count == 2), "Word 'and' is 2 count"); }
public override int GetHashCode() { const int prime = 31; int result = base.GetHashCode(); result = prime * result + ((Analyzer == null) ? 0 : Analyzer.GetHashCode()); result = prime * result + ((fieldName == null) ? 0 : fieldName.GetHashCode()); result = prime * result + ((LikeText == null) ? 0 : LikeText.GetHashCode()); result = prime * result + MaxQueryTerms; result = prime * result + MinDocFreq; result = prime * result + MinTermFrequency; result = prime * result + Arrays.GetHashCode(MoreLikeFields); result = prime * result + Number.FloatToIntBits(PercentTermsToMatch); result = prime * result + ((StopWords == null) ? 0 : StopWords.GetHashCode()); return(result); }
public void StopWords_ReturnCleansedString() { // arrange var blackList = new string[] { "a", "b" }; var in_RawList = new List <string> { "a", "b", "c" }; var cleansedList = new List <string> { "c" }; // act List <string> resultList = StopWords.ReturnCleansedString(blackList, in_RawList); // assert CollectionAssert.Equals(resultList, cleansedList); }
public Boolean ShouldStop(ChatMessage chatMessage) { if (!chatMessage.IsModerator) { return(false); } if (chatMessage.Username.Equals("streamelements", StringComparison.OrdinalIgnoreCase)) { return(false); } String[] messageWords = chatMessage.Message.Split(' '); return(messageWords.Any(word => StopWords.Contains(word.ToLower()) || word.Equals(TwitchClientManager.Name.Value, StringComparison.OrdinalIgnoreCase))); }
public static List <List <string> > GetDocuments() //returns a list of string arrays that will be each document stemmed and have stop word removed { //Save all documents to single string Assembly assembly = Assembly.GetExecutingAssembly(); string resourceName = "Information_Retrieval_System.Resources.MEDDocuments.txt"; string result; using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { using (StreamReader reader = new StreamReader(stream)) { //single string that contains all documents result = reader.ReadToEnd(); } } string[] docSeperator = { ".I" }; //split document string into separate strings for each document string[] splitDocuments = result.Split(docSeperator, StringSplitOptions.RemoveEmptyEntries); //save each document to list List <string[]> initialDocumentsList = new List <string[]>(); string[] docDelimiters = { ".W", "\r", "\n", " ", ".", ",", "?", "!", "-", "/", "'", "(", ")" }; foreach (string doc in splitDocuments) { string[] d = doc.Split(docDelimiters, StringSplitOptions.RemoveEmptyEntries); initialDocumentsList.Add(d); } //take stop words and stem all documents List <List <string> > finalDocumentsList = new List <List <string> >(); foreach (string[] dt in initialDocumentsList) { string[] removedStopWords = StopWords.RemoveStopWords(dt); string[] finalDoc = WordStemmer.QueryStemmer(removedStopWords); //remove doc number from beginning of document content List <string> temp = new List <string>(finalDoc); temp.RemoveAt(0); finalDocumentsList.Add(temp); } //return processed list of documents return(finalDocumentsList); }
public void TestCanConstruct() { var stopWords = new StopWords(new CultureData(CultureInfo.CurrentCulture), new List<string>()); Assert.IsInstanceOfType(stopWords, typeof(LanguageWordList)); }
public Cleaner() { this.m_StopWords = new StopWords(); }