public void Test_Stemmer4() { // Setup testing variables string token = "Soars"; // Setup expected test results string expectedStem = "Soar"; PorterStemmer stemmer = new PorterStemmer(); string actualStem = stemmer.stemTerm(token); // Verify test results Assert.AreEqual(actualStem, expectedStem, "Porter Stemmer returned incorrect stem"); }
private static Term[] PrzerobNaTermy(string zapytanie) { zapytanie = Regex.Replace(zapytanie, "(\\p{P})", string.Empty).ToLower(); string[] zap = zapytanie.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); List<Term> retTerm = new List<Term>(); PorterStemmer ps = new PorterStemmer(); foreach (string s in zap) { retTerm.Add(new Term { TermOryginal = s, TermStemming = ps.stemTerm(s) }); } // sortujemy wg termow juz PODDANYCH STEMMINGOWI retTerm.Sort((x, y) => x.TermStemming.CompareTo(y.TermStemming)); // usuwamy duplikaty retTerm = retTerm.Distinct(Term.EqComparer).ToList(); return retTerm.ToArray(); }
public static string Process(string textToProcess) { StringBuilder builder = new StringBuilder(); string result = string.Empty; string stemmedWord; char[] delimiterChars = { ' ' }; string[] tokens = textToProcess.Split(delimiterChars); StemmerInterface porterStemmer = new PorterStemmer(); foreach (string token in tokens) { stemmedWord = porterStemmer.stemTerm(token); builder.AppendFormat("{0} ", stemmedWord); } result = builder.ToString().Trim(); return result; }
public bool WczytajDokumenty(string path) { if (path.Length > 0 && File.Exists(path)) { try { string documents = string.Empty; using (TextReader tr = new StreamReader(File.Open(path, FileMode.Open))) { documents = tr.ReadToEnd(); } string[] tmpDocs = documents.Split(new string[] { "\n\n" }, StringSplitOptions.RemoveEmptyEntries); if (tmpDocs != null && tmpDocs.Length > 0) { this.Dokumenty = new List<Dokument>(); PorterStemmer ps = new PorterStemmer(); foreach (string item in tmpDocs) { int index = item.IndexOf('\n'); if (index > 0 && index < item.Length) { Dokument newDoc = new Dokument { NaglowekOryginal = item.Substring(0, index), TrescOryginal = item.Substring(index + 1, item.Length - index - 1) }; // usuwamy interpunkcję string naglStem = Regex.Replace(newDoc.NaglowekOryginal, "(\\p{P})", string.Empty).ToLower(); string trescStem = Regex.Replace(newDoc.TrescOryginal, "(\\p{P})", string.Empty).ToLower().Replace('\n', ' '); // używamy algorytmu Portera dla nagłówka string[] splitted = naglStem.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); naglStem = string.Empty; foreach (string s in splitted) { string st = ps.stemTerm(s.Trim()); naglStem += st + " "; newDoc.Termy.Add(new Term { TermStemming = st }); } naglStem = naglStem.Substring(0, naglStem.Length - 1); // używamy algorytmu Portera dla treści splitted = trescStem.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); trescStem = string.Empty; foreach (string s in splitted) { string st = ps.stemTerm(s.Trim()); trescStem += st + " "; newDoc.Termy.Add(new Term { TermStemming = st }); } trescStem = trescStem.Substring(0, trescStem.Length - 1); newDoc.NaglowekStemming = naglStem; newDoc.TrescStemming = trescStem; this.Dokumenty.Add(newDoc); } } return true; } } catch (Exception ex) { return false; } } return false; }
public bool WczytajTermy(string path) { if (path.Length > 0 && File.Exists(path)) { try { string terms = string.Empty; using (TextReader tr = new StreamReader(File.Open(path, FileMode.Open))) { terms = tr.ReadToEnd(); } string[] tmpTerms = terms.Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries); if (tmpTerms != null && tmpTerms.Length > 0) { this.Termy = new List<Term>(); PorterStemmer ps = new PorterStemmer(); foreach (string item in tmpTerms) { Term newTerm = new Term { TermOryginal = item }; // usuwamy interpunkcję string termStem = Regex.Replace(newTerm.TermOryginal, "(\\p{P})", string.Empty).ToLower().Trim(); // używamy algorytmu Portera newTerm.TermStemming = ps.stemTerm(termStem); this.Termy.Add(newTerm); } this.Termy.Sort((x, y) => x.TermOryginal.CompareTo(y.TermOryginal)); this.Termy = this.Termy.Distinct(Term.EqComparer).ToList(); return true; } } catch { return false; } } return false; }
public void ExtractTokens() { string str = ""; for (int i = 0; i < 256; ++i) { char ch = (char)i; if (!char.IsLetter(ch) )//&& ch != '\'' ) { str += ch; } } char[] delimiterChars = str.ToCharArray(); PorterStemmer stemmer = new PorterStemmer(); foreach (WikiPage page in wikiPages) { string temp = page.text;/* bool test = false; foreach (char ch in page.text) { if (char.GetNumericValue(ch) < 256) { temp += ch; test = false; } else { if (!test) { temp += ' '; test = true; } } }*/ temp = Regex.Replace(temp, @"[^\u0000-\u007F]", " "); temp = Regex.Replace(temp, @"<span.*> ", " "); temp = Regex.Replace(temp, @"<math>.*</math>", " "); //temp = Regex.Replace(temp, @"<ref>.*</ref>", " "); //temp = Regex.Replace(temp, @"<source.*</source>", " "); temp = temp.Replace("</span>", " "); temp = temp.Replace("<math", " "); temp = temp.Replace("</math>", " "); temp = temp.Replace("\'\'", " "); temp = temp.Replace(" \'", " "); temp = temp.Replace("\' ", " "); string[] tokenStrings = temp.Split(delimiterChars, StringSplitOptions.RemoveEmptyEntries); //string[] tokenStrings = page.text.Split(delimiterChars, StringSplitOptions.RemoveEmptyEntries); foreach (string tokenString in tokenStrings) { string stem = stemmer.stemTerm(tokenString).ToLower(); if (stem.Length <= 2) continue; if (stem == "ref") continue; if (stem == "sub") continue; //if (stem == "math") continue; if (stem.StartsWith("disambig")) continue; if (stem == "sup") continue; if (stem == "stub") continue; if (stem == "frac") continue; if (stem == "nbsp") continue; if (page.tf_IDF_Vec.ContainsKey(stem)) { ++page.tf_IDF_Vec[stem].TF; } else { page.tf_IDF_Vec[stem] = new WikiToken(tokenString, stem); if (inverseTokens.ContainsKey(stem)) { ++inverseTokens[stem]; } else { inverseTokens[stem] = 1; } } } } foreach (WikiPage page in wikiPages) { float squaredSummed = 0; //float summed = 0; foreach (string token in page.tf_IDF_Vec.Keys ) { WikiToken wikiToken = page.tf_IDF_Vec[token]; wikiToken.DF = inverseTokens[wikiToken.Stemmed]; wikiToken.TF_IDF = (float)(1 + Math.Log((float)wikiToken.TF, 2)) * (float)Math.Log((float)wikiPages.Count / wikiToken.DF, 2); squaredSummed += wikiToken.TF_IDF * wikiToken.TF_IDF; //summed += wikiToken.TF_IDF; } float magnitude = (float)Math.Sqrt(squaredSummed); foreach (string token in page.tf_IDF_Vec.Keys) { WikiToken wikiToken = page.tf_IDF_Vec[token]; wikiToken.TF_IDF /= magnitude; //wikiToken.TF_IDF /= summed; } } }