public void Test_Stemmer4()
        {
            // Setup testing variables
            string token = "Soars";

            // Setup expected test results
            string expectedStem = "Soar";

            PorterStemmer stemmer = new PorterStemmer();
            string actualStem = stemmer.stemTerm(token);

            // Verify test results
            Assert.AreEqual(actualStem, expectedStem, "Porter Stemmer returned incorrect stem");
        }
Example #2
0
        private static Term[] PrzerobNaTermy(string zapytanie)
        {
            zapytanie = Regex.Replace(zapytanie, "(\\p{P})", string.Empty).ToLower();
            string[] zap = zapytanie.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
            List<Term> retTerm = new List<Term>();
            PorterStemmer ps = new PorterStemmer();
            foreach (string s in zap)
            {
                retTerm.Add(new Term
                {
                    TermOryginal = s,
                    TermStemming = ps.stemTerm(s)
                });
            }

            // sortujemy wg termow juz PODDANYCH STEMMINGOWI
            retTerm.Sort((x, y) => x.TermStemming.CompareTo(y.TermStemming));
            // usuwamy duplikaty
            retTerm = retTerm.Distinct(Term.EqComparer).ToList();
            return retTerm.ToArray();
        }
        public static string Process(string textToProcess)
        {
            StringBuilder builder = new StringBuilder();
            string result = string.Empty;
            string stemmedWord;

            char[] delimiterChars = { ' ' };

            string[] tokens = textToProcess.Split(delimiterChars);

            StemmerInterface porterStemmer = new PorterStemmer();

            foreach (string token in tokens)
            {
                stemmedWord = porterStemmer.stemTerm(token);
                builder.AppendFormat("{0} ", stemmedWord);
            }

            result = builder.ToString().Trim();
            
            return result;
        }
Example #4
0
        public bool WczytajDokumenty(string path)
        {
            if (path.Length > 0 && File.Exists(path))
            {
                try
                {
                    string documents = string.Empty;
                    using (TextReader tr = new StreamReader(File.Open(path, FileMode.Open)))
                    {
                        documents = tr.ReadToEnd();
                    }
                    string[] tmpDocs = documents.Split(new string[] { "\n\n" }, StringSplitOptions.RemoveEmptyEntries);
                    if (tmpDocs != null && tmpDocs.Length > 0)
                    {
                        this.Dokumenty = new List<Dokument>();
                        PorterStemmer ps = new PorterStemmer();
                        foreach (string item in tmpDocs)
                        {
                            int index = item.IndexOf('\n');
                            if (index > 0 && index < item.Length)
                            {
                                Dokument newDoc = new Dokument
                                {
                                    NaglowekOryginal = item.Substring(0, index),
                                    TrescOryginal = item.Substring(index + 1, item.Length - index - 1)
                                };

                                // usuwamy interpunkcję
                                string naglStem = Regex.Replace(newDoc.NaglowekOryginal, "(\\p{P})", string.Empty).ToLower();
                                string trescStem = Regex.Replace(newDoc.TrescOryginal, "(\\p{P})", string.Empty).ToLower().Replace('\n', ' ');

                                // używamy algorytmu Portera dla nagłówka
                                string[] splitted = naglStem.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                                naglStem = string.Empty;
                                foreach (string s in splitted)
                                {
                                    string st = ps.stemTerm(s.Trim());
                                    naglStem += st + " ";
                                    newDoc.Termy.Add(new Term
                                    {
                                        TermStemming = st
                                    });
                                }
                                naglStem = naglStem.Substring(0, naglStem.Length - 1);

                                // używamy algorytmu Portera dla treści
                                splitted = trescStem.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                                trescStem = string.Empty;
                                foreach (string s in splitted)
                                {
                                    string st = ps.stemTerm(s.Trim());
                                    trescStem += st + " ";
                                    newDoc.Termy.Add(new Term
                                    {
                                        TermStemming = st
                                    });
                                }
                                trescStem = trescStem.Substring(0, trescStem.Length - 1);

                                newDoc.NaglowekStemming = naglStem;
                                newDoc.TrescStemming = trescStem;
                                this.Dokumenty.Add(newDoc);
                            }
                        }
                        return true;
                    }
                }
                catch (Exception ex)
                {
                    return false;
                }
            }
            return false;
        }
Example #5
0
        public bool WczytajTermy(string path)
        {
            if (path.Length > 0 && File.Exists(path))
            {
                try
                {
                    string terms = string.Empty;
                    using (TextReader tr = new StreamReader(File.Open(path, FileMode.Open)))
                    {
                        terms = tr.ReadToEnd();
                    }
                    string[] tmpTerms = terms.Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries);
                    if (tmpTerms != null && tmpTerms.Length > 0)
                    {
                        this.Termy = new List<Term>();
                        PorterStemmer ps = new PorterStemmer();
                        foreach (string item in tmpTerms)
                        {
                            Term newTerm = new Term
                            {
                                TermOryginal = item
                            };

                            // usuwamy interpunkcję
                            string termStem = Regex.Replace(newTerm.TermOryginal, "(\\p{P})", string.Empty).ToLower().Trim();
                            // używamy algorytmu Portera
                            newTerm.TermStemming = ps.stemTerm(termStem);
                            this.Termy.Add(newTerm);
                        }
                        this.Termy.Sort((x, y) => x.TermOryginal.CompareTo(y.TermOryginal));
                        this.Termy = this.Termy.Distinct(Term.EqComparer).ToList();
                        return true;
                    }
                }
                catch
                {
                    return false;
                }
            }
            return false;
        }
        public void ExtractTokens()
        {
            string str = "";
            for (int i = 0; i < 256; ++i)
            {
                char ch = (char)i;
                if (!char.IsLetter(ch) )//&& ch != '\'' )
                {
                    str += ch;
                }
            }
            char[] delimiterChars = str.ToCharArray();

            PorterStemmer stemmer = new PorterStemmer();

            foreach (WikiPage page in wikiPages)
            {
                string temp = page.text;/*
                bool test = false;
                foreach (char ch in page.text)
                {
                    if (char.GetNumericValue(ch) < 256)
                    {
                        temp += ch;
                        test = false;
                    }
                    else
                    {
                        if (!test)
                        {
                            temp += ' ';
                            test = true;
                        }
                    }
                }*/
                temp = Regex.Replace(temp, @"[^\u0000-\u007F]", " ");
                temp = Regex.Replace(temp, @"<span.*>&nbsp;", " ");
                temp = Regex.Replace(temp, @"<math>.*</math>", " ");
                //temp = Regex.Replace(temp, @"<ref>.*</ref>", " ");
                //temp = Regex.Replace(temp, @"<source.*</source>", " ");
                temp = temp.Replace("</span>", " ");
                temp = temp.Replace("<math", " ");
                temp = temp.Replace("</math>", " ");
                temp = temp.Replace("\'\'", " ");
                temp = temp.Replace(" \'", " ");
                temp = temp.Replace("\' ", " ");
                string[] tokenStrings = temp.Split(delimiterChars, StringSplitOptions.RemoveEmptyEntries);
                //string[] tokenStrings = page.text.Split(delimiterChars, StringSplitOptions.RemoveEmptyEntries);
                foreach (string tokenString in tokenStrings)
                {
                    string stem = stemmer.stemTerm(tokenString).ToLower();
                    if (stem.Length <= 2) continue;
                    if (stem == "ref") continue;
                    if (stem == "sub") continue;
                    //if (stem == "math") continue;
                    if (stem.StartsWith("disambig")) continue;
                    if (stem == "sup") continue;
                    if (stem == "stub") continue;
                    if (stem == "frac") continue;
                    if (stem == "nbsp") continue;
                    if (page.tf_IDF_Vec.ContainsKey(stem))
                    {
                        ++page.tf_IDF_Vec[stem].TF;
                    }
                    else
                    {
                        page.tf_IDF_Vec[stem] = new WikiToken(tokenString, stem);
                        if (inverseTokens.ContainsKey(stem))
                        {
                            ++inverseTokens[stem];
                        }
                        else
                        {
                            inverseTokens[stem] = 1;
                        }
                    }
                }
            }

            foreach (WikiPage page in wikiPages)
            {
                float squaredSummed = 0;
                //float summed = 0;
                foreach (string token in page.tf_IDF_Vec.Keys )
                {
                    WikiToken wikiToken = page.tf_IDF_Vec[token];
                    wikiToken.DF = inverseTokens[wikiToken.Stemmed];
                    wikiToken.TF_IDF = (float)(1 + Math.Log((float)wikiToken.TF, 2)) * (float)Math.Log((float)wikiPages.Count / wikiToken.DF, 2);
                    squaredSummed += wikiToken.TF_IDF * wikiToken.TF_IDF;
                    //summed += wikiToken.TF_IDF;
                }

                float magnitude = (float)Math.Sqrt(squaredSummed);
                foreach (string token in page.tf_IDF_Vec.Keys)
                {
                    WikiToken wikiToken = page.tf_IDF_Vec[token];
                    wikiToken.TF_IDF /= magnitude;
                    //wikiToken.TF_IDF /= summed;
                }
            }
        }