PorterStemmer.StemWord C# (CSharp) Exemples de code

Exemple #1

0

Afficher le fichier

        /// <summary>
        /// 文章を解析しやすい形に前処理します。
        /// </summary>
        /// <remarks>
        /// 単語に分割
        /// ストップワードを除外
        /// ステミング
        /// スペース区切りで単語をつないで戻す。
        /// </remarks>
        /// <param name="text"></param>
        /// <returns></returns>
        private string CleanText(string text)
        {
            string[] words      = text.Split(' ');
            var      cleanWords = EnumerableValidWords(words).Select(word => _stemmer.StemWord(word));

            return(string.Join(" ", cleanWords.ToArray()));
        }

Exemple #2

0

Afficher le fichier

        public CrawlerLink(string address, string contents, IEnumerable <string> links)
        {
            Console.WriteLine("Processing contents...");

            this.Address = address;

            // Store links
            this.Links = new HashSet <string>(links);

            // Tokenize
            Console.Write("Tokenizing... ");
            var tokens = new List <string>(contents.Split(' '));

            Console.WriteLine("Done!");
            Console.Write("Removing short and stop word tokens... ");
            tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token));
            Console.WriteLine("Done!");

            // Generate shingle hashes
            Console.Write("Generating shingle hashes... ");
            var jaccard = new Jaccard();

            this.ShingleHashes = new LinkedList <ulong>(jaccard.HashedShinglifyDocument(tokens.ToArray()));
            Console.WriteLine("Done!");

            // Apply stemming
            Console.Write("Stemming tokens... ");
            var stemmer       = new PorterStemmer();
            var stemmedTokens = new List <string>(tokens.Select(token => stemmer.StemWord(token)));

            this.Tokens = new HashSet <string>(stemmedTokens);
            Console.WriteLine("Done!");

            // Sort elements
            Console.Write("Sorting stemmed tokens... ");
            stemmedTokens.Sort();
            Console.WriteLine("Done!");

            // Get keyword count
            Console.Write("Adding stemmed tokens to dictionary... ");
            var lastKeyword = "";
            var keywords    = new Dictionary <string, int>();

            foreach (var stemmedToken in stemmedTokens)
            {
                if (!stemmedToken.Equals(lastKeyword))
                {
                    lastKeyword            = stemmedToken;
                    keywords[stemmedToken] = 1;
                }
                else
                {
                    keywords[stemmedToken] += 1;
                }
            }

            this.Keywords = keywords;
            Console.WriteLine("Done!");
        }

Exemple #3

0

Afficher le fichier

        public LinkedList <IndexEntry> Execute(string query, CrawlerRegistry registry, int maxResults = 25, bool usePageRank = false)
        {
            var tokens = new List <string>(query.ToLower().Split(' '));

            tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token));

            var stemmer       = new PorterStemmer();
            var stemmedTokens = new HashSet <string>(tokens.Select(token => stemmer.StemWord(token.ToLower())));

            return(this.Execute(stemmedTokens.ToList(), registry, maxResults, usePageRank));
        }

Exemple #4

0

Afficher le fichier

Fichier : Indexing.cs Projet : krestpel15/BIT694_TMA3

        //inverted index
        public Dictionary <string, Dictionary <string, double> > InvertedIndex(string folder)
        {
            converter = new Converter();
            if (internalIndex != null)
            {
                internalIndex.Clear();
            }                                                                         // clears the memory usage of exisitng Index
            internalIndex = new Dictionary <string, Dictionary <string, double> >();  // the invertedIndex to be returned
            indexCount    = 0;                                                        // a counter for how large the inverted index is.

            Dictionary <string, double> fileList = new Dictionary <string, double>(); // a list to populate the files that match a term
            PorterStemmer stemmer = new PorterStemmer();                              // instantiate a PorterStemmer object to stem words from files

            foreach (string file in IndexingFolders(folder))
            {
                //int fileID = converter.assignID(file); // create an Id from the string of the file and store in HashMap Converter.paths

                foreach (string word in ScanFiles.scanFiles(file))
                {
                    // stem the word
                    string stemmedWord = stemmer.StemWord(word);

                    // create the Dictionary for the collection
                    if (internalIndex.ContainsKey(stemmedWord))
                    {
                        fileList = internalIndex[stemmedWord];

                        // check if the file is already in the list or not
                        if (fileList.ContainsKey(file))
                        {
                            fileList[file] = double.Parse(fileList[file].ToString()) + 1;
                        }
                        else
                        {
                            fileList.Add(file, 1.0);
                        }

                        internalIndex[stemmedWord] = fileList;
                    }
                    else
                    {
                        // create a new key and start new List of files for the key
                        fileList = new Dictionary <string, double>
                        {
                            { file, 1.0 }
                        };
                        internalIndex.Add(stemmedWord, fileList);
                        indexCount++;
                    }
                }
            }
            return(internalIndex);
        }

Exemple #5

0

Afficher le fichier

Fichier : TestPorterStemmer.cs Projet : MartyAddante/WordCountApp

        public void Test_StemWordOutPut_Matches_StaticOutput()
        {
            string filepath = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);

            filepath = Directory.GetParent(Directory.GetParent(Directory.GetParent(filepath).FullName).FullName).FullName;

            List <string> StaticOutput = new List <string>();
            List <string> TestOutput   = new List <string>();

            try
            {
                string staticOutputPath = filepath + @"\StemmerTestFiles\OutputWords.txt";
                using (var stream = new StreamReader(staticOutputPath))
                {
                    string line = stream.ReadLine();
                    while (line != null)
                    {
                        StaticOutput.Add(line);
                        line = stream.ReadLine();
                    }
                }
            }
            catch (IOException e)
            {
                Console.WriteLine("The file could not be read:");
                Console.WriteLine(e.Message);
            }

            PorterStemmer ps = new PorterStemmer();

            try
            {
                string staticOutputPath = filepath + @"\StemmerTestFiles\RawWords.txt";
                using (var stream = new StreamReader(staticOutputPath))
                {
                    string line = stream.ReadLine();
                    while (line != null)
                    {
                        TestOutput.Add(ps.StemWord(line));
                        line = stream.ReadLine();
                    }
                }
            }
            catch (IOException e)
            {
                Console.WriteLine("The file could not be read:");
                Console.WriteLine(e.Message);
            }


            Assert.IsTrue(StaticOutput.SequenceEqual(TestOutput));
        }

Exemple #6

0

Afficher le fichier

Fichier : Functions.cs Projet : Talidyn/FrequencyCalculator

        /// <summary>
        /// Transforms the list of terms into their root form.
        /// Uses the Porter Stemming alogorithm.
        /// </summary>
        /// <param name="wordList">List of terms to edit</param>
        /// <returns></returns>
        public static List <string> StemWords(List <string> wordList)
        {
            var stemmer = new PorterStemmer();

            List <string> stemmedList = new List <string>();

            foreach (var item in wordList)
            {
                stemmedList.Add(stemmer.StemWord(item));
            }

            return(stemmedList);
        }

Exemple #7

0

Afficher le fichier

Fichier : AnswerService.cs Projet : rimever/NLP100Knocks

        /// <summary>
        /// 52. ステミング
        /// 51の出力を入力として受け取り，Porterのステミングアルゴリズムを適用し，単語と語幹をタブ区切り形式で出力せよ． Pythonでは，Porterのステミングアルゴリズムの実装としてstemmingモジュールを利用するとよい．
        /// </summary>
        public void Answer52()
        {
            PorterStemmer porterStemmer = new PorterStemmer();

            foreach (var sentence in SplitSentence())
            {
                foreach (var word in SplitWords(sentence))
                {
                    var stem = porterStemmer.StemWord(word);
                    Console.WriteLine($"{word}\t{stem}");
                }

                Console.WriteLine();
            }
        }

Exemple #8

0

Afficher le fichier

        private void searchButton_Click(object sender, EventArgs e)
        {
            PorterStemmer stemmer     = new PorterStemmer();               // instantiate a PorterStemmer object to stem words from files
            string        stemmedWord = stemmer.StemWord(searchWord.Text); //stems the word before searching

            fileList.Text   = " ";
            filesFound.Text = " ";

            List <string> files = new List <string>();
            bool          found = false;

            if (thread.IsAlive)
            {
                MessageBox.Show("The index is currently busy. Please try again later"); //message shown if index building is in progress
            }
            else
            {
                foreach (var item in index.internalIndex)
                {
                    string newItem = StopWords.RemoveStopwords(item.Key);

                    if (newItem == stemmedWord)
                    {
                        found           = true;
                        filesFound.Text = item.Value.Count.ToString();

                        foreach (var folderName in item.Value.Keys)
                        {
                            files.Add(folderName);
                        }
                    }
                }

                foreach (var file in files)
                {
                    fileList.Text += file + "\r\n";
                }
                if (!found)
                {
                    filesFound.Text = "0";
                    fileList.Text   = "No results found";
                }
            }
        }

Exemple #9

0

Afficher le fichier

        static void Main(string[] args)
        {
            debug = false; // If set increases output of info
            string usage = "Usage options: \"verbose\" for additional output ";

            if (args.Length != 0)
            {
                switch (args[0])
                {
                case "verbose":
                    debug = true;
                    break;

                default:
                    Console.WriteLine(usage);
                    break;
                }
            }


            string output;

            string[]     tokens;
            StreamWriter outfile = new StreamWriter("dictionary.txt");

            Console.WriteLine("Getting list of files for processing");
            //*************************************
            // Get files from the current directory
            //*************************************
            string path = Directory.GetCurrentDirectory();

            string[] docx     = Directory.GetFiles(path, "*.docx");
            string[] pdf      = Directory.GetFiles(path, "*.pdf");
            string[] rtf      = Directory.GetFiles(path, "*.rtf");
            string[] docArray = docx.Concat(Directory.GetFiles(path, "*.pdf")).ToArray();
            docArray = docArray.Concat(rtf).ToArray();
            Array.Sort(docArray);   // Ensure sort order is maintained across the processing apps

            // list all docs found
            if (debug == true)
            {
                Console.WriteLine("We found the following list of files: ");
                foreach (var file in docArray)
                {
                    Console.WriteLine(file);
                }
            }
            Console.WriteLine("Total Files found:{0}", docArray.Length);

            Dictionary <string, int> dictionary = new Dictionary <string, int>();

            foreach (var file in docArray)
            {
                output = processdocument(file);
                if (fileopen == false)
                {
                    continue;
                }
                string fname = file;

                fname = file.Replace(".docx", ".txt").Replace(".pdf", ".txt").Replace(".rtf", ".txt");

                Console.WriteLine($"Writing file {fname} output...");
                System.IO.StreamWriter writefile = new System.IO.StreamWriter(fname, true); // Create output file same name .txt

                //*************
                // tokenization
                //*************
                char[] separators = { '_', ' ', ',', '.', '-', ':', ';', '{', '}', '|', '\n', '\t', '\u2029', '\r' };
                tokens = output.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                var stemmer = new PorterStemmer();

                string stem; // Token after stemming
                foreach (var token in tokens)
                {
                    // Stem the current Token stemmed token left in stem

                    stem = stemmer.StemWord(token);
                    writefile.WriteLine(stem);

                    // The Add method throws an exception if the new key is
                    // already in the dictionary.
                    try
                    {
                        dictionary.Add(stem, 1);
                    }
                    catch (ArgumentException)
                    {
                        dictionary[stem] += 1;  // if exists increment the count
                    }
                }
                writefile.Close(); // added this becuase for resumes of less than a page the file was empty
            }

            // Now write out the dicionary to a text file
            if (fileopen == true)
            {
                foreach (var entry in dictionary)
                {
                    outfile.WriteLine("{0}, {1}", entry.Key, entry.Value);
                }
                outfile.Close(); // added this because for resumes of less than a page the dict was empty
            }
            Console.WriteLine("{0} Errors found", errorcount);
        }

Exemple #10

0

Afficher le fichier

Fichier : Program.cs Projet : ahmedfathy74/Search-Engine

        static void Main(string[] args)
        {
            // start connection with database
            SqlConnection sqlConnection = new SqlConnection("Data Source=AHMEDFATHY-PC;Initial Catalog=newDB;Integrated Security=True; MultipleActiveResultSets=true");

            sqlConnection.Open();
            // select statment to retrieve everything from database
            string     queryString = "SELECT * FROM crawler_Table";
            SqlCommand cmd         = new SqlCommand(queryString, sqlConnection);
            // declare variable from reader to read from database (all the content from  database)
            SqlDataReader rdr = cmd.ExecuteReader();

            int counterofopages = 0;   // counter for number of pages that i read it from database (at least 1500)

            // datastructure to save term and doc_id and frequency and list of positions for this term
            List <KeyValuePair <string, KeyValuePair <int[], List <int> > > > indexmap = new List <KeyValuePair <string, KeyValuePair <int[], List <int> > > >();

            // while loop to read row by row from the reader
            while (rdr.Read())
            {
                // this condition to break from loop when take at least 1500 page
                if (counterofopages == 1600)
                {
                    break;
                }
                // try and catch to throw any exceptions out if it retreive null from innertext or something else

                int boolll = 0; // boolean to check if the inner text has exception change boolean = 1 and skip tha link
                try
                {
                    //===================================================//
                    // retreive from each row docid , url (link) , content of the page (html page)
                    int    doc_id  = (int)rdr["doc_id"];
                    string url     = (string)rdr["URL"];
                    string content = (string)rdr["Page_Content"];
                    //===================================================//

                    // pasre html page from database and get the inner text  (step 1)
                    IHTMLDocument2 myDoc = new HTMLDocumentClass();
                    myDoc.write(content);
                    string elements = myDoc.body.innerText;
                    //===================================================//
                    //(it will be)
                    /// split in (step 2) (to take tokens and save it in array of strings named (tokens)
                    string[] tokens = elements.Split(',', ' ', '.', ':', '\t', '\n', '\r');

                    int i = 0; // counter to calculate the position for every term

                    // check if any string it will be null or empty
                    tokens = tokens.Where(x => !string.IsNullOrEmpty(x)).ToArray();
                    //===================================================//

                    /// saves every term and its list (positions) (s in dictionary named (termsandpos) before removing stop words
                    Dictionary <string, List <int> > termsandpos = new Dictionary <string, List <int> >();
                    foreach (var words in tokens)
                    {
                        List <int> listofpos = new List <int>();
                        i++;
                        // using regex to remove punctuation characters from every word   (step 3) -> req 1
                        string word = Regex.Replace(words, @"[^\w\d\s]", "");
                        word = Regex.Replace(word, @"\d", "");
                        // if the word is empty after removing punctuation characters continues and don't save it
                        if (word == "")
                        {
                            continue;
                        }
                        // using spelling class from netspell reference and create object from it and using it to check if this word is real word in english or not.
                        Spelling ss = new Spelling();
                        // when the object from spelling class is used , the dialog window will opened and has many feature and i will closed by using next line to continue my run it's not used for my code.
                        ss.ShowDialog = false;
                        // check if this word is not found in dictionary in the spell library , continue ( go to the next word).
                        // esle continue the rest of the code (that is mean the word is found in the dictionary).
                        if (ss.SpellCheck(word))
                        {
                            continue;
                        }

                        word = word.ToLower(); //case folding in  (step 3) -> req 2

                        //If the word  is already existed ,add the new position in the list of this word
                        if (termsandpos.ContainsKey(word))
                        {
                            listofpos = termsandpos[word];
                            listofpos.Add(i);
                            termsandpos[word] = listofpos;
                        }
                        // else, add the word and the first position
                        else
                        {
                            listofpos.Add(i);
                            termsandpos.Add(word, listofpos);
                        }
                    }
                    //===================================================//

                    /////  stop words removing in (step 3) -> req 3
                    /// list of stop words
                    /// create anthor dictinary to copy all terms without stop words
                    Dictionary <string, List <int> > temp = new Dictionary <string, List <int> >();
                    List <string> stopwords = new List <string>()
                    {
                        "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"
                    };
                    for (int f = 0; f < termsandpos.Count; f++)
                    {
                        // if the term is already existed in the stopwords list or the term is a single character like ii or i , continue (and go to the next term).
                        if (stopwords.Contains(termsandpos.Keys.ElementAt(f)) || termsandpos.Keys.ElementAt(f).Length <= 2)
                        {
                            continue;
                        }
                        // else ,that's mean the term is not a stop word then add it and its positions in the temp dictionary.
                        else
                        {
                            List <int> copyofpositions = new List <int>();
                            copyofpositions = termsandpos[termsandpos.Keys.ElementAt(f)];
                            temp.Add(termsandpos.Keys.ElementAt(f), copyofpositions);
                        }
                    }
                    //===================================================//

                    ////  al stemming algorithm            (step 3) --> req 4
                    var stemmer = new PorterStemmer();    // declare object from claas of porterstemmer algorithm
                    Dictionary <string, List <int> > finalterm = new Dictionary <string, List <int> >();
                    foreach (KeyValuePair <string, List <int> > iter1 in temp)
                    {
                        //===================================================//

                        // add every term and its docid in table called (TermsBStemming_Table) in db before stemming (the note in step 3 -->req 4)
                        string       insertString3 = "INSERT INTO TermsBStemming_Table (termBstemming,docID) VALUES (@termBstemming,@docID)";
                        SqlCommand   cmd3          = new SqlCommand(insertString3, sqlConnection);
                        SqlParameter par1          = new SqlParameter("@termBstemming", iter1.Key);
                        SqlParameter par2          = new SqlParameter("@docID", doc_id);
                        cmd3.Parameters.Add(par1);
                        cmd3.Parameters.Add(par2);
                        cmd3.ExecuteNonQuery();
                        //===================================================//

                        List <int> listofpositions = new List <int>();
                        // called function (StemWord) and send the term and return term after stemming
                        string stem = stemmer.StemWord(iter1.Key);
                        // check if this stem is already existed in finalterm dictionary (the new datastructure to save the term and its list after stemmnig)
                        if (finalterm.ContainsKey(stem))
                        {
                            List <int> tempforsimlir = new List <int>();
                            tempforsimlir   = finalterm[stem]; // take the list of positions for this term (old positions added before for this term)
                            listofpositions = temp[iter1.Key]; // take the list of new positions for this term
                            /// added the new positions and old position in one list
                            for (int j = 0; j < listofpositions.Count; j++)
                            {
                                tempforsimlir.Add(listofpositions[j]);
                            }
                            // and save it again for the term
                            finalterm[stem] = tempforsimlir;
                        }
                        // addd the term ans its list to finalterm dictionary
                        else
                        {
                            listofpositions = temp[iter1.Key];
                            finalterm.Add(stem, listofpositions);
                        }
                    }

                    //===================================================//

                    ////  inverted index (step 4)

                    foreach (KeyValuePair <string, List <int> > iter in finalterm)
                    {
                        int   freq = iter.Value.Count; // calculate freq through count number of positions
                        int[] arr  = new int[2];       // save in this array doc id and the frequency
                        arr[0] = doc_id;
                        arr[1] = freq;
                        // convert list of the positions for every term to string
                        var resultofpositions = string.Join(", ", iter.Value);
                        //===================================================//

                        // save term and docid ans=d frequency and (list of positions as string ) in table called Inverted_Index in db.
                        string       insertString2 = "INSERT INTO Inverted_Index (Term,DocID,Frequency,position) VALUES (@Term,@DocID,@Frequency,@position)";
                        SqlCommand   cmd2          = new SqlCommand(insertString2, sqlConnection);
                        SqlParameter paramter1     = new SqlParameter("@Term", iter.Key);
                        SqlParameter paramter2     = new SqlParameter("@DocID", doc_id);
                        SqlParameter paramter3     = new SqlParameter("@Frequency", freq);
                        SqlParameter paramter4     = new SqlParameter("@position", resultofpositions);
                        cmd2.Parameters.Add(paramter1);
                        cmd2.Parameters.Add(paramter2);
                        cmd2.Parameters.Add(paramter3);
                        cmd2.Parameters.Add(paramter4);
                        cmd2.ExecuteNonQuery();
                        //===================================================//
                        /// store in index list term and arrof ints (arr[0]=docid,arr[1] = freqs of every term) and list of all positions of this term (if i needed in ranks or something else).
                        indexmap.Add(new KeyValuePair <string, KeyValuePair <int[], List <int> > >(iter.Key, new KeyValuePair <int[], List <int> >(arr, iter.Value)));
                    }

                    //===================================================//
                }
                //===================================================//
                //catch any type of exception and change the boolean that i decalred equal zero
                catch (NullReferenceException ex)
                {
                    boolll = 1;
                    Console.WriteLine(ex.Message);
                }
                catch (ArgumentOutOfRangeException exx)
                {
                    boolll = 1;
                    Console.WriteLine(exx.Message);
                }
                // if the boolean became equal 1 , then leave this link and go to anthor link
                if (boolll == 1)
                {
                    continue;
                }

                //===================================================//
                /// to count number of pages (at least 1500 page)
                counterofopages++;
                //===================================================//
            }
            //===================================================//
            // close the reader from database
            rdr.Close();
            /// close the connection
            sqlConnection.Close();
            //===================================================//
        }

Exemple #11

0

Afficher le fichier

Fichier : BooleanQuery.cs Projet : AngelOD/AAU-WI

        public QueryPart ParseQuery(string input)
        {
            var tokens = new List <string>(input.ToLower().Split(' '));

            tokens.RemoveAll(token => token.Length <= 1 || StopWords.BooleanStopWordsList.Contains(token));

            var stemmer       = new PorterStemmer();
            var stemmedTokens = new List <string>(tokens.Select(token => StopWords.BooleanWords.Contains(token.ToLower()) ? token.ToUpper() : stemmer.StemWord(token.ToLower())));

            return(this.ParseQuery(stemmedTokens));
        }

Exemple #12

0

Afficher le fichier

Fichier : Summarizer.cs Projet : JohnSell620/FileSystemHelper

        public static string SummarizeByLSA(TextFile textFile)
        {
            string input = textFile.RawText;

            string[] sentences = input.Split(new char[] { '.', '!', '?', ':', '…', '\r', '\n' },
                                             StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < sentences.Length; ++i)
            {
                var    sb       = new StringBuilder();
                string sentence = sentences[i].Trim();
                foreach (char c in sentence)
                {
                    if (!char.IsPunctuation(c))
                    {
                        sb.Append(c);
                    }
                }
                sentences[i] = sb.ToString().ToLower();
            }

            // Remove stop words--e.g., the, and, a, etc.
            string[] stopwords = File.ReadAllLines(@"Resources/stopwords.txt");
            for (int i = 0; i < sentences.Count(); ++i)
            {
                string sentence = sentences[i];
                for (int j = 0; j < stopwords.Count(); ++j)
                {
                    sentences[i] = string.Join(" ", sentence.Split(' ').Where(wrd => !stopwords.Contains(wrd)));
                }
            }

            // Reduce words to their stem.
            PorterStemmer stemmer = new PorterStemmer();

            for (int i = 0; i < sentences.Count(); ++i)
            {
                sentences[i] = stemmer.StemWord(sentences[i]);
            }

            Dictionary <string, int> wordFrequencies = new Dictionary <string, int>();

            foreach (string s in sentences)
            {
                string[] words = s.Split(' ');
                foreach (string w in words)
                {
                    if (wordFrequencies.ContainsKey(w))
                    {
                        wordFrequencies[w] += 1;
                    }
                    else
                    {
                        wordFrequencies[w] = 1;
                    }
                }
            }

            // Top N words with highest frequencies will serve as document concepts.
            int N = textFile.DesiredSummaryLength;

            string[] concepts = (from kvp in wordFrequencies
                                 orderby kvp.Value descending
                                 select kvp)
                                .ToDictionary(pair => pair.Key, pair => pair.Value).Take(N)
                                .Select(k => k.Key).ToArray();

            // Add concepts to TextFile instance properties.
            textFile.DocumentConcepts = concepts;

            int documentLength = sentences.Length;
            var X = DenseMatrix.Create(N, documentLength, (i, j) => 0.0);

            for (int i = 0; i < X.RowCount; ++i)
            {
                int    sentencesWithConcept = 0;
                string concept = concepts[i];
                for (int j = 0; j < X.ColumnCount; ++j)
                {
                    string[] sentenceWords = sentences[j].Split(' ');
                    int      wordCount     = (from word in sentenceWords
                                              where word == concept
                                              select word)
                                             .Count();
                    if (wordCount > 0)
                    {
                        sentencesWithConcept += 1;
                    }

                    X[i, j] = wordCount / sentenceWords.Length;
                }
                if (sentencesWithConcept == 0)
                {
                    Console.WriteLine("No sentences with concept " + concepts[i]);
                }
                double inverseDocumentFreq = Math.Log(documentLength / (sentencesWithConcept + 0.0001), 2.0);
                for (int k = 0; k < X.ColumnCount; ++k)
                {
                    X[i, k] = X[i, k] * inverseDocumentFreq;
                }
            }

            // Compute SVD of the topic representation matrix, X.
            var svd = X.Svd();

            // Cross method to select summary sentences.
            int             columnCount = svd.VT.ColumnCount;
            Matrix <double> Vh          = svd.VT.SubMatrix(0, concepts.Length, 0, columnCount).PointwiseAbs();

            for (int i = 0; i < Vh.RowCount; ++i)
            {
                double averageSentenceScore = Vh.Row(i).Average();
                for (int j = 0; j < Vh.ColumnCount; ++j)
                {
                    if (Vh[i, j] <= averageSentenceScore)
                    {
                        Vh[i, j] = 0;
                    }
                }
            }

            var sentenceLengths = Vh.RowSums();

            int[] summaryIndices = new int[Vh.RowCount];
            Console.Write("Vh.RowCnt = ", Vh.RowCount);
            Console.Write("concepts.Length = ", concepts.Length);
            for (int i = 0; i < Vh.RowCount; ++i)
            {
                double max = 0;
                for (int j = 0; j < Vh.ColumnCount; ++j)
                {
                    if (Vh[i, j] > max)
                    {
                        max = Vh[i, j];
                        summaryIndices[i] = j;
                    }
                }
            }

            string[] sourceSentences = Regex.Split(input, @"(?<=[\.!\?])\s+");
            textFile.DocumentLength = sourceSentences.Length;
            string summary = "";

            foreach (int i in summaryIndices)
            {
                summary += sourceSentences[i] + " ";
            }

            /* From https://bit.ly/3ogjy2l */
            return(summary.Replace("\r\n", string.Empty)
                   .Replace("\n", string.Empty)
                   .Replace("\r", string.Empty)
                   .Replace("\t", string.Empty)
                   .Replace(((char)0x2028).ToString(), string.Empty)
                   .Replace(((char)0x2029).ToString(), string.Empty));
        }

C# (CSharp) PorterStemmer.StemWord Exemples