コード例 #1
0
        public static List <List <string> > GetDocuments() //returns a list of string arrays that will be each document stemmed and have stop word removed
        {
            //Save all documents to single string
            Assembly assembly     = Assembly.GetExecutingAssembly();
            string   resourceName = "Information_Retrieval_System.Resources.MEDDocuments.txt";

            string result;

            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                using (StreamReader reader = new StreamReader(stream))
                {
                    //single string that contains all documents
                    result = reader.ReadToEnd();
                }
            }
            string[] docSeperator = { ".I" };
            //split document string into separate strings for each document
            string[] splitDocuments = result.Split(docSeperator, StringSplitOptions.RemoveEmptyEntries);

            //save each document to list
            List <string[]> initialDocumentsList = new List <string[]>();

            string[] docDelimiters = { ".W", "\r", "\n", " ", ".", ",", "?", "!", "-", "/", "'", "(", ")" };
            foreach (string doc in splitDocuments)
            {
                string[] d = doc.Split(docDelimiters, StringSplitOptions.RemoveEmptyEntries);

                initialDocumentsList.Add(d);
            }

            //take stop words and stem all documents
            List <List <string> > finalDocumentsList = new List <List <string> >();

            foreach (string[] dt in initialDocumentsList)
            {
                string[] removedStopWords = StopWords.RemoveStopWords(dt);
                string[] finalDoc         = WordStemmer.QueryStemmer(removedStopWords);
                //remove doc number from beginning of document content
                List <string> temp = new List <string>(finalDoc);
                temp.RemoveAt(0);
                finalDocumentsList.Add(temp);
            }
            //return processed list of documents
            return(finalDocumentsList);
        }