/// <summary> /// Writes each of the dictionaries to file /// </summary> public void WriteToFile() { Char_gram.ToCSV("Char.csv"); Uni_gram.ToCSV("Uni.csv"); Bi_gram.ToCSV("Bi.csv"); Tri_gram.ToCSV("Tri.csv"); }
/// <summary> /// This will take in a text file, read the file by line, parse the terms, stem the terms, and set the terms into the correct dictionary /// </summary> /// <param name="path">The path to the file that is being currently worked</param> public void IndexFile(string path) { //Read in the file FileInfo info = new FileInfo(path); StreamReader sr = new StreamReader(path); //Loop through the file line by line while (!sr.EndOfStream) { string orgline = sr.ReadLine(); string[] terms = orgline.Split(' '); //Break the line down into its individual characters foreach (string s in terms) { char[] scrubList = Token_Char(s); foreach (char c in scrubList) { Char_gram.Add(c.ToString()); } } //break the line down into its individual tokens string line = Token_String(orgline); terms = line.Split(' '); List <string> temp = new List <string>(); //remove any white spaces that ended up in the set foreach (string s in terms) { string hold = s.Trim(); if (hold.Length > 0) { temp.Add(hold); } } terms = temp.ToArray(); //if the set has terms then process the single terms if (terms.Length >= 1) { foreach (string term in terms) { //if (term.Contains("affection")) //{ // string s = term; //} Uni_gram.Add(Stemmer.Stem(term)); } } //if the set has terms then process the bi-terms if (terms.Length >= 2) { string biGram = ""; for (int i = 0; i < terms.Length - 1; i++) { biGram = Stemmer.Stem(terms[i]) + " " + Stemmer.Stem(terms[i + 1]); Bi_gram.Add(biGram); } } //if the set has terms then process the tri-terms if (terms.Length >= 3) { string triGram = ""; for (int i = 0; i < terms.Length - 2; i++) { triGram = Stemmer.Stem(terms[i]) + " " + Stemmer.Stem(terms[i + 1]) + " " + Stemmer.Stem(terms[i + 2]); Tri_gram.Add(triGram); } } } }