//the bigram of a word table insertion public static void AddBiGram(string text) { //take the term and put the doolar sign with the first letter and the last letter and we must not repeat the bi grams so we will check the occurance first for (int i = -1; i < text.Length; i++) { string dollar_sign = "$"; string BI1; string BI2; if (i == -1) { BI1 = dollar_sign; } else { BI1 = text[i].ToString(); } if (i + 1 == text.Length) { BI2 = dollar_sign; } else { BI2 = text[i + 1].ToString(); } string BI; BI = BI1 + BI2; SQL_DB.insert_into_DB_BIGRAMTable(BI); } }
//the soundex code table insertion public static void AddSoundex(string single_term) { if (single_term == "") { return; } string codeterm = single_term[0].ToString(); //Loop On Term Length for (int i = 1; i <= single_term.Length - 1; i++) { //Get The Code Of EACH Character foreach (var c in Constants.DictionarysoundexList) { if (c.Key.Contains(single_term[i])) { codeterm += c.Value; break; } } } //Loop On Code Term And Remove Repeated Character for (int j = codeterm.Length - 1; j >= 1; j--) { if (codeterm[j] == codeterm[j - 1]) { codeterm = codeterm.Remove(j, 1); } } //Remove All Zeros for (int n = 0; n < codeterm.Length; n++) { if (codeterm[n].Equals('0')) { codeterm = codeterm.Remove(n, 1); } } //If CodeLenght less than 4 Concate zeros if (codeterm.Length < 4) { int diff = 4 - codeterm.Length; for (int k = 0; k < diff; k++) { codeterm += '0'; } } SQL_DB.insert_into_DB_SoundexTable(codeterm); }
//the whole indexing process public void indexing() { //select all the contents of the URLs List <URL_Data> contents = SQL_DB.GetAllUrlData(); //loop on these contents for (int i = 0; i < contents.Count; i++) { //1. Parse each Content (to extract the text) //didn't use to reduse the run time beacause we have managed data // string textOfContent = ParseHTMLContent(contents[i].content); //2.tokenize each content string textOfContent = contents[i].content; List <string> splittedContentList = tokenize(textOfContent); //3.linguistics Algo List <string> doc_Tokens = linguistics_Algo(splittedContentList); //4.dic to hold terms without stopwords , and position of term in the doc before remove the stopwords (as comma seprated string) Dictionary <string, string> doc_Terms = new Dictionary <string, string>(); for (int j = 0; j < doc_Tokens.Count; j++) { if (!Constants.StopWords.Contains(doc_Tokens[j])) { string term = doc_Tokens[j]; if (doc_Terms.Keys.Contains(term)) { doc_Terms[term] = doc_Terms[term] + "," + j.ToString(); } else { doc_Terms.Add(term, j.ToString()); } } } //big dic to hold all the unique terms of all documents(dic=(term,list[docID,freq,pos])) ////this list holds many val. //so this dic holds all words from all documents without repetition and without stopwords and with all its info in a list foreach (var dTerm in doc_Terms) { inverted_index doc_term_data = new inverted_index { DocId = contents[i].ID, Frequency = dTerm.Value.Count(c => c == ',') + 1, Position = dTerm.Value }; if (terms.Keys.Contains(dTerm.Key)) { //new value only (terms[dTerm.Key]).Add(doc_term_data); } else { //new row (key, value) terms.Add(dTerm.Key, new List <inverted_index> { doc_term_data }); } } } //save the term and its doc_id in a table before stemming to build next module foreach (var term in terms) { for (int i = 0; i < term.Value.Count; i++) { //if ((Regex.IsMatch(term.Key, "^[a-zA-Z0-9]*$")) && (term.Key != " ")) if ((Regex.IsMatch(term.Key, @"^[a-zA-Z]+$")) && (term.Key != " ")) { //Console.WriteLine("filtered"); SQL_DB.insert_into_DB_TermDocTable(term.Key, term.Value[i]); } //else //{ // Console.WriteLine("bye bye term"); //} } } ////get the bigram of the term and put it in the DB table foreach (var term in terms) { string text = term.Key; AddBiGram(text); } //Console.WriteLine("5alas add bigram"); //get the soundex of the term and put it in the DB table foreach (var term in terms) { string single_term = term.Key; AddSoundex(single_term); } inverted_index_table = stemming(); //stemming Algo //building the inverted_index after stemming foreach (var rec in inverted_index_table) { if ((Regex.IsMatch(rec.Key, @"^[a-zA-Z]+$")) && (rec.Key != " ")) { int term_id = SQL_DB.insert_into_DB_termsTable(rec.Key); for (int i = 0; i < rec.Value.Count; i++) { rec.Value[i].Term_id = term_id; SQL_DB.insert_into_DB_invertedindexTable(rec.Key, rec.Value[i]); } } } }