// Merge 2 inverted_index list by comparing the doc id and appending if it matchs private List <inverted_index> MergeIndexValues(List <inverted_index> list1, List <inverted_index> list2) { List <inverted_index> inverted_index_val = new List <inverted_index> { }; for (int j = 0; j < list1.Count; j++) { for (int i = 0; i < list2.Count; i++) { if (list1[j].DocId == list2[i].DocId) { inverted_index inverted_index_obj = new inverted_index { DocId = list1[j].DocId, Frequency = list1[j].Frequency + list2[i].Frequency + 1, Position = list1[j].Position + "," + list2[i].Position }; inverted_index_val.Add(inverted_index_obj); list1.Remove(list1[j]); list2.Remove(list2[i]); i--; j--; break; } } } inverted_index_val.AddRange(list1); inverted_index_val.AddRange(list2); return(inverted_index_val); }
//insert_into_DB_invertedindexTable //that have all the stemming words and its pos and its freq etc. public static void insert_into_DB_invertedindexTable(string term, inverted_index item) { //SQLQuery SqlConnection con = new SqlConnection(connectionString); con.Open(); string insertStr = @" insert into invertedIndex(Term_id, Term, DocId, Frequency, Positions)values(@Term_id, @Term, @DocId, @Frequency, @Positions)"; SqlCommand cmd = new SqlCommand(insertStr, con); SqlParameter term_id = new SqlParameter("@Term_id", item.Term_id); cmd.Parameters.Add(term_id); SqlParameter term_name = new SqlParameter("@Term", term); cmd.Parameters.Add(term_name); SqlParameter doc_id = new SqlParameter("@DocId", item.DocId); cmd.Parameters.Add(doc_id); SqlParameter freq = new SqlParameter("@Frequency", item.Frequency); cmd.Parameters.Add(freq); SqlParameter pos = new SqlParameter("@Positions", item.Position); cmd.Parameters.Add(pos); cmd.ExecuteNonQuery(); con.Close(); }
//the whole indexing process public void indexing() { //select all the contents of the URLs List <URL_Data> contents = SQL_DB.GetAllUrlData(); //loop on these contents for (int i = 0; i < contents.Count; i++) { //1. Parse each Content (to extract the text) //didn't use to reduse the run time beacause we have managed data // string textOfContent = ParseHTMLContent(contents[i].content); //2.tokenize each content string textOfContent = contents[i].content; List <string> splittedContentList = tokenize(textOfContent); //3.linguistics Algo List <string> doc_Tokens = linguistics_Algo(splittedContentList); //4.dic to hold terms without stopwords , and position of term in the doc before remove the stopwords (as comma seprated string) Dictionary <string, string> doc_Terms = new Dictionary <string, string>(); for (int j = 0; j < doc_Tokens.Count; j++) { if (!Constants.StopWords.Contains(doc_Tokens[j])) { string term = doc_Tokens[j]; if (doc_Terms.Keys.Contains(term)) { doc_Terms[term] = doc_Terms[term] + "," + j.ToString(); } else { doc_Terms.Add(term, j.ToString()); } } } //big dic to hold all the unique terms of all documents(dic=(term,list[docID,freq,pos])) ////this list holds many val. //so this dic holds all words from all documents without repetition and without stopwords and with all its info in a list foreach (var dTerm in doc_Terms) { inverted_index doc_term_data = new inverted_index { DocId = contents[i].ID, Frequency = dTerm.Value.Count(c => c == ',') + 1, Position = dTerm.Value }; if (terms.Keys.Contains(dTerm.Key)) { //new value only (terms[dTerm.Key]).Add(doc_term_data); } else { //new row (key, value) terms.Add(dTerm.Key, new List <inverted_index> { doc_term_data }); } } } //save the term and its doc_id in a table before stemming to build next module foreach (var term in terms) { for (int i = 0; i < term.Value.Count; i++) { //if ((Regex.IsMatch(term.Key, "^[a-zA-Z0-9]*$")) && (term.Key != " ")) if ((Regex.IsMatch(term.Key, @"^[a-zA-Z]+$")) && (term.Key != " ")) { //Console.WriteLine("filtered"); SQL_DB.insert_into_DB_TermDocTable(term.Key, term.Value[i]); } //else //{ // Console.WriteLine("bye bye term"); //} } } ////get the bigram of the term and put it in the DB table foreach (var term in terms) { string text = term.Key; AddBiGram(text); } //Console.WriteLine("5alas add bigram"); //get the soundex of the term and put it in the DB table foreach (var term in terms) { string single_term = term.Key; AddSoundex(single_term); } inverted_index_table = stemming(); //stemming Algo //building the inverted_index after stemming foreach (var rec in inverted_index_table) { if ((Regex.IsMatch(rec.Key, @"^[a-zA-Z]+$")) && (rec.Key != " ")) { int term_id = SQL_DB.insert_into_DB_termsTable(rec.Key); for (int i = 0; i < rec.Value.Count; i++) { rec.Value[i].Term_id = term_id; SQL_DB.insert_into_DB_invertedindexTable(rec.Key, rec.Value[i]); } } } }