// rank for eachDoc private Dictionary <string, double> calculatingDocumentsBM25(Dictionary <string, LinkedList <WordsConnected> > dic) { Dictionary <string, double> array_doc = new Dictionary <string, double>(); double average_doc = m_average_lenght_doc / m_doc_dic.Count; int numberOfDoc = m_doc_dic.Count; foreach (string doc_name in dic.Keys) { LinkedList <WordsConnected> Word_D = dic[doc_name]; int countWord = 0; double doc_final = 0; double K = 0; double k1 = 1.1; double b = 0; double k2 = 1000; K = k1 * ((1 - b) + b * (m_doc_dic[doc_name].DOC_LENGTH)); double part1 = 0; double part2 = 0; double part3 = 0; while (countWord < Word_D.Count) { WordsConnected first = Word_D.First(); Word_D.RemoveFirst(); part1 = (0.5 / 0.5) / ((m_terms_dictionary[first.NAME].DF + 0.5) / (numberOfDoc - m_terms_dictionary[first.NAME].DF + 0.5)); part1 = Math.Log(part1); part2 = ((k1 + 1) * first.LOCATION) / (K + first.LOCATION); part3 = ((k2 + 1) * m_words_in_query[first.NAME]) / (k2 + m_words_in_query[first.NAME]); doc_final = doc_final + (part1 * part2 * part3); part1 = 0; part2 = 0; part3 = 0; countWord++; Word_D.AddLast(first); } array_doc.Add(doc_name, doc_final); doc_final = 0; } return(array_doc); }
// CosSim private Dictionary <string, double> calculatingDocumentsCosSim() { Dictionary <string, double> array_doc = new Dictionary <string, double>(); Dictionary <string, double> idf_words = new Dictionary <string, double>(); double wcalcu = 0; int number_doc = m_doc_dic.Count(); foreach (string word in m_words_in_query.Keys) { if (m_terms_dictionary.ContainsKey(word)) { wcalcu = number_doc / m_terms_dictionary[word].DF; if (!idf_words.ContainsKey(word)) { idf_words[word] = wcalcu; } } } double cosSimD = 0; foreach (string doc in m_query_in_RelevantDoc.Keys) { LinkedList <WordsConnected> Word_D = m_query_in_RelevantDoc[doc]; int countWord = 0; while (countWord < Word_D.Count) { WordsConnected first = Word_D.First(); Word_D.RemoveFirst(); cosSimD = cosSimD + idf_words[first.NAME]; Word_D.AddLast(first); countWord++; } cosSimD = cosSimD / ((m_words_in_query.Count) * (m_doc_dic[doc].DOC_LENGTH)); // cosSimD = cosSimD / (Math.Sqrt(m_words_in_query.Count) * Math.Sqrt(m_doc_dic[doc].DOC_LENGTH)); array_doc.Add(doc, cosSimD); cosSimD = 0; } return(array_doc); }
// LInked list of - [ for each document : words + count ] public Dictionary <string, LinkedList <WordsConnected> > getTheDocument(Dictionary <int, string[]> query_list, bool Dostemming, string path_posting) { //number of doc + average of the length of all doc Dictionary <string, int> words_in_query = new Dictionary <string, int>(); int number_of_doc = m_doc_dic.Count; double average = m_average_lenght_doc / number_of_doc; Dictionary <int, List <string> > list_of_relevet = new Dictionary <int, List <string> >(); Dictionary <string, LinkedList <WordsConnected> > doc_to_calculate = new Dictionary <string, LinkedList <WordsConnected> >(); FileStream fs1, fs2; if (Dostemming == true)///////////////////////////////////////////// { fs1 = new FileStream(path_posting + "/DictionaryStemming", FileMode.Open, FileAccess.Read); fs2 = new FileStream(path_posting + "/PostingFileStemming", FileMode.Open, FileAccess.Read); } else { fs1 = new FileStream(path_posting + "/Dictionary", FileMode.Open, FileAccess.Read); fs2 = new FileStream(path_posting + "/PostingFile", FileMode.Open, FileAccess.Read); } StreamReader sr1 = new StreamReader(fs1); //read dictionary StreamReader sr2 = new StreamReader(fs2); //read posting //get the parameters to the BM25 foreach (int query_number in query_list.Keys) { string[] wordsQuery = query_list[query_number]; int length_query = wordsQuery.Length; Array.Sort(wordsQuery); // check if sort ////////////////////////////// //count the times of each word in the query for (int i = 0; i < wordsQuery.Length; i++) { if (!words_in_query.ContainsKey(wordsQuery[i])) { words_in_query[(wordsQuery[i])] = 1; } else { words_in_query[(wordsQuery[i])]++; } } m_words_in_query = words_in_query; foreach (string word in words_in_query.Keys) { //int qfi = words_in_query[word]; //int word_df = dic_to_use[word].DF; //find the posting of the word string line1 = sr1.ReadLine(); //read dictionary string line2 = sr2.ReadLine(); //read posting int index; string posting; while (line1 != null && line2 != null) { index = line1.IndexOf("/"); line1 = line1.Substring(0, index); //find the word if (line1 == word) { posting = line2; while (posting.Contains(";")) { index = posting.IndexOf(","); string doc_number = posting.Substring(0, index); posting = posting.Substring(index + 1); index = posting.IndexOf(";"); string appear_number = posting.Substring(0, index); int i3; int appear = 0; if (Int32.TryParse(appear_number, out i3)) { appear = i3; } posting = posting.Substring(index + 1); WordsConnected to_add = new WordsConnected(word, appear); // check if the key alreday exist if (doc_to_calculate.ContainsKey(doc_number)) { doc_to_calculate[doc_number].AddFirst(to_add); } else // not exist { if (m_list_languages_pressed.Contains(m_doc_dic[doc_number].LANGUAGE.ToLower()) || m_list_languages_pressed.Count == 0) { doc_to_calculate.Add(doc_number, new LinkedList <WordsConnected>()); doc_to_calculate[doc_number].AddFirst(to_add); } } } line1 = null; line2 = null; } else { line1 = sr1.ReadLine(); line2 = sr2.ReadLine(); } } // reset the postion of the line to the start sr1.Close(); sr2.Close(); fs1.Close(); fs2.Close(); if (Dostemming == true)///////////////////////////////////////////// { fs1 = new FileStream(path_posting + "/DictionaryStemming", FileMode.Open, FileAccess.Read); fs2 = new FileStream(path_posting + "/PostingFileStemming", FileMode.Open, FileAccess.Read); } else { fs1 = new FileStream(path_posting + "/Dictionary", FileMode.Open, FileAccess.Read); fs2 = new FileStream(path_posting + "/PostingFile", FileMode.Open, FileAccess.Read); } sr1 = new StreamReader(fs1); sr2 = new StreamReader(fs2); } } sr1.Close(); sr2.Close(); fs1.Close(); fs2.Close(); // Dictionary<string, double> doc_rank = new Dictionary<string, double>(); // Dictionary<string, double> doc_rank_sim = new Dictionary<string, double>(); // m_query_in_RelevantDoc = doc_to_calculate; <= was before return(doc_to_calculate); // doc_rank_sim = calculatingDocumentsCosSim(); //doc_rank = calculatingDocumentsBM25(average_lenght_doc); // return doc_rank; }
// parse the terms public void parse(Dictionary <int, string[]> d, string path_stopWords, string path_newFile, string path_docs, bool Dostemming, bool Doindexer, string path_wordsConnected, int total_lenght_doc, Dictionary <string, Document> doc_dic, Dictionary <string, Term> terms_dictionary, ArrayList list_languages_pressed, string path_queries_ranked) { Dictionary <int, string[]> dicOfTerms = new Dictionary <int, string[]>(); Dictionary <string, ArrayConnectedwords> final_connectWords = new Dictionary <string, ArrayConnectedwords>(); LinkedList <WordsConnected> list_wordsCOnnected = new LinkedList <WordsConnected>(); ArrayList documents = new ArrayList(); // create a list of documnets char[] delimiters = { ' ' }; HashSet <string> stopWords = new HashSet <string>(); stopWords = StopWords(path_stopWords); //create month dictionary Dictionary <string, string> MonthDic = new Dictionary <string, string>(); MonthDic = CreateDic(MonthDic); //int how = 1; foreach (int NumOfFile in d.Keys) { string[] words = d[NumOfFile][1].Split(delimiters); string[] AfterParse = new String[words.Length * 2]; //delete from the end the ; . , : ) * for (int i = 0; i < words.Length; i++) { if (((words[i] != "") && (words[i].ToLower() != "u.s.")) && ((words[i].Substring(words[i].Length - 1, 1) == "-" || words[i].Substring(words[i].Length - 1, 1) == "!" || words[i].Substring(words[i].Length - 1, 1) == "?" || words[i].Substring(words[i].Length - 1, 1) == "\"" || words[i].Substring(words[i].Length - 1, 1) == ";" || words[i].Substring(words[i].Length - 1, 1) == "." || words[i].Substring(words[i].Length - 1, 1) == "," || words[i].Substring(words[i].Length - 1, 1) == ":" || words[i].Substring(words[i].Length - 1, 1) == ")" || words[i].Substring(words[i].Length - 1, 1) == "*"))) { words[i] = words[i].Substring(0, words[i].Length - 1); } } int counterWords = 0; int counterAfterParse = 0; double i1; double i2; bool CheckIFEnter = false; while (counterWords < words.Length) { //delete the , and the . , ; * ) : from the end of the string while ((words[counterWords] != "" && (words[counterWords].ToLower() != "u.s.")) && (words[counterWords].Substring(words[counterWords].Length - 1, 1) == "]" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "{" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "(" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "," || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "'" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "|" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "`" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "\"" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "." || words[counterWords].Substring(words[counterWords].Length - 1, 1) == ";" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "*" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "?" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == ")" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == ":")) { words[counterWords] = words[counterWords].Substring(0, words[counterWords].Length - 1); } //delete * " ( from the beginning while ((words[counterWords] != "") && ((words[counterWords].Substring(0, 1) == "(") || words[counterWords].Substring(0, 1) == "\\" || words[counterWords].Substring(0, 1) == "}" || words[counterWords].Substring(0, 1) == ")" || words[counterWords].Substring(0, 1) == "@" || words[counterWords].Substring(0, 1) == ";" || words[counterWords].Substring(0, 1) == ":" || words[counterWords].Substring(0, 1) == "!" || words[counterWords].Substring(0, 1) == "%" || words[counterWords].Substring(0, 1) == "=" || words[counterWords].Substring(0, 1) == "]" || words[counterWords].Substring(0, 1) == ")" || words[counterWords].Substring(0, 1) == "+" || words[counterWords].Substring(0, 1) == "|" || words[counterWords].Substring(0, 1) == "'" || (words[counterWords].Substring(0, 1) == ".") || words[counterWords].Substring(0, 1) == "`" || (words[counterWords].Substring(0, 1) == ",") || (words[counterWords].Substring(0, 1) == "?") || (words[counterWords].Substring(0, 1) == "&") || (words[counterWords].Substring(0, 1) == "[") || words[counterWords].Substring(0, 1) == "/" || words[counterWords].Substring(0, 1) == "_" || (words[counterWords].Substring(0, 1) == "\"") || (words[counterWords].Substring(0, 1) == "-") || (words[counterWords].Substring(0, 1) == "*"))) { words[counterWords] = words[counterWords].Substring(1, words[counterWords].Length - 1); } // delete foe the next word . , ; * ) : from the end of the string if ((counterWords + 1 < words.Length)) { counterWords++; //delete the , and the . , ; * ) : from the end of the string while ((words[counterWords] != "" && (words[counterWords].ToLower() != "u.s.")) && (words[counterWords].Substring(words[counterWords].Length - 1, 1) == "]" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "," || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "'" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "|" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "`" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "\"" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "." || words[counterWords].Substring(words[counterWords].Length - 1, 1) == ";" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "*" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == "?" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == ")" || words[counterWords].Substring(words[counterWords].Length - 1, 1) == ":")) { words[counterWords] = words[counterWords].Substring(0, words[counterWords].Length - 1); } counterWords--; } if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords] = s.stemTerm(words[counterWords]); } // check if it is date - [16TH] if ((words[counterWords] != "") && (words[counterWords].Length == 4 && words[counterWords].Substring(2).ToLower() == "th") && (Double.TryParse(words[counterWords].Substring(0, 2), out i1))) { words[counterWords] = words[counterWords].Substring(0, 2); } // check if it is date - [9TH] if ((words[counterWords] != "") && (words[counterWords].Length == 3 && words[counterWords].Substring(1).ToLower() == "th") && (Double.TryParse(words[counterWords].Substring(0, 1), out i1))) { words[counterWords] = words[counterWords].Substring(0, 2); } //check if the term is number in the end if ((words[counterWords] != "") && (Double.TryParse(words[counterWords], out i2)) && (words.Length == counterWords + 1)) { AfterParse[counterAfterParse] = words[counterWords]; CheckIFEnter = true; counterAfterParse++; } //check if it is NUMBER else if ((words[counterWords] != "") && (Double.TryParse(words[counterWords], out i2) && !MonthDic.ContainsKey(words[counterWords + 1]))) { double number = Double.Parse(words[counterWords]); // if the string is up to miliion if (number > 999999) { //the last word in the array if ((counterWords + 1) == words.Length) { AfterParse[counterAfterParse] = IfNumber(Double.Parse(words[counterWords]), ""); CheckIFEnter = true; } else { // word in the middle AfterParse[counterAfterParse] = IfNumber(Double.Parse(words[counterWords]), words[counterWords + 1]); counterAfterParse++; CheckIFEnter = true; if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords + 1] = s.stemTerm(words[counterWords + 1]); } if (words[counterWords + 1].ToLower() == "million" || words[counterWords + 1].ToLower() == "billion" || words[counterWords + 1].ToLower() == "trillion") { counterWords++; } } } else // the string is less than million [ num < MILLION] { //the last word in the array if ((counterWords + 1) == words.Length) { AfterParse[counterAfterParse] = words[counterWords]; CheckIFEnter = true; } //if the string is SHEVER [ LIKE 3/4 ] else if (words[counterWords + 1].Contains("/")) { int i = words[counterWords + 1].IndexOf("/"); double i3, i4; if (Double.TryParse(words[counterWords + 1].Substring(0, i), out i3) && Double.TryParse(words[counterWords + 1].Substring(i + 1, words[counterWords + 1].Length - i - 1), out i4)) { double new_num = i3 / i4; i2 = i2 + new_num; AfterParse[counterAfterParse] = i2.ToString(); counterWords++; counterAfterParse++; CheckIFEnter = true; } //AfterParse[counterAfterParse] = words[counterWords] + " " + words[counterWords + 1]; } else // the string is in the middle { //check if there is enough words to end if (counterWords + 1 < words.Length) { //this month if (MonthDic.ContainsKey(words[counterWords + 1].ToLower())) { // check if the string is not in the end if (counterWords + 2 < words.Length) { // this is NUMBER MONTH NUMBER [ LIKE 14 MAY 1991] if (Double.TryParse(words[counterWords + 2], out i1)) { // this is 2 DIGIT NUMBER [LIKE 91] if (words[counterWords + 2].Length == 2) { if (i2 < 10) { words[counterWords] = "0" + words[counterWords]; } AfterParse[counterAfterParse] = "19" + words[counterWords + 2] + "-" + MonthDic[words[counterWords + 1].ToLower()] + "-" + words[counterWords]; CheckIFEnter = true; counterAfterParse++; counterWords = counterWords + 2; } // this is 4 DIGITS NUMBER [LIKE 1991] else if (words[counterWords + 2].Length == 4) { if (i2 < 10) { words[counterWords] = "0" + words[counterWords]; } AfterParse[counterAfterParse] = words[counterWords + 2] + "-" + MonthDic[words[counterWords + 1].ToLower()] + "-" + words[counterWords]; CheckIFEnter = true; counterAfterParse++; counterWords = counterWords + 2; } } // this is NUMBER MONTH [ 14 MAY ] else { double ii; // check if the number is small than 10. than change to 03 if (Double.TryParse(words[counterWords], out ii)) { // check if the number is small than 10. than change to 03 if (ii < 10) { words[counterWords] = "0" + words[counterWords]; } } AfterParse[counterAfterParse] = MonthDic[words[counterWords + 1].ToLower()] + "-" + words[counterWords]; CheckIFEnter = true; counterAfterParse++; counterWords++; } } // this is only NUMBER MONTH in the end of the string! [14 MAY] else { double ii; // check if the number is small than 10. than change to 03 if (Double.TryParse(words[counterWords], out ii)) { // check if the number is small than 10. than change to 03 if (ii < 10) { words[counterWords] = "0" + words[counterWords]; } } AfterParse[counterAfterParse] = MonthDic[words[counterWords + 1].ToLower()] + "-" + words[counterWords]; CheckIFEnter = true; counterAfterParse++; counterWords++; } } else //check if this is a Million/Trillion/billion { if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords + 1] = s.stemTerm(words[counterWords + 1]); } if (words[counterWords + 1].ToLower() == "million" || words[counterWords + 1].ToLower() == "billion" || words[counterWords + 1].ToLower() == "trillion") { AfterParse[counterAfterParse] = IfNumber(i2, words[counterWords + 1]); counterAfterParse++; counterWords++; } } } else // the number is at the end of the array { AfterParse[counterAfterParse] = words[counterWords]; CheckIFEnter = true; counterAfterParse++; } } } } //check if in the phrase is BETWEEN if ((words[counterWords] != "") && words[counterWords].ToLower() == "between") { if (counterWords + 3 < words.Length) { //check if it BETWEEN NUMBER and NUMBER if (Double.TryParse(words[counterWords + 1], out i1) || double.TryParse(words[counterWords + 3], out i1)) { AfterParse[counterAfterParse] = words[counterWords] + " " + words[counterWords + 1] + " " + words[counterWords + 2] + " " + words[counterWords + 3]; CheckIFEnter = true; counterAfterParse++; counterWords = counterWords + 3; } else // not contain a numberic variable { AfterParse[counterAfterParse] = words[counterWords]; CheckIFEnter = true; counterAfterParse++; } } } //check if it is percenct and add % if ((words[counterWords] != "") && (words[counterWords].ToLower() == "percent" || words[counterWords].ToLower() == "percentage")) { if (counterWords != 0) { // this in number in front of him [ like 9 % ] if ((words[counterWords - 1] != "") && (Double.TryParse((AfterParse[counterAfterParse - 1]), out i1))) { AfterParse[counterAfterParse - 1] = AfterParse[counterAfterParse - 1] + "%"; CheckIFEnter = true; } } } //check if it is dollar and add DOLLARS WITH STEMMING if (Dostemming == true) { if (words[counterWords] != "" && (words[counterWords].ToLower() == "dollar")) { if (Double.TryParse(words[counterWords - 1], out i2) || words[counterWords - 1].Contains("/")) { AfterParse[counterAfterParse - 1] = AfterParse[counterAfterParse - 1] + " Dollars"; CheckIFEnter = true; } } } //check if it is dollar and add DOLLARS if (Dostemming == false) { if (words[counterWords] != "" && (words[counterWords].ToLower() == "dollars")) { if (Double.TryParse(words[counterWords - 1], out i2) || words[counterWords - 1].Contains("/")) { AfterParse[counterAfterParse - 1] = AfterParse[counterAfterParse - 1] + " Dollars"; CheckIFEnter = true; } } } //check if it is contain $ and add DOLLARS if (words[counterWords] != "" && words[counterWords].Substring(0, 1) == ("$")) { words[counterWords] = words[counterWords].Substring(1); //check that it is a NUMBER if (Double.TryParse(words[counterWords], out i1)) { // more than 999999 if (i1 > 999999) { AfterParse[counterAfterParse] = IfNumber(i1, "") + " Dollars"; CheckIFEnter = true; counterAfterParse++; } else // less than 999999 ( i1 < 999999 ) { if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords + 1] = s.stemTerm(words[counterWords + 1]); } // the next word is million or billion or trillion if (words[counterWords + 1].ToLower() == "million" || words[counterWords + 1].ToLower() == "trillion" || words[counterWords + 1].ToLower() == "billion") { AfterParse[counterAfterParse] = IfNumber(i1, words[counterWords + 1]) + " Dollars"; CheckIFEnter = true; counterWords++; counterAfterParse++; } else // NO next word { AfterParse[counterAfterParse] = words[counterWords] + " Dollars"; CheckIFEnter = true; counterAfterParse++; } } } } //check for tha MONTH if (words[counterWords] != "" && MonthDic.ContainsKey(words[counterWords].ToLower())) { if (counterWords + 2 < words.Length) { // LIKE APRIL 28, 1990 if (Double.TryParse(words[counterWords + 2], out i2) && (Double.TryParse(words[counterWords + 1], out i1))) { // ADD 0 if the number is under 10 if (i1 < 10) { words[counterWords + 1] = "0" + words[counterWords + 1]; } AfterParse[counterAfterParse] = words[counterWords + 2] + "-" + MonthDic[words[counterWords].ToLower()] + "-" + words[counterWords + 1].Substring(0, words[counterWords + 1].Length - 1); CheckIFEnter = true; counterAfterParse++; counterWords = counterWords + 2; } // LIKE APRIL 28 else { if (Double.TryParse(words[counterWords + 1], out i2)) { // MONTH DD [ APRIL 4 ] if (i2 < 32) { // ADD 0 if the number is under 10 if (i2 < 10) { words[counterWords + 1] = "0" + words[counterWords + 1]; } AfterParse[counterAfterParse] = MonthDic[words[counterWords].ToLower()] + "-" + words[counterWords + 1]; CheckIFEnter = true; counterAfterParse++; counterWords = counterWords++; } // if it is MONTH YEAR [ APRIL 1991] else { // ADD 0 if the number is under 10 if (words[counterWords + 1] != "") { AfterParse[counterAfterParse] = words[counterWords + 1] + "-" + MonthDic[words[counterWords].ToLower()]; CheckIFEnter = true; counterAfterParse++; counterWords++; } } } } } // the string is in the end! [LIKE APRIL 28] else if (counterWords + 1 < words.Length) { if (Double.TryParse(words[counterWords + 1], out i2)) { // MONTH DD [ APRIL 4 ] if (i2 < 32) { // ADD 0 if the number is under 10 if (i2 < 10) { words[counterWords + 1] = "0" + words[counterWords + 1]; } AfterParse[counterAfterParse] = MonthDic[words[counterWords].ToLower()] + "-" + words[counterWords + 1]; CheckIFEnter = true; counterAfterParse++; counterWords = counterWords++; } // if it is MONTH YEAR [ APRIL 1991] else { AfterParse[counterAfterParse] = words[counterWords + 1] + "-" + MonthDic[words[counterWords].ToLower()]; CheckIFEnter = true; counterAfterParse++; counterWords++; } } } else // the MONTH is in the end of the string { AfterParse[counterAfterParse] = words[counterWords]; CheckIFEnter = true; counterAfterParse++; } } // if the case is 100bn DOLLARS if ((words[counterWords] != "" && words[counterWords].Length > 1)) { if (words[counterWords].Substring(words[counterWords].Length - 2, 2).ToLower() == "bn") { if (Double.TryParse(words[counterWords].Substring(0, words[counterWords].Length - 2), out i1)) { double num = i1 * 1000; if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords + 1] = s.stemTerm(words[counterWords + 1]); if (words[counterWords + 1].ToLower() == "dollar") { AfterParse[counterAfterParse] = num.ToString() + " M Dollars"; CheckIFEnter = true; } } else if (words[counterWords + 1].ToLower() == "dollars") { AfterParse[counterAfterParse] = num.ToString() + " M Dollars"; CheckIFEnter = true; } } } } // for case 100 billion U.S dollars if ((words[counterWords] != "" && counterWords + 2 < words.Length)) { if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords + 1] = s.stemTerm(words[counterWords + 1]); if (words[counterWords].ToLower() == "u.s." && words[counterWords + 1].ToLower() == "dollar") { AfterParse[counterAfterParse - 1] = AfterParse[counterAfterParse - 1] + " Dollars"; CheckIFEnter = true; } } else if (words[counterWords].ToLower() == "u.s." && words[counterWords + 1].ToLower() == "dollars") { AfterParse[counterAfterParse - 1] = AfterParse[counterAfterParse - 1] + " Dollars"; CheckIFEnter = true; } } // case of Price UP to Million = 20.6m Dollars if ((words[counterWords] != "" && counterWords + 1 < words.Length)) { if (Dostemming == true) { Stemmer s = new Stemmer(); words[counterWords + 1] = s.stemTerm(words[counterWords + 1]); if (words[counterWords].Substring(words[counterWords].Length - 1, 1).ToLower() == "m" && Double.TryParse(words[counterWords].Substring(0, words[counterWords].Length - 1), out i1) && words[counterWords + 1].ToLower() == "dollar") { AfterParse[counterAfterParse] = i1.ToString() + " M Dollars"; CheckIFEnter = true; counterAfterParse++; counterWords++; } } else if (words[counterWords].Substring(words[counterWords].Length - 1, 1).ToLower() == "m" && Double.TryParse(words[counterWords].Substring(0, words[counterWords].Length - 1), out i1) && words[counterWords + 1].ToLower() == "dollars") { AfterParse[counterAfterParse] = i1.ToString() + " M Dollars"; CheckIFEnter = true; counterAfterParse++; counterWords++; } } // if it is just a REGULAR STRING if (words[counterWords] != "" && CheckIFEnter == false) { if (words[counterWords].Contains("/")) { int index = words[counterWords].IndexOf("/"); if (!Double.TryParse(words[counterWords].Substring(0, index), out i1)) { AfterParse[counterAfterParse] = words[counterWords].Substring(0, index); counterAfterParse++; //contain more than 1 / string temp_string = words[counterWords].Substring(index + 1, words[counterWords].Length - index - 1); //delete * " ( from the beginning while ((temp_string != "") && ((temp_string.Substring(0, 1) == "(") || temp_string.Substring(0, 1) == "}" || temp_string.Substring(0, 1) == ")" || temp_string.Substring(0, 1) == "[" || temp_string.Substring(0, 1) == "/" || temp_string.Substring(0, 1) == "|" || temp_string.Substring(0, 1) == "]" || temp_string.Substring(0, 1) == ";" || temp_string.Substring(0, 1) == ":" || temp_string.Substring(0, 1) == "_" || temp_string.Substring(0, 1) == "@" || temp_string.Substring(0, 1) == "=" || temp_string.Substring(0, 1) == "+" || temp_string.Substring(0, 1) == "!" || temp_string.Substring(0, 1) == "%" || temp_string.Substring(0, 1) == "|" || temp_string.Substring(0, 1) == "'" || (temp_string.Substring(0, 1) == ".") || temp_string.Substring(0, 1) == "`" || (temp_string.Substring(0, 1) == ",") || (temp_string.Substring(0, 1) == "?") || (temp_string.Substring(0, 1) == "&") || (temp_string.Substring(0, 1) == "[") || (temp_string.Substring(0, 1) == "\"") || (temp_string.Substring(0, 1) == "-") || (temp_string.Substring(0, 1) == "*"))) { temp_string = temp_string.Substring(1, temp_string.Length - 1); } while (temp_string.Contains("/")) { index = temp_string.IndexOf("/"); AfterParse[counterAfterParse] = temp_string.Substring(0, index); counterAfterParse++; temp_string = temp_string.Substring(index + 1, temp_string.Length - index - 1); } AfterParse[counterAfterParse] = temp_string; counterAfterParse++; } } else { // check if it is not contain the stop words if (!stopWords.Contains(words[counterWords].ToLower())) { if (words[counterWords].Substring(0, 1) != "-") { AfterParse[counterAfterParse] = words[counterWords]; counterAfterParse++; CheckIFEnter = false; // add the list the words connected list_wordsCOnnected.AddLast(new WordsConnected(words[counterWords], counterAfterParse - 1)); } } } } counterWords++; CheckIFEnter = false; } if (Doindexer == true) { Document doc = new Document(d[NumOfFile][2], AfterParse, counterAfterParse - 1, d[NumOfFile][0], d[NumOfFile][3]); documents.Add(doc); while (list_wordsCOnnected.Count >= 2) { WordsConnected obj1 = ((WordsConnected)list_wordsCOnnected.First()); list_wordsCOnnected.RemoveFirst(); WordsConnected obj2 = ((WordsConnected)list_wordsCOnnected.First()); list_wordsCOnnected.RemoveFirst(); // check if the string is connected - one string is after the other if (obj1.LOCATION + 1 == obj2.LOCATION) { string key = obj1.NAME + " " + obj2.NAME; // check if it is in the dictionary if (final_connectWords.ContainsKey(key)) { if (final_connectWords[key].DICTIONARY.ContainsKey(d[NumOfFile][2])) // if contains name of file { final_connectWords[key].DICTIONARY[d[NumOfFile][2]]++; // add the num of apperance } } //not in the dictionary else { final_connectWords.Add(key, new ArrayConnectedwords(d[NumOfFile][2])); final_connectWords[key].DICTIONARY.Add(d[NumOfFile][2], 1); } } list_wordsCOnnected.AddFirst(obj2); } } else // enter the string after the parsing { d[NumOfFile] = AfterParse; ArrayList arrayList = new ArrayList(); for (int i = 0; i < AfterParse.Length; i++) { if (AfterParse[i] != null) { arrayList.Add(AfterParse[i]); } } string[] final_after_parsing = new string[arrayList.Count]; int j = 0; foreach (string val in arrayList) { final_after_parsing[j] = val; j++; } // create the final dictionary for queries Dictionary <int, string[]> final_dic_query = new Dictionary <int, string[]>(); // get the num of the query foreach (int num_query in d.Keys) { final_dic_query.Add(num_query, final_after_parsing); } //sent to the Ranker! Ranker r = new Ranker(terms_dictionary, doc_dic, final_dic_query, total_lenght_doc, Dostemming, path_newFile, list_languages_pressed, path_queries_ranked); ArrayList list_rel = new ArrayList(); m_rel_doc = new ArrayList(); Dictionary <string, double> temp_dic = new Dictionary <string, double>(); temp_dic = r.TOP50DOCS; foreach (string doc in temp_dic.Keys) { list_rel.Add(doc); } m_rel_doc = list_rel; m_query_num = r.QUERY_NUM; //////////////////////////////// return; } } //for the documents if (Doindexer == true) { // string chaining = ""; Dictionary <string, ArrayConnectedwords> Dic_connect_new = new Dictionary <string, ArrayConnectedwords>(); foreach (var name in final_connectWords.OrderBy(i => i.Key)) { Dic_connect_new[name.Key] = name.Value; } FileStream fs = new FileStream(path_wordsConnected, FileMode.Create); StreamWriter sw = new StreamWriter(fs); foreach (string term in Dic_connect_new.Keys) { sw.Write(term + "/"); foreach (string nameDoc in Dic_connect_new[term].DICTIONARY.Keys) { sw.Write(nameDoc + "," + Dic_connect_new[term].DICTIONARY[nameDoc] + ";"); } sw.WriteLine(); } sw.Close(); fs.Close(); Indexer indexer = new Indexer(documents, path_newFile, path_docs); } // for the queries else { } }