private void loadDictionaryButton_Click() { string dictPath = @"C:\Users\ירדן\Desktop\לימודים\שנה ד\סמס1\אחזור מידע\data\posting"; bool stemm = false; d = new Dictionary <string, TermInfo>(); docs = new Dictionary <string, DocumentInfo>(); string p = ""; if (stemm) { p = dictPath + @"\DictionaryStemm.txt"; } else { p = dictPath + @"\Dictionary.txt"; } if (!File.Exists(p)) { System.Windows.Forms.MessageBox.Show("Dictionary not found !"); } else { Thread tDocs = new Thread(() => loadDocs(dictPath, stemm)); tDocs.Start(); string[] tempDic = File.ReadAllLines(p); for (int i = 0; i < tempDic.Length; i += 2) { string[] t = tempDic[i].Split(','); string term = t[0]; TermInfo ti = new TermInfo(t[0], int.Parse(t[1]), int.Parse(t[2]), t[3][0]); ti.setPointer(int.Parse(t[4])); t = tempDic[i + 1].Split(','); foreach (string next in t) { ti.nextString.Add(next); } d[term] = ti; } tDocs.Join(); System.Windows.Forms.MessageBox.Show("load data succeeded"); } }
/// <summary> /// build mini postings for dictionary /// </summary> /// <param name="terms"></param> public void buildInvertedIndex(Dictionary <string, TermInfo> terms) { List <string> sortTerms = terms.Keys.ToList(); sortTerms.Sort(); string fullPath = path + @"\" + idx + ".txt"; using (FileStream fs = File.Create(fullPath)) { using (StreamWriter sw = new StreamWriter(fs)) { string newL = ""; foreach (string t in sortTerms) { // newL = String.Join(",", terms[t].locations); newL = terms[t].getStringOfDocs(); sw.WriteLine(t); sw.Write(newL); sw.WriteLine(); //Dictionary : int listSize = terms[t].locations.Count; int frecCurrPost = terms[t].corpusF; if (fullD.ContainsKey(t)) { fullD[t].docF += listSize; fullD[t].corpusF += frecCurrPost; fullD[t].addNextString(terms[t].nextStringFull); } else { TermInfo newT = new TermInfo(t, listSize, frecCurrPost, terms[t].type); newT.nextStringFull = terms[t].nextStringFull; fullD[t] = newT; } } terms.Clear(); } } idx++; }
/// <summary> /// read amount of files, for each file split to docs, parse the document, and build mini dictionary /// </summary> /// <param name="amount"></param> /// <returns></returns> public Dictionary <string, TermInfo> readBatch(int amount) { if (files.Length > 0) { Dictionary <string, TermInfo> termsInFiles = new Dictionary <string, TermInfo>(); for (int i = 0; i < amount; i++) { string text = System.IO.File.ReadAllText(files[idxFile]); List <string> docs = getDocs(text); foreach (string doc in docs) { parser = new Parse(stopWords, stemm); Dictionary <string, int> termsInCurrDoc = parser.parseDoc(doc); // term and his tf from the doc DocumentInfo d = parser.getDoc(); // doc details d.numTermsInDoc = parser.countTerms; #region doc FBIS-3366 /* string s = d.docID.Trim(' '); * if (s.Equals("FBIS3-3366")) * { * * using (StreamWriter sw = new StreamWriter(p+"\\Y.txt")) * { * foreach (string ter in termsInCurrDoc.Keys) * { * sw.WriteLine(String.Join(",",new string[]{ter , termsInCurrDoc[ter].ToString() })); * } * * } * }*/ #endregion // add Terms From Doc To dictionary foreach (KeyValuePair <string, int> curr in termsInCurrDoc) { if (!termsInFiles.ContainsKey(curr.Key)) { TermInfo t = new TermInfo(); t.term = curr.Key; t.locations[d.docID] = curr.Value; t.type = parser.termType[t.term]; t.corpusF = curr.Value; t.docF = 1; if (parser.nextTerm.ContainsKey(curr.Key)) { t.addNextString(parser.nextTerm[curr.Key]); } t.inFirstThird[d.docID] = parser.termsInFirstThirdOfText[curr.Key]; t.inLast10[d.docID] = parser.termsInLast10OfText[curr.Key]; termsInFiles[curr.Key] = t; } else { termsInFiles[curr.Key].locations[d.docID] = curr.Value; termsInFiles[curr.Key].corpusF += curr.Value; termsInFiles[curr.Key].docF++; if (parser.nextTerm.ContainsKey(curr.Key)) { termsInFiles[curr.Key].addNextString(parser.nextTerm[curr.Key]); } termsInFiles[curr.Key].inFirstThird[d.docID] = parser.termsInFirstThirdOfText[curr.Key]; termsInFiles[curr.Key].inLast10[d.docID] = parser.termsInLast10OfText[curr.Key]; } } docsInDataBase.Add(d); countDocL += d.numTermsInDoc; } idxFile++; } return(termsInFiles); } return(null); }
/// <summary> /// load dictionary to memory /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void loadDictionaryButton_Click(object sender, RoutedEventArgs e) { string dictPath = PathPostingDictionary.Text; bool stemm = false; if (stemmingCheck.HasContent && stemmingCheck.IsChecked == true) { stemm = true; } d = new Dictionary <string, TermInfo>(); docs = new Dictionary <string, DocumentInfo>(); string p = ""; if (stemm) { p = dictPath + @"\DictionaryStemm.txt"; } else { p = dictPath + @"\Dictionary.txt"; } if (!File.Exists(p)) { System.Windows.Forms.MessageBox.Show("Dictionary not found !"); } else { Thread tDocs = new Thread(() => loadDocs(dictPath, stemm)); tDocs.Start(); string[] tempDic = File.ReadAllLines(p); for (int i = 0; i < tempDic.Length; i += 2) { string[] t = tempDic[i].Split(','); string term = t[0]; TermInfo ti = new TermInfo(t[0], int.Parse(t[1]), int.Parse(t[2]), t[3][0]); ti.setPointer(int.Parse(t[4])); t = tempDic[i + 1].Split(','); foreach (string next in t) { ti.nextString.Add(next); } /* t = tempDic[i + 2].Split(','); * foreach (string next in t) * { * ti.synForTerm.Add(next); * }*/ d[term] = ti; } /* * foreach (string term in tempDic) * { * string[] t = term.Split(','); * TermInfo ti = new TermInfo(t[0], int.Parse(t[1]), int.Parse(t[2]), t[3][0]); * ti.setPointer(int.Parse(t[4])); * d[t[0]] = ti; * }*/ tDocs.Join(); System.Windows.Forms.MessageBox.Show("load data succeeded"); } }