/// <summary> /// method to write to posting files /// </summary> /// <param name="documentTermsDic">input dictionary of the Parse class</param> /// <param name="hasWritten">boolean input for first writing</param> public void WriteToPostingFile(Dictionary <string, DocumentsTerm> documentTermsDic, bool hasWritten) { tempDic = documentTermsDic; if (!hasWritten) { this.tempDic = tempDic.OrderBy(i => i.Value.postNum).ThenBy(i => i.Value.line).ToDictionary(p => p.Key, p => p.Value); StringBuilder data = new StringBuilder(); int postNum = -1; foreach (KeyValuePair <string, DocumentsTerm> pair in tempDic) { if (StreamShouldBeChanged(pair.Value.postNum, postNum)) { if (Writer != null) { Writer.Write(data); data.Clear(); Writer.Flush(); Writer.Close(); } SwitchWriterForPosting(pair.Value.postNum, "Posting"); } postNum = pair.Value.postNum; IndexTerm currentTerm = new IndexTerm(pair.Value.m_valueOfTerm, postNum, currentLine[postNum]); currentTerm.IncreaseTfc(pair.Value.countTotalFrequency()); currentTerm.IncreaseDf(pair.Value.m_Terms.Count); dictionaries[postNum].Add(pair.Value.m_valueOfTerm, currentTerm); data.AppendLine(pair.Value.WriteToPostingFileDocDocTerm(false).ToString()); currentLine[postNum]++; } if (Writer != null) { Writer.Write(data); data.Clear(); } tempDic.Clear(); Writer.Flush(); Writer.Close(); Writer = null; } else { UpdatePosting(); DeleteOldFiles(); } Console.WriteLine("-----------" + m_indexCounter++); }
/// <summary> /// method to load dictionary to memory /// </summary> public void LoadDictionary() { string[] AllLines, splittedLine; string term, df, tfc, PN, LN; int postNum; Dictionary <string, IndexTerm>[] dictionaries = new Dictionary <string, IndexTerm> [27]; bool existWithStem = File.Exists(Path.Combine(outPutPath, "WithStem\\Dictionary.txt")); bool existWithoutStem = File.Exists(Path.Combine(outPutPath, "WithOutStem\\Dictionary.txt")); if (m_doStemming && existWithStem) { AllLines = File.ReadAllLines(Path.Combine(outPutPath, "WithStem\\Dictionary.txt")); } else if (!m_doStemming && existWithoutStem) { AllLines = File.ReadAllLines(Path.Combine(outPutPath, "WithOutStem\\Dictionary.txt")); } else { return; } for (int i = 0; i < 27; i++) { dictionaries[i] = new Dictionary <string, IndexTerm>(); } for (int j = 0; j < AllLines.Length; j++) { splittedLine = AllLines[j].Split(new string[] { "(#)" }, StringSplitOptions.None); term = splittedLine[0]; df = splittedLine[1].Split(':')[1]; tfc = splittedLine[2].Split(':')[1]; PN = splittedLine[3].Split(':')[1]; LN = splittedLine[4].Split(':')[1]; postNum = Int32.Parse(PN); IndexTerm temp = new IndexTerm(term, postNum, Int32.Parse(LN)); temp.IncreaseDf(Int32.Parse(df)); temp.IncreaseTfc(Int32.Parse(tfc)); dictionaries[postNum].Add(term, temp); } _dictionaries = dictionaries; }
/// <summary> /// method which updates current posting files /// </summary> public void UpdatePosting() { InitTerms(); CreateNewPostingFiles(); bool finishedUpdatePrevTerms = true; int PostNumber = -1, LineNumberOfTerm, currentLineNumber = 0, indexLine = 0; StringBuilder writeData = new StringBuilder(); string[] lines = null; foreach (KeyValuePair <string, DocumentsTerm> pair in tempDic) { if (StreamShouldBeChanged(pair.Value.postNum, PostNumber)) { if (Writer != null) { Writer.Write(writeData); writeData.Clear(); Writer.Flush(); Writer.Close(); } SwitchWriterForPosting(pair.Value.postNum, "NewPosting"); } if (StreamHasChanged) { lines = File.ReadAllLines(Path.Combine(m_outPutPath, "Posting" + pair.Value.postNum + ".txt")); currentLineNumber = 0; indexLine = 0; StreamHasChanged = false; finishedUpdatePrevTerms = true; } PostNumber = pair.Value.postNum; LineNumberOfTerm = pair.Value.line; if (LineNumberOfTerm != Int32.MaxValue) // exist in the posting file { while (LineNumberOfTerm > currentLineNumber) { writeData.AppendLine(lines[indexLine++]); currentLineNumber++; } writeData.AppendLine(lines[indexLine++] + pair.Value.WriteToPostingFileDocDocTerm(true)); dictionaries[PostNumber][pair.Value.m_valueOfTerm].IncreaseTfc(pair.Value.countTotalFrequency()); dictionaries[PostNumber][pair.Value.m_valueOfTerm].IncreaseDf(pair.Value.m_Terms.Count); currentLineNumber++; } else { if (finishedUpdatePrevTerms) { while (indexLine < lines.Length) { writeData.AppendLine(lines[indexLine++]); currentLineNumber++; } finishedUpdatePrevTerms = false; } IndexTerm currentTerm = new IndexTerm(pair.Value.m_valueOfTerm, PostNumber, currentLine[PostNumber]); LineNumberOfTerm = currentLine[PostNumber]; currentLine[PostNumber]++; currentTerm.IncreaseTfc(pair.Value.countTotalFrequency()); currentTerm.IncreaseDf(pair.Value.m_Terms.Count); dictionaries[PostNumber].Add(pair.Value.m_valueOfTerm, currentTerm); writeData.AppendLine(pair.Value.WriteToPostingFileDocDocTerm(false).ToString()); } } if (Writer != null) { Writer.Write(writeData); writeData.Clear(); } Writer.Flush(); Writer.Close(); Writer = null; tempDic.Clear(); }