/// Helper function for CreateIndex /// This function implements threading to improve indexing speed. /// For each filename the method: /// 1. Read the file and store in a string. /// 2. Turn the text into IRDocument object and add to collection /// 3. Add the IRDocument to the index private IRCollection ReadAndProcessFiles(List <string> fileNames) { IRCollection collection = new IRCollection(); // Lists are not thread safe so... // 1. need to create a ConcurrentBag<IRDocument> // 2. add docs to this collection // 3. then after all docs are added, convert the array to a list int numDocs = fileNames.Count; IRDocument[] docArray = new IRDocument[numDocs]; var conDocs = new ConcurrentBag <IRDocument>(); Parallel.ForEach(fileNames, fn => { string docText = FileHandling.ReadTextFile(fn); IRDocument doc = GetNewDoc(docText); if (doc != null) { conDocs.Add(doc); doc.AddToIndex(writer); } else { Console.WriteLine("Error with file: " + fn); } }); // add documents to collection object and set maxResults collection.AddDocs(conDocs.ToList()); maxResults = conDocs.Count; return(collection); }
/// Builds the index... public int CreateIndex(string collectionPath, string indexPath) { // start timer... DateTime start = DateTime.Now; // get all of the files names in the collection path List <string> filenames = FileHandling.GetFileNames(collectionPath, false); // initialise the index InitIndex(indexPath); // build the index // this method call does lots of things in parallel myCollection = ReadAndProcessFiles(filenames); // close the index CleanUpIndex(); // end timer and calculate total time DateTime end = DateTime.Now; TimeSpan duration = end - start; indexTime = duration.Seconds + (float)duration.Milliseconds / 1000; return(myCollection.Length()); }
// Parses a standard information needs file public static Dictionary <string, string> GetInfoNeeds(string fileName) { Dictionary <string, string> iNeeds = new Dictionary <string, string>(); // open file and dump into a string string document = FileHandling.ReadTextFile(fileName); // split string based on ".I" and ".D" delimiters string[] delims = { ".I", ".D" }; string[] docParts = document.Split(delims, StringSplitOptions.RemoveEmptyEntries); if (docParts.Length > 2) { // build dicationary from string array for (int i = 0; i < docParts.Length; i++) { iNeeds.Add(docParts[i].Trim(), docParts[i + 1].Trim()); // inc i so that it goes up 2 each iteration i++; } return(iNeeds); } else { return(null); } }
/// Writes a trec evaluation file from the search results. /// if the query is not a standard one, '000' is used as the topicID public int WriteEvalFile(string fileName, string topicID) { List <string> evalList = new List <string>(); bool appendFlag = true; // check if the file exists if (File.Exists(fileName) == true) { // prompt for append DialogResult append = MessageBox.Show("Do you want to append to the existing file?", "Confirm", MessageBoxButtons.YesNo); if (append == DialogResult.Yes) { appendFlag = true; } else { // if overwrite confirm DialogResult ruSure = MessageBox.Show("Are you sure you want to overwrite the file?", "Confirm", MessageBoxButtons.YesNo); if (ruSure == DialogResult.Yes) { appendFlag = false; } } } // this is fixed string groupName = "09648500_NathanOnly"; // structure TopicID QO DocID rank score group string tempString = ""; for (int i = 0; i < resultsCollection.Length(); i++) { IRDocument doc = resultsCollection.GetIRDocument(i); tempString = topicID + "\tQ0\t"; tempString += doc.GetDocID() + "\t"; tempString += doc.Rank + "\t"; tempString += doc.Score + "\t"; tempString += groupName + "\n"; evalList.Add(tempString); } // write file FileHandling.WriteTextFile(evalList, fileName, appendFlag); return(0); }
// this is for testing only public void AutoResults(string filename, Dictionary <string, string> queries, bool preproc) { string dontcare = ""; bool appendFlag = false; foreach (KeyValuePair <string, string> q in queries) { // execute query string topicID = q.Key; RunQuery(q.Value, preproc, out dontcare); // get results //IRCollection results = BuildResults(); int numResults = BuildResults(); // write to file string groupName = "09648500_NathanOnly"; List <string> evalList = new List <string>(); // structure TopicID QO DocID rank score group string tempString = ""; for (int i = 0; i < numResults; i++) { IRDocument doc = resultsCollection.GetIRDocument(i); tempString = topicID + "\tQ0\t"; tempString += doc.GetDocID() + "\t"; tempString += doc.Rank + "\t"; tempString += doc.Score + "\t"; tempString += groupName + "\n"; evalList.Add(tempString); } // write file FileHandling.WriteTextFile(evalList, filename, appendFlag); appendFlag = true; } string trecpath = "../../../../results/"; if (File.Exists(trecpath + Path.GetFileName(filename))) { File.Delete(trecpath + Path.GetFileName(filename)); } File.Move(filename, trecpath + Path.GetFileName(filename)); // from MSDN Process p = new Process(); p.StartInfo.UseShellExecute = false; p.StartInfo.RedirectStandardOutput = true; p.StartInfo.FileName = trecpath + "trec_eval"; p.StartInfo.Arguments = "-q " + trecpath + "cranqrel.txt " + trecpath + "autoquery_results.txt"; p.Start(); string output = p.StandardOutput.ReadToEnd(); p.WaitForExit(); Console.WriteLine(output); }