/// <summary> /// Return a document for a query with the given id. /// </summary> static Document getQueryById(String sid, DocumentCollection queryCollection) { int queryId = int.Parse(sid); var doc = queryCollection.documentById(queryId); if (doc == null) { Console.WriteLine("Error: Query with ID {0} doesn't exist!", queryId); return null; } return doc; }
/// <summary> /// Update the IR system by building a search index using the /// given document collection. /// </summary> public void update(DocumentCollection documents) { this.documents = documents; foreach (var doc in documents.getDocuments()) { // Create a local (document specific) term collection. Dictionary<String, int> documentTerms = new Dictionary<String, int>(); foreach (var rawWord in doc.getTerms()) { var word = stemmer.stem(rawWord); if (stopWords.isStopWord(word)) { continue; } int frequency; if (documentTerms.TryGetValue(word, out frequency)) { documentTerms[word] = frequency + 1; } else { documentTerms[word] = 1; } } // Merge the local term collection with the global term document matrix. foreach (var termOccurence in documentTerms) { var term = terms.getTerm(termOccurence.Key); term.addOccurenceForDocument(termOccurence.Value, doc.id); } } }
static void Main(string[] args) { Console.WriteLine("Welcome to ir-project command line terminal interface."); Console.WriteLine("----"); String input; var loadDocuments = new Command("load documents", "filepath"); var loadQueries = new Command("load queries", "filepath"); var loadRelevance = new Command("load relevance", "filepath"); var runNewQuery = new Command("run text query", "text"); var runQuery = new Command("run query", "query id"); var showTerm = new Command("show term", "term"); var showDocument = new Command("show document", "document id"); var showQuery = new Command("show query", "query id"); var setScheme = new Command("set scheme", "bm25|pivoted"); Command[] commands = { loadDocuments, loadQueries, loadRelevance, runNewQuery, runQuery, showTerm, showDocument, showQuery, setScheme }; var documentCollection = new DocumentCollection(); var queryCollection = new DocumentCollection(); List<RelevanceJudgement> relevance = null; var engine = new InformationRetriever(); // Weighting schemes. var bm25Scheme = new BM25Scheme(/*k1=*/2.0f, /*b=*/0.75f); var pivotedScheme = new PivotedNormalisationScheme(/*s=*/0.5f); WeightingScheme scheme = bm25Scheme; do { Console.Write("> "); input = Console.ReadLine(); if (input == null) { input = ""; } if (input == "exit" || input == "quit") { break; } if (input == "help") { Console.WriteLine("Terminal UI for the ir-project IR system."); Console.WriteLine("Available commands:"); Console.WriteLine(" exit"); Console.WriteLine(" quit"); Console.WriteLine(" help"); foreach (var command in commands) { Console.WriteLine(" {0}: <{1}>", command.command, command.argumentName); } continue; } String matchedCommand = ""; String matchedArgument = null; foreach (var command in commands) { var match = command.matches(input); if (match.Item1) { matchedCommand = command.command; matchedArgument = match.Item2.Trim(); break; } } switch (matchedCommand) { case "load documents": { var data = readFile(matchedArgument); if (data != null) { Console.WriteLine("Loading documents... "); var documents = DataImporter.parseDocuments(data); Console.WriteLine("Loaded {0} documents!", documents.Count); Console.Write("Constructing the term-document indexing datastructures... "); Stopwatch sw = new Stopwatch(); sw.Start(); documentCollection = new DocumentCollection(documents); engine.update(documentCollection); sw.Stop(); Console.WriteLine("Done ({0} ms)!", sw.ElapsedMilliseconds); } break; } case "load queries": { var data = readFile(matchedArgument); if (data != null) { Console.WriteLine("Loading queries"); var queries = DataImporter.parseDocuments(data); queryCollection = new DocumentCollection(queries); Console.WriteLine("Loaded {0} queries!", queries.Count); } break; } case "load relevance": { var data = readFile(matchedArgument); if (data != null) { Console.Write("Loading relevance judgements... "); relevance = DataImporter.parseRelevance(data); Console.WriteLine("Loaded {0} relevance judgements!", relevance.Count); } break; } case "run text query": { var query = engine.createQuery(new Document(matchedArgument, 0)); var results = engine.executeQuery(query, scheme); Console.WriteLine("Found {0} results:", results.Count); foreach (var result in results) { Console.WriteLine(" document id: {0}, similarity score: {1}", result.documentId, result.similarity); } break; } case "run query": { var doc = getQueryById(matchedArgument, queryCollection); if (doc == null) { break; } Console.WriteLine("Query with id {0}, text: '{1}'", doc.id, doc.value); Stopwatch sw = new Stopwatch(); sw.Start(); var query = engine.createQuery(doc); var results = engine.executeQuery(query, scheme); sw.Stop(); Console.WriteLine("Found {0} results in {1} ms:", results.Count, sw.ElapsedMilliseconds); foreach (var result in results) { Console.WriteLine(" document id: {0}, similarity score: {1}", result.documentId, result.similarity); } // Compute precision/recall if possible. if (relevance != null) { var relevantDocuments = relevance.Where(x => x.queryId == doc.id).Select(x => x.documentId).ToArray(); var relevantSet = new HashSet<int>(relevantDocuments); List<double> precision = new List<double>(); List<double> recall = new List<double>(); //add initial precision and recall values precision.Add(100); recall.Add(0); double noOfRelevantDocs = relevantDocuments.Count(); double relevantDocCount = 1; double resultCount = 1; foreach (var result in results) { if (relevantSet.Contains(result.documentId)) { recall.Add((relevantDocCount / noOfRelevantDocs)*100); precision.Add((relevantDocCount / resultCount)*100); relevantDocCount++; } resultCount++; } Console.WriteLine("Precision | Recall (graph is shown in the browser):"); for(int i = 0; i < precision.Count; i++) { Console.WriteLine(" {0} | {1}", precision[i], recall[i]); } PrecisionRecallGraph.show(doc.id, precision, recall); } break; } case "show term": { var term = engine.terms.findTerm(matchedArgument); if (term == null) { Console.WriteLine("Error: Invalid term '{0}'", matchedArgument); break; } Console.WriteLine("Term '{0}', global frequency: {1}, occurences:", matchedArgument, term.getGlobalFrequency()); foreach (var occurence in term.getOccurences()) { Console.WriteLine(" document id: {0}, frequency: {1}", occurence.documentId, occurence.frequency); } break; } case "show document": { int docId = int.Parse(matchedArgument); var doc = documentCollection.documentById(docId); if (doc == null) { Console.WriteLine("Error: Document with ID {0} doesn't exist!", docId); break; } Console.WriteLine("Document with id '{0}', terms:", doc.id); foreach (var term in engine.terms.termList) { var occurences = term.getOccurences().Where(x => x.documentId == doc.id); // Usually this just one occurence. foreach (var occurence in occurences) { Console.WriteLine(" term '{0}', frequency: {1}", term.value, occurence.frequency); } } break; } case "show query": { var doc = getQueryById(matchedArgument, queryCollection); if (doc == null) { break; } Console.WriteLine("Query with id {0}, text: '{1}', terms:", doc.id, doc.value); var query = engine.createQuery(doc); foreach (var term in query.terms) { Console.WriteLine(" term '{0}', frequency: {1}", term.term.value, term.frequency); } break; } case "set scheme": { if (matchedArgument == "bm25") { scheme = bm25Scheme; Console.WriteLine("Using BM25."); } else if (matchedArgument == "pivoted") { scheme = pivotedScheme; Console.WriteLine("Using Pivoted Normalisation."); } else { Console.WriteLine(" unknown weighting scheme '{0}'", matchedArgument); } break; } default: Console.WriteLine("Error: Unknown command '{0}'!", input); break; } } while (input != "exit" && input != "quit"); Console.WriteLine("---"); }