private static IEnumerable<Dictionary<string, HashSet<int>>> GetTermBlocks(IEnumerable<Document> documents, DocumentIdTracker idTracker) { //key: term, value: set of document paths var termDictionary = new Dictionary<string, HashSet<int>>(); var tokenizer = new WordDocumentTokenizer(); foreach (var doc in documents) { int id = idTracker.TrackDocumentId(doc); foreach (var term in tokenizer.Tokenize(doc.Text)) { if (!termDictionary.ContainsKey(term)) { termDictionary[term] = new HashSet<int>(); } termDictionary[term].Add(id); if (termDictionary.Count >= TermsInBlock) { yield return termDictionary; termDictionary = new Dictionary<string, HashSet<int>>(); } } } yield return termDictionary; }
public HashSet<IDocument> ProcessQuery(string query) { var tokenizer = new WordDocumentTokenizer(); var terms = tokenizer.Tokenize(query); HashSet<IDocument> currentSet = null; Operator currentOperator = Operator.AND; bool negating = false; foreach (var term in terms) { switch (term) { case "and": case "or": currentOperator = (Operator) Enum.Parse(typeof (Operator), term.ToUpper()); negating = false; break; case "not": negating = true; break; default: var newSet = _inverceIndex.GetDocumentSet(term, negating); HashSetHelper.AddToSet(newSet, currentOperator, ref currentSet); negating = false; break; } } return currentSet ?? new HashSet<IDocument>(); }
public void Launch() { var documents = _documentProvider.GetDocuments(_inputDirectory); var tokenizer = new WordDocumentTokenizer(); var inverseIndex = new InverceIndex<string, IDocument>(tokenizer); inverseIndex.AddDocuments(documents); var searcher = new InformationRetrieval.Common.BooleanSearcher(inverseIndex); while (true) { var input = Console.ReadLine(); if (!string.IsNullOrWhiteSpace(input)) { var resultDocs = searcher.ProcessQuery(input); Console.WriteLine(string.Join("\n", resultDocs.Select(d => d.FilePath))); } } }
private static void Main(string[] args) { var inputDirectory = args[0]; var documents = DocumentProvider.GetDocuments(inputDirectory); var tokenizer = new WordDocumentTokenizer(); var positionalIndex = new PositionalIndex(tokenizer); positionalIndex.AddDocuments(documents); while (true) { var input = Console.ReadLine(); if (!string.IsNullOrWhiteSpace(input)) { var resultDocs = positionalIndex.FindDocuments(input); Console.WriteLine(string.Join("\n", resultDocs.Select(d => d.FilePath))); } } }
private static void Main(string[] args) { var directory = args[0]; var documents = new Fb2ZoneDocumentProvider().GetDocuments(directory); var tokenizer = new WordDocumentTokenizer(); var index = new InverceIndex<string, IDocument>(tokenizer); index.AddDocuments(documents); while (true) { var input = Console.ReadLine(); var token = tokenizer.Tokenize(input).FirstOrDefault(); if (token != null) { Console.WriteLine(string.Join("\n", index.GetDocumentsWithScore(token).Select(t => $"{t.Item1.FilePath}\tScore:{t.Item2}"))); } } }