//Computes the TF-IDF rank of both the query and the documents relevant //to the document private void CompTF_IDF(Tuple <int, List <ITermDocument> > query) { //A cache of Length of each documents needed for ranking. IDictionary <String, long> cacheDocumentsLength = new Dictionary <String, long>(); //No of document in Corpus. long noDocs = _corpus.GetNoDocuments().Result; //Compute the size of the query to compute each query's IDF var querySize = query.Item1; foreach (var term in query.Item2) { IDictionary <String, int> documents = new Dictionary <String, int>(); foreach (var doc in _corpus.GetDocuments(term.Term).Result) { if (documents.Keys.Contains(doc.DocID)) { continue; } documents.Add(doc.DocID, doc.Pos.Count); this._documents.Add(doc.DocID); } if (documents.Count < 1) { continue; } var nDocuments = new Dictionary <String, double>(); double IDF = (1 + Math.Log(1.0 * noDocs / documents.Keys.Count)); this._queryRank[term.Term] = (1.0 * term.Docs.First().Pos.Count / querySize) * IDF; foreach (var item in documents) { long length; if (cacheDocumentsLength.ContainsKey(item.Key)) { length = cacheDocumentsLength[item.Key]; } else { length = _corpus.GetDocumentLength(item.Key).Result; cacheDocumentsLength[item.Key] = length; } if (length == 0) { continue; } double tF = 1.0 * item.Value / length; nDocuments[item.Key] = tF * IDF; } _documentRank[term.Term] = nDocuments; ; } }
public async Task init() { Console.WriteLine("Initializing..."); _indexer = new Indexer(_corpus); var repo = await _corpus.GetRepository(); Console.WriteLine(repo); var files = crawl(repo).ToList();//All files in my repository foreach (var file in files) { Console.WriteLine(file); if (!isValidFile(file)) { continue; } Console.WriteLine(await _corpus.GetRepository()); var sfile = file.Replace(await _corpus.GetRepository() + "\\", ""); Console.WriteLine(await _corpus.GetRepository()); //For each file in the repository, try to get their id from the corpus Console.WriteLine(sfile); string id = await _corpus.GetDocumentID(sfile); if (id == null) { //File is not in the Corpus //Index it and add it to the corpus. Console.WriteLine("deji2"); await addFile(file); } else { //File exists in the corpus if (File.GetLastWriteTime(file).Ticks > await _corpus.GetLastCrawled()) { //File has been edited since last checked, re-index it and update it into the corpus await editFile(file); } } //Console.WriteLine("done"); } var kFiles = await _corpus.GetDocuments(); //Files that exist in my database foreach (var id in kFiles) { var file = await _corpus.GetDocumentPath(id); if (!files.Contains(Path.Combine(await _corpus.GetRepository(), file))) { //File does not exist again in repo, delete it from database await removeFile(file); } } //------------Watching for changes-------------------- watcher = new FileSystemWatcher(await _corpus.GetRepository()); Console.WriteLine("watch"); watcher.NotifyFilter = NotifyFilters.LastWrite | NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.Attributes; watcher.Changed += new FileSystemEventHandler(async(source, e) => { Console.WriteLine("changed"); if (!isValidFile(e.FullPath)) { return; } await editFile(e.FullPath); }); watcher.Created += new FileSystemEventHandler(async(source, e) => { Console.WriteLine("created"); if (!isValidFile(e.FullPath)) { return; } await addFile(e.FullPath); }); watcher.Deleted += new FileSystemEventHandler(async(source, e) => { Console.WriteLine("remove"); await removeFile(e.FullPath); }); watcher.Renamed += new RenamedEventHandler(async(source, e) => { if (!isValidFile(e.FullPath)) { return; } await renameFile(e.FullPath); }); watcher.EnableRaisingEvents = true; Console.WriteLine("watcher: " + watcher.Path); }