コード例 #1
0
        //Computes the TF-IDF rank of both the query and the documents relevant
        //to the document
        private void CompTF_IDF(Tuple <int, List <ITermDocument> > query)
        {
            //A cache of Length of each documents needed for ranking.
            IDictionary <String, long> cacheDocumentsLength = new Dictionary <String, long>();

            //No of document in Corpus.
            long noDocs = _corpus.GetNoDocuments().Result;

            //Compute the size of the query to compute each query's IDF
            var querySize = query.Item1;

            foreach (var term in query.Item2)
            {
                IDictionary <String, int> documents = new Dictionary <String, int>();
                foreach (var doc in _corpus.GetDocuments(term.Term).Result)
                {
                    if (documents.Keys.Contains(doc.DocID))
                    {
                        continue;
                    }
                    documents.Add(doc.DocID, doc.Pos.Count);
                    this._documents.Add(doc.DocID);
                }
                if (documents.Count < 1)
                {
                    continue;
                }
                var    nDocuments = new Dictionary <String, double>();
                double IDF        = (1 + Math.Log(1.0 * noDocs / documents.Keys.Count));
                this._queryRank[term.Term] = (1.0 * term.Docs.First().Pos.Count / querySize) * IDF;
                foreach (var item in documents)
                {
                    long length;
                    if (cacheDocumentsLength.ContainsKey(item.Key))
                    {
                        length = cacheDocumentsLength[item.Key];
                    }
                    else
                    {
                        length = _corpus.GetDocumentLength(item.Key).Result;
                        cacheDocumentsLength[item.Key] = length;
                    }
                    if (length == 0)
                    {
                        continue;
                    }
                    double tF = 1.0 * item.Value / length;
                    nDocuments[item.Key] = tF * IDF;
                }
                _documentRank[term.Term] = nDocuments;
                ;
            }
        }
コード例 #2
0
ファイル: Crawler.cs プロジェクト: Kooldeji/WebSpy
        public async Task init()
        {
            Console.WriteLine("Initializing...");
            _indexer = new Indexer(_corpus);
            var repo = await _corpus.GetRepository();

            Console.WriteLine(repo);
            var files = crawl(repo).ToList();//All files in my repository

            foreach (var file in files)
            {
                Console.WriteLine(file);
                if (!isValidFile(file))
                {
                    continue;
                }
                Console.WriteLine(await _corpus.GetRepository());
                var sfile = file.Replace(await _corpus.GetRepository() + "\\", "");
                Console.WriteLine(await _corpus.GetRepository());
                //For each file in the repository, try to get their id from the corpus
                Console.WriteLine(sfile);
                string id = await _corpus.GetDocumentID(sfile);

                if (id == null)
                {
                    //File is not in the Corpus
                    //Index it and add it to the corpus.
                    Console.WriteLine("deji2");
                    await addFile(file);
                }
                else
                {
                    //File exists in the corpus
                    if (File.GetLastWriteTime(file).Ticks > await _corpus.GetLastCrawled())
                    {
                        //File has been edited since last checked, re-index it and update it into the corpus
                        await editFile(file);
                    }
                }
                //Console.WriteLine("done");
            }
            var kFiles = await _corpus.GetDocuments(); //Files that exist in my database

            foreach (var id in kFiles)
            {
                var file = await _corpus.GetDocumentPath(id);

                if (!files.Contains(Path.Combine(await _corpus.GetRepository(), file)))
                {
                    //File does not exist again in repo, delete it from database
                    await removeFile(file);
                }
            }

            //------------Watching for changes--------------------
            watcher = new FileSystemWatcher(await _corpus.GetRepository());
            Console.WriteLine("watch");
            watcher.NotifyFilter = NotifyFilters.LastWrite | NotifyFilters.FileName | NotifyFilters.DirectoryName | NotifyFilters.Attributes;
            watcher.Changed     += new FileSystemEventHandler(async(source, e) =>
            {
                Console.WriteLine("changed");
                if (!isValidFile(e.FullPath))
                {
                    return;
                }
                await editFile(e.FullPath);
            });
            watcher.Created += new FileSystemEventHandler(async(source, e) =>
            {
                Console.WriteLine("created");
                if (!isValidFile(e.FullPath))
                {
                    return;
                }
                await addFile(e.FullPath);
            });
            watcher.Deleted += new FileSystemEventHandler(async(source, e) =>
            {
                Console.WriteLine("remove");
                await removeFile(e.FullPath);
            });
            watcher.Renamed += new RenamedEventHandler(async(source, e) =>
            {
                if (!isValidFile(e.FullPath))
                {
                    return;
                }
                await renameFile(e.FullPath);
            });

            watcher.EnableRaisingEvents = true;
            Console.WriteLine("watcher: " + watcher.Path);
        }