예제 #1
0
        public void GettingDocFrequency_ProvidingATerm_ShouldReturnCorrectResult5()
        {
            var invertedIndex = new InvertedIndex(this.Documents);
            var actual        = invertedIndex.GetDocumentFrequency("whatever");

            Assert.AreEqual(2, actual);
        }
예제 #2
0
        public void ShouldMarkAsDeleted()
        {
            var index     = new InvertedIndex();
            var fieldName = Utils.RandomString();
            var value1    = Utils.RandomString();
            var value2    = Utils.RandomString();
            var docId1    = Utils.RandomInteger();
            var docId2    = Utils.RandomInteger();
            var docId3    = Utils.RandomInteger();

            index.AddPosting(fieldName, value1, docId1);
            index.AddPosting(fieldName, value2, docId2);
            index.AddPosting(fieldName, value2, docId3);
            index.MarkAsDeleted(docId2);

            var actual1 = index.GetPostings(fieldName, value1).ToList();

            Assert.AreEqual(1, actual1.Count);
            Assert.AreEqual(docId1, actual1[0]);

            var actual2 = index.GetPostings(fieldName, value2).ToList();

            Assert.AreEqual(1, actual2.Count);
            Assert.AreEqual(docId3, actual2[0]);
        }
예제 #3
0
        public void GettingDocFrequency_ProvidingUnseenTerm_ShouldReturnZero()
        {
            var invertedIndex = new InvertedIndex(this.Documents);
            var actual        = invertedIndex.GetDocumentFrequency("navigator");

            Assert.AreEqual(0, actual);
        }
예제 #4
0
        private void handleIndexCommand(string[] tokens)
        {
            var watch = System.Diagnostics.Stopwatch.StartNew();

            if (tokens.Length < 2)
            {
                throw new ArgumentException("No path provided for indexing.");
            }

            var path = String.Join(" ", tokens, 1, tokens.Length - 1)
                       .Replace("'", String.Empty).Replace("\"", String.Empty);
            var pathHash = HashUtilities.CreateMD5Hash(path);

            Console.WriteLine($"pathHash = {pathHash}");
            Console.WriteLine($"Indexing path '{path}' started at {DateTime.Now}");
            var invertedIndex    = new InvertedIndex(path, pathHash);
            var filepaths        = fileUtilities.GetIndexableFilePaths(path, this.configuration.AllowedExtensions);
            var indexBuildResult = invertedIndex.BuildIndex(filepaths, fileUtilities, stopWords: this.configuration.StopWords);

            this.invertedIndexCache.set(invertedIndex.MD5, invertedIndex);  // save to memory, force overwrite
            this.lastIndexedPath = invertedIndex.Path;

            var savePath = $"{pathHash}.txt";

            this.fileUtilities.SerializeInvertedIndex(savePath, invertedIndex);
            Console.WriteLine($"Index written to '{savePath}'");

            watch.Stop();
            var elapsedMs = watch.ElapsedMilliseconds;

            Console.WriteLine($"Indexed Path '{path}' with {filepaths.Count} files in {elapsedMs} ms");
        }
예제 #5
0
        public void GettingDocuments_ProvidingUnseenTerm_ShouldReturnEmptyCollection()
        {
            var invertedIndex = new InvertedIndex(this.Documents);
            var actual        = invertedIndex.GetDocumentsContainingTerm("bear").Select(x => x.Id).ToList();

            CollectionAssert.AreEquivalent(new string[] { }, actual);
        }
예제 #6
0
        public void ShouldAddPostingsInParallel()
        {
            const int count = 1000;

            long currentDocId = -1;
            var  fieldNames   = Enumerable.Repeat(0, count).Select(x => Utils.RandomString()).ToList();
            var  values       = Enumerable.Repeat(0, count).Select(x => Utils.RandomString()).ToList();
            var  index        = new InvertedIndex(cleanUpPeriod: 1);

            Parallel.For(0, count, i =>
            {
                var field = Utils.RandomElement(fieldNames);
                var value = Utils.RandomElement(values);
                index.AddPosting(field, value, Interlocked.Increment(ref currentDocId));
            });

            var docsCount = 0;

            foreach (var fieldName in fieldNames)
            {
                foreach (var value in values)
                {
                    docsCount += index.GetPostings(fieldName, value).Count();
                }
            }

            Assert.AreEqual(count, docsCount);
        }
예제 #7
0
 public void Import(SearchExport searchExport)
 {
     _invertedIndex         = searchExport.InvertedIndex;
     _documentNumberOfTerms = searchExport.DocumentNumberOfTerms;
     NumberOfDocuments      = searchExport.NumberOfDocuments;
     NumberOfTerms          = searchExport.NumberOfTerms;
 }
        private static void Main(string[] args)
        {
            //Console.WriteLine("Enter url");
            //_url = Console.ReadLine();

            DocumentData document = new DocumentData(_url);

            //document.CreateXMLDoc();

            Console.WriteLine("Choose stemmer: porter or mystem");
            _stemmer = Console.ReadLine();
            InvertedIndex invertedIndex = new InvertedIndex();

            invertedIndex.CreateInvertedIndex(_stemmer);

            document.XMLForInvertedIndex(invertedIndex.GetInvertedIndex());

            Console.WriteLine("enter messege: ");
            _mess = Console.ReadLine();

            invertedIndex.And(_mess);

            LSIClass c = new LSIClass(invertedIndex.FillMatrix(), invertedIndex.q, _mess, invertedIndex.n);

            c.LatentSemanticIndexing();
        }
        public void AddDocumentTest()
        {
            var invertedIndex = new InvertedIndex();

            invertedIndex.AddDocument(new Document("doc"), "this is content in doc");
            Assert.True(true);
        }
        private static InvertedIndex InitializeInvertedIndex()
        {
            var elasticIndex  = new ElasticIndex(elasticServerUrl, elasticServerPort, indexName);
            var invertedIndex = new InvertedIndex(elasticIndex);

            return(invertedIndex);
        }
예제 #11
0
 public void Init()
 {
     //Note fill testdata  into the invertedindex -- in progress
     index = new InvertedIndex();
     index.Add(doc1, new List <string>()
     {
         "word1", "word2", "word3", "word4"
     });
     index.Add(doc2, new List <string>()
     {
         "word2", "word4", "word6", "word8"
     });
     index.Add(doc3, new List <string>()
     {
         "word3", "word6", "word9", "word12"
     });
     index.Add(doc4, new List <string>()
     {
         "word4", "word8", "word12", "word16"
     });
     index.Add(doc5, new List <string>()
     {
         "word5", "word10", "word15", "word20"
     });
     index.Add(doc6, new List <string>()
     {
         "word6", "word12", "word18", "word24"
     });
 }
예제 #12
0
        public override IEnumerable <int> Execute(InvertedIndex index)
        {
            var postings = new List <List <int> >();

            foreach (var query in _queries)
            {
                var l = query.Execute(index).ToList();

                if (!l.Any())
                {
                    return(Enumerable.Empty <int>());
                }

                postings.Add(l.ToList());
            }

            postings.Sort((l1, l2) => l1.Count.CompareTo(l2.Count));

            var results = postings[0];

            for (var i = 1; i < postings.Count; i++)
            {
                results = results.Intersect(postings[1]).ToList();
            }

            return(results);
        }
예제 #13
0
        public void GettingDocuments_ContainingATerm_ShouldReturnCorrectResult5()
        {
            var invertedIndex = new InvertedIndex(this.Documents);
            var actual        = invertedIndex.GetDocumentsContainingTerm("you").Select(x => x.Id).ToList();

            CollectionAssert.AreEquivalent(new[] { "2", "5", "6" }, actual);
        }
예제 #14
0
        public void GettingTheInverseDocFrequency_ProvidingATerm_ShouldReturnCorrectResult5()
        {
            var invertedIndex = new InvertedIndex(this.Documents);
            var expected      = Math.Round(Math.Log(6 / 3), 2);
            var actual        = Math.Round(invertedIndex.GetInverseDocumentFrequency("you"), 2);

            Assert.AreEqual(expected, actual);
        }
예제 #15
0
        public void GettingTheInverseDocFrequency_ProvidingUnseenTerm_ShouldReturnZero()
        {
            var invertedIndex = new InvertedIndex(this.Documents);
            var expected      = 0;
            var actual        = invertedIndex.GetInverseDocumentFrequency("tree");

            Assert.AreEqual(expected, actual);
        }
예제 #16
0
        public void CreateIndexTest()
        {
            var invertedIndex = new InvertedIndex();
            var docs          = SampleCreator.CreateStringList();

            invertedIndex.CreateIndex(docs);
            Assert.Equal(invertedIndex.Index, SampleCreator.CreateIndex());
        }
예제 #17
0
        public void Initialize()
        {
            _index = new InvertedIndex();

            var data1 = string.Join("\r\n", _data1);

            Add("test1", string.Empty, Encoding.Default.GetByteCount(data1), _timestamp, TokenizeString("test1", string.Empty, data1));
        }
예제 #18
0
        public override IEnumerable <int> Execute(InvertedIndex index)
        {
            var results = Enumerable.Empty <int>();

            return(_queries.Aggregate(
                       seed: Enumerable.Empty <int>(),
                       func: (current, query) => current.Union(query.Execute(index))
                       ));
        }
        public void ConstructorDictionaryTest()
        {
            var inputData = new Dictionary <Document, string>
            {
                { new Document("doc"), "this is content of the file" }
            };

            _ = new InvertedIndex(inputData);
            Assert.True(true);
        }
        public void QueryEmptyResultTest()
        {
            var invertedIndex = new InvertedIndex();

            invertedIndex.AddDocuments(sampleData);
            var expected = new DocumentSet(new HashSet <Document>());
            var actual   = invertedIndex.SearchTokenInDocuments(new Token("Mohammad"));

            Assert.Equal(expected, actual);
        }
        public void UpdateTheMapTest()
        {
            string        str           = "The sky , at sunset , looked like a carnivorous flower .";
            InvertedIndex invertedIndex = new InvertedIndex(mapOfTheWords);

            invertedIndex.UpdateTheMap(str, 1);
            Assert.Equal(mapOfTheWords["sky"].Count, 2);
            Assert.Equal(mapOfTheWords["sunset"].Count, 2);
            Assert.Equal(mapOfTheWords["tree"].Count, 1);
            Assert.Equal(mapOfTheWords["flower"].Count, 1);
        }
        public InvertedIndexTest()
        {
            var options = new DbContextOptionsBuilder <InvertedIndexDbContext>()
                          .UseInMemoryDatabase("Test")
                          .Options;

            invertedIndexContext = new InvertedIndexDbContext(options);
            invertedIndexContext.Database.EnsureDeleted();
            invertedIndexContext.Database.EnsureCreated();
            invertedIndex = new InvertedIndex(invertedIndexContext);
        }
예제 #23
0
        public IndexRepository(IOptions <AWSOptions> options)
        {
            index  = new InvertedIndex <string>();
            client = new AmazonDynamoDBClient(
                options.Value.AwsAccessKey, options.Value.AwsSecretKey, RegionEndpoint.EUWest1
                );
            context   = new DynamoDBContext(client);
            throttler = new DBThrottler <IndexItem>(context, 60, 100);

            Initialisation = AsyncInitialise();
        }
예제 #24
0
 private void bw_DoWork(object sender, DoWorkEventArgs e)
 {
     BackgroundWorker loadWorker = sender as BackgroundWorker;
     index = new InvertedIndex();
     logic = new Expressions(index, persistentFileName);
     logEntries = new LogEntries(logFilename, index, ProgressUpdate);
     if (logEntries!=null)
     {
         logEntries.Build();
     }
 }
        /// <summary>
        /// Initial test InvertedIndex with default values.
        /// </summary>
        /// <returns>
        /// An InvertedIndex with default values.
        /// </returns>
        public static InvertedIndex InitialInvertedIndex()
        {
            var invertedIndex = new InvertedIndex();

            invertedIndex.InsertDatas(new List <Tuple <string, string> > {
                new Tuple <string, string>("file1", "test"),
                new Tuple <string, string>("file2", "test"),
                new Tuple <string, string>("file1", "test2"),
                new Tuple <string, string>("file3", "test2"),
            });
            return(invertedIndex);
        }
예제 #26
0
        public static void Run()
        {
            var ii    = new InvertedIndex();
            var words = new System.String[]
            {
                "Kanishka", "Kanishk", "ken", "Kanishka", "Ken", "ken", "Kanishka", "Kanishk", "ken", "Kanishka", "Ken",
                "ken"
            };

            InvertedIndex.PreCompute(words);
            InvertedIndex.PrintDictionary();
        }
예제 #27
0
        public void AddToIndexTest()
        {
            var index = new Dictionary <string, HashSet <int> >();

            index.Add("chocolate", new HashSet <int> {
                3
            });
            var invertedIndex = new InvertedIndex();

            invertedIndex.AddToIndex("chocolate", 3);
            Assert.Equal(index, invertedIndex.Index);
        }
예제 #28
0
        public void QuerySimpleWithConsturctorDataTest()
        {
            var invertedIndex = new InvertedIndex(repository);
            var expected      = new DocumentSet(new HashSet <Document>
            {
                doc1, doc2
            }
                                                );
            var actual = invertedIndex.SearchTokenInDocuments(new Token("content"));

            Assert.Equal(expected, actual);
        }
예제 #29
0
        public void ShouldReturnEmptyIfNotFound()
        {
            var index = new InvertedIndex();

            CollectionAssert.IsEmpty(index.GetPostings(Utils.RandomString(), Utils.RandomString()));

            var fieldName = Utils.RandomString();

            index.AddPosting(fieldName, Utils.RandomString(), Utils.RandomInteger());

            CollectionAssert.IsEmpty(index.GetPostings(fieldName, Utils.RandomString()));
        }
예제 #30
0
        public void PopulateIndex(InvertedIndex <string> index, string content, Website web)
        {
            foreach (var word in content.Split(' '))
            {
                string key = ProcessKey(word);
                if (!index.ContainsKey(key))
                {
                    index.Add(key, new HashSet <string>());
                }

                index[key].Add(web.Url);
            }
        }
예제 #31
0
        public static SequenceInvertedIndex Build(InvertedIndex invindex, SequenceBuilder builder)
        {
            var newinvindex = new SequenceInvertedIndex ();

            var xseq = new int[invindex.NumberOfItems];
            for (int sym = 0; sym < invindex.Count; ++sym) {
                var list = invindex [sym];
                foreach (var objID in list) {
                    xseq [objID] = sym;
                }
            }
            newinvindex.seq = builder.Invoke (xseq, invindex.Count);
            return newinvindex;
        }
예제 #32
0
 public ElasticSearch(int numDocuments)
 {
     _numDocuments = numDocuments;
     _invertedIndex = new InvertedIndex(_numDocuments);
 }
예제 #33
0
 public static Plain64InvertedIndex Build(InvertedIndex invindex)
 {
     var I = new Plain64InvertedIndex ();
     for (int i = 0; i < invindex.Count; ++i) {
         I.Add (invindex[i]);
     }
     return I;
 }
예제 #34
0
파일: LSH.cs 프로젝트: sadit/natix
 public override void Load(BinaryReader Input)
 {
     base.Load(Input);
     this.Width = Input.ReadInt32 ();
     this.invindex = GenericIO<InvertedIndex>.Load (Input);
 }
예제 #35
0
파일: LSH.cs 프로젝트: sadit/natix
        public virtual void Build(MetricDB db, int width, Random rand, Func<InvertedIndex,InvertedIndex> create_invertedindex = null, Func<int,object> get_item = null)
        {
            this.DB = db;
            this.Width = width;

            int len = this.DB.Count;
            int pc = len / 100 + 1;
            int numbits = width > 32 ? 32 : width;

            Plain64InvertedIndex table = new Plain64InvertedIndex ();
            table.Initialize (1 << numbits);
            int maxhash = 0;

            this.PreBuild (rand, this.DB [0]);
            for (int objID = 0; objID < len; objID++) {
                if (objID % pc == 0) {
                    Console.WriteLine ("Advance: {0:0.00}%, docid: {1}, total: {2}", objID * 100.0 / len, objID, len);
                }
                int hash;
                if (get_item == null) {
                    hash = this.ComputeHash (this.DB [objID]);
                } else {
                    hash = this.ComputeHash (get_item (objID));
                }

                table.AddItem(hash, objID);
                if (hash > maxhash) {
                    maxhash = hash;
                }
            }

            table.Trim (maxhash + 1);
            if (create_invertedindex == null) {
                this.invindex = table;
            } else {
                this.invindex = create_invertedindex (table);
            }
        }
예제 #36
0
 public static CompressedInvertedIndex Build(InvertedIndex invindex)
 {
     var cii = new CompressedInvertedIndex ();
     for (int s = 0; s < invindex.Count; ++s) {
         cii.Add (invindex [s]);
     }
     return cii;
 }