Example #1
0
 /// <summary>
 /// Returns True if a Segment Size has reached threshold, otherwise returns false
 /// </summary>
 /// <param name="segment"></param>
 /// <returns></returns>
 public bool IsSegmentSizeReached(IndexSegment segment)
 {
     return(_config.SEGMENT_FLUSH_STRATEGY switch {
         SegmentFlushStrategy.AllocatedMemory => segment.SizeBytes >= _config.MAX_SIZE_BYTES_PER_SEGMENT,
         SegmentFlushStrategy.PostingsCount => segment.DocumentCount >= _config.MAX_POSTING_COUNT_PER_SEGMENT,
         _ => throw new InvalidOperationException("Unknown or unspecified SEGMENT_FLUSH_STRATEGY.")
     });
Example #2
0
        /// <summary>
        /// Indexes text data from the Stream
        /// - Reads stream
        /// - Tokenizes text data
        /// - Updates the inverted index
        /// </summary>
        /// <param name="stream"></param>
        /// <param name="docId"></param>
        private async Task BuildSegment(Stream stream, IndexSegment segment, long docId)
        {
            using var reader = new StreamReader(stream);
            var documentText = await reader.ReadToEndAsync();

            var tokens = _tokenizer.Tokenize(documentText);

            try {
                foreach (var token in tokens)
                {
                    var loweredToken = token.ToLowerInvariant();
                    if (segment.Index.ContainsKey(loweredToken))
                    {
                        segment.Index[loweredToken].Postings.Add(docId);
                    }
                    else
                    {
                        segment.Index[loweredToken] = new PostingList {
                            Postings = new List <long> {
                                docId
                            }
                        };
                    }
                }

                segment.DocumentCount++;

                // TODO: How to measure efficiently at runtime
                // segment.SizeBytes += await EstimateMemSize(segment);
            } catch (Exception e) {
                Console.WriteLine("Error building index");
                Console.WriteLine(e);
            }
        }
 /// <summary>
 /// Using the provided _serializer,
 /// loads an inverted index from disk into memory
 /// </summary>
 public async Task LoadIndexFromDisk()
 {
     try {
         _inMemoryIndex = await _serializer.ReadFromFile(_pathToIndex);
     } catch (Exception e) {
         Console.WriteLine("Error reading index from disk.");
         Console.WriteLine(e);
     }
 }
Example #4
0
 /// <summary>
 /// Using the provided _serializer,
 /// loads an inverted index from disk into memory
 /// </summary>
 /// <param name="fileName"></param>
 public async Task LoadIndexFromDisk(string fileName)
 {
     try {
         _indexSegment = await _serializer.ReadFromFile(fileName);
     } catch (Exception e) {
         Console.WriteLine("Error reading index from disk.");
         Console.WriteLine(e);
     }
 }
Example #5
0
        /// <summary>
        /// Writes the IndexSegment instance to disk
        /// </summary>
        private async Task FlushIndexSegment(IndexSegment segment)
        {
            var fileName = Path.Join(_config.SEGMENT_DIRECTORY, _config.SEGMENT_PREFIX);

            fileName += $"{segment.Id:X}";

            try {
                await _serializer.WriteToFile(fileName, segment);
            } catch (Exception e) {
                Console.WriteLine("Error writing index to disk.");
                Console.WriteLine(e);
            }
        }
Example #6
0
        public override void Initialize()
        {
            var indexSegments =
                Execute(IndexSegmentCommandText)
                .Select(o => IndexSegment.CreateFrom(SqlHelper, o))
                .ToLookup(x => x.IndexName);

            m_Indices =
                Execute(IndexCommandText)
                .Select(o => Index.CreateFrom(SqlHelper, o, indexSegments))
                .ToDictionary(x => x.IndexName);
            m_IndicesByRelation = m_Indices.Values
                                  .ToMultiDictionary(x => x.RelationName);
        }
Example #7
0
        /// <summary>
        /// Builds an in-memory index using a provided filePath to a zip file.
        /// The provided filePath should be a zip archive containing text files
        /// for index. Each file is considered a document for the posting list.
        /// </summary>
        /// <param name="filePath"></param>
        /// <param name="indexName"></param>
        public void BuildIndexForArchive(string filePath, string indexName)
        {
            _indexSegment = new IndexSegment {
                Index = new SortedDictionary <string, PostingList>()
            };

            using var file = File.OpenRead(filePath);
            using var zip  = new ZipArchive(file, ZipArchiveMode.Read);

            for (var docId = 1; docId < zip.Entries.Count; docId++)
            {
                using var stream = zip.Entries[docId].Open();
                IndexStream(stream, docId);
            }

            WriteIndexToDisk(indexName);
        }
Example #8
0
        /// <summary>
        /// Indexes the provided zip archive at `filePath` and writes
        /// segmented indices to disk
        /// </summary>
        /// <param name="filePath"></param>
        public async Task BuildIndexForArchive(string filePath)
        {
            await using var file = File.OpenRead(filePath);
            using var zip        = new ZipArchive(file, ZipArchiveMode.Read);
            var docId     = 0;
            var indexId   = 1;
            var fileCount = zip.Entries.Count;

            // TODO: Stream / use cancellation token to exit while loop in async method
            while (true)
            {
                var segment = new IndexSegment(indexId);

                while (!_segMerge.IsSegmentSizeReached(segment))
                {
                    try {
                        await using var stream = zip.Entries[docId].Open();
                        await BuildSegment(stream, segment, docId);

                        docId++;
                        if (docId > fileCount - 1)
                        {
                            break;
                        }
                    } catch (ArgumentOutOfRangeException e) {
                        Console.WriteLine($"docId not found: {docId}");
                    }
                }

                await FlushIndexSegment(segment);

                indexId++;

                if (docId > fileCount - 1)
                {
                    break;
                }
            }

            await _segMerge.MergeSegments();
        }
Example #9
0
            private void processIndexSegment(IndexSegment segment)
            {
                lastSegment = segment;
                if (segment.IsAny)
                {
                    if (currentNode.AnyChildNode != null)
                    {
                        parentNode  = currentNode;
                        currentNode = currentNode.AnyChildNode;
                    }
                    else
                    {
                        MemoryIndexCollectorNode newNode = new MemoryIndexCollectorNode(null);
                        newNode.IsMust = false;
                        currentNode.addAnyChild(newNode);

                        parentNode      = currentNode;
                        currentNode     = newNode;
                        lastCreatedNode = newNode;
                    }
                }
                else
                {
                    if (currentNode.NamedChildNodes != null && currentNode.NamedChildNodes.ContainsKey(segment.Name))
                    {
                        parentNode  = currentNode;
                        currentNode = currentNode.NamedChildNodes[segment.Name];
                    }
                    else
                    {
                        MemoryIndexCollectorNode newNode = new MemoryIndexCollectorNode(null);
                        newNode.IsMust = isMust;
                        currentNode.addChild(newNode, segment.Name);

                        parentNode      = currentNode;
                        currentNode     = newNode;
                        lastCreatedNode = newNode;
                    }
                }
            }
Example #10
0
        public void Test_GetDocumentsContainingTerm_Returns_ExpectedDocuments()
        {
            var index = new SortedDictionary <string, PostingList>()
            {
                ["red"] = new PostingList {
                    Postings = new List <long> {
                        1, 2, 3, 4, 5
                    }
                },
                ["blue"] = new PostingList {
                    Postings = new List <long> {
                        2, 3, 4, 7
                    }
                },
                ["green"] = new PostingList {
                    Postings = new List <long> {
                        6, 7
                    }
                },
            };

            var inMemoryIndex = new IndexSegment()
            {
                Index = index
            };
            var sut = new SimpleBooleanSearchEngine();

            sut.LoadIndexFromMemory(inMemoryIndex);

            var result       = sut.GetDocumentsContainingTerm("blue");
            var expectedDocs = new PostingList {
                Postings = new List <long> {
                    2, 3, 4, 7
                }
            };

            result.Postings.Count.Should().Be(4);
            result.Should().BeEquivalentTo(expectedDocs);
        }
Example #11
0
        public void Test_Intersection_Returns_ExpectedDocuments(string t1, string t2, long expectedCount)
        {
            var index = new SortedDictionary <string, PostingList>()
            {
                ["red"] = new PostingList {
                    Postings = new List <long> {
                        1, 2, 3, 4, 5
                    }
                },
                ["blue"] = new PostingList {
                    Postings = new List <long> {
                        2, 3, 4, 7
                    }
                },
                ["green"] = new PostingList {
                    Postings = new List <long> {
                        6, 7
                    }
                },
            };

            var inMemoryIndex = new IndexSegment()
            {
                Index = index
            };
            var sut = new SimpleBooleanSearchEngine();

            sut.LoadIndexFromMemory(inMemoryIndex);

            var searchTerms = new List <string> {
                t1, t2
            };
            var result = sut.IntersectionQuery(searchTerms);

            result.Count.Should().Be((int)expectedCount);
        }
 /// <summary>
 /// Sets the _inMemoryIndex to the provided index value
 /// </summary>
 /// <param name="index"></param>
 public void LoadIndexFromMemory(IndexSegment index)
 {
     _inMemoryIndex = index;
 }
Example #13
0
        public void AscendingIndexSegmentToString()
        {
            var segment = new IndexSegment("column", JET_coltyp.IEEEDouble, true, false);

            Assert.AreEqual("+column(IEEEDouble)", segment.ToString());
        }
Example #14
0
        public void VerifyIndexSegmentCanBeSerialized()
        {
            var expected = new IndexSegment("column", JET_coltyp.Text, false, true);

            SerializeAndCompare(expected);
        }
Example #15
0
        public void DescendingIndexSegmentToString()
        {
            var segment = new IndexSegment("othercolumn", JET_coltyp.Bit, false, false);

            Assert.AreEqual("-othercolumn(Bit)", segment.ToString());
        }
Example #16
0
 /// <summary>
 /// Estimates the memory required to store the given `segment`
 /// This is not the way to do this, but trying to get something
 /// simple working temporarily
 ///
 /// This is way too slow to use efficiently.
 /// TODO: How to monitor the size in MB of the segment during indexing?
 /// Fairly non-trivial to implement properly. See Lucene Estimator for an example.
 /// https://github.com/apache/lucenenet/blob/master/src/Lucene.Net/Util/RamUsageEstimator.cs
 /// </summary>
 /// <param name="segment"></param>
 /// <returns></returns>
 /// <exception cref="NotImplementedException"></exception>
 private async Task <long> EstimateMemSize(IndexSegment segment)
 {
     await using Stream stream = new MemoryStream();
     _formatter.Serialize(stream, segment.Index);
     return(stream.Length);
 }
Example #17
0
 /// <summary>
 /// Gets the lexicon term count of the in-memory index
 /// </summary>
 /// <returns></returns>
 public static long GetSegmentVocabularySize(IndexSegment segment) => segment.Index.Keys.Count;
Example #18
0
            public static MyError BadIndexError(PyObj pyObj, IndexSegment indexSegment)
            {
                var indexValue = indexSegment.Index;

                return(new MyError("El indice: " + indexValue.MyToString() + " (" + TypeConstants.GetMyTypeName(indexValue.GetMyType()) + ") no es un indice valido para el tipo: " + TypeConstants.GetMyTypeName(pyObj.GetMyType()) + " valor: " + pyObj.MyToString()));;
            }