/// <summary> /// Returns True if a Segment Size has reached threshold, otherwise returns false /// </summary> /// <param name="segment"></param> /// <returns></returns> public bool IsSegmentSizeReached(IndexSegment segment) { return(_config.SEGMENT_FLUSH_STRATEGY switch { SegmentFlushStrategy.AllocatedMemory => segment.SizeBytes >= _config.MAX_SIZE_BYTES_PER_SEGMENT, SegmentFlushStrategy.PostingsCount => segment.DocumentCount >= _config.MAX_POSTING_COUNT_PER_SEGMENT, _ => throw new InvalidOperationException("Unknown or unspecified SEGMENT_FLUSH_STRATEGY.") });
/// <summary> /// Indexes text data from the Stream /// - Reads stream /// - Tokenizes text data /// - Updates the inverted index /// </summary> /// <param name="stream"></param> /// <param name="docId"></param> private async Task BuildSegment(Stream stream, IndexSegment segment, long docId) { using var reader = new StreamReader(stream); var documentText = await reader.ReadToEndAsync(); var tokens = _tokenizer.Tokenize(documentText); try { foreach (var token in tokens) { var loweredToken = token.ToLowerInvariant(); if (segment.Index.ContainsKey(loweredToken)) { segment.Index[loweredToken].Postings.Add(docId); } else { segment.Index[loweredToken] = new PostingList { Postings = new List <long> { docId } }; } } segment.DocumentCount++; // TODO: How to measure efficiently at runtime // segment.SizeBytes += await EstimateMemSize(segment); } catch (Exception e) { Console.WriteLine("Error building index"); Console.WriteLine(e); } }
/// <summary> /// Using the provided _serializer, /// loads an inverted index from disk into memory /// </summary> public async Task LoadIndexFromDisk() { try { _inMemoryIndex = await _serializer.ReadFromFile(_pathToIndex); } catch (Exception e) { Console.WriteLine("Error reading index from disk."); Console.WriteLine(e); } }
/// <summary> /// Using the provided _serializer, /// loads an inverted index from disk into memory /// </summary> /// <param name="fileName"></param> public async Task LoadIndexFromDisk(string fileName) { try { _indexSegment = await _serializer.ReadFromFile(fileName); } catch (Exception e) { Console.WriteLine("Error reading index from disk."); Console.WriteLine(e); } }
/// <summary> /// Writes the IndexSegment instance to disk /// </summary> private async Task FlushIndexSegment(IndexSegment segment) { var fileName = Path.Join(_config.SEGMENT_DIRECTORY, _config.SEGMENT_PREFIX); fileName += $"{segment.Id:X}"; try { await _serializer.WriteToFile(fileName, segment); } catch (Exception e) { Console.WriteLine("Error writing index to disk."); Console.WriteLine(e); } }
public override void Initialize() { var indexSegments = Execute(IndexSegmentCommandText) .Select(o => IndexSegment.CreateFrom(SqlHelper, o)) .ToLookup(x => x.IndexName); m_Indices = Execute(IndexCommandText) .Select(o => Index.CreateFrom(SqlHelper, o, indexSegments)) .ToDictionary(x => x.IndexName); m_IndicesByRelation = m_Indices.Values .ToMultiDictionary(x => x.RelationName); }
/// <summary> /// Builds an in-memory index using a provided filePath to a zip file. /// The provided filePath should be a zip archive containing text files /// for index. Each file is considered a document for the posting list. /// </summary> /// <param name="filePath"></param> /// <param name="indexName"></param> public void BuildIndexForArchive(string filePath, string indexName) { _indexSegment = new IndexSegment { Index = new SortedDictionary <string, PostingList>() }; using var file = File.OpenRead(filePath); using var zip = new ZipArchive(file, ZipArchiveMode.Read); for (var docId = 1; docId < zip.Entries.Count; docId++) { using var stream = zip.Entries[docId].Open(); IndexStream(stream, docId); } WriteIndexToDisk(indexName); }
/// <summary> /// Indexes the provided zip archive at `filePath` and writes /// segmented indices to disk /// </summary> /// <param name="filePath"></param> public async Task BuildIndexForArchive(string filePath) { await using var file = File.OpenRead(filePath); using var zip = new ZipArchive(file, ZipArchiveMode.Read); var docId = 0; var indexId = 1; var fileCount = zip.Entries.Count; // TODO: Stream / use cancellation token to exit while loop in async method while (true) { var segment = new IndexSegment(indexId); while (!_segMerge.IsSegmentSizeReached(segment)) { try { await using var stream = zip.Entries[docId].Open(); await BuildSegment(stream, segment, docId); docId++; if (docId > fileCount - 1) { break; } } catch (ArgumentOutOfRangeException e) { Console.WriteLine($"docId not found: {docId}"); } } await FlushIndexSegment(segment); indexId++; if (docId > fileCount - 1) { break; } } await _segMerge.MergeSegments(); }
private void processIndexSegment(IndexSegment segment) { lastSegment = segment; if (segment.IsAny) { if (currentNode.AnyChildNode != null) { parentNode = currentNode; currentNode = currentNode.AnyChildNode; } else { MemoryIndexCollectorNode newNode = new MemoryIndexCollectorNode(null); newNode.IsMust = false; currentNode.addAnyChild(newNode); parentNode = currentNode; currentNode = newNode; lastCreatedNode = newNode; } } else { if (currentNode.NamedChildNodes != null && currentNode.NamedChildNodes.ContainsKey(segment.Name)) { parentNode = currentNode; currentNode = currentNode.NamedChildNodes[segment.Name]; } else { MemoryIndexCollectorNode newNode = new MemoryIndexCollectorNode(null); newNode.IsMust = isMust; currentNode.addChild(newNode, segment.Name); parentNode = currentNode; currentNode = newNode; lastCreatedNode = newNode; } } }
public void Test_GetDocumentsContainingTerm_Returns_ExpectedDocuments() { var index = new SortedDictionary <string, PostingList>() { ["red"] = new PostingList { Postings = new List <long> { 1, 2, 3, 4, 5 } }, ["blue"] = new PostingList { Postings = new List <long> { 2, 3, 4, 7 } }, ["green"] = new PostingList { Postings = new List <long> { 6, 7 } }, }; var inMemoryIndex = new IndexSegment() { Index = index }; var sut = new SimpleBooleanSearchEngine(); sut.LoadIndexFromMemory(inMemoryIndex); var result = sut.GetDocumentsContainingTerm("blue"); var expectedDocs = new PostingList { Postings = new List <long> { 2, 3, 4, 7 } }; result.Postings.Count.Should().Be(4); result.Should().BeEquivalentTo(expectedDocs); }
public void Test_Intersection_Returns_ExpectedDocuments(string t1, string t2, long expectedCount) { var index = new SortedDictionary <string, PostingList>() { ["red"] = new PostingList { Postings = new List <long> { 1, 2, 3, 4, 5 } }, ["blue"] = new PostingList { Postings = new List <long> { 2, 3, 4, 7 } }, ["green"] = new PostingList { Postings = new List <long> { 6, 7 } }, }; var inMemoryIndex = new IndexSegment() { Index = index }; var sut = new SimpleBooleanSearchEngine(); sut.LoadIndexFromMemory(inMemoryIndex); var searchTerms = new List <string> { t1, t2 }; var result = sut.IntersectionQuery(searchTerms); result.Count.Should().Be((int)expectedCount); }
/// <summary> /// Sets the _inMemoryIndex to the provided index value /// </summary> /// <param name="index"></param> public void LoadIndexFromMemory(IndexSegment index) { _inMemoryIndex = index; }
public void AscendingIndexSegmentToString() { var segment = new IndexSegment("column", JET_coltyp.IEEEDouble, true, false); Assert.AreEqual("+column(IEEEDouble)", segment.ToString()); }
public void VerifyIndexSegmentCanBeSerialized() { var expected = new IndexSegment("column", JET_coltyp.Text, false, true); SerializeAndCompare(expected); }
public void DescendingIndexSegmentToString() { var segment = new IndexSegment("othercolumn", JET_coltyp.Bit, false, false); Assert.AreEqual("-othercolumn(Bit)", segment.ToString()); }
/// <summary> /// Estimates the memory required to store the given `segment` /// This is not the way to do this, but trying to get something /// simple working temporarily /// /// This is way too slow to use efficiently. /// TODO: How to monitor the size in MB of the segment during indexing? /// Fairly non-trivial to implement properly. See Lucene Estimator for an example. /// https://github.com/apache/lucenenet/blob/master/src/Lucene.Net/Util/RamUsageEstimator.cs /// </summary> /// <param name="segment"></param> /// <returns></returns> /// <exception cref="NotImplementedException"></exception> private async Task <long> EstimateMemSize(IndexSegment segment) { await using Stream stream = new MemoryStream(); _formatter.Serialize(stream, segment.Index); return(stream.Length); }
/// <summary> /// Gets the lexicon term count of the in-memory index /// </summary> /// <returns></returns> public static long GetSegmentVocabularySize(IndexSegment segment) => segment.Index.Keys.Count;
public static MyError BadIndexError(PyObj pyObj, IndexSegment indexSegment) { var indexValue = indexSegment.Index; return(new MyError("El indice: " + indexValue.MyToString() + " (" + TypeConstants.GetMyTypeName(indexValue.GetMyType()) + ") no es un indice valido para el tipo: " + TypeConstants.GetMyTypeName(pyObj.GetMyType()) + " valor: " + pyObj.MyToString()));; }