public long Write() { if (_committed) { return(_ix.VersionId); } var trieBuilder = new TrieBuilder(); var docTimer = Stopwatch.StartNew(); var upsert = new DocumentUpsertCommand(_writeSession, _analyzer, trieBuilder); foreach (var doc in _documents.ReadSource()) { doc.Id = _count++; upsert.Write(doc); } Log.InfoFormat("stored {0} documents in {1}", _count, docTimer.Elapsed); var posTimer = Stopwatch.StartNew(); var tries = trieBuilder.GetTries(); foreach (var trie in tries) { // decode into a list of words and set postings address foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = _postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.DebugFormat("{0}\t{1}", word.Value, word.Postings.Count); } } } Log.InfoFormat( "stored postings refs in trees in {0}", posTimer.Elapsed); var treeTimer = Stopwatch.StartNew(); _ix.FieldOffsets = SerializeTries(tries, _compoundFile); Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed); _ix.PostingsOffset = _compoundFile.Position; _postingsWriter.Stream.Flush(); _postingsWriter.Stream.Position = 0; _postingsWriter.Stream.CopyTo(_compoundFile); _ix.DocumentCount = _count; return(_ix.VersionId); }
public void when_() { var trieBuilder = new TrieBuilder <string>( rootNodeDataFactory: () => "", nodeDataMerger: nodeDatas => nodeDatas.First(), nodeDataComparor: (first, second) => first.CompareTo(second), nodeDataHasher: nodeData => nodeData.GetHashCode(), canParentAcceptChildChecker: (parent, child) => child.StartsWith(parent), canNodeHaveChildrenChecker: nodeData => true); var datas = new[] { "abi", "efg*", "a", "abc", "abc*", "abcd", }; var expectedTree = new TestNode { Key = "", Children = new[] { new TestNode { Key = "a", Children = new[] { new TestNode { Key = "abc", Children = new[] { new TestNode { Key = "abc*" }, new TestNode { Key = "abcd" } } }, new TestNode { Key = "abi" } } }, new TestNode { Key = "efg*" } }, }; var trie = trieBuilder.CreateTrie(datas); AssertTree(expectedTree, trie); }
public void Write( Document document, IDocumentStoreWriter storeWriter, IAnalyzer analyzer, TrieBuilder trieBuilder) { var analyzed = analyzer.AnalyzeDocument(document); foreach (var word in analyzed.Words) { var field = word.Term.Field; var token = word.Term.Word.Value; var posting = word.Posting; trieBuilder.Add(new WordInfo(field, token, posting)); } storeWriter.Write(document); }
public void Puzzle3Test() { //Arrange IDictionaryRepository dictionaryRepository = new DictionaryRepository(); ITrieBuilder trieBuilder = new TrieBuilder(dictionaryRepository); BoggleWordSearchService boggleWordSearchService = new BoggleWordSearchService(trieBuilder); var board = "FVKOWQIMEZTLRPBE"; var expectedResults = Solution3Results(); //Act HashSet <string> result = boggleWordSearchService.FindWordsInBoggle(board, 4, 4); //only care that tries look the same. var resultString = Newtonsoft.Json.JsonConvert.SerializeObject(result.OrderBy(keys => keys)); var expectedString = Newtonsoft.Json.JsonConvert.SerializeObject(expectedResults); //Assert Assert.AreEqual(expectedString, resultString); }
public void Puzzle5By5Test() { //Arrange Mock <IDictionaryRepository> dictionaryRepositoryMock = new Mock <IDictionaryRepository>(); dictionaryRepositoryMock.Setup(x => x.GetWords()).Returns(Solution5By5Results().ToList); ITrieBuilder trieBuilder = new TrieBuilder(dictionaryRepositoryMock.Object); BoggleWordSearchService boggleWordSearchService = new BoggleWordSearchService(trieBuilder); var board = "sabtebralnuetoenrkeseqnss"; var expectedResults = Solution5By5Results(); //Act HashSet <string> result = boggleWordSearchService.FindWordsInBoggle(board, 5, 5); //only care that tries look the same. var resultString = Newtonsoft.Json.JsonConvert.SerializeObject(result.OrderBy(keys => keys)); var expectedString = Newtonsoft.Json.JsonConvert.SerializeObject(expectedResults.OrderBy(keys => keys)); //Assert Assert.AreEqual(expectedString, resultString); }
public long Commit() { var ts = new List <Task>(); var trieBuilder = new TrieBuilder(); using (var documentsToAnalyze = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { Log.Info("serializing documents"); var count = 0; var docFileName = Path.Combine(_directory, _indexVersionId + ".rdoc"); var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da"); var readTimer = new Stopwatch(); readTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write))) using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression)) { foreach (var doc in ReadSourceAndAssignIdentifiers()) { documentsToAnalyze.Add(doc); var adr = docWriter.Write(doc); docAddressWriter.Write(adr); count++; } } documentsToAnalyze.CompleteAdding(); Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); Log.Info("analyzing"); var count = 0; try { while (true) { var doc = documentsToAnalyze.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } count++; } } catch (InvalidOperationException) { // Done trieBuilder.CompleteAdding(); } Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); Log.Info("serializing postings"); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed); }), Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); Log.Info("serializing tries"); SerializeTries(tries); Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); Log.Info("serializing doc hashes"); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(_indexVersionId); }
public long Write() { if (_committed) { return(_indexVersionId); } var trieBuilder = new TrieBuilder(); var posFileName = Path.Combine( _directory, string.Format("{0}.{1}", _indexVersionId, "pos")); var docTimer = Stopwatch.StartNew(); foreach (var doc in _documents.ReadSource()) { doc.Id = _count++; new DocumentUpsertOperation().Write( doc, _storeWriter, _analyzer, trieBuilder); } Log.InfoFormat("stored {0} documents in {1}", _count + 1, docTimer.Elapsed); var posTimer = Stopwatch.StartNew(); var tries = trieBuilder.GetTries(); using (var postingsWriter = new PostingsWriter( new FileStream(posFileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.DebugFormat("{0}\t{1}", word.Value, word.Count); } } } } Log.InfoFormat( "stored postings refs in trees and wrote postings file in {0}", posTimer.Elapsed); var treeTimer = new Stopwatch(); treeTimer.Start(); SerializeTries(tries); Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed); return(_indexVersionId); }
public long Commit() { var trieBuilder = new TrieBuilder(); var docAddresses = new List <BlockInfo>(); var analyzed = _analyzer.AnalyzeDocument(_document); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } trieBuilder.CompleteAdding(); var indexVersionId = Util.GetChronologicalFileId(); var docFileName = Path.Combine(_directory, indexVersionId + ".rdoc"); var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da"); BlockInfo adr; using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression)) { adr = docWriter.Write(_document); } using (var docAddressWriter = new DocumentAddressWriter( new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None))) { docAddressWriter.Write(adr); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } }), Task.Run(() => { SerializeTries(tries, indexVersionId); }), Task.Run(() => { var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk")); new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName); }) }; Task.WaitAll(tasks.ToArray()); new IxInfo { VersionId = indexVersionId, DocumentCount = 1, Compression = _compression }.Serialize(Path.Combine(_directory, indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(indexVersionId); }
public DocumentUpsertCommand(IWriteSession writeSession, IAnalyzer analyzer, TrieBuilder treeBuilder) { _writeSession = writeSession; _analyzer = analyzer; _treeBuilder = treeBuilder; }
public void BuildSmallTrieIsValid() { //Arrange var smallWordList = new List <string> { "Apple", "Apples", "Ape", "Cat" }; Mock <IDictionaryRepository> dictionaryRepo = new Mock <IDictionaryRepository>(); dictionaryRepo.Setup(x => x.GetWords()).Returns(smallWordList); TrieBuilder trieBuilder = new TrieBuilder(dictionaryRepo.Object); var expectedResults = Factory.CreateTrieList(); expectedResults.Add(new Trie { Value = 'A', ValidWord = false, Children = new List <ITrie> { new Trie { Value = 'P', ValidWord = false, Children = new List <ITrie> { new Trie { Value = 'P', ValidWord = false, Children = new List <ITrie> { new Trie { Value = 'L', ValidWord = false, Children = new List <ITrie> { new Trie { Value = 'E', ValidWord = true, Children = new List <ITrie> { new Trie { Value = 'S', ValidWord = true, Children = new List <ITrie>() } } } } } } }, new Trie { Value = 'E', ValidWord = true, Children = new List <ITrie>() } } } } }); expectedResults.Add(new Trie { Value = 'C', ValidWord = false, Children = new List <ITrie> { new Trie { Value = 'A', ValidWord = false, Children = new List <ITrie> { new Trie { Value = 'T', ValidWord = true, Children = new List <ITrie>() } } } } }); //Act var result = trieBuilder.BuildTrie(); //only care that tries look the same. var resultString = Newtonsoft.Json.JsonConvert.SerializeObject(result); var expectedString = Newtonsoft.Json.JsonConvert.SerializeObject(expectedResults); //Assert Assert.AreEqual(expectedString, resultString); }