Beispiel #1
0
        public long Write()
        {
            if (_committed)
            {
                return(_ix.VersionId);
            }

            var trieBuilder = new TrieBuilder();
            var docTimer    = Stopwatch.StartNew();
            var upsert      = new DocumentUpsertCommand(_writeSession, _analyzer, trieBuilder);

            foreach (var doc in _documents.ReadSource())
            {
                doc.Id = _count++;

                upsert.Write(doc);
            }

            Log.InfoFormat("stored {0} documents in {1}", _count, docTimer.Elapsed);

            var posTimer = Stopwatch.StartNew();

            var tries = trieBuilder.GetTries();

            foreach (var trie in tries)
            {
                // decode into a list of words and set postings address
                foreach (var node in trie.Value.EndOfWordNodes())
                {
                    node.PostingsAddress = _postingsWriter.Write(node.Postings);
                }

                if (Log.IsDebugEnabled)
                {
                    foreach (var word in trie.Value.Words())
                    {
                        Log.DebugFormat("{0}\t{1}", word.Value, word.Postings.Count);
                    }
                }
            }

            Log.InfoFormat(
                "stored postings refs in trees in {0}",
                posTimer.Elapsed);

            var treeTimer = Stopwatch.StartNew();

            _ix.FieldOffsets = SerializeTries(tries, _compoundFile);

            Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed);

            _ix.PostingsOffset = _compoundFile.Position;
            _postingsWriter.Stream.Flush();
            _postingsWriter.Stream.Position = 0;
            _postingsWriter.Stream.CopyTo(_compoundFile);

            _ix.DocumentCount = _count;

            return(_ix.VersionId);
        }
Beispiel #2
0
        public void when_()
        {
            var trieBuilder = new TrieBuilder <string>(
                rootNodeDataFactory: () => "",
                nodeDataMerger: nodeDatas => nodeDatas.First(),
                nodeDataComparor: (first, second) => first.CompareTo(second),
                nodeDataHasher: nodeData => nodeData.GetHashCode(),
                canParentAcceptChildChecker: (parent, child) => child.StartsWith(parent),
                canNodeHaveChildrenChecker: nodeData => true);

            var datas = new[]
            {
                "abi",
                "efg*",
                "a",
                "abc",
                "abc*",
                "abcd",
            };

            var expectedTree = new TestNode
            {
                Key      = "",
                Children = new[]
                {
                    new TestNode
                    {
                        Key      = "a",
                        Children = new[]
                        {
                            new TestNode
                            {
                                Key      = "abc",
                                Children = new[]
                                {
                                    new TestNode {
                                        Key = "abc*"
                                    },
                                    new TestNode {
                                        Key = "abcd"
                                    }
                                }
                            },
                            new TestNode {
                                Key = "abi"
                            }
                        }
                    },
                    new TestNode {
                        Key = "efg*"
                    }
                },
            };

            var trie = trieBuilder.CreateTrie(datas);

            AssertTree(expectedTree, trie);
        }
Beispiel #3
0
        public void Write(
            Document document,
            IDocumentStoreWriter storeWriter,
            IAnalyzer analyzer,
            TrieBuilder trieBuilder)
        {
            var analyzed = analyzer.AnalyzeDocument(document);

            foreach (var word in analyzed.Words)
            {
                var field   = word.Term.Field;
                var token   = word.Term.Word.Value;
                var posting = word.Posting;

                trieBuilder.Add(new WordInfo(field, token, posting));
            }

            storeWriter.Write(document);
        }
Beispiel #4
0
        public void Puzzle3Test()
        {
            //Arrange
            IDictionaryRepository dictionaryRepository = new DictionaryRepository();

            ITrieBuilder trieBuilder = new TrieBuilder(dictionaryRepository);

            BoggleWordSearchService boggleWordSearchService = new BoggleWordSearchService(trieBuilder);
            var board           = "FVKOWQIMEZTLRPBE";
            var expectedResults = Solution3Results();

            //Act
            HashSet <string> result = boggleWordSearchService.FindWordsInBoggle(board, 4, 4);

            //only care that tries look the same.
            var resultString   = Newtonsoft.Json.JsonConvert.SerializeObject(result.OrderBy(keys => keys));
            var expectedString = Newtonsoft.Json.JsonConvert.SerializeObject(expectedResults);

            //Assert
            Assert.AreEqual(expectedString, resultString);
        }
Beispiel #5
0
        public void Puzzle5By5Test()
        {
            //Arrange
            Mock <IDictionaryRepository> dictionaryRepositoryMock = new Mock <IDictionaryRepository>();

            dictionaryRepositoryMock.Setup(x => x.GetWords()).Returns(Solution5By5Results().ToList);

            ITrieBuilder trieBuilder = new TrieBuilder(dictionaryRepositoryMock.Object);

            BoggleWordSearchService boggleWordSearchService = new BoggleWordSearchService(trieBuilder);
            var board           = "sabtebralnuetoenrkeseqnss";
            var expectedResults = Solution5By5Results();

            //Act
            HashSet <string> result = boggleWordSearchService.FindWordsInBoggle(board, 5, 5);

            //only care that tries look the same.
            var resultString   = Newtonsoft.Json.JsonConvert.SerializeObject(result.OrderBy(keys => keys));
            var expectedString = Newtonsoft.Json.JsonConvert.SerializeObject(expectedResults.OrderBy(keys => keys));

            //Assert
            Assert.AreEqual(expectedString, resultString);
        }
Beispiel #6
0
        public long Commit()
        {
            var ts          = new List <Task>();
            var trieBuilder = new TrieBuilder();

            using (var documentsToAnalyze = new BlockingCollection <Document>())
            {
                ts.Add(Task.Run(() =>
                {
                    Log.Info("serializing documents");

                    var count        = 0;
                    var docFileName  = Path.Combine(_directory, _indexVersionId + ".rdoc");
                    var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da");
                    var readTimer    = new Stopwatch();

                    readTimer.Start();

                    using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write)))
                        using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression))
                        {
                            foreach (var doc in ReadSourceAndAssignIdentifiers())
                            {
                                documentsToAnalyze.Add(doc);

                                var adr = docWriter.Write(doc);

                                docAddressWriter.Write(adr);

                                count++;
                            }
                        }

                    documentsToAnalyze.CompleteAdding();

                    Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed);
                }));

                ts.Add(Task.Run(() =>
                {
                    var analyzeTimer = new Stopwatch();
                    analyzeTimer.Start();

                    Log.Info("analyzing");

                    var count = 0;

                    try
                    {
                        while (true)
                        {
                            var doc = documentsToAnalyze.Take();

                            var analyzed = _analyzer.AnalyzeDocument(doc);

                            foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field))
                            {
                                trieBuilder.Add(term.Key, term.Select(t =>
                                {
                                    var field   = t.Term.Field;
                                    var token   = t.Term.Word.Value;
                                    var posting = t.Posting;
                                    return(new WordInfo(field, token, posting));
                                }).ToList());
                            }

                            count++;
                        }
                    }
                    catch (InvalidOperationException)
                    {
                        // Done
                        trieBuilder.CompleteAdding();
                    }
                    Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed);
                }));

                Task.WaitAll(ts.ToArray());
            }

            var tries = trieBuilder.GetTries();

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var postingsTimer = new Stopwatch();
                    postingsTimer.Start();

                    Log.Info("serializing postings");

                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }

                    Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var trieTimer = new Stopwatch();
                    trieTimer.Start();

                    Log.Info("serializing tries");

                    SerializeTries(tries);

                    Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docHasTimer = new Stopwatch();
                    docHasTimer.Start();

                    Log.Info("serializing doc hashes");

                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk"));

                    _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName);

                    Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed);
                })
            };

            Task.WaitAll(tasks.ToArray());

            CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }
            else
            {
                Log.Info("compression: false");
            }

            return(_indexVersionId);
        }
Beispiel #7
0
        public long Write()
        {
            if (_committed)
            {
                return(_indexVersionId);
            }

            var trieBuilder = new TrieBuilder();
            var posFileName = Path.Combine(
                _directory, string.Format("{0}.{1}", _indexVersionId, "pos"));

            var docTimer = Stopwatch.StartNew();

            foreach (var doc in _documents.ReadSource())
            {
                doc.Id = _count++;

                new DocumentUpsertOperation().Write(
                    doc,
                    _storeWriter,
                    _analyzer,
                    trieBuilder);
            }

            Log.InfoFormat("stored {0} documents in {1}", _count + 1, docTimer.Elapsed);

            var posTimer = Stopwatch.StartNew();

            var tries = trieBuilder.GetTries();

            using (var postingsWriter = new PostingsWriter(
                       new FileStream(posFileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite)))
            {
                foreach (var trie in tries)
                {
                    foreach (var node in trie.Value.EndOfWordNodes())
                    {
                        node.PostingsAddress = postingsWriter.Write(node.Postings);
                    }

                    if (Log.IsDebugEnabled)
                    {
                        foreach (var word in trie.Value.Words())
                        {
                            Log.DebugFormat("{0}\t{1}", word.Value, word.Count);
                        }
                    }
                }
            }

            Log.InfoFormat(
                "stored postings refs in trees and wrote postings file in {0}",
                posTimer.Elapsed);

            var treeTimer = new Stopwatch();

            treeTimer.Start();

            SerializeTries(tries);

            Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed);

            return(_indexVersionId);
        }
Beispiel #8
0
        public long Commit()
        {
            var trieBuilder  = new TrieBuilder();
            var docAddresses = new List <BlockInfo>();

            var analyzed = _analyzer.AnalyzeDocument(_document);

            foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field))
            {
                trieBuilder.Add(term.Key, term.Select(t =>
                {
                    var field   = t.Term.Field;
                    var token   = t.Term.Word.Value;
                    var posting = t.Posting;
                    return(new WordInfo(field, token, posting));
                }).ToList());
            }

            trieBuilder.CompleteAdding();

            var indexVersionId = Util.GetChronologicalFileId();
            var docFileName    = Path.Combine(_directory, indexVersionId + ".rdoc");
            var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da");

            BlockInfo adr;

            using (var docWriter = new DocumentWriter(
                       new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression))
            {
                adr = docWriter.Write(_document);
            }

            using (var docAddressWriter = new DocumentAddressWriter(
                       new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None)))
            {
                docAddressWriter.Write(adr);
            }

            var tries = trieBuilder.GetTries();

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }
                }),
                Task.Run(() =>
                {
                    SerializeTries(tries, indexVersionId);
                }),
                Task.Run(() =>
                {
                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk"));

                    new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName);
                })
            };

            Task.WaitAll(tasks.ToArray());

            new IxInfo
            {
                VersionId     = indexVersionId,
                DocumentCount = 1,
                Compression   = _compression
            }.Serialize(Path.Combine(_directory, indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }
            else
            {
                Log.Info("compression: false");
            }

            return(indexVersionId);
        }
Beispiel #9
0
 public DocumentUpsertCommand(IWriteSession writeSession, IAnalyzer analyzer, TrieBuilder treeBuilder)
 {
     _writeSession = writeSession;
     _analyzer     = analyzer;
     _treeBuilder  = treeBuilder;
 }
Beispiel #10
0
        public void BuildSmallTrieIsValid()
        {
            //Arrange
            var smallWordList = new List <string> {
                "Apple", "Apples", "Ape", "Cat"
            };
            Mock <IDictionaryRepository> dictionaryRepo = new Mock <IDictionaryRepository>();

            dictionaryRepo.Setup(x => x.GetWords()).Returns(smallWordList);

            TrieBuilder trieBuilder     = new TrieBuilder(dictionaryRepo.Object);
            var         expectedResults = Factory.CreateTrieList();

            expectedResults.Add(new Trie
            {
                Value     = 'A',
                ValidWord = false,
                Children  = new List <ITrie>
                {
                    new Trie
                    {
                        Value     = 'P',
                        ValidWord = false,
                        Children  = new List <ITrie>
                        {
                            new Trie
                            {
                                Value     = 'P',
                                ValidWord = false,
                                Children  = new List <ITrie>
                                {
                                    new Trie
                                    {
                                        Value     = 'L',
                                        ValidWord = false,
                                        Children  = new List <ITrie>
                                        {
                                            new Trie
                                            {
                                                Value     = 'E',
                                                ValidWord = true,
                                                Children  = new List <ITrie>
                                                {
                                                    new Trie
                                                    {
                                                        Value     = 'S',
                                                        ValidWord = true,
                                                        Children  = new List <ITrie>()
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            },
                            new Trie
                            {
                                Value     = 'E',
                                ValidWord = true,
                                Children  = new List <ITrie>()
                            }
                        }
                    }
                }
            });
            expectedResults.Add(new Trie
            {
                Value     = 'C',
                ValidWord = false,
                Children  = new List <ITrie>
                {
                    new Trie
                    {
                        Value     = 'A',
                        ValidWord = false,
                        Children  = new List <ITrie>
                        {
                            new Trie
                            {
                                Value     = 'T',
                                ValidWord = true,
                                Children  = new List <ITrie>()
                            }
                        }
                    }
                }
            });

            //Act
            var result = trieBuilder.BuildTrie();

            //only care that tries look the same.
            var resultString   = Newtonsoft.Json.JsonConvert.SerializeObject(result);
            var expectedString = Newtonsoft.Json.JsonConvert.SerializeObject(expectedResults);

            //Assert
            Assert.AreEqual(expectedString, resultString);
        }