Example #1
0
            private readonly PostingsWriter postingsWriter; // LUCENENET: marked readonly

            public PreFlexTermsWriter(PreFlexRWFieldsWriter outerInstance, FieldInfo fieldInfo)
            {
                this.outerInstance = outerInstance;

                postingsWriter = new PostingsWriter(this);
                this.fieldInfo = fieldInfo;
                omitTF         = fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY;
                storePayloads  = fieldInfo.HasPayloads;
            }
            public TermsWriter(IndexOutput @out, FieldInfo field, bool doPackFST, float acceptableOverheadRatio)
            {
                postingsWriter = new PostingsWriter(this);

                this.@out      = @out;
                this.field     = field;
                this.doPackFST = doPackFST;
                this.acceptableOverheadRatio = acceptableOverheadRatio;
                builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPackFST, acceptableOverheadRatio, true, 15);
            }
Example #3
0
        public FullTextWriteSession(string directory, Compression compression, TreeBuilder treeBuilder)
            : base(directory, compression)
        {
            _treeBuilder = treeBuilder;

            _postingsWriter = new PostingsWriter(
                new FileStream(
                    Path.Combine(Directory.GetCurrentDirectory(), Path.GetRandomFileName() + ".pos"),
                    FileMode.CreateNew,
                    FileAccess.ReadWrite,
                    FileShare.None,
                    4096,
                    FileOptions.DeleteOnClose
                    ));
        }
Example #4
0
 internal void InitializeInstanceFields()
 {
     postingsWriter = new PostingsWriter(this);
 }
Example #5
0
        public long Commit()
        {
            var docAddresses = new List <BlockInfo>();
            var pks          = new Dictionary <UInt64, object>();
            var ts           = new List <Task>();

            using (var words = new BlockingCollection <WordInfo>())
                using (var documents = new BlockingCollection <Document>())
                {
                    ts.Add(Task.Run(() =>
                    {
                        var trieTimer = new Stopwatch();
                        trieTimer.Start();

                        try
                        {
                            while (true)
                            {
                                var word = words.Take();

                                GetTrie(word.Field)
                                .Add(word.Token, word.Posting);
                            }
                        }
                        catch (InvalidOperationException)
                        {
                            // Done
                        }
                        Log.InfoFormat("Built tries in {0}", trieTimer.Elapsed);
                    }));

                    ts.Add(Task.Run(() =>
                    {
                        var analyzeTimer = new Stopwatch();
                        analyzeTimer.Start();

                        try
                        {
                            while (true)
                            {
                                var doc      = documents.Take();
                                var analyzed = _analyzer.AnalyzeDocument(doc);

                                foreach (var term in analyzed.Words)
                                {
                                    var field   = term.Term.Field;
                                    var token   = term.Term.Word.Value;
                                    var posting = term.Posting;

                                    words.Add(new WordInfo(field, token, posting));
                                }
                            }
                        }
                        catch (InvalidOperationException)
                        {
                            // Done
                            words.CompleteAdding();
                        }
                        Log.InfoFormat("Analyzed {0} documents in {1}", pks.Count, analyzeTimer.Elapsed);
                    }));

                    ts.Add(Task.Run(() =>
                    {
                        var docWriterTimer = new Stopwatch();
                        docWriterTimer.Start();

                        var docFileName = Path.Combine(_directory, _indexVersionId + ".doc");

                        using (var docWriter = new DocumentWriter(
                                   new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None), _compression))
                        {
                            foreach (var doc in ReadSource())
                            {
                                string pkVal;

                                if (_autoGeneratePk)
                                {
                                    pkVal = Guid.NewGuid().ToString();
                                }
                                else
                                {
                                    pkVal = doc.Fields.First(f => f.Key == _primaryKey).Value;
                                }

                                var hash = pkVal.ToHash();

                                if (pks.ContainsKey(hash))
                                {
                                    Log.WarnFormat("Found multiple occurrences of documents with pk value of {0} (id:{1}). Only first occurrence will be stored.",
                                                   pkVal, _docId);
                                }
                                else
                                {
                                    pks.Add(hash, null);

                                    doc.Id = _docId++;

                                    documents.Add(doc);

                                    var adr = docWriter.Write(doc);

                                    docAddresses.Add(adr);
                                }
                            }
                        }
                        documents.CompleteAdding();
                        Log.InfoFormat("Serialized {0} documents in {1}", pks.Count, docWriterTimer.Elapsed);
                    }));

                    Task.WaitAll(ts.ToArray());
                }

            if (pks.Count == 0)
            {
                Log.Info("Aborted write (source is empty).");

                return(0);
            }

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var postingsTimer = new Stopwatch();
                    postingsTimer.Start();

                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in _tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }
                    Log.InfoFormat("Serialized postings in {0}", postingsTimer.Elapsed);

                    var trieTimer = new Stopwatch();
                    trieTimer.Start();

                    SerializeTries();

                    Log.InfoFormat("Serialized tries in {0}", trieTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docAdrTimer = new Stopwatch();
                    docAdrTimer.Start();

                    using (var docAddressWriter = new DocumentAddressWriter(new FileStream(Path.Combine(_directory, _indexVersionId + ".da"), FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var address in docAddresses)
                        {
                            docAddressWriter.Write(address);
                        }
                    }

                    Log.InfoFormat("Serialized doc addresses in {0}", docAdrTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docHasTimer = new Stopwatch();
                    docHasTimer.Start();

                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk"));

                    pks.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName);

                    Log.InfoFormat("Serialized doc hashes in {0}", docHasTimer.Elapsed);
                })
            };

            Task.WaitAll(tasks.ToArray());

            CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }

            return(_indexVersionId);
        }
Example #6
0
        public long Commit()
        {
            var ts          = new List <Task>();
            var trieBuilder = new TrieBuilder();

            using (var documentsToAnalyze = new BlockingCollection <Document>())
            {
                ts.Add(Task.Run(() =>
                {
                    Log.Info("serializing documents");

                    var count        = 0;
                    var docFileName  = Path.Combine(_directory, _indexVersionId + ".rdoc");
                    var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da");
                    var readTimer    = new Stopwatch();

                    readTimer.Start();

                    using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write)))
                        using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression))
                        {
                            foreach (var doc in ReadSourceAndAssignIdentifiers())
                            {
                                documentsToAnalyze.Add(doc);

                                var adr = docWriter.Write(doc);

                                docAddressWriter.Write(adr);

                                count++;
                            }
                        }

                    documentsToAnalyze.CompleteAdding();

                    Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed);
                }));

                ts.Add(Task.Run(() =>
                {
                    var analyzeTimer = new Stopwatch();
                    analyzeTimer.Start();

                    Log.Info("analyzing");

                    var count = 0;

                    try
                    {
                        while (true)
                        {
                            var doc = documentsToAnalyze.Take();

                            var analyzed = _analyzer.AnalyzeDocument(doc);

                            foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field))
                            {
                                trieBuilder.Add(term.Key, term.Select(t =>
                                {
                                    var field   = t.Term.Field;
                                    var token   = t.Term.Word.Value;
                                    var posting = t.Posting;
                                    return(new WordInfo(field, token, posting));
                                }).ToList());
                            }

                            count++;
                        }
                    }
                    catch (InvalidOperationException)
                    {
                        // Done
                        trieBuilder.CompleteAdding();
                    }
                    Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed);
                }));

                Task.WaitAll(ts.ToArray());
            }

            var tries = trieBuilder.GetTries();

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var postingsTimer = new Stopwatch();
                    postingsTimer.Start();

                    Log.Info("serializing postings");

                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }

                    Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var trieTimer = new Stopwatch();
                    trieTimer.Start();

                    Log.Info("serializing tries");

                    SerializeTries(tries);

                    Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docHasTimer = new Stopwatch();
                    docHasTimer.Start();

                    Log.Info("serializing doc hashes");

                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk"));

                    _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName);

                    Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed);
                })
            };

            Task.WaitAll(tasks.ToArray());

            CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }
            else
            {
                Log.Info("compression: false");
            }

            return(_indexVersionId);
        }
Example #7
0
        public long Write()
        {
            if (_committed)
            {
                return(_indexVersionId);
            }

            var trieBuilder = new TrieBuilder();
            var posFileName = Path.Combine(
                _directory, string.Format("{0}.{1}", _indexVersionId, "pos"));

            var docTimer = Stopwatch.StartNew();

            foreach (var doc in _documents.ReadSource())
            {
                doc.Id = _count++;

                new DocumentUpsertOperation().Write(
                    doc,
                    _storeWriter,
                    _analyzer,
                    trieBuilder);
            }

            Log.InfoFormat("stored {0} documents in {1}", _count + 1, docTimer.Elapsed);

            var posTimer = Stopwatch.StartNew();

            var tries = trieBuilder.GetTries();

            using (var postingsWriter = new PostingsWriter(
                       new FileStream(posFileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite)))
            {
                foreach (var trie in tries)
                {
                    foreach (var node in trie.Value.EndOfWordNodes())
                    {
                        node.PostingsAddress = postingsWriter.Write(node.Postings);
                    }

                    if (Log.IsDebugEnabled)
                    {
                        foreach (var word in trie.Value.Words())
                        {
                            Log.DebugFormat("{0}\t{1}", word.Value, word.Count);
                        }
                    }
                }
            }

            Log.InfoFormat(
                "stored postings refs in trees and wrote postings file in {0}",
                posTimer.Elapsed);

            var treeTimer = new Stopwatch();

            treeTimer.Start();

            SerializeTries(tries);

            Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed);

            return(_indexVersionId);
        }
Example #8
0
        public UpsertTransaction(
            string directory,
            IAnalyzer analyzer,
            Compression compression,
            DocumentStream documents,
            IWriteSessionFactory storeWriterFactory = null)
        {
            long version = Util.GetNextChronologicalFileId();

            Log.InfoFormat("begin writing {0}", version);

            FileStream lockFile;

            if (!Util.TryAquireWriteLock(directory, out lockFile))
            {
                var compoundFileName = Path.Combine(directory, version + ".rdb");

                _compoundFile = new FileStream(
                    compoundFileName,
                    FileMode.CreateNew,
                    FileAccess.Write,
                    FileShare.ReadWrite,
                    4096
                    );
            }
            else
            {
                var  ixFileName = Util.GetIndexFileNamesInChronologicalOrder(directory).FirstOrDefault();
                long dataFileVersion;

                if (ixFileName == null)
                {
                    dataFileVersion = version;
                }
                else
                {
                    dataFileVersion = long.Parse(Path.GetFileNameWithoutExtension(ixFileName));
                }

                var compoundFileName = Path.Combine(directory, dataFileVersion + ".rdb");

                _compoundFile = new FileStream(
                    compoundFileName,
                    FileMode.Append,
                    FileAccess.Write,
                    FileShare.ReadWrite,
                    4096
                    );

                _lockFile = lockFile;
            }

            _directory = directory;
            _analyzer  = analyzer;
            _documents = documents;

            _ix = new BatchInfo
            {
                VersionId           = version,
                Compression         = compression,
                PrimaryKeyFieldName = documents.PrimaryKeyFieldName
            };

            var posFileName = Path.Combine(
                _directory, string.Format("{0}.{1}", _ix.VersionId, "pos"));

            var factory = storeWriterFactory ?? new WriteSessionFactory(directory, _ix);

            _writeSession = factory.OpenWriteSession(_compoundFile);

            _postingsWriter = new PostingsWriter(
                new FileStream(
                    posFileName,
                    FileMode.CreateNew,
                    FileAccess.ReadWrite,
                    FileShare.None,
                    4096,
                    FileOptions.DeleteOnClose
                    ));
        }
Example #9
0
        public long Commit()
        {
            var trieBuilder  = new TrieBuilder();
            var docAddresses = new List <BlockInfo>();

            var analyzed = _analyzer.AnalyzeDocument(_document);

            foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field))
            {
                trieBuilder.Add(term.Key, term.Select(t =>
                {
                    var field   = t.Term.Field;
                    var token   = t.Term.Word.Value;
                    var posting = t.Posting;
                    return(new WordInfo(field, token, posting));
                }).ToList());
            }

            trieBuilder.CompleteAdding();

            var indexVersionId = Util.GetChronologicalFileId();
            var docFileName    = Path.Combine(_directory, indexVersionId + ".rdoc");
            var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da");

            BlockInfo adr;

            using (var docWriter = new DocumentWriter(
                       new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression))
            {
                adr = docWriter.Write(_document);
            }

            using (var docAddressWriter = new DocumentAddressWriter(
                       new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None)))
            {
                docAddressWriter.Write(adr);
            }

            var tries = trieBuilder.GetTries();

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }
                }),
                Task.Run(() =>
                {
                    SerializeTries(tries, indexVersionId);
                }),
                Task.Run(() =>
                {
                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk"));

                    new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName);
                })
            };

            Task.WaitAll(tasks.ToArray());

            new IxInfo
            {
                VersionId     = indexVersionId,
                DocumentCount = 1,
                Compression   = _compression
            }.Serialize(Path.Combine(_directory, indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }
            else
            {
                Log.Info("compression: false");
            }

            return(indexVersionId);
        }
 internal virtual void InitializeInstanceFields()
 {
     postingsWriter = new PostingsWriter(this);
 }