private readonly PostingsWriter postingsWriter; // LUCENENET: marked readonly public PreFlexTermsWriter(PreFlexRWFieldsWriter outerInstance, FieldInfo fieldInfo) { this.outerInstance = outerInstance; postingsWriter = new PostingsWriter(this); this.fieldInfo = fieldInfo; omitTF = fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY; storePayloads = fieldInfo.HasPayloads; }
public TermsWriter(IndexOutput @out, FieldInfo field, bool doPackFST, float acceptableOverheadRatio) { postingsWriter = new PostingsWriter(this); this.@out = @out; this.field = field; this.doPackFST = doPackFST; this.acceptableOverheadRatio = acceptableOverheadRatio; builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPackFST, acceptableOverheadRatio, true, 15); }
public FullTextWriteSession(string directory, Compression compression, TreeBuilder treeBuilder) : base(directory, compression) { _treeBuilder = treeBuilder; _postingsWriter = new PostingsWriter( new FileStream( Path.Combine(Directory.GetCurrentDirectory(), Path.GetRandomFileName() + ".pos"), FileMode.CreateNew, FileAccess.ReadWrite, FileShare.None, 4096, FileOptions.DeleteOnClose )); }
internal void InitializeInstanceFields() { postingsWriter = new PostingsWriter(this); }
public long Commit() { var docAddresses = new List <BlockInfo>(); var pks = new Dictionary <UInt64, object>(); var ts = new List <Task>(); using (var words = new BlockingCollection <WordInfo>()) using (var documents = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); try { while (true) { var word = words.Take(); GetTrie(word.Field) .Add(word.Token, word.Posting); } } catch (InvalidOperationException) { // Done } Log.InfoFormat("Built tries in {0}", trieTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); try { while (true) { var doc = documents.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words) { var field = term.Term.Field; var token = term.Term.Word.Value; var posting = term.Posting; words.Add(new WordInfo(field, token, posting)); } } } catch (InvalidOperationException) { // Done words.CompleteAdding(); } Log.InfoFormat("Analyzed {0} documents in {1}", pks.Count, analyzeTimer.Elapsed); })); ts.Add(Task.Run(() => { var docWriterTimer = new Stopwatch(); docWriterTimer.Start(); var docFileName = Path.Combine(_directory, _indexVersionId + ".doc"); using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None), _compression)) { foreach (var doc in ReadSource()) { string pkVal; if (_autoGeneratePk) { pkVal = Guid.NewGuid().ToString(); } else { pkVal = doc.Fields.First(f => f.Key == _primaryKey).Value; } var hash = pkVal.ToHash(); if (pks.ContainsKey(hash)) { Log.WarnFormat("Found multiple occurrences of documents with pk value of {0} (id:{1}). Only first occurrence will be stored.", pkVal, _docId); } else { pks.Add(hash, null); doc.Id = _docId++; documents.Add(doc); var adr = docWriter.Write(doc); docAddresses.Add(adr); } } } documents.CompleteAdding(); Log.InfoFormat("Serialized {0} documents in {1}", pks.Count, docWriterTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } if (pks.Count == 0) { Log.Info("Aborted write (source is empty)."); return(0); } var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in _tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("Serialized postings in {0}", postingsTimer.Elapsed); var trieTimer = new Stopwatch(); trieTimer.Start(); SerializeTries(); Log.InfoFormat("Serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docAdrTimer = new Stopwatch(); docAdrTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(Path.Combine(_directory, _indexVersionId + ".da"), FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var address in docAddresses) { docAddressWriter.Write(address); } } Log.InfoFormat("Serialized doc addresses in {0}", docAdrTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); pks.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("Serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } return(_indexVersionId); }
public long Commit() { var ts = new List <Task>(); var trieBuilder = new TrieBuilder(); using (var documentsToAnalyze = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { Log.Info("serializing documents"); var count = 0; var docFileName = Path.Combine(_directory, _indexVersionId + ".rdoc"); var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da"); var readTimer = new Stopwatch(); readTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write))) using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression)) { foreach (var doc in ReadSourceAndAssignIdentifiers()) { documentsToAnalyze.Add(doc); var adr = docWriter.Write(doc); docAddressWriter.Write(adr); count++; } } documentsToAnalyze.CompleteAdding(); Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); Log.Info("analyzing"); var count = 0; try { while (true) { var doc = documentsToAnalyze.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } count++; } } catch (InvalidOperationException) { // Done trieBuilder.CompleteAdding(); } Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); Log.Info("serializing postings"); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed); }), Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); Log.Info("serializing tries"); SerializeTries(tries); Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); Log.Info("serializing doc hashes"); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(_indexVersionId); }
public long Write() { if (_committed) { return(_indexVersionId); } var trieBuilder = new TrieBuilder(); var posFileName = Path.Combine( _directory, string.Format("{0}.{1}", _indexVersionId, "pos")); var docTimer = Stopwatch.StartNew(); foreach (var doc in _documents.ReadSource()) { doc.Id = _count++; new DocumentUpsertOperation().Write( doc, _storeWriter, _analyzer, trieBuilder); } Log.InfoFormat("stored {0} documents in {1}", _count + 1, docTimer.Elapsed); var posTimer = Stopwatch.StartNew(); var tries = trieBuilder.GetTries(); using (var postingsWriter = new PostingsWriter( new FileStream(posFileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.DebugFormat("{0}\t{1}", word.Value, word.Count); } } } } Log.InfoFormat( "stored postings refs in trees and wrote postings file in {0}", posTimer.Elapsed); var treeTimer = new Stopwatch(); treeTimer.Start(); SerializeTries(tries); Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed); return(_indexVersionId); }
public UpsertTransaction( string directory, IAnalyzer analyzer, Compression compression, DocumentStream documents, IWriteSessionFactory storeWriterFactory = null) { long version = Util.GetNextChronologicalFileId(); Log.InfoFormat("begin writing {0}", version); FileStream lockFile; if (!Util.TryAquireWriteLock(directory, out lockFile)) { var compoundFileName = Path.Combine(directory, version + ".rdb"); _compoundFile = new FileStream( compoundFileName, FileMode.CreateNew, FileAccess.Write, FileShare.ReadWrite, 4096 ); } else { var ixFileName = Util.GetIndexFileNamesInChronologicalOrder(directory).FirstOrDefault(); long dataFileVersion; if (ixFileName == null) { dataFileVersion = version; } else { dataFileVersion = long.Parse(Path.GetFileNameWithoutExtension(ixFileName)); } var compoundFileName = Path.Combine(directory, dataFileVersion + ".rdb"); _compoundFile = new FileStream( compoundFileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite, 4096 ); _lockFile = lockFile; } _directory = directory; _analyzer = analyzer; _documents = documents; _ix = new BatchInfo { VersionId = version, Compression = compression, PrimaryKeyFieldName = documents.PrimaryKeyFieldName }; var posFileName = Path.Combine( _directory, string.Format("{0}.{1}", _ix.VersionId, "pos")); var factory = storeWriterFactory ?? new WriteSessionFactory(directory, _ix); _writeSession = factory.OpenWriteSession(_compoundFile); _postingsWriter = new PostingsWriter( new FileStream( posFileName, FileMode.CreateNew, FileAccess.ReadWrite, FileShare.None, 4096, FileOptions.DeleteOnClose )); }
public long Commit() { var trieBuilder = new TrieBuilder(); var docAddresses = new List <BlockInfo>(); var analyzed = _analyzer.AnalyzeDocument(_document); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } trieBuilder.CompleteAdding(); var indexVersionId = Util.GetChronologicalFileId(); var docFileName = Path.Combine(_directory, indexVersionId + ".rdoc"); var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da"); BlockInfo adr; using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression)) { adr = docWriter.Write(_document); } using (var docAddressWriter = new DocumentAddressWriter( new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None))) { docAddressWriter.Write(adr); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } }), Task.Run(() => { SerializeTries(tries, indexVersionId); }), Task.Run(() => { var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk")); new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName); }) }; Task.WaitAll(tasks.ToArray()); new IxInfo { VersionId = indexVersionId, DocumentCount = 1, Compression = _compression }.Serialize(Path.Combine(_directory, indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(indexVersionId); }
internal virtual void InitializeInstanceFields() { postingsWriter = new PostingsWriter(this); }