public long Write() { if (_committed) { return(_ix.VersionId); } var trieBuilder = new TrieBuilder(); var docTimer = Stopwatch.StartNew(); var upsert = new DocumentUpsertCommand(_writeSession, _analyzer, trieBuilder); foreach (var doc in _documents.ReadSource()) { doc.Id = _count++; upsert.Write(doc); } Log.InfoFormat("stored {0} documents in {1}", _count, docTimer.Elapsed); var posTimer = Stopwatch.StartNew(); var tries = trieBuilder.GetTries(); foreach (var trie in tries) { // decode into a list of words and set postings address foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = _postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.DebugFormat("{0}\t{1}", word.Value, word.Postings.Count); } } } Log.InfoFormat( "stored postings refs in trees in {0}", posTimer.Elapsed); var treeTimer = Stopwatch.StartNew(); _ix.FieldOffsets = SerializeTries(tries, _compoundFile); Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed); _ix.PostingsOffset = _compoundFile.Position; _postingsWriter.Stream.Flush(); _postingsWriter.Stream.Position = 0; _postingsWriter.Stream.CopyTo(_compoundFile); _ix.DocumentCount = _count; return(_ix.VersionId); }
protected override void DoFlush(Stream dataFile) { var posTimer = Stopwatch.StartNew(); var tries = _treeBuilder.GetTrees(); foreach (var trie in tries) { var nodes = trie.Value.EndOfWordNodes(); foreach (var node in nodes) { node.PostingsAddress = _postingsWriter.Write(node.PostingsStream); } //if (Log.IsDebugEnabled) //{ // foreach (var word in trie.Value.Words()) // { // Log.Debug(word.Value); // } //} } Log.InfoFormat( "stored postings in in {0}", posTimer.Elapsed); var treeTimer = Stopwatch.StartNew(); Version.FieldOffsets = SerializeTries(tries, dataFile); Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed); var postingsTimer = Stopwatch.StartNew(); Version.PostingsOffset = dataFile.Position; _postingsWriter.Stream.Flush(); _postingsWriter.Stream.Position = 0; _postingsWriter.Stream.CopyTo(dataFile); Log.InfoFormat("copied postings to data file in {0}", postingsTimer.Elapsed); base.DoFlush(dataFile); }
public long Commit() { var docAddresses = new List <BlockInfo>(); var pks = new Dictionary <UInt64, object>(); var ts = new List <Task>(); using (var words = new BlockingCollection <WordInfo>()) using (var documents = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); try { while (true) { var word = words.Take(); GetTrie(word.Field) .Add(word.Token, word.Posting); } } catch (InvalidOperationException) { // Done } Log.InfoFormat("Built tries in {0}", trieTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); try { while (true) { var doc = documents.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words) { var field = term.Term.Field; var token = term.Term.Word.Value; var posting = term.Posting; words.Add(new WordInfo(field, token, posting)); } } } catch (InvalidOperationException) { // Done words.CompleteAdding(); } Log.InfoFormat("Analyzed {0} documents in {1}", pks.Count, analyzeTimer.Elapsed); })); ts.Add(Task.Run(() => { var docWriterTimer = new Stopwatch(); docWriterTimer.Start(); var docFileName = Path.Combine(_directory, _indexVersionId + ".doc"); using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None), _compression)) { foreach (var doc in ReadSource()) { string pkVal; if (_autoGeneratePk) { pkVal = Guid.NewGuid().ToString(); } else { pkVal = doc.Fields.First(f => f.Key == _primaryKey).Value; } var hash = pkVal.ToHash(); if (pks.ContainsKey(hash)) { Log.WarnFormat("Found multiple occurrences of documents with pk value of {0} (id:{1}). Only first occurrence will be stored.", pkVal, _docId); } else { pks.Add(hash, null); doc.Id = _docId++; documents.Add(doc); var adr = docWriter.Write(doc); docAddresses.Add(adr); } } } documents.CompleteAdding(); Log.InfoFormat("Serialized {0} documents in {1}", pks.Count, docWriterTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } if (pks.Count == 0) { Log.Info("Aborted write (source is empty)."); return(0); } var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in _tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("Serialized postings in {0}", postingsTimer.Elapsed); var trieTimer = new Stopwatch(); trieTimer.Start(); SerializeTries(); Log.InfoFormat("Serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docAdrTimer = new Stopwatch(); docAdrTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(Path.Combine(_directory, _indexVersionId + ".da"), FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var address in docAddresses) { docAddressWriter.Write(address); } } Log.InfoFormat("Serialized doc addresses in {0}", docAdrTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); pks.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("Serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } return(_indexVersionId); }
public long Commit() { var ts = new List <Task>(); var trieBuilder = new TrieBuilder(); using (var documentsToAnalyze = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { Log.Info("serializing documents"); var count = 0; var docFileName = Path.Combine(_directory, _indexVersionId + ".rdoc"); var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da"); var readTimer = new Stopwatch(); readTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write))) using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression)) { foreach (var doc in ReadSourceAndAssignIdentifiers()) { documentsToAnalyze.Add(doc); var adr = docWriter.Write(doc); docAddressWriter.Write(adr); count++; } } documentsToAnalyze.CompleteAdding(); Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); Log.Info("analyzing"); var count = 0; try { while (true) { var doc = documentsToAnalyze.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } count++; } } catch (InvalidOperationException) { // Done trieBuilder.CompleteAdding(); } Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); Log.Info("serializing postings"); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed); }), Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); Log.Info("serializing tries"); SerializeTries(tries); Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); Log.Info("serializing doc hashes"); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(_indexVersionId); }
public long Write() { if (_committed) { return(_indexVersionId); } var trieBuilder = new TrieBuilder(); var posFileName = Path.Combine( _directory, string.Format("{0}.{1}", _indexVersionId, "pos")); var docTimer = Stopwatch.StartNew(); foreach (var doc in _documents.ReadSource()) { doc.Id = _count++; new DocumentUpsertOperation().Write( doc, _storeWriter, _analyzer, trieBuilder); } Log.InfoFormat("stored {0} documents in {1}", _count + 1, docTimer.Elapsed); var posTimer = Stopwatch.StartNew(); var tries = trieBuilder.GetTries(); using (var postingsWriter = new PostingsWriter( new FileStream(posFileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.DebugFormat("{0}\t{1}", word.Value, word.Count); } } } } Log.InfoFormat( "stored postings refs in trees and wrote postings file in {0}", posTimer.Elapsed); var treeTimer = new Stopwatch(); treeTimer.Start(); SerializeTries(tries); Log.InfoFormat("serialized trees in {0}", treeTimer.Elapsed); return(_indexVersionId); }
public long Commit() { var trieBuilder = new TrieBuilder(); var docAddresses = new List <BlockInfo>(); var analyzed = _analyzer.AnalyzeDocument(_document); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } trieBuilder.CompleteAdding(); var indexVersionId = Util.GetChronologicalFileId(); var docFileName = Path.Combine(_directory, indexVersionId + ".rdoc"); var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da"); BlockInfo adr; using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression)) { adr = docWriter.Write(_document); } using (var docAddressWriter = new DocumentAddressWriter( new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None))) { docAddressWriter.Write(adr); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } }), Task.Run(() => { SerializeTries(tries, indexVersionId); }), Task.Run(() => { var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk")); new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName); }) }; Task.WaitAll(tasks.ToArray()); new IxInfo { VersionId = indexVersionId, DocumentCount = 1, Compression = _compression }.Serialize(Path.Combine(_directory, indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(indexVersionId); }