public long Commit() { var ts = new List <Task>(); var trieBuilder = new TrieBuilder(); using (var documentsToAnalyze = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { Log.Info("serializing documents"); var count = 0; var docFileName = Path.Combine(_directory, _indexVersionId + ".rdoc"); var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da"); var readTimer = new Stopwatch(); readTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write))) using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression)) { foreach (var doc in ReadSourceAndAssignIdentifiers()) { documentsToAnalyze.Add(doc); var adr = docWriter.Write(doc); docAddressWriter.Write(adr); count++; } } documentsToAnalyze.CompleteAdding(); Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); Log.Info("analyzing"); var count = 0; try { while (true) { var doc = documentsToAnalyze.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } count++; } } catch (InvalidOperationException) { // Done trieBuilder.CompleteAdding(); } Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); Log.Info("serializing postings"); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed); }), Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); Log.Info("serializing tries"); SerializeTries(tries); Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); Log.Info("serializing doc hashes"); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(_indexVersionId); }
public long Commit() { var docAddresses = new List <BlockInfo>(); var pks = new Dictionary <UInt64, object>(); var ts = new List <Task>(); using (var words = new BlockingCollection <WordInfo>()) using (var documents = new BlockingCollection <Document>()) { ts.Add(Task.Run(() => { var trieTimer = new Stopwatch(); trieTimer.Start(); try { while (true) { var word = words.Take(); GetTrie(word.Field) .Add(word.Token, word.Posting); } } catch (InvalidOperationException) { // Done } Log.InfoFormat("Built tries in {0}", trieTimer.Elapsed); })); ts.Add(Task.Run(() => { var analyzeTimer = new Stopwatch(); analyzeTimer.Start(); try { while (true) { var doc = documents.Take(); var analyzed = _analyzer.AnalyzeDocument(doc); foreach (var term in analyzed.Words) { var field = term.Term.Field; var token = term.Term.Word.Value; var posting = term.Posting; words.Add(new WordInfo(field, token, posting)); } } } catch (InvalidOperationException) { // Done words.CompleteAdding(); } Log.InfoFormat("Analyzed {0} documents in {1}", pks.Count, analyzeTimer.Elapsed); })); ts.Add(Task.Run(() => { var docWriterTimer = new Stopwatch(); docWriterTimer.Start(); var docFileName = Path.Combine(_directory, _indexVersionId + ".doc"); using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None), _compression)) { foreach (var doc in ReadSource()) { string pkVal; if (_autoGeneratePk) { pkVal = Guid.NewGuid().ToString(); } else { pkVal = doc.Fields.First(f => f.Key == _primaryKey).Value; } var hash = pkVal.ToHash(); if (pks.ContainsKey(hash)) { Log.WarnFormat("Found multiple occurrences of documents with pk value of {0} (id:{1}). Only first occurrence will be stored.", pkVal, _docId); } else { pks.Add(hash, null); doc.Id = _docId++; documents.Add(doc); var adr = docWriter.Write(doc); docAddresses.Add(adr); } } } documents.CompleteAdding(); Log.InfoFormat("Serialized {0} documents in {1}", pks.Count, docWriterTimer.Elapsed); })); Task.WaitAll(ts.ToArray()); } if (pks.Count == 0) { Log.Info("Aborted write (source is empty)."); return(0); } var tasks = new List <Task> { Task.Run(() => { var postingsTimer = new Stopwatch(); postingsTimer.Start(); var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in _tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } Log.InfoFormat("Serialized postings in {0}", postingsTimer.Elapsed); var trieTimer = new Stopwatch(); trieTimer.Start(); SerializeTries(); Log.InfoFormat("Serialized tries in {0}", trieTimer.Elapsed); }), Task.Run(() => { var docAdrTimer = new Stopwatch(); docAdrTimer.Start(); using (var docAddressWriter = new DocumentAddressWriter(new FileStream(Path.Combine(_directory, _indexVersionId + ".da"), FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var address in docAddresses) { docAddressWriter.Write(address); } } Log.InfoFormat("Serialized doc addresses in {0}", docAdrTimer.Elapsed); }), Task.Run(() => { var docHasTimer = new Stopwatch(); docHasTimer.Start(); var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk")); pks.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName); Log.InfoFormat("Serialized doc hashes in {0}", docHasTimer.Elapsed); }) }; Task.WaitAll(tasks.ToArray()); CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } return(_indexVersionId); }
public long Commit() { var trieBuilder = new TrieBuilder(); var docAddresses = new List <BlockInfo>(); var analyzed = _analyzer.AnalyzeDocument(_document); foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field)) { trieBuilder.Add(term.Key, term.Select(t => { var field = t.Term.Field; var token = t.Term.Word.Value; var posting = t.Posting; return(new WordInfo(field, token, posting)); }).ToList()); } trieBuilder.CompleteAdding(); var indexVersionId = Util.GetChronologicalFileId(); var docFileName = Path.Combine(_directory, indexVersionId + ".rdoc"); var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da"); BlockInfo adr; using (var docWriter = new DocumentWriter( new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression)) { adr = docWriter.Write(_document); } using (var docAddressWriter = new DocumentAddressWriter( new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None))) { docAddressWriter.Write(adr); } var tries = trieBuilder.GetTries(); var tasks = new List <Task> { Task.Run(() => { var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos")); using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None))) { foreach (var trie in tries) { foreach (var node in trie.Value.EndOfWordNodes()) { node.PostingsAddress = postingsWriter.Write(node.Postings); } if (Log.IsDebugEnabled) { foreach (var word in trie.Value.Words()) { Log.Debug(word); } } } } }), Task.Run(() => { SerializeTries(tries, indexVersionId); }), Task.Run(() => { var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk")); new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName); }) }; Task.WaitAll(tasks.ToArray()); new IxInfo { VersionId = indexVersionId, DocumentCount = 1, Compression = _compression }.Serialize(Path.Combine(_directory, indexVersionId + ".ix")); if (_compression > 0) { Log.Info("compression: true"); } else { Log.Info("compression: false"); } return(indexVersionId); }