Пример #1
0
        public long Commit()
        {
            var ts          = new List <Task>();
            var trieBuilder = new TrieBuilder();

            using (var documentsToAnalyze = new BlockingCollection <Document>())
            {
                ts.Add(Task.Run(() =>
                {
                    Log.Info("serializing documents");

                    var count        = 0;
                    var docFileName  = Path.Combine(_directory, _indexVersionId + ".rdoc");
                    var docAddressFn = Path.Combine(_directory, _indexVersionId + ".da");
                    var readTimer    = new Stopwatch();

                    readTimer.Start();

                    using (var docAddressWriter = new DocumentAddressWriter(new FileStream(docAddressFn, FileMode.Create, FileAccess.Write)))
                        using (var docWriter = new DocumentWriter(new FileStream(docFileName, FileMode.Create, FileAccess.Write), _compression))
                        {
                            foreach (var doc in ReadSourceAndAssignIdentifiers())
                            {
                                documentsToAnalyze.Add(doc);

                                var adr = docWriter.Write(doc);

                                docAddressWriter.Write(adr);

                                count++;
                            }
                        }

                    documentsToAnalyze.CompleteAdding();

                    Log.InfoFormat("serialized {0} documents in {1}", count, readTimer.Elapsed);
                }));

                ts.Add(Task.Run(() =>
                {
                    var analyzeTimer = new Stopwatch();
                    analyzeTimer.Start();

                    Log.Info("analyzing");

                    var count = 0;

                    try
                    {
                        while (true)
                        {
                            var doc = documentsToAnalyze.Take();

                            var analyzed = _analyzer.AnalyzeDocument(doc);

                            foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field))
                            {
                                trieBuilder.Add(term.Key, term.Select(t =>
                                {
                                    var field   = t.Term.Field;
                                    var token   = t.Term.Word.Value;
                                    var posting = t.Posting;
                                    return(new WordInfo(field, token, posting));
                                }).ToList());
                            }

                            count++;
                        }
                    }
                    catch (InvalidOperationException)
                    {
                        // Done
                        trieBuilder.CompleteAdding();
                    }
                    Log.InfoFormat("analyzed {0} documents in {1}", count, analyzeTimer.Elapsed);
                }));

                Task.WaitAll(ts.ToArray());
            }

            var tries = trieBuilder.GetTries();

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var postingsTimer = new Stopwatch();
                    postingsTimer.Start();

                    Log.Info("serializing postings");

                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }

                    Log.InfoFormat("serialized postings in {0}", postingsTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var trieTimer = new Stopwatch();
                    trieTimer.Start();

                    Log.Info("serializing tries");

                    SerializeTries(tries);

                    Log.InfoFormat("serialized tries in {0}", trieTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docHasTimer = new Stopwatch();
                    docHasTimer.Start();

                    Log.Info("serializing doc hashes");

                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk"));

                    _primaryKeys.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName);

                    Log.InfoFormat("serialized doc hashes in {0}", docHasTimer.Elapsed);
                })
            };

            Task.WaitAll(tasks.ToArray());

            CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }
            else
            {
                Log.Info("compression: false");
            }

            return(_indexVersionId);
        }
Пример #2
0
        public long Commit()
        {
            var docAddresses = new List <BlockInfo>();
            var pks          = new Dictionary <UInt64, object>();
            var ts           = new List <Task>();

            using (var words = new BlockingCollection <WordInfo>())
                using (var documents = new BlockingCollection <Document>())
                {
                    ts.Add(Task.Run(() =>
                    {
                        var trieTimer = new Stopwatch();
                        trieTimer.Start();

                        try
                        {
                            while (true)
                            {
                                var word = words.Take();

                                GetTrie(word.Field)
                                .Add(word.Token, word.Posting);
                            }
                        }
                        catch (InvalidOperationException)
                        {
                            // Done
                        }
                        Log.InfoFormat("Built tries in {0}", trieTimer.Elapsed);
                    }));

                    ts.Add(Task.Run(() =>
                    {
                        var analyzeTimer = new Stopwatch();
                        analyzeTimer.Start();

                        try
                        {
                            while (true)
                            {
                                var doc      = documents.Take();
                                var analyzed = _analyzer.AnalyzeDocument(doc);

                                foreach (var term in analyzed.Words)
                                {
                                    var field   = term.Term.Field;
                                    var token   = term.Term.Word.Value;
                                    var posting = term.Posting;

                                    words.Add(new WordInfo(field, token, posting));
                                }
                            }
                        }
                        catch (InvalidOperationException)
                        {
                            // Done
                            words.CompleteAdding();
                        }
                        Log.InfoFormat("Analyzed {0} documents in {1}", pks.Count, analyzeTimer.Elapsed);
                    }));

                    ts.Add(Task.Run(() =>
                    {
                        var docWriterTimer = new Stopwatch();
                        docWriterTimer.Start();

                        var docFileName = Path.Combine(_directory, _indexVersionId + ".doc");

                        using (var docWriter = new DocumentWriter(
                                   new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None), _compression))
                        {
                            foreach (var doc in ReadSource())
                            {
                                string pkVal;

                                if (_autoGeneratePk)
                                {
                                    pkVal = Guid.NewGuid().ToString();
                                }
                                else
                                {
                                    pkVal = doc.Fields.First(f => f.Key == _primaryKey).Value;
                                }

                                var hash = pkVal.ToHash();

                                if (pks.ContainsKey(hash))
                                {
                                    Log.WarnFormat("Found multiple occurrences of documents with pk value of {0} (id:{1}). Only first occurrence will be stored.",
                                                   pkVal, _docId);
                                }
                                else
                                {
                                    pks.Add(hash, null);

                                    doc.Id = _docId++;

                                    documents.Add(doc);

                                    var adr = docWriter.Write(doc);

                                    docAddresses.Add(adr);
                                }
                            }
                        }
                        documents.CompleteAdding();
                        Log.InfoFormat("Serialized {0} documents in {1}", pks.Count, docWriterTimer.Elapsed);
                    }));

                    Task.WaitAll(ts.ToArray());
                }

            if (pks.Count == 0)
            {
                Log.Info("Aborted write (source is empty).");

                return(0);
            }

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var postingsTimer = new Stopwatch();
                    postingsTimer.Start();

                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in _tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }
                    Log.InfoFormat("Serialized postings in {0}", postingsTimer.Elapsed);

                    var trieTimer = new Stopwatch();
                    trieTimer.Start();

                    SerializeTries();

                    Log.InfoFormat("Serialized tries in {0}", trieTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docAdrTimer = new Stopwatch();
                    docAdrTimer.Start();

                    using (var docAddressWriter = new DocumentAddressWriter(new FileStream(Path.Combine(_directory, _indexVersionId + ".da"), FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var address in docAddresses)
                        {
                            docAddressWriter.Write(address);
                        }
                    }

                    Log.InfoFormat("Serialized doc addresses in {0}", docAdrTimer.Elapsed);
                }),
                Task.Run(() =>
                {
                    var docHasTimer = new Stopwatch();
                    docHasTimer.Start();

                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", _indexVersionId, "pk"));

                    pks.Keys.Select(h => new DocHash(h)).Serialize(docHashesFileName);

                    Log.InfoFormat("Serialized doc hashes in {0}", docHasTimer.Elapsed);
                })
            };

            Task.WaitAll(tasks.ToArray());

            CreateIxInfo().Serialize(Path.Combine(_directory, _indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }

            return(_indexVersionId);
        }
Пример #3
0
        public long Commit()
        {
            var trieBuilder  = new TrieBuilder();
            var docAddresses = new List <BlockInfo>();

            var analyzed = _analyzer.AnalyzeDocument(_document);

            foreach (var term in analyzed.Words.GroupBy(t => t.Term.Field))
            {
                trieBuilder.Add(term.Key, term.Select(t =>
                {
                    var field   = t.Term.Field;
                    var token   = t.Term.Word.Value;
                    var posting = t.Posting;
                    return(new WordInfo(field, token, posting));
                }).ToList());
            }

            trieBuilder.CompleteAdding();

            var indexVersionId = Util.GetChronologicalFileId();
            var docFileName    = Path.Combine(_directory, indexVersionId + ".rdoc");
            var docAddressesFn = Path.Combine(_directory, indexVersionId + ".da");

            BlockInfo adr;

            using (var docWriter = new DocumentWriter(
                       new FileStream(docFileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, true), _compression))
            {
                adr = docWriter.Write(_document);
            }

            using (var docAddressWriter = new DocumentAddressWriter(
                       new FileStream(docAddressesFn, FileMode.Create, FileAccess.Write, FileShare.None)))
            {
                docAddressWriter.Write(adr);
            }

            var tries = trieBuilder.GetTries();

            var tasks = new List <Task>
            {
                Task.Run(() =>
                {
                    var posFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pos"));
                    using (var postingsWriter = new PostingsWriter(new FileStream(posFileName, FileMode.Create, FileAccess.Write, FileShare.None)))
                    {
                        foreach (var trie in tries)
                        {
                            foreach (var node in trie.Value.EndOfWordNodes())
                            {
                                node.PostingsAddress = postingsWriter.Write(node.Postings);
                            }

                            if (Log.IsDebugEnabled)
                            {
                                foreach (var word in trie.Value.Words())
                                {
                                    Log.Debug(word);
                                }
                            }
                        }
                    }
                }),
                Task.Run(() =>
                {
                    SerializeTries(tries, indexVersionId);
                }),
                Task.Run(() =>
                {
                    var docHashesFileName = Path.Combine(_directory, string.Format("{0}.{1}", indexVersionId, "pk"));

                    new DocHash[] { new DocHash(_primaryKeyHash) }.Serialize(docHashesFileName);
                })
            };

            Task.WaitAll(tasks.ToArray());

            new IxInfo
            {
                VersionId     = indexVersionId,
                DocumentCount = 1,
                Compression   = _compression
            }.Serialize(Path.Combine(_directory, indexVersionId + ".ix"));

            if (_compression > 0)
            {
                Log.Info("compression: true");
            }
            else
            {
                Log.Info("compression: false");
            }

            return(indexVersionId);
        }