示例#1
0
        /// <summary>
        /// itbls and transaction must be supplied, to make it working from outside
        /// </summary>
        internal void DoIndexing(Transaction itran, Dictionary <string, ITS> xitbls)
        {
            byte[] btUdtStart = DateTime.UtcNow.Ticks.To_8_bytes_array_BigEndian();

            ITS its = null;

            byte[] kA                  = null;
            byte[] kZ                  = null;
            byte[] newSrch             = null;
            byte[] oldSrch             = null;
            Row <string, byte[]> rWord = null;
            //Dictionary<string, WordInDocs> wds = new Dictionary<string, WordInDocs>();
            WordInDocs wd = null;

            uint iterBlockId  = 0;
            int  iterBlockLen = 0;
            int  blockSize    = 0;

            byte[] btBlock = null;
            Dictionary <uint, byte[]> block = new Dictionary <uint, byte[]>();

            byte[] btWah = null;
            byte[] tmp   = null;
            byte[] val   = null;
            WABI   wah   = null;



            foreach (var tbl in xitbls)
            {
                its = tbl.Value;
                if (its.srch == null)   //Can be instantiated in insert procedure, depending how we use indexer
                {
                    its.srch = itran.InsertTable <byte>(tbl.Key, 3, 0);
                    its.srch.ValuesLazyLoadingIsOn = false;
                }
                //Are instantiated only hear
                its.blocks        = itran.InsertTable <byte>(tbl.Key, 10, 0);
                its.words         = itran.InsertTable <byte>(tbl.Key, 20, 0);
                its.currentBlock  = itran.Select <int, uint>(tbl.Key, 11).Value;
                its.numberInBlock = itran.Select <int, uint>(tbl.Key, 12).Value;

                its.blocks.ValuesLazyLoadingIsOn = false;
                its.words.ValuesLazyLoadingIsOn  = false;

                if (its.currentBlock == 0)
                {
                    its.numberInBlock = 0;
                    its.currentBlock  = 1;
                }

                //Getting latest indexing time for that table
                var    litRow      = itran.Select <byte, byte[]>(tbl.Key, 4);
                byte[] lastIndexed = DateTime.MinValue.Ticks.To_8_bytes_array_BigEndian();
                if (litRow.Exists)
                {
                    lastIndexed = litRow.Value;
                }

                kA = lastIndexed.Concat(int.MinValue.To_4_bytes_array_BigEndian());
                kZ = DateTime.MaxValue.Ticks.To_8_bytes_array_BigEndian().Concat(int.MaxValue.To_4_bytes_array_BigEndian());

                //Key is word, Value.Item1 is documents list from which this word must be removed, Value.Item2 is documents List where word must be added
                Dictionary <string, Tuple <HashSet <int>, HashSet <int>, WordInDocs> > ds = new Dictionary <string, Tuple <HashSet <int>, HashSet <int>, WordInDocs> >();
                Tuple <HashSet <int>, HashSet <int>, WordInDocs> tpl = null;

                //Dictionary<string, byte[]> tmpWrds = new Dictionary<string, byte[]>(StringComparison.Ordinal);
                var tmpWrds = new SortedDictionary <string, byte[]>(StringComparer.Ordinal);

                Action <string> createNew = (word) =>
                {
                    if (!tmpWrds.ContainsKey(word))
                    {
                        rWord = its.words.Select <string, byte[]>(word, true);
                        wd    = new WordInDocs();

                        if (rWord.Exists)
                        {
                            wd.BlockId       = rWord.Value.Substring(0, 4).To_UInt32_BigEndian();
                            wd.NumberInBlock = rWord.Value.Substring(4, 4).To_UInt32_BigEndian();
                        }
                        else
                        {
                            its.numberInBlock++;

                            if (its.numberInBlock > itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.QuantityOfWordsInBlock)  //Quantity of words (WAHs) in block
                            {
                                its.currentBlock++;
                                its.numberInBlock = 1;
                            }

                            wd.BlockId       = its.currentBlock;
                            wd.NumberInBlock = its.numberInBlock;
                            //Inserting new definition



                            // its.words.Insert<string, byte[]>(word, wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian()));
                            if (tmpWrds.Count < 100000)
                            {
                                tmpWrds[word] = wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian());
                            }
                            else
                            {
                                // its.words.Insert<string, byte[]>(word, wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian()));

                                foreach (var tmpwrd in tmpWrds)
                                {
                                    its.words.Insert <string, byte[]>(tmpwrd.Key, tmpwrd.Value);
                                }
                                tmpWrds.Clear();
                            }
                        }
                        tpl      = new Tuple <HashSet <int>, HashSet <int>, WordInDocs>(new HashSet <int>(), new HashSet <int>(), wd);
                        ds[word] = tpl;
                    }
                };

                //List<byte[]> docs2Change = new List<byte[]>();
                Dictionary <byte[], byte[]> docs2Change = new Dictionary <byte[], byte[]>();
                Tuple <HashSet <string>, HashSet <string> > diff;


                //foreach (var docId in its.ChangedDocIds)
                foreach (var docId in its.ChangedDocIds.OrderBy(r => r))
                {
                    //diff will return list of words to be removed and list of words to be added
                    oldSrch = its.srch.Select <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 })).Value;
                    newSrch = its.srch.Select <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 1 })).Value;

                    diff = WordsDiff(
                        oldSrch,        //Current searchables
                        newSrch         //new
                        );

                    //diff = WordsDiff(
                    //            its.srch.Select<byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }), true).Value, //Current searchables
                    //            newSrch //new
                    //            );

                    //Copying new searchables to current searchables
                    docs2Change.Add(docId.To_4_bytes_array_BigEndian(), newSrch);
                    //its.srch.ChangeKey<byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 1 }), docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }));


                    //To be removed
                    foreach (var word in diff.Item1)
                    {
                        if (!ds.TryGetValue(word, out tpl))
                        {
                            createNew(word);
                        }

                        tpl.Item1.Add(docId);
                    }

                    //To be added
                    foreach (var word in diff.Item2)
                    {
                        if (!ds.TryGetValue(word, out tpl))
                        {
                            createNew(word);
                        }

                        tpl.Item2.Add(docId);
                    }
                }//eo foreach new searchables, end of document itteration


                foreach (var d2c in docs2Change.OrderBy(r => r.Key.ToBytesString()))
                {
                    its.srch.RemoveKey <byte[]>(d2c.Key.Concat(new byte[] { 1 }));
                    its.srch.Insert <byte[], byte[]>(d2c.Key.Concat(new byte[] { 0 }), d2c.Value);
                    // its.srch.ChangeKey<byte[]>(d2c.Concat(new byte[] { 1 }), d2c.Concat(new byte[] { 0 }));
                }

                //foreach (var eeel in its.srch.SelectForward<byte[], byte[]>(false).Take(50))
                //    Console.WriteLine(eeel.Key.ToBytesString());

                foreach (var tmpwrd in tmpWrds)
                {
                    its.words.Insert <string, byte[]>(tmpwrd.Key, tmpwrd.Value);
                }
                tmpWrds.Clear();


                #region "S1"
                //Inserting WAH blocks
                //Going through the list of collected words order by blockID, fill blocks and save them
                block.Clear();
                iterBlockId = 0;

                foreach (var wd1 in ds.OrderBy(r => r.Value.Item3.BlockId))
                {
                    //reading block if it's not loaded
                    if (wd1.Value.Item3.BlockId != iterBlockId)
                    {
                        if (iterBlockId > 0)
                        {
                            //We must save current datablock
                            if (block.Count() > 0)
                            {
                                btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip);

                                if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes)    //Minimal reserv
                                {
                                    tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }
                                else if ((btBlock.Length + 4) > iterBlockLen)
                                {
                                    //Doubling reserve
                                    tmp = new byte[btBlock.Length * 2];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }
                                else
                                {
                                    //Filling existing space
                                    tmp = new byte[btBlock.Length + 4];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }

                                //Saving into DB
                                its.blocks.Insert <uint, byte[]>(iterBlockId, tmp);
                            }

                            block.Clear();
                        }

                        val          = its.blocks.Select <uint, byte[]>(wd1.Value.Item3.BlockId).Value;
                        iterBlockId  = wd1.Value.Item3.BlockId;
                        iterBlockLen = val == null ? 0 : val.Length;

                        if (val != null)
                        {
                            blockSize = val.Substring(0, 4).To_Int32_BigEndian();
                            if (blockSize > 0)
                            {
                                btBlock = val.Substring(4, blockSize);
                                block.Clear();
                                btBlock.Decode_DICT_PROTO_UINT_BYTEARRAY(block, Compression.eCompressionMethod.Gzip);
                            }
                            else
                            {
                                block.Clear();
                            }
                        }
                        else
                        {
                            block.Clear();
                        }
                    }

                    //Getting from Block
                    if (block.TryGetValue((uint)wd1.Value.Item3.NumberInBlock, out btWah))
                    {
                        wah = new WABI(btWah);
                    }
                    else
                    {
                        wah = new WABI(null);
                    }

                    //Adding documents
                    foreach (var dId in wd1.Value.Item2)
                    {
                        wah.Add(dId, true);
                    }

                    //Removing documents
                    foreach (var dId in wd1.Value.Item1)
                    {
                        wah.Add(dId, false);
                    }

                    block[wd1.Value.Item3.NumberInBlock] = wah.GetCompressedByteArray();
                }//eo foreach wds


                //Saving last element
                //saving current block
                if (block.Count() > 0)
                {
                    //!!!!!!!!!!! Remake it for smoothing storage
                    btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip);

                    if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes)    //Minimal reserve
                    {
                        tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }
                    else if ((btBlock.Length + 4) > iterBlockLen)
                    {
                        //Doubling reserve
                        tmp = new byte[btBlock.Length * 2];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }
                    else
                    {
                        //Filling existing space
                        tmp = new byte[btBlock.Length + 4];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }

                    //Saving into DB
                    its.blocks.Insert <uint, byte[]>(iterBlockId, tmp);
                }

                block.Clear();
                #endregion

                itran.Insert <int, uint>(tbl.Key, 11, its.currentBlock);
                itran.Insert <int, uint>(tbl.Key, 12, its.numberInBlock);

                //Setting last indexing time
                itran.Insert <byte, byte[]>(tbl.Key, 4, btUdtStart);
            }//eo foreach tablesToIndex
        }
示例#2
0
        /// <summary>
        /// itbls and transaction must be supplied, to make it working from outside
        /// </summary>
        internal void DoIndexing(Transaction itran, Dictionary<string, ITS> xitbls)
        {
            byte[] btUdtStart = DateTime.UtcNow.Ticks.To_8_bytes_array_BigEndian();

            ITS its = null;

            byte[] kA = null;
            byte[] kZ = null;
            byte[] newSrch = null;
            Row<string, byte[]> rWord = null;
            //Dictionary<string, WordInDocs> wds = new Dictionary<string, WordInDocs>();
            WordInDocs wd = null;

            uint iterBlockId = 0;
            int iterBlockLen = 0;
            int blockSize = 0;
            byte[] btBlock = null;
            Dictionary<uint, byte[]> block = new Dictionary<uint, byte[]>();
            byte[] btWah = null;
            byte[] tmp = null;
            byte[] val = null;
            WABI wah = null;

            foreach (var tbl in xitbls)
            {
                its = tbl.Value;
                if (its.srch == null)   //Can be instantiated in insert procedure, depending how we use indexer
                {
                    its.srch = itran.InsertTable<byte>(tbl.Key, 3, 0);
                    its.srch.ValuesLazyLoadingIsOn = false;
                }
                //Are instantiated only hear
                its.blocks = itran.InsertTable<byte>(tbl.Key, 10, 0);
                its.words = itran.InsertTable<byte>(tbl.Key, 20, 0);
                its.currentBlock = itran.Select<int, uint>(tbl.Key, 11).Value;
                its.numberInBlock = itran.Select<int, uint>(tbl.Key, 12).Value;

                its.blocks.ValuesLazyLoadingIsOn = false;
                its.words.ValuesLazyLoadingIsOn = false;

                if (its.currentBlock == 0)
                {
                    its.numberInBlock = 0;
                    its.currentBlock = 1;
                }

                //Getting latest indexing time for that table
                var litRow = itran.Select<byte, byte[]>(tbl.Key, 4);
                byte[] lastIndexed = DateTime.MinValue.Ticks.To_8_bytes_array_BigEndian();
                if (litRow.Exists)
                    lastIndexed = litRow.Value;

                kA = lastIndexed.Concat(int.MinValue.To_4_bytes_array_BigEndian());
                kZ = DateTime.MaxValue.Ticks.To_8_bytes_array_BigEndian().Concat(int.MaxValue.To_4_bytes_array_BigEndian());

                //Key is word, Value.Item1 is documents list from which this word must be removed, Value.Item2 is documents List where word must be added
                Dictionary<string, Tuple<HashSet<int>, HashSet<int>, WordInDocs>> ds = new Dictionary<string, Tuple<HashSet<int>, HashSet<int>, WordInDocs>>();
                Tuple<HashSet<int>, HashSet<int>, WordInDocs> tpl = null;

                //Dictionary<string, byte[]> tmpWrds = new Dictionary<string, byte[]>(StringComparison.Ordinal);
                var tmpWrds = new SortedDictionary<string, byte[]>(StringComparer.Ordinal);

                foreach (var docId in its.ChangedDocIds)
                {
                    //diff will return list of words to be removed and list of words to be added
                    newSrch = its.srch.Select<byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 1 })).Value;

                    var diff = WordsDiff(
                        its.srch.Select<byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }), true).Value, //Current searchables
                        newSrch //new
                        );

                    //Copying new searchables to current searchables
                    its.srch.ChangeKey<byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 1 }), docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }));
                    //its.srch.Insert<byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }), newSrch);

                    Action <string> createNew = (word) =>
                    {
                        if (!tmpWrds.ContainsKey(word))
                        {
                            rWord = its.words.Select<string, byte[]>(word, true);
                            wd = new WordInDocs();

                            if (rWord.Exists)
                            {
                                wd.BlockId = rWord.Value.Substring(0, 4).To_UInt32_BigEndian();
                                wd.NumberInBlock = rWord.Value.Substring(4, 4).To_UInt32_BigEndian();
                            }
                            else
                            {
                                its.numberInBlock++;

                                if (its.numberInBlock > itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.QuantityOfWordsInBlock)  //Quantity of words (WAHs) in block
                                {
                                    its.currentBlock++;
                                    its.numberInBlock = 1;
                                }

                                wd.BlockId = its.currentBlock;
                                wd.NumberInBlock = its.numberInBlock;
                                //Inserting new definition

                                // its.words.Insert<string, byte[]>(word, wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian()));
                                if (tmpWrds.Count < 100000)
                                    tmpWrds[word] = wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian());
                                else
                                {
                                    // its.words.Insert<string, byte[]>(word, wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian()));

                                    foreach (var tmpwrd in tmpWrds)
                                    {
                                        its.words.Insert<string, byte[]>(tmpwrd.Key, tmpwrd.Value);

                                    }
                                    tmpWrds.Clear();
                                }

                            }
                            tpl = new Tuple<HashSet<int>, HashSet<int>, WordInDocs>(new HashSet<int>(), new HashSet<int>(), wd);
                            ds[word] = tpl;
                        }
                    };

                    //To be removed
                    foreach (var word in diff.Item1)
                    {
                        if (!ds.TryGetValue(word, out tpl))
                            createNew(word);

                        tpl.Item1.Add(docId);
                    }

                    //To be added
                    foreach (var word in diff.Item2)
                    {
                        if (!ds.TryGetValue(word, out tpl))
                            createNew(word);

                        tpl.Item2.Add(docId);
                    }
                }//eo foreach new searchables, end of document itteration

                foreach (var tmpwrd in tmpWrds)
                {
                    its.words.Insert<string, byte[]>(tmpwrd.Key, tmpwrd.Value);

                }
                tmpWrds.Clear();

                #region "S1"
                //Inserting WAH blocks
                //Going through the list of collected words order by blockID, fill blocks and save them
                block.Clear();
                iterBlockId = 0;

                foreach (var wd1 in ds.OrderBy(r => r.Value.Item3.BlockId))
                {
                    //reading block if it's not loaded
                    if (wd1.Value.Item3.BlockId != iterBlockId)
                    {
                        if (iterBlockId > 0)
                        {
                            //We must save current datablock
                            if (block.Count() > 0)
                            {

                                btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip);

                                if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes)    //Minimal reserv
                                {
                                    tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }
                                else if ((btBlock.Length + 4) > iterBlockLen)
                                {
                                    //Doubling reserve
                                    tmp = new byte[btBlock.Length * 2];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }
                                else
                                {
                                    //Filling existing space
                                    tmp = new byte[btBlock.Length + 4];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }

                                //Saving into DB
                                its.blocks.Insert<uint, byte[]>(iterBlockId, tmp);
                            }

                            block.Clear();
                        }

                        val = its.blocks.Select<uint, byte[]>(wd1.Value.Item3.BlockId).Value;
                        iterBlockId = wd1.Value.Item3.BlockId;
                        iterBlockLen = val == null ? 0 : val.Length;

                        if (val != null)
                        {
                            blockSize = val.Substring(0, 4).To_Int32_BigEndian();
                            if (blockSize > 0)
                            {
                                btBlock = val.Substring(4, blockSize);
                                block.Clear();
                                btBlock.Decode_DICT_PROTO_UINT_BYTEARRAY(block, Compression.eCompressionMethod.Gzip);
                            }
                            else
                                block.Clear();
                        }
                        else
                            block.Clear();
                    }

                    //Getting from Block
                    if (block.TryGetValue((uint)wd1.Value.Item3.NumberInBlock, out btWah))
                    {
                        wah = new WABI(btWah);
                    }
                    else
                        wah = new WABI(null);

                    //Adding documents
                    foreach (var dId in wd1.Value.Item2)
                        wah.Add(dId, true);

                    //Removing documents
                    foreach (var dId in wd1.Value.Item1)
                        wah.Add(dId, false);

                    block[wd1.Value.Item3.NumberInBlock] = wah.GetCompressedByteArray();

                }//eo foreach wds

                //Saving last element
                //saving current block
                if (block.Count() > 0)
                {
                    //!!!!!!!!!!! Remake it for smoothing storage
                    btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip);

                    if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes)    //Minimal reserve
                    {
                        tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }
                    else if ((btBlock.Length + 4) > iterBlockLen)
                    {
                        //Doubling reserve
                        tmp = new byte[btBlock.Length * 2];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }
                    else
                    {
                        //Filling existing space
                        tmp = new byte[btBlock.Length + 4];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }

                    //Saving into DB
                    its.blocks.Insert<uint, byte[]>(iterBlockId, tmp);
                }

                block.Clear();
                #endregion

                itran.Insert<int, uint>(tbl.Key, 11, its.currentBlock);
                itran.Insert<int, uint>(tbl.Key, 12, its.numberInBlock);

                //Setting last indexing time
                itran.Insert<byte, byte[]>(tbl.Key, 4, btUdtStart);

            }//eo foreach tablesToIndex
        }