예제 #1
0
        /// <summary>
        /// itbls and transaction must be supplied, to make it working from outside
        /// </summary>
        internal void DoIndexing(Transaction itran, Dictionary <string, ITS> xitbls)
        {
            byte[] btUdtStart = DateTime.UtcNow.Ticks.To_8_bytes_array_BigEndian();

            ITS its = null;

            byte[] kA                  = null;
            byte[] kZ                  = null;
            byte[] newSrch             = null;
            Row <string, byte[]> rWord = null;
            //Dictionary<string, WordInDocs> wds = new Dictionary<string, WordInDocs>();
            WordInDocs wd = null;

            uint iterBlockId  = 0;
            int  iterBlockLen = 0;
            int  blockSize    = 0;

            byte[] btBlock = null;
            Dictionary <uint, byte[]> block = new Dictionary <uint, byte[]>();

            byte[] btWah = null;
            byte[] tmp   = null;
            byte[] val   = null;
            WAH2   wah   = null;



            foreach (var tbl in xitbls)
            {
                its = tbl.Value;
                if (its.srch == null)   //Can be instantiated in insert procedure, depending how we use indexer
                {
                    its.srch = itran.InsertTable <byte>(tbl.Key, 3, 0);
                    its.srch.ValuesLazyLoadingIsOn = false;
                }
                //Are instantiated only hear
                its.blocks        = itran.InsertTable <byte>(tbl.Key, 10, 0);
                its.words         = itran.InsertTable <byte>(tbl.Key, 20, 0);
                its.currentBlock  = itran.Select <int, uint>(tbl.Key, 11).Value;
                its.numberInBlock = itran.Select <int, uint>(tbl.Key, 12).Value;

                its.blocks.ValuesLazyLoadingIsOn = false;
                its.words.ValuesLazyLoadingIsOn  = false;

                if (its.currentBlock == 0)
                {
                    its.numberInBlock = 0;
                    its.currentBlock  = 1;
                }

                //Getting latest indexing time for that table
                var    litRow      = itran.Select <byte, byte[]>(tbl.Key, 4);
                byte[] lastIndexed = DateTime.MinValue.Ticks.To_8_bytes_array_BigEndian();
                if (litRow.Exists)
                {
                    lastIndexed = litRow.Value;
                }

                kA = lastIndexed.Concat(int.MinValue.To_4_bytes_array_BigEndian());
                kZ = DateTime.MaxValue.Ticks.To_8_bytes_array_BigEndian().Concat(int.MaxValue.To_4_bytes_array_BigEndian());

                //Key is word, Value.Item1 is documents list from which this word must be removed, Value.Item2 is documents List where word must be added
                Dictionary <string, Tuple <HashSet <int>, HashSet <int>, WordInDocs> > ds = new Dictionary <string, Tuple <HashSet <int>, HashSet <int>, WordInDocs> >();
                Tuple <HashSet <int>, HashSet <int>, WordInDocs> tpl = null;

                foreach (var docId in its.ChangedDocIds)
                {
                    //diff will return list of words to be removed and list of words to be added
                    newSrch = its.srch.Select <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 1 })).Value;

                    var diff = WordsDiff(
                        its.srch.Select <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 })).Value, //Current searchables
                        newSrch                                                                                              //new
                        );

                    //Copying new searchables to current searchables
                    its.srch.Insert <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }), newSrch);

                    Action <string> createNew = (word) =>
                    {
                        rWord = its.words.Select <string, byte[]>(word, true);
                        wd    = new WordInDocs();
                        if (rWord.Exists)
                        {
                            wd.BlockId       = rWord.Value.Substring(0, 4).To_UInt32_BigEndian();
                            wd.NumberInBlock = rWord.Value.Substring(4, 4).To_UInt32_BigEndian();
                        }
                        else
                        {
                            its.numberInBlock++;

                            if (its.numberInBlock > itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.QuantityOfWordsInBlock)  //Quantity of words (WAHs) in block
                            {
                                its.currentBlock++;
                                its.numberInBlock = 1;
                            }

                            wd.BlockId       = its.currentBlock;
                            wd.NumberInBlock = its.numberInBlock;
                            //Inserting new definition
                            its.words.Insert <string, byte[]>(word, wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian()));
                        }
                        tpl      = new Tuple <HashSet <int>, HashSet <int>, WordInDocs>(new HashSet <int>(), new HashSet <int>(), wd);
                        ds[word] = tpl;
                    };

                    //To be removed
                    foreach (var word in diff.Item1)
                    {
                        if (!ds.TryGetValue(word, out tpl))
                        {
                            createNew(word);
                        }

                        tpl.Item1.Add(docId);
                    }

                    //To be added
                    foreach (var word in diff.Item2)
                    {
                        if (!ds.TryGetValue(word, out tpl))
                        {
                            createNew(word);
                        }

                        tpl.Item2.Add(docId);
                    }
                }//eo foreach new searchables, end of document itteration


                #region "S1"
                //Inserting WAH blocks
                //Going through the list of collected words order by blockID, fill blocks and save them
                block.Clear();
                iterBlockId = 0;

                foreach (var wd1 in ds.OrderBy(r => r.Value.Item3.BlockId))
                {
                    //reading block if it's not loaded
                    if (wd1.Value.Item3.BlockId != iterBlockId)
                    {
                        if (iterBlockId > 0)
                        {
                            //We must save current datablock
                            if (block.Count() > 0)
                            {
                                btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip);

                                if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes)    //Minimal reserv
                                {
                                    tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }
                                else if ((btBlock.Length + 4) > iterBlockLen)
                                {
                                    //Doubling reserve
                                    tmp = new byte[btBlock.Length * 2];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }
                                else
                                {
                                    //Filling existing space
                                    tmp = new byte[btBlock.Length + 4];
                                    tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                                    tmp.CopyInside(4, btBlock);
                                }

                                //Saving into DB
                                its.blocks.Insert <uint, byte[]>(iterBlockId, tmp);
                            }

                            block.Clear();
                        }

                        val          = its.blocks.Select <uint, byte[]>(wd1.Value.Item3.BlockId).Value;
                        iterBlockId  = wd1.Value.Item3.BlockId;
                        iterBlockLen = val == null ? 0 : val.Length;

                        if (val != null)
                        {
                            blockSize = val.Substring(0, 4).To_Int32_BigEndian();
                            if (blockSize > 0)
                            {
                                btBlock = val.Substring(4, blockSize);
                                block.Clear();
                                btBlock.Decode_DICT_PROTO_UINT_BYTEARRAY(block, Compression.eCompressionMethod.Gzip);
                            }
                            else
                            {
                                block.Clear();
                            }
                        }
                        else
                        {
                            block.Clear();
                        }
                    }

                    //Getting from Block
                    if (block.TryGetValue((uint)wd1.Value.Item3.NumberInBlock, out btWah))
                    {
                        wah = new WAH2(btWah);
                    }
                    else
                    {
                        wah = new WAH2(null);
                    }

                    //Adding documents
                    foreach (var dId in wd1.Value.Item2)
                    {
                        wah.Add(dId, true);
                    }

                    //Removing documents
                    foreach (var dId in wd1.Value.Item1)
                    {
                        wah.Add(dId, false);
                    }

                    block[wd1.Value.Item3.NumberInBlock] = wah.GetCompressedByteArray();
                }//eo foreach wds


                //Saving last element
                //saving current block
                if (block.Count() > 0)
                {
                    //!!!!!!!!!!! Remake it for smoothing storage
                    btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip);

                    if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes)    //Minimal reserve
                    {
                        tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }
                    else if ((btBlock.Length + 4) > iterBlockLen)
                    {
                        //Doubling reserve
                        tmp = new byte[btBlock.Length * 2];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }
                    else
                    {
                        //Filling existing space
                        tmp = new byte[btBlock.Length + 4];
                        tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian());
                        tmp.CopyInside(4, btBlock);
                    }

                    //Saving into DB
                    its.blocks.Insert <uint, byte[]>(iterBlockId, tmp);
                }

                block.Clear();
                #endregion

                itran.Insert <int, uint>(tbl.Key, 11, its.currentBlock);
                itran.Insert <int, uint>(tbl.Key, 12, its.numberInBlock);

                //Setting last indexing time
                itran.Insert <byte, byte[]>(tbl.Key, 4, btUdtStart);
            }//eo foreach tablesToIndex
        }
예제 #2
0
        /// <summary>
        /// SearchTextInDocuments
        /// </summary>
        /// <param name="tableName"></param>
        /// <param name="req"></param>
        /// <returns></returns>
        public TextSearchResponse SearchTextInDocuments(string tableName, TextSearchRequest req)
        {
            TextSearchResponse resp = new TextSearchResponse();

            //[string,byte[]] BlockId[int] + NumberInBlock[int]
            NestedTable tbWords = tran.SelectTable <byte>(tableName, 20, 0);

            tbWords.ValuesLazyLoadingIsOn = false;

            var Words = this.PrepareSearchKeyWords(req.SearchWords);

            #region "Multiple Words"

            int           j                  = -1;
            List <byte[]> foundArrays        = new List <byte[]>();
            List <byte[]> oneWordFoundArrays = new List <byte[]>();

            bool anyWordFound    = false;
            int  totalFoundWords = 0;

            Dictionary <string, WordInDocs> words = new Dictionary <string, WordInDocs>();
            int foundOrigin = 1;

            Dictionary <string, WordInDocs> perWord = new Dictionary <string, WordInDocs>();
            Dictionary <string, WordInDocs> firstHighOccuranceWord = new Dictionary <string, WordInDocs>();

            //Currently we ignore these words and do nothing with them
            List <string> highOccuranceWordParts = new List <string>();


            foreach (var word in Words.Take(tran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MaxQuantityOfWordsToBeSearched)) //Maximum 10 words for search
            {
                anyWordFound    = false;
                totalFoundWords = 0;
                perWord         = new Dictionary <string, WordInDocs>();


                foreach (var row1 in tbWords.SelectForwardStartsWith <string, byte[]>(word))
                {
                    anyWordFound = true;
                    totalFoundWords++;

                    if (Words.Count() == 1 && totalFoundWords > req.Quantity)
                    {
                        //In case if only one search word, then we don't need to make any comparation
                        break;
                    }
                    else if (totalFoundWords >= req.NoisyQuantity)  //Found lots of words with such mask inside
                    {
                        //Too much found docs have this word-part inside, better to enhance search
                        if (firstHighOccuranceWord.Count() == 0)
                        {
                            //Only first HighOccurance word part come to the list. It can be used later in case if all search words are of HighOccurance (then we will visualize only this one)
                            firstHighOccuranceWord = perWord.ToDictionary(r => r.Key, r => r.Value);
                        }
                        //Clearing repack element
                        perWord.Clear();
                        //Adding word into List of High-Occurance word-part
                        highOccuranceWordParts.Add(word);
                        break;
                    }

                    perWord.Add(row1.Key, new WordInDocs()
                    {
                        BlockId       = row1.Value.Substring(0, 4).To_UInt32_BigEndian(),
                        NumberInBlock = row1.Value.Substring(4, 4).To_UInt32_BigEndian(),
                        foundOrigin   = foundOrigin
                    });
                }

                //Repacking occurances
                foreach (var pw in perWord)
                {
                    words.Add(pw.Key, pw.Value);
                }

                foundOrigin++;

                if (
                    req.SearchLogicType == TextSearchRequest.eSearchLogicType.AND
                    &&
                    !anyWordFound
                    )
                {
                    //Non of words found corresponding to AND logic
                    return(resp);
                }
            }


            if (words.Count() == 0)
            {
                //In case of multiple search words and each of them of HighOccurance.
                //We will form result only from the first HighOccurance list

                //Repacking occurances
                foreach (var pw in firstHighOccuranceWord.Take(req.Quantity))
                {
                    words.Add(pw.Key, pw.Value);
                }

                //In this case highOccuranceWordParts must be cleared, because the returning result is very approximate
                highOccuranceWordParts.Clear();
            }


            //Here we must start get data from blocks
            //Nested table with blocks
            //[uint,byte[]] where K is BlockID[uint]
            NestedTable tbBlocks = tran.SelectTable <byte>(tableName, 10, 0);
            tbBlocks.ValuesLazyLoadingIsOn = false;

            Dictionary <uint, byte[]> block = new Dictionary <uint, byte[]>();
            byte[] btBlock        = null;
            uint   currentBlockId = 0;

            //DBreeze.Diagnostic.SpeedStatistic.StartCounter("LoadBlocks");

            foreach (var wrd in words.OrderBy(r => r.Value.BlockId))
            {
                if (currentBlockId != wrd.Value.BlockId)
                {
                    currentBlockId = wrd.Value.BlockId;
                    block          = new Dictionary <uint, byte[]>();

                    //DBreeze.Diagnostic.SpeedStatistic.StartCounter("SelectBlocks");
                    btBlock = tbBlocks.Select <uint, byte[]>(wrd.Value.BlockId).Value;
                    //DBreeze.Diagnostic.SpeedStatistic.StopCounter("SelectBlocks");
                    btBlock = btBlock.Substring(4, btBlock.Substring(0, 4).To_Int32_BigEndian());
                    //DBreeze.Diagnostic.SpeedStatistic.StartCounter("DecomDeserBlocks");
                    btBlock.Decode_DICT_PROTO_UINT_BYTEARRAY(block, Compression.eCompressionMethod.Gzip);
                    // block = btBlock.DeserializeProtobuf<Dictionary<int, byte[]>>();
                    //DBreeze.Diagnostic.SpeedStatistic.StopCounter("DecomDeserBlocks");
                }

                wrd.Value.wah = new WAH2(block[wrd.Value.NumberInBlock]);
            }
            //DBreeze.Diagnostic.SpeedStatistic.PrintOut("LoadBlocks", true);
            //DBreeze.Diagnostic.SpeedStatistic.PrintOut("SelectBlocks", true);
            //DBreeze.Diagnostic.SpeedStatistic.PrintOut("DecomDeserBlocks", true);

            foundOrigin = 0;

            foreach (var wrd in words.OrderBy(r => r.Value.foundOrigin))
            {
                //Console.WriteLine(wrd.Value.foundOrigin);

                if (foundOrigin != wrd.Value.foundOrigin)
                {
                    if (oneWordFoundArrays.Count() > 0)
                    {
                        j++;
                        foundArrays.Add(WAH2.MergeAllUncompressedIntoOne(oneWordFoundArrays));
                        oneWordFoundArrays = new List <byte[]>();
                    }

                    foundOrigin = wrd.Value.foundOrigin;
                }
                else
                {
                }

                oneWordFoundArrays.Add(wrd.Value.wah.GetUncompressedByteArray());
            }

            //The last
            if (oneWordFoundArrays.Count() > 0)
            {
                j++;
                foundArrays.Add(WAH2.MergeAllUncompressedIntoOne(oneWordFoundArrays));
                oneWordFoundArrays = new List <byte[]>();
            }


            //////////  final results

            if (j >= 0)
            {
                var q = WAH2.TextSearch_OR_logic(foundArrays, req.Quantity);

                if (req.SearchLogicType == TextSearchRequest.eSearchLogicType.AND)
                {
                    q = WAH2.TextSearch_AND_logic(foundArrays).Take(req.Quantity);
                }

                //Key int, Value byte[]
                NestedTable i2e = tran.SelectTable <byte>(tableName, 2, 0);
                i2e.ValuesLazyLoadingIsOn = false;

                int qOutput = 0;
                DBreeze.DataTypes.Row <int, byte[]> docRow = null;
                foreach (var el in q)
                {
                    ////Getting document exterrnal ID
                    docRow = i2e.Select <int, byte[]>((int)el);
                    if (docRow.Exists)
                    {
                        resp.FoundDocumentIDs.Add(docRow.Value);
                    }

                    //docRow = dt.Select<int, byte[]>((int)el);
                    //if (docRow.Exists)
                    //{
                    //    if (!dmnts.ContainsKey((int)el))
                    //    {
                    //        if (highOccuranceWordParts.Count() > 0)
                    //        {
                    //            //We got some noisy word-parts of high occurance together with strongly found words.
                    //            //We must be sure that these word parts are also inside of returned docs
                    //            doc = this.RetrieveDocument(req.IncludeDocumentsContent, true, dt, docRow);
                    //            if (doc != null)
                    //            {
                    //                //Checking doc.Searchables must have all word parts from the occurance in case of AND
                    //                if (req.SearchLogicType == SearchRequest.eSearchLogicType.AND)
                    //                {
                    //                    if (String.IsNullOrEmpty(doc.Searchables))
                    //                        continue;
                    //                    if (!highOccuranceWordParts.All(doc.Searchables.ToLower().Contains))
                    //                        continue;
                    //                }

                    //                if (req.IncludeDocuments)
                    //                {
                    //                    if (!req.IncludeDocumentsSearchanbles)
                    //                        doc.Searchables = String.Empty;

                    //                    dmnts.Add((int)el, doc);
                    //                }
                    //                else
                    //                {
                    //                    dmnts.Add((int)el, null);
                    //                }

                    //            }
                    //            else
                    //                continue;
                    //        }
                    //        else
                    //        {
                    //            if (req.IncludeDocuments)
                    //            {

                    //                doc = this.RetrieveDocument(req.IncludeDocumentsContent, req.IncludeDocumentsSearchanbles, dt, docRow);
                    //                if (doc == null) //If doc is deleted, while search was in progress and we received its id in the list
                    //                    continue;

                    //                dmnts.Add((int)el, doc);
                    //            }
                    //            else
                    //            {
                    //                dmnts.Add((int)el, null);
                    //            }

                    //        }

                    //        qOutput++;
                    //    }
                    //}

                    qOutput++;

                    if (qOutput > req.Quantity)
                    {
                        break;
                    }
                }
            }
            #endregion



            return(resp);
        }