/// <summary> /// itbls and transaction must be supplied, to make it working from outside /// </summary> internal void DoIndexing(Transaction itran, Dictionary <string, ITS> xitbls) { byte[] btUdtStart = DateTime.UtcNow.Ticks.To_8_bytes_array_BigEndian(); ITS its = null; byte[] kA = null; byte[] kZ = null; byte[] newSrch = null; Row <string, byte[]> rWord = null; //Dictionary<string, WordInDocs> wds = new Dictionary<string, WordInDocs>(); WordInDocs wd = null; uint iterBlockId = 0; int iterBlockLen = 0; int blockSize = 0; byte[] btBlock = null; Dictionary <uint, byte[]> block = new Dictionary <uint, byte[]>(); byte[] btWah = null; byte[] tmp = null; byte[] val = null; WAH2 wah = null; foreach (var tbl in xitbls) { its = tbl.Value; if (its.srch == null) //Can be instantiated in insert procedure, depending how we use indexer { its.srch = itran.InsertTable <byte>(tbl.Key, 3, 0); its.srch.ValuesLazyLoadingIsOn = false; } //Are instantiated only hear its.blocks = itran.InsertTable <byte>(tbl.Key, 10, 0); its.words = itran.InsertTable <byte>(tbl.Key, 20, 0); its.currentBlock = itran.Select <int, uint>(tbl.Key, 11).Value; its.numberInBlock = itran.Select <int, uint>(tbl.Key, 12).Value; its.blocks.ValuesLazyLoadingIsOn = false; its.words.ValuesLazyLoadingIsOn = false; if (its.currentBlock == 0) { its.numberInBlock = 0; its.currentBlock = 1; } //Getting latest indexing time for that table var litRow = itran.Select <byte, byte[]>(tbl.Key, 4); byte[] lastIndexed = DateTime.MinValue.Ticks.To_8_bytes_array_BigEndian(); if (litRow.Exists) { lastIndexed = litRow.Value; } kA = lastIndexed.Concat(int.MinValue.To_4_bytes_array_BigEndian()); kZ = DateTime.MaxValue.Ticks.To_8_bytes_array_BigEndian().Concat(int.MaxValue.To_4_bytes_array_BigEndian()); //Key is word, Value.Item1 is documents list from which this word must be removed, Value.Item2 is documents List where word must be added Dictionary <string, Tuple <HashSet <int>, HashSet <int>, WordInDocs> > ds = new Dictionary <string, Tuple <HashSet <int>, HashSet <int>, WordInDocs> >(); Tuple <HashSet <int>, HashSet <int>, WordInDocs> tpl = null; foreach (var docId in its.ChangedDocIds) { //diff will return list of words to be removed and list of words to be added newSrch = its.srch.Select <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 1 })).Value; var diff = WordsDiff( its.srch.Select <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 })).Value, //Current searchables newSrch //new ); //Copying new searchables to current searchables its.srch.Insert <byte[], byte[]>(docId.To_4_bytes_array_BigEndian().Concat(new byte[] { 0 }), newSrch); Action <string> createNew = (word) => { rWord = its.words.Select <string, byte[]>(word, true); wd = new WordInDocs(); if (rWord.Exists) { wd.BlockId = rWord.Value.Substring(0, 4).To_UInt32_BigEndian(); wd.NumberInBlock = rWord.Value.Substring(4, 4).To_UInt32_BigEndian(); } else { its.numberInBlock++; if (its.numberInBlock > itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.QuantityOfWordsInBlock) //Quantity of words (WAHs) in block { its.currentBlock++; its.numberInBlock = 1; } wd.BlockId = its.currentBlock; wd.NumberInBlock = its.numberInBlock; //Inserting new definition its.words.Insert <string, byte[]>(word, wd.BlockId.To_4_bytes_array_BigEndian().Concat(wd.NumberInBlock.To_4_bytes_array_BigEndian())); } tpl = new Tuple <HashSet <int>, HashSet <int>, WordInDocs>(new HashSet <int>(), new HashSet <int>(), wd); ds[word] = tpl; }; //To be removed foreach (var word in diff.Item1) { if (!ds.TryGetValue(word, out tpl)) { createNew(word); } tpl.Item1.Add(docId); } //To be added foreach (var word in diff.Item2) { if (!ds.TryGetValue(word, out tpl)) { createNew(word); } tpl.Item2.Add(docId); } }//eo foreach new searchables, end of document itteration #region "S1" //Inserting WAH blocks //Going through the list of collected words order by blockID, fill blocks and save them block.Clear(); iterBlockId = 0; foreach (var wd1 in ds.OrderBy(r => r.Value.Item3.BlockId)) { //reading block if it's not loaded if (wd1.Value.Item3.BlockId != iterBlockId) { if (iterBlockId > 0) { //We must save current datablock if (block.Count() > 0) { btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip); if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes) //Minimal reserv { tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes]; tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian()); tmp.CopyInside(4, btBlock); } else if ((btBlock.Length + 4) > iterBlockLen) { //Doubling reserve tmp = new byte[btBlock.Length * 2]; tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian()); tmp.CopyInside(4, btBlock); } else { //Filling existing space tmp = new byte[btBlock.Length + 4]; tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian()); tmp.CopyInside(4, btBlock); } //Saving into DB its.blocks.Insert <uint, byte[]>(iterBlockId, tmp); } block.Clear(); } val = its.blocks.Select <uint, byte[]>(wd1.Value.Item3.BlockId).Value; iterBlockId = wd1.Value.Item3.BlockId; iterBlockLen = val == null ? 0 : val.Length; if (val != null) { blockSize = val.Substring(0, 4).To_Int32_BigEndian(); if (blockSize > 0) { btBlock = val.Substring(4, blockSize); block.Clear(); btBlock.Decode_DICT_PROTO_UINT_BYTEARRAY(block, Compression.eCompressionMethod.Gzip); } else { block.Clear(); } } else { block.Clear(); } } //Getting from Block if (block.TryGetValue((uint)wd1.Value.Item3.NumberInBlock, out btWah)) { wah = new WAH2(btWah); } else { wah = new WAH2(null); } //Adding documents foreach (var dId in wd1.Value.Item2) { wah.Add(dId, true); } //Removing documents foreach (var dId in wd1.Value.Item1) { wah.Add(dId, false); } block[wd1.Value.Item3.NumberInBlock] = wah.GetCompressedByteArray(); }//eo foreach wds //Saving last element //saving current block if (block.Count() > 0) { //!!!!!!!!!!! Remake it for smoothing storage btBlock = block.Encode_DICT_PROTO_UINT_BYTEARRAY(Compression.eCompressionMethod.Gzip); if ((btBlock.Length + 4) < itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes) //Minimal reserve { tmp = new byte[itran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MinimalBlockReservInBytes]; tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian()); tmp.CopyInside(4, btBlock); } else if ((btBlock.Length + 4) > iterBlockLen) { //Doubling reserve tmp = new byte[btBlock.Length * 2]; tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian()); tmp.CopyInside(4, btBlock); } else { //Filling existing space tmp = new byte[btBlock.Length + 4]; tmp.CopyInside(0, btBlock.Length.To_4_bytes_array_BigEndian()); tmp.CopyInside(4, btBlock); } //Saving into DB its.blocks.Insert <uint, byte[]>(iterBlockId, tmp); } block.Clear(); #endregion itran.Insert <int, uint>(tbl.Key, 11, its.currentBlock); itran.Insert <int, uint>(tbl.Key, 12, its.numberInBlock); //Setting last indexing time itran.Insert <byte, byte[]>(tbl.Key, 4, btUdtStart); }//eo foreach tablesToIndex }
/// <summary> /// SearchTextInDocuments /// </summary> /// <param name="tableName"></param> /// <param name="req"></param> /// <returns></returns> public TextSearchResponse SearchTextInDocuments(string tableName, TextSearchRequest req) { TextSearchResponse resp = new TextSearchResponse(); //[string,byte[]] BlockId[int] + NumberInBlock[int] NestedTable tbWords = tran.SelectTable <byte>(tableName, 20, 0); tbWords.ValuesLazyLoadingIsOn = false; var Words = this.PrepareSearchKeyWords(req.SearchWords); #region "Multiple Words" int j = -1; List <byte[]> foundArrays = new List <byte[]>(); List <byte[]> oneWordFoundArrays = new List <byte[]>(); bool anyWordFound = false; int totalFoundWords = 0; Dictionary <string, WordInDocs> words = new Dictionary <string, WordInDocs>(); int foundOrigin = 1; Dictionary <string, WordInDocs> perWord = new Dictionary <string, WordInDocs>(); Dictionary <string, WordInDocs> firstHighOccuranceWord = new Dictionary <string, WordInDocs>(); //Currently we ignore these words and do nothing with them List <string> highOccuranceWordParts = new List <string>(); foreach (var word in Words.Take(tran._transactionUnit.TransactionsCoordinator._engine.Configuration.TextSearchConfig.MaxQuantityOfWordsToBeSearched)) //Maximum 10 words for search { anyWordFound = false; totalFoundWords = 0; perWord = new Dictionary <string, WordInDocs>(); foreach (var row1 in tbWords.SelectForwardStartsWith <string, byte[]>(word)) { anyWordFound = true; totalFoundWords++; if (Words.Count() == 1 && totalFoundWords > req.Quantity) { //In case if only one search word, then we don't need to make any comparation break; } else if (totalFoundWords >= req.NoisyQuantity) //Found lots of words with such mask inside { //Too much found docs have this word-part inside, better to enhance search if (firstHighOccuranceWord.Count() == 0) { //Only first HighOccurance word part come to the list. It can be used later in case if all search words are of HighOccurance (then we will visualize only this one) firstHighOccuranceWord = perWord.ToDictionary(r => r.Key, r => r.Value); } //Clearing repack element perWord.Clear(); //Adding word into List of High-Occurance word-part highOccuranceWordParts.Add(word); break; } perWord.Add(row1.Key, new WordInDocs() { BlockId = row1.Value.Substring(0, 4).To_UInt32_BigEndian(), NumberInBlock = row1.Value.Substring(4, 4).To_UInt32_BigEndian(), foundOrigin = foundOrigin }); } //Repacking occurances foreach (var pw in perWord) { words.Add(pw.Key, pw.Value); } foundOrigin++; if ( req.SearchLogicType == TextSearchRequest.eSearchLogicType.AND && !anyWordFound ) { //Non of words found corresponding to AND logic return(resp); } } if (words.Count() == 0) { //In case of multiple search words and each of them of HighOccurance. //We will form result only from the first HighOccurance list //Repacking occurances foreach (var pw in firstHighOccuranceWord.Take(req.Quantity)) { words.Add(pw.Key, pw.Value); } //In this case highOccuranceWordParts must be cleared, because the returning result is very approximate highOccuranceWordParts.Clear(); } //Here we must start get data from blocks //Nested table with blocks //[uint,byte[]] where K is BlockID[uint] NestedTable tbBlocks = tran.SelectTable <byte>(tableName, 10, 0); tbBlocks.ValuesLazyLoadingIsOn = false; Dictionary <uint, byte[]> block = new Dictionary <uint, byte[]>(); byte[] btBlock = null; uint currentBlockId = 0; //DBreeze.Diagnostic.SpeedStatistic.StartCounter("LoadBlocks"); foreach (var wrd in words.OrderBy(r => r.Value.BlockId)) { if (currentBlockId != wrd.Value.BlockId) { currentBlockId = wrd.Value.BlockId; block = new Dictionary <uint, byte[]>(); //DBreeze.Diagnostic.SpeedStatistic.StartCounter("SelectBlocks"); btBlock = tbBlocks.Select <uint, byte[]>(wrd.Value.BlockId).Value; //DBreeze.Diagnostic.SpeedStatistic.StopCounter("SelectBlocks"); btBlock = btBlock.Substring(4, btBlock.Substring(0, 4).To_Int32_BigEndian()); //DBreeze.Diagnostic.SpeedStatistic.StartCounter("DecomDeserBlocks"); btBlock.Decode_DICT_PROTO_UINT_BYTEARRAY(block, Compression.eCompressionMethod.Gzip); // block = btBlock.DeserializeProtobuf<Dictionary<int, byte[]>>(); //DBreeze.Diagnostic.SpeedStatistic.StopCounter("DecomDeserBlocks"); } wrd.Value.wah = new WAH2(block[wrd.Value.NumberInBlock]); } //DBreeze.Diagnostic.SpeedStatistic.PrintOut("LoadBlocks", true); //DBreeze.Diagnostic.SpeedStatistic.PrintOut("SelectBlocks", true); //DBreeze.Diagnostic.SpeedStatistic.PrintOut("DecomDeserBlocks", true); foundOrigin = 0; foreach (var wrd in words.OrderBy(r => r.Value.foundOrigin)) { //Console.WriteLine(wrd.Value.foundOrigin); if (foundOrigin != wrd.Value.foundOrigin) { if (oneWordFoundArrays.Count() > 0) { j++; foundArrays.Add(WAH2.MergeAllUncompressedIntoOne(oneWordFoundArrays)); oneWordFoundArrays = new List <byte[]>(); } foundOrigin = wrd.Value.foundOrigin; } else { } oneWordFoundArrays.Add(wrd.Value.wah.GetUncompressedByteArray()); } //The last if (oneWordFoundArrays.Count() > 0) { j++; foundArrays.Add(WAH2.MergeAllUncompressedIntoOne(oneWordFoundArrays)); oneWordFoundArrays = new List <byte[]>(); } ////////// final results if (j >= 0) { var q = WAH2.TextSearch_OR_logic(foundArrays, req.Quantity); if (req.SearchLogicType == TextSearchRequest.eSearchLogicType.AND) { q = WAH2.TextSearch_AND_logic(foundArrays).Take(req.Quantity); } //Key int, Value byte[] NestedTable i2e = tran.SelectTable <byte>(tableName, 2, 0); i2e.ValuesLazyLoadingIsOn = false; int qOutput = 0; DBreeze.DataTypes.Row <int, byte[]> docRow = null; foreach (var el in q) { ////Getting document exterrnal ID docRow = i2e.Select <int, byte[]>((int)el); if (docRow.Exists) { resp.FoundDocumentIDs.Add(docRow.Value); } //docRow = dt.Select<int, byte[]>((int)el); //if (docRow.Exists) //{ // if (!dmnts.ContainsKey((int)el)) // { // if (highOccuranceWordParts.Count() > 0) // { // //We got some noisy word-parts of high occurance together with strongly found words. // //We must be sure that these word parts are also inside of returned docs // doc = this.RetrieveDocument(req.IncludeDocumentsContent, true, dt, docRow); // if (doc != null) // { // //Checking doc.Searchables must have all word parts from the occurance in case of AND // if (req.SearchLogicType == SearchRequest.eSearchLogicType.AND) // { // if (String.IsNullOrEmpty(doc.Searchables)) // continue; // if (!highOccuranceWordParts.All(doc.Searchables.ToLower().Contains)) // continue; // } // if (req.IncludeDocuments) // { // if (!req.IncludeDocumentsSearchanbles) // doc.Searchables = String.Empty; // dmnts.Add((int)el, doc); // } // else // { // dmnts.Add((int)el, null); // } // } // else // continue; // } // else // { // if (req.IncludeDocuments) // { // doc = this.RetrieveDocument(req.IncludeDocumentsContent, req.IncludeDocumentsSearchanbles, dt, docRow); // if (doc == null) //If doc is deleted, while search was in progress and we received its id in the list // continue; // dmnts.Add((int)el, doc); // } // else // { // dmnts.Add((int)el, null); // } // } // qOutput++; // } //} qOutput++; if (qOutput > req.Quantity) { break; } } } #endregion return(resp); }