/// <summary> /// Constructor (automatically starts unfinished indexing job) /// </summary> /// <param name="DBreezeEngine">must be already initialized</param> public Storage(DBreezeEngine DBreezeEngine) { this.OnProcessingStarted += Storage_OnProcessingStarted; this.OnProcessingStopped += Storage_OnProcessingStopped; if (DBreezeEngine == null) throw ThrowException("Storage", "DBreezeEngine must be instantiated"); //if(SearchWordMinimalLength < 1) // throw ThrowException("Storage", "SearchWordMinimalLength must be > 0"); //if (DocumentsStorageTablesPrefix.Length < 1) // throw ThrowException("Storage", "DocumentsStorageTablesPrefix.Length must be > 0"); this.DBreezeEngine = DBreezeEngine; //Preparing Protobuf ProtoBuf.Serializer.PrepareSerializer<Document>(); ProtoBuf.Serializer.PrepareSerializer<SearchRequest>(); ProtoBuf.Serializer.PrepareSerializer<SearchResponse>(); Document o1 = new Document(); o1.SerializeProtobuf(); SearchRequest o2 = new SearchRequest(); o2.SerializeProtobuf(); SearchResponse o3 = new SearchResponse(); o3.SerializeProtobuf(); //Automatic indexing of unfinished documents StartDocumentsIndexing(); }
/// <summary> /// /// </summary> /// <param name="req"></param> public SearchResponse SearchDocumentSpace(SearchRequest req) { SearchResponse resp = new SearchResponse(); try { if (req == null || String.IsNullOrEmpty(req.DocumentSpace) || String.IsNullOrEmpty(req.SearchWords)) return resp; resp.DocumentSpace = req.DocumentSpace; Dictionary<int, Document> dmnts = new Dictionary<int, Document>(); Action repack = () => { //Repacking dmnts into resp if (req.IncludeDocuments) { foreach (var el in dmnts) { resp.Documents.Add(el.Value); } } else { foreach (var el in dmnts) { resp.DocumentsInternalIds.Add(el.Key); } } }; System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); sw.Start(); using (var tran = DBreezeEngine.GetTransaction()) { var mt = tran.SelectTable<int>(DocumentsStorageTablesPrefix + "m", 1, 0); var docSpaceId = mt.Select<string, long>(req.DocumentSpace).Value; if (docSpaceId == 0) return resp; //Not found document space var Words = this.PrepareSearchKeyWords(req.SearchWords); string docTable = DocumentsStorageTablesPrefix + "d" + docSpaceId.ToString(); var vt = tran.SelectTable<int>(docTable, 3, 0); //Version table Key var dt = tran.SelectTable<int>(docTable, 1, 0); //Document table Key dt.ValuesLazyLoadingIsOn = !req.IncludeDocuments; DBreeze.DataTypes.Row<int, byte[]> docRow = null; Document doc = null; //byte[] btDoc = null; int qOutput = 0; //----------------------------------------------------------------- ONE/MULTIPLE WORDS SEARCH then one word is supplied, using AND/OR LOGIC #region "Multiple Words" int j = -1; List<byte[]> foundArrays = new List<byte[]>(); List<byte[]> oneWordFoundArrays = new List<byte[]>(); //WAH2 wh = null; var tbOneWordWAH = tran.SelectTable<int>(DocumentsStorageTablesPrefix + "s" + docSpaceId.ToString(), 2, 0); tbOneWordWAH.ValuesLazyLoadingIsOn = false; resp.UniqueWordsInDataSpace = (int)tbOneWordWAH.Count(); bool anyWordFound = false; int totalFoundWords = 0; Dictionary<string, WordInDoc> words = new Dictionary<string, WordInDoc>(); int foundOrigin = 1; Dictionary<string, WordInDoc> perWord = new Dictionary<string, WordInDoc>(); Dictionary<string, WordInDoc> firstHighOccuranceWord = new Dictionary<string, WordInDoc>(); //Currently we ignore these words and do nothing with them List<string> highOccuranceWordParts = new List<string>(); foreach (var word in Words.Take(10)) //Maximum 10 words for search { anyWordFound = false; totalFoundWords = 0; perWord = new Dictionary<string, WordInDoc>(); foreach (var row1 in tbOneWordWAH.SelectForwardStartsWith<string, byte[]>(word)) { anyWordFound = true; totalFoundWords++; if (Words.Count() == 1 && totalFoundWords > req.Quantity) { //In case if only one search word, then we don't need to make any comparation break; } else if (totalFoundWords >= req.MaximalExcludingOccuranceOfTheSearchPattern) //Found lots of words with such mask inside { //Too much found docs have this word-part inside, better to enhance search if (firstHighOccuranceWord.Count() == 0) { //Only first HighOccurance word part come to the list. It can be used later in case if all search words are of HighOccurance (then we will visualize only this one) firstHighOccuranceWord = perWord.ToDictionary(r => r.Key, r => r.Value); } //Clearing repack element perWord.Clear(); //Adding word into List of High-Occurance word-part highOccuranceWordParts.Add(word); break; } perWord.Add(row1.Key, new WordInDoc() { BlockId = row1.Value.Substring(0,4).To_Int32_BigEndian(), NumberInBlock = row1.Value.Substring(4, 4).To_Int32_BigEndian(), foundOrigin = foundOrigin }); } //Repacking occurances foreach (var pw in perWord) words.Add(pw.Key, pw.Value); foundOrigin++; if ( req.SearchLogicType == SearchRequest.eSearchLogicType.AND && !anyWordFound ) { //Non of words found corresponding to AND logic sw.Stop(); resp.SearchDurationMs = sw.ElapsedMilliseconds; return resp; } } if (words.Count() == 0) { //In case of multiple search words and each of them of HighOccurance. //We will form result only from the first HighOccurance list //Repacking occurances foreach (var pw in firstHighOccuranceWord.Take(req.Quantity)) words.Add(pw.Key, pw.Value); //In this case highOccuranceWordParts must be cleared, because the returning result is very approximate highOccuranceWordParts.Clear(); } //Here we must start get data from blocks //Nested table with blocks var tbBlocks = tran.SelectTable<int>(DocumentsStorageTablesPrefix + "s" + docSpaceId.ToString(), 10, 0); tbBlocks.ValuesLazyLoadingIsOn = false; Dictionary<int,byte[]> block=null; byte[] btBlock=null; int currentBlockId = 0; //DBreeze.Diagnostic.SpeedStatistic.StartCounter("LoadBlocks"); foreach (var wrd in words.OrderBy(r=>r.Value.BlockId)) { if (currentBlockId != wrd.Value.BlockId) { currentBlockId = wrd.Value.BlockId; //DBreeze.Diagnostic.SpeedStatistic.StartCounter("SelectBlocks"); btBlock = tbBlocks.Select<int, byte[]>(wrd.Value.BlockId).Value; //DBreeze.Diagnostic.SpeedStatistic.StopCounter("SelectBlocks"); btBlock = btBlock.Substring(4, btBlock.Substring(0, 4).To_Int32_BigEndian()); //DBreeze.Diagnostic.SpeedStatistic.StartCounter("DecomDeserBlocks"); btBlock = btBlock.DecompressGZip(); block = btBlock.DeserializeProtobuf<Dictionary<int, byte[]>>(); //DBreeze.Diagnostic.SpeedStatistic.StopCounter("DecomDeserBlocks"); } wrd.Value.wah = new WAH2(block[wrd.Value.NumberInBlock]); } //DBreeze.Diagnostic.SpeedStatistic.PrintOut("LoadBlocks", true); //DBreeze.Diagnostic.SpeedStatistic.PrintOut("SelectBlocks", true); //DBreeze.Diagnostic.SpeedStatistic.PrintOut("DecomDeserBlocks", true); foundOrigin = 0; foreach (var wrd in words.OrderBy(r => r.Value.foundOrigin)) { //Console.WriteLine(wrd.Value.foundOrigin); if (foundOrigin != wrd.Value.foundOrigin) { if (oneWordFoundArrays.Count() > 0) { j++; foundArrays.Add(WAH2.MergeAllUncompressedIntoOne(oneWordFoundArrays)); oneWordFoundArrays = new List<byte[]>(); } foundOrigin = wrd.Value.foundOrigin; } else { } oneWordFoundArrays.Add(wrd.Value.wah.GetUncompressedByteArray()); } //The last if (oneWordFoundArrays.Count() > 0) { j++; foundArrays.Add(WAH2.MergeAllUncompressedIntoOne(oneWordFoundArrays)); oneWordFoundArrays = new List<byte[]>(); } ////////// final results if (j >= 0) { var q = WAH2.TextSearch_OR_logic(foundArrays, req.Quantity); if (req.SearchLogicType == SearchRequest.eSearchLogicType.AND) q = WAH2.TextSearch_AND_logic(foundArrays).Take(req.Quantity); foreach (var el in q) { //Getting document docRow = dt.Select<int, byte[]>((int)el); if (docRow.Exists) { if (!dmnts.ContainsKey((int)el)) { if (highOccuranceWordParts.Count() > 0) { //We got some noisy word-parts of high occurance together with strongly found words. //We must be sure that these word parts are also inside of returned docs doc = this.RetrieveDocument(req.IncludeDocumentsContent, true, dt, docRow); if (doc != null) { //Checking doc.Searchables must have all word parts from the occurance in case of AND if (req.SearchLogicType == SearchRequest.eSearchLogicType.AND) { if (String.IsNullOrEmpty(doc.Searchables)) continue; if (!highOccuranceWordParts.All(doc.Searchables.ToLower().Contains)) continue; } if (req.IncludeDocuments) { if (!req.IncludeDocumentsSearchanbles) doc.Searchables = String.Empty; dmnts.Add((int)el, doc); } else { dmnts.Add((int)el, null); } } else continue; } else { if (req.IncludeDocuments) { doc = this.RetrieveDocument(req.IncludeDocumentsContent, req.IncludeDocumentsSearchanbles, dt, docRow); if (doc == null) //If doc is deleted, while search was in progress and we received its id in the list continue; dmnts.Add((int)el, doc); } else { dmnts.Add((int)el, null); } } qOutput++; } } if (qOutput > req.Quantity) break; } } #endregion }//eo using //Repacking dmnts into resp repack(); sw.Stop(); resp.SearchDurationMs = sw.ElapsedMilliseconds; } catch (Exception ex) { throw ThrowException("SearchDocumentSpace", ex.ToString()); } return resp; }