internal QueryResult QueryNoLINQ(QueryInfo info, CLR.HashSet <string> tagsToExclude = null) { var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(info.Type); ThrowIfInvalidParameters(info.Tag, info.PageSize, queryInfo); ThrowIfInvalidParameters(info.OtherTag, info.PageSize, queryInfo); ComplexQueryResult queryResult = null; switch (info.Operator) { case "AND": queryResult = AndQuery(queryInfo[info.Tag], queryInfo[info.OtherTag], info.PageSize, info.Skip, tagsToExclude); break; case "AND-NOT": queryResult = AndNotQuery(queryInfo[info.Tag], queryInfo[info.OtherTag], info.PageSize, info.Skip, tagsToExclude); break; case "OR": queryResult = OrQuery(queryInfo[info.Tag], queryInfo[info.OtherTag], info.PageSize, info.Skip, tagsToExclude); break; case "OR-NOT": //"i.e. .net+or+jquery-" queryResult = OrNotQuery(queryInfo[info.Tag], queryInfo[info.OtherTag], queryInfo[TagServer.ALL_TAGS_KEY], info.PageSize, info.Skip, tagsToExclude); break; // TODO Work out what a "NOT" query really means, at the moment it's the same as "AND-NOT"?! //case "NOT": // queryResult = NotQuery(queryInfo[info.Tag], queryInfo[info.OtherTag], info.PageSize, info.Skip, tagsToExclude); // break; default: throw new InvalidOperationException(string.Format("Invalid operator specified: {0}", info.Operator ?? "<NULL>")); } timer.Stop(); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); Logger.Log("NO LINQ Boolean Query: \"{0}\" {1} \"{2}\", pageSize = {3:N0}, skip = {4:N0}, took {5} ({6:N2} ms) NO LINQ", info.Tag, info.Operator, info.OtherTag, info.PageSize, info.Skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); Logger.Log("Got {0:N0} results in total, baseQueryCounter = {1:N0}, itemsSkipped = {2:N0}, excludedCounter = {3:N0} ({4} tags to be excluded)", queryResult.Results.Count(), queryResult.BaseQueryCounter, queryResult.ItemsSkipped, queryResult.ExcludedCounter, tagsToExclude != null ? tagsToExclude.Count.ToString("N0") : "NO"); return(new QueryResult { Questions = queryResult.Results, Counters = new Dictionary <string, int> { { "BaseQueryCounter", queryResult.BaseQueryCounter }, { "ItemsSkipped", queryResult.ItemsSkipped }, { "ExcludedCounter", queryResult.ExcludedCounter } } }); }
/// <summary> /// Load all the exclusions into a HashSet, then loop through the Base Query, until we have pageSize + Skip items that aren't in the HashSet. /// Expensive when there are LOTS of exclusions, but cheaper when the BaseQuery is large because we don't process all of it (stop when we have enough) /// </summary> internal List <Question> BooleanQueryWithExclusionsFastAlternativeVersion(QueryType type, string tag, IList <string> excludedTags, int pageSize, int skip) { var gcInfo = new GCCollectionInfo(); var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(type); Func <Question, string> fieldSelector = GetFieldSelector(type); ThrowIfInvalidParameters(tag, pageSize, queryInfo); var exclusions = cache.Value.GetCachedHashSet(); foreach (var excludedTag in excludedTags) { foreach (var qu in queryInfo[excludedTag]) { exclusions.Add(qu); } } var allResults = new List <int>(skip + pageSize); foreach (var qu in queryInfo[tag]) { // If it's not in the exclusions, we can use it if (exclusions.Contains(qu) == false) { allResults.Add(qu); } if (allResults.Count >= (skip + pageSize)) { break; } } var results = allResults.Skip(skip) .Take(pageSize) .Select(i => questions[i]) .ToList(); timer.Stop(); gcInfo.UpdateCollectionInfo(); Logger.Log("Base Query: {0}, there are {1:N0} Excluded Tags", tag, excludedTags.Count); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); using (Utils.SetConsoleColour(Utils.GetColorForTimespan(timer.Elapsed))) { Logger.Log("Boolean Query {0} against tag \"{1}\", pageSize = {2}, skip = {3}, took {4} ({5:N2} ms) - FAST ALT", type, tag, pageSize, skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); } Logger.Log("Got {0} results ({1} in allResults), {2:N0} items in exclusions", results.Count(), allResults.Count, exclusions.Count); Logger.Log(gcInfo.ToString()); //var formattedResults = results.Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); //Log(" {0}", string.Join("\n ", formattedResults)); Logger.Log(""); return(results); }
/// <summary> /// Load up the HashSet with the values from the Base Query, then loop through all the exclusions and remove them /// Expensive when their are LOTS of exclusions, expensive when there is a large Base Query (we process it all regardless) /// </summary> internal List <Question> BooleanQueryWithExclusionsFastVersion(QueryType type, string tag, IList <string> excludedTags, int pageSize, int skip) { var gcInfo = new GCCollectionInfo(); var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(type); Func <Question, string> fieldSelector = GetFieldSelector(type); ThrowIfInvalidParameters(tag, pageSize, queryInfo); var baseHashSet = cache.Value.GetCachedHashSet(queryInfo[tag]); foreach (var excludedTag in excludedTags) { foreach (var qu in queryInfo[excludedTag]) { // We don't care if it was present before or not, either way it's removed baseHashSet.Remove(qu); } } var results = baseHashSet.Skip(skip) .Take(pageSize) .Select(i => questions[i]) .ToList(); timer.Stop(); gcInfo.UpdateCollectionInfo(); Logger.Log("Base Query: {0}, there are {1:N0} Excluded Tags", tag, excludedTags.Count); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); using (Utils.SetConsoleColour(Utils.GetColorForTimespan(timer.Elapsed))) { Logger.Log("Boolean Query {0} against tag \"{1}\", pageSize = {2}, skip = {3}, took {4} ({5:N2} ms) - FAST", type, tag, pageSize, skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); } Logger.Log("Got {0} results, {1:N0} items left in baseHashSet", results.Count(), baseHashSet.Count); Logger.Log(gcInfo.ToString()); //var formattedResults = results.Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); //Log(" {0}", string.Join("\n ", formattedResults)); Logger.Log(""); return(results); }
internal List <Question> BooleanQueryWithExclusionsLINQVersion(QueryType type, string tag, IList <string> excludedTags, int pageSize, int skip) { var gcInfo = new GCCollectionInfo(); var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(type); Func <Question, string> fieldSelector = GetFieldSelector(type); ThrowIfInvalidParameters(tag, pageSize, queryInfo); IEnumerable <int> baseQuery = queryInfo[tag]; foreach (var excludedTag in excludedTags) { baseQuery = baseQuery.Except(queryInfo[excludedTag]); } var results = baseQuery.Skip(skip) .Take(pageSize) .Select(i => questions[i]) .ToList(); timer.Stop(); gcInfo.UpdateCollectionInfo(); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); Logger.Log("Base Query: {0}, there are {1:N0} Excluded Tags", tag, excludedTags.Count); using (Utils.SetConsoleColour(Utils.GetColorForTimespan(timer.Elapsed))) { Logger.Log("Boolean Query {0} against tag \"{1}\", pageSize = {2}, skip = {3}, took {4} ({5:N2} ms) - SLOW", type, tag, pageSize, skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); } Logger.Log("Got {0} results", results.Count()); Logger.Log(gcInfo.ToString()); //var formattedResults = results.Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); //Log(" {0}", string.Join("\n ", formattedResults)); Logger.Log(""); return(results); }
internal QueryResult Query(QueryInfo info, CLR.HashSet <string> tagsToExclude = null) { var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(info.Type); ThrowIfInvalidParameters(info.Tag, info.PageSize, queryInfo); ThrowIfInvalidParameters(info.OtherTag, info.PageSize, queryInfo); var tagCounter = 0; var otherTagCounter = 0; var exclusionCounter = new CounterWrapper(initialValue: 0); IEnumerable <int> tag1Query = queryInfo[info.Tag].Select(t => { tagCounter++; return(t); }); IEnumerable <int> tag2Query = queryInfo[info.OtherTag].Select(t => { otherTagCounter++; return(t); }); IEnumerable <int> query = Enumerable.Empty <int>(); switch (info.Operator) { case "AND": query = tag1Query.Intersect(tag2Query); if (tagsToExclude != null) { query = AddExclusionsToQuery(query, tagsToExclude, exclusionCounter); } break; case "AND-NOT": query = tag1Query.Except(tag2Query); if (tagsToExclude != null) { query = AddExclusionsToQuery(query, tagsToExclude, exclusionCounter); } break; case "OR": // TODO this has a small bug, we can get items out of order as we pull them thru in pairs // if t2 has several items that are larger than t1, t1 will still come out first!! // So algorithm needs to be: // 1) pull the LARGEST value (from t1 or t2) // 2) process this item // 3) repeat 1) again query = tag1Query.Zip(tag2Query, (t1, t2) => new[] { t1, t2 }) .SelectMany(item => item) .Distinct(); if (tagsToExclude != null) { query = AddExclusionsToQuery(query, tagsToExclude, exclusionCounter); } break; case "OR-NOT": //"i.e. .net+or+jquery-" // TODO this has a small bug, we can get items out of order as we pull them thru in pairs // if t2 has several items that are larger than t1, t1 will still come out first!! // So algorithm needs to be: // 1) pull the LARGEST value (from t1 or t2) // 2) process this item // 3) repeat 1) again query = tag1Query.Zip(queryInfo[TagServer.ALL_TAGS_KEY], (t1, t2) => new[] { t1, t2 }) .SelectMany(item => item) .Except(tag2Query) .Distinct(); if (tagsToExclude != null) { query = AddExclusionsToQuery(query, tagsToExclude, exclusionCounter); } break; // TODO Work out what a "NOT" query really means, at the moment it's the same as "AND-NOT"?! //case "NOT": // query = tag1Query.Except(tag2Query); // if (tagsToExclude != null) // query = AddExclusionsToQuery(query, tagsToExclude, exclusionCounter); // break; default: throw new InvalidOperationException(string.Format("Invalid operator specified: {0}", info.Operator ?? "<NULL>")); } var results = query.Skip(info.Skip) .Take(info.PageSize) .Select(i => questions[i]) .ToList(); timer.Stop(); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); Logger.Log("REGULAR Boolean Query: \"{0}\" {1} \"{2}\", pageSize = {3:N0}, skip = {4:N0}, took {5} ({6:N2} ms) REGULAR", info.Tag, info.Operator, info.OtherTag, info.PageSize, info.Skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); Logger.Log("Got {0:} results in total, tag1 QueryCounter = {1:N0}, tag2 QueryCounter = {1:N0}", results.Count(), tagCounter, otherTagCounter); //PrintResults(results, string.Format("{0} {1} {2}", info.Tag, info.Operator, info.OtherTag), info.Type); return(new QueryResult { Questions = results, Counters = new Dictionary <string, int> { { "TagCounter", tagCounter }, { "OtherTagCounter", otherTagCounter }, { "ExclusionCounter", exclusionCounter.Counter } } }); }
internal QueryResult Query(QueryInfo info, EwahCompressedBitArray exclusionBitMap = null, bool printLoggingMessages = false) { var bitMap = GetTagByQueryBitMapLookup(info.Type); var questionLookup = GetTagByQueryLookup(info.Type)[TagServer.ALL_TAGS_KEY]; // Calculating the Cardinality can be (is?) expensive, we don't want to do it in Queries unless we really need to!? bool calculateCardinality = true; // false if (printLoggingMessages) { Logger.Log("Tag \"{0}\" is in {1:N0} Questions, Tag \"{2}\" is in {3:N0} Questions", info.Tag, allTags[info.Tag], info.OtherTag, allTags[info.OtherTag]); } //PrintResults(Enumerable.Range(0, questionLookup.Length), qu => questionLookup[qu], TagServer.ALL_TAGS_KEY, info.Type); //PrintResults(bitMap[info.Tag], qu => questionLookup[qu], info.Tag, info.Type); //PrintResults(bitMap[info.OtherTag], qu => questionLookup[qu], info.OtherTag, info.Type); var timer = Stopwatch.StartNew(); var tag1BitMap = bitMap[info.Tag]; var tag2BitMap = bitMap[info.OtherTag]; EwahCompressedBitArray bitMapResult = new EwahCompressedBitArray(); switch (info.Operator) { case "AND": bitMapResult = tag1BitMap.And(tag2BitMap); break; case "AND-NOT": bitMapResult = tag1BitMap.AndNot(tag2BitMap); break; case "OR": bitMapResult = tag1BitMap.Or(tag2BitMap); break; case "OR-NOT": //"i.e. .net+or+jquery-" bitMapResult = tag1BitMap.OrNot(tag2BitMap); break; // TODO Work out what a "NOT" query really means, the LINQ version was "result = tag1Query.Except(tag2Query)" (which is the same as AND-NOT?!) //case "NOT": // var bitMapResult = (EwahCompressedBitArray)tag2BitMap.Clone(); // bitMapResult.Not(); // break; default: throw new InvalidOperationException(string.Format("Invalid operator specified: {0}", info.Operator ?? "<NULL>")); } if (exclusionBitMap != null) { ulong cardinalityBeforeExclusions = 0; if (printLoggingMessages) { cardinalityBeforeExclusions = bitMapResult.GetCardinality(); } // The Exclusiong BitMap is Set (i.e. 1) in places where you CAN use the question, i.e. it's NOT excluded // That way we can efficiently apply the exclusions by ANDing this BitMap to the previous results var exclusionTimer = Stopwatch.StartNew(); bitMapResult = bitMapResult.AndNot(exclusionBitMap); exclusionTimer.Stop(); if (printLoggingMessages) { if (calculateCardinality) { Logger.Log("Took {0,5:N2} ms to apply exclusion BitMap (Cardinality={1:N0}), Results Cardinality: Before={2:N0}, After={3:N0}", exclusionTimer.Elapsed.TotalMilliseconds, exclusionBitMap.GetCardinality(), cardinalityBeforeExclusions, bitMapResult.GetCardinality()); } else { Logger.Log("Took {0,5:N2} ms to apply exclusion BitMap", exclusionTimer.Elapsed.TotalMilliseconds); } } } var resultCollectionTimer = Stopwatch.StartNew(); var result = bitMapResult.Skip(info.Skip) .Take(info.PageSize) .Select(i => questions[questionLookup[i]]) .ToList(); resultCollectionTimer.Stop(); if (printLoggingMessages) { Logger.Log("Took {0,5:N2} ms to collect the results", resultCollectionTimer.Elapsed.TotalMilliseconds); } timer.Stop(); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); if (printLoggingMessages) { using (Utils.SetConsoleColour(ConsoleColor.DarkYellow)) { if (calculateCardinality) { Logger.Log("Took {0,5:N2} ms in TOTAL to calculate \"{1} {2} {3}\", Got {4} results, (Result Cardinality={5:N0})", timer.Elapsed.TotalMilliseconds, info.Tag, info.Operator, info.OtherTag, result.Count, bitMapResult.GetCardinality()); } else { Logger.Log("Took {0,5:N2} ms in TOTAL to calculate \"{1} {2} {3}\", Got {4} results", timer.Elapsed.TotalMilliseconds, info.Tag, info.Operator, info.OtherTag, result.Count); } } //PrintResults(bitMapResult, qu => questionLookup[qu], string.Format("{0} {1} {2}", info.Tag, info.Operator, info.OtherTag), info.Type); Logger.Log(); } return(new QueryResult { Questions = result, // TODO see if we can get meaningful numbers here, WITHOUT calling GetCardinality() (because it's expensive) //Counters = new Dictionary<string, int> //{ // { "TagCounter", tagCounter }, // { "OtherTagCounter", otherTagCounter }, // { "ExclusionCounter", exclusionCounter.Counter } //} }); }
/// <summary> /// Similar to <seealso cref="BooleanQueryWithExclusionsFastAlternativeVersion"/> using a BloomFilter instead of a HashSet /// Load up the BloomFilter with the exclusions, then loop through the Base Query, until we have pageSize + Skip items that aren't in the BloomFilter. /// Expensive when there are LOTS of exclusions, but cheaper when the BaseQuery is large because we don't process all of it (stop when we have enough) /// </summary> internal List <Question> BooleanQueryWithExclusionsBloomFilterVersion(QueryType type, string tag, IList <string> excludedTags, int pageSize, int skip) { var gcInfo = new GCCollectionInfo(); var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(type); Func <Question, string> fieldSelector = GetFieldSelector(type); ThrowIfInvalidParameters(tag, pageSize, queryInfo); //int bloomFilterSize = 40 * 1000 * 1000; // million's, 40mil produces several False +ve's int bloomFilterSize = 100 * 1000 * 1000; // million's #if DEBUG var bloomFilterCreationTimer = Stopwatch.StartNew(); var bloomFilter = new SimpleBloomFilter(bloomFilterSize); bloomFilterCreationTimer.Stop(); Logger.Log("Took {0} ({1:N2} ms) to create the bloom filter with {2:N0} bits ({3:N2} bytes)", bloomFilterCreationTimer.Elapsed, bloomFilterCreationTimer.Elapsed.TotalMilliseconds, bloomFilterSize, bloomFilterSize / 8); #else var bloomFilter = new SimpleBloomFilter(bloomFilterSize); #endif #if DEBUG //var tests = new[] { 1066589, 2793150, 364114, 910374 }; // These are the Question Id's NOT the array index ([]) values!! var tests = new[] { 192257, 616585, 53029, 158368 }; // These ARE the array index ([]) values var debugging = cache.Value.GetCachedHashSet(); #endif foreach (var excludedTag in excludedTags) { foreach (var qu in queryInfo[excludedTag]) { bloomFilter.Add(qu); #if DEBUG debugging.Add(qu); if (tests.Contains(qu)) { // It it's false, it's DEFINITELY false // It it's true, it could really be false (false +ve) var possiblyExists = bloomFilter.PossiblyExists(qu, debugInfo: true); Logger.Log("Bloom Filter.PossiblyExists - {0,8} = {1} ****", qu, possiblyExists); Logger.Log(" DebuggingHashSet.Contains - {0,8} = {1} ****", qu, debugging.Contains(qu)); } #endif } } var baseQuery = queryInfo[tag]; #if DEBUG var result = baseQuery.Where(b => { var possiblyExists = bloomFilter.PossiblyExists(b); if (possiblyExists == false) { return(true); // we can use it } if (debugging.Contains(b) == false) { var qu = questions[b]; Logger.Log("FALSE +VE: {0,8}, PossiblyExists = {1}, debugging.Contains() = {2}, Id = {3,8}, Tags = {4}", b, possiblyExists, debugging.Contains(b), qu.Id, string.Join(",", qu.Tags)); } return(false); // we can't use it }) #else var result = baseQuery.Where(b => bloomFilter.PossiblyExists(b) == false) #endif .Skip(skip) .Take(pageSize) .Select(i => questions[i]) .ToList(); timer.Stop(); gcInfo.UpdateCollectionInfo(); Logger.Log("Base Query: {0}, there are {1:N0} Excluded Tags", tag, excludedTags.Count); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); using (Utils.SetConsoleColour(Utils.GetColorForTimespan(timer.Elapsed))) { Logger.Log("Boolean Query {0} against tag \"{1}\", pageSize = {2}, skip = {3}, took {4} ({5:N2} ms) - BLOOM", type, tag, pageSize, skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); } //Log("Got {0} results, Bloom Filter contains {1:N0} items (some could be dupes), Truthiness {2:N2}", // result.Count(), bloomFilter.NumberOfItems, bloomFilter.Truthiness); Logger.Log("Got {0} results, Bloom Filter contains {1:N0} items (some could be dupes)", result.Count(), bloomFilter.NumberOfItems); Logger.Log(gcInfo.ToString()); //var formattedResults = result.Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); //Log(" {0}", string.Join("\n ", formattedResults)); Logger.Log(""); #if DEBUG foreach (var item in tests) { var possiblyExists = bloomFilter.PossiblyExists(item, debugInfo: true); Logger.Log("Bloom Filter.PossiblyExists - {0,8} = {1}", item, possiblyExists); Logger.Log(" DebuggingHashSet.Contains - {0,8} = {1}", item, debugging.Contains(item)); Logger.Log(""); } // When the values in "tests" represent Question Id //var testResults = tests.Select(t => questions.First(qu => qu.Id == t)) // .Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); // When the values in "tests" represent array indexes, i.e. questions[x] var testResults = tests.Select(t => questions[t]) .Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); Logger.Log(" {0}", string.Join("\n ", testResults)); #endif return(result); }