internal void ValidateExclusionBitMap(EwahCompressedBitArray bitMapIndex, CLR.HashSet <string> expandedTagsNGrams, QueryType queryType) { // Exclusion BitMap is Set (i.e. 1) in places where you CAN use the question, i.e. it's NOT excluded var questionLookup = GetTagByQueryLookup(queryType)[TagServer.ALL_TAGS_KEY]; var invalidQuestions = new List <Tuple <Question, string> >(); var NOTbitMapIndex = ((EwahCompressedBitArray)bitMapIndex.Clone()); NOTbitMapIndex.Not(); var positions = NOTbitMapIndex.GetPositions(); foreach (var position in positions) { var question = questions[questionLookup[position]]; foreach (var tag in question.Tags) { if (expandedTagsNGrams.Contains(tag)) { invalidQuestions.Add(Tuple.Create(question, tag)); } } // Sometimes the validitation locks up my laptop, this *seems* to make a difference?! Thread.Yield(); } using (Utils.SetConsoleColour(ConsoleColor.Blue)) Logger.Log("Validating Exclusion Bit Map, checked {0:N0} positions for INVALID tags", positions.Count); if (invalidQuestions.Any()) { using (Utils.SetConsoleColour(ConsoleColor.Red)) Logger.Log("ERROR Validating Exclusion Bit Map, {0:N0} questions should have been excluded", invalidQuestions.Select(i => i.Item1.Id).Distinct().Count()); foreach (var error in invalidQuestions) { Logger.Log(" {0,8}: {1} -> {2}", error.Item1.Id, String.Join(", ", error.Item1.Tags), error.Item2); } } var expectedPositions = bitMapIndex.GetPositions(); foreach (var position in expectedPositions) { var question = questions[questionLookup[position]]; if (question.Tags.Any(t => expandedTagsNGrams.Contains(t)) == false) { using (Utils.SetConsoleColour(ConsoleColor.Red)) Logger.Log("ERROR {0,8}: {1} -> didn't contain ANY excluded tags", question.Id, String.Join(", ", question.Tags)); } } using (Utils.SetConsoleColour(ConsoleColor.Blue)) Logger.Log("Validating Exclusion Bit Map, checked {0:N0} positions for EXPECTED tags", expectedPositions.Count); Logger.Log(); }
internal List <Tuple <Question, List <string> > > GetShouldHaveBeenExcludedResults(List <Question> results, QueryInfo queryInfo, CLR.HashSet <string> tagsToExclude) { var errors = new List <Tuple <Question, List <string> > >(); if (tagsToExclude == null) { return(errors); } foreach (var result in results) { var invalidTags = new List <string>(); foreach (var tag in result.Tags) { if (tagsToExclude.Contains(tag)) { invalidTags.Add(tag); } } if (invalidTags.Count > 0) { errors.Add(Tuple.Create(result, invalidTags)); } } return(errors); }
private IEnumerable <int> AddExclusionsToQuery(IEnumerable <int> query, CLR.HashSet <string> tagsToExclude, CounterWrapper exclusionCounter) { return(query.Where(i => { if (questions[i].Tags.All(t => tagsToExclude.Contains(t) == false)) { return true; } exclusionCounter.Counter++; return false; })); }
ComplexQueryResult AndNotQuery(int[] tag1Ids, int[] tag2Ids, int pageSize, int skip, CLR.HashSet <string> tagsToExclude = null) { var queryResult = new ComplexQueryResult { Results = new List <Question>(pageSize), BaseQueryCounter = 0, ItemsSkipped = 0, ExcludedCounter = 0 }; // https://github.com/ungood/EduLinq/blob/master/Edulinq/Except.cs#L26-L40 var notHashSet = cache.Value.GetCachedHashSet(tag2Ids); foreach (var item in tag1Ids) { if (queryResult.Results.Count >= pageSize) { break; } queryResult.BaseQueryCounter++; if (tagsToExclude != null && questions[item].Tags.Any(t => tagsToExclude.Contains(t))) { queryResult.ExcludedCounter++; } else if (notHashSet.Add(item)) { if (queryResult.ItemsSkipped >= skip) { queryResult.Results.Add(questions[item]); } else { queryResult.ItemsSkipped++; } } } return(queryResult); }
private static void GetLeppieTagInfo(List <Question> rawQuestions, TagLookup allTags, List <string> leppieTags, HashSet leppieExpandedTags) { Logger.Log("\nThere are {0:N0} questions and {1:N0} tags in total", rawQuestions.Count, allTags.Count); Logger.Log("Leppie list of {0:N0} tags contains {1:N0} that are wildcards", leppieTags.Count, leppieTags.Count(t => t.Contains('*'))); Logger.Log("Leppie {0:N0} tags with wildcards expand to {1:N0} tags in total", leppieTags.Count, leppieExpandedTags.Count); var remainingTagsHashSet = new CLR.HashSet <string>(allTags.Keys); remainingTagsHashSet.ExceptWith(leppieExpandedTags); Logger.LogStartupMessage("There are {0:N0} tags remaining, {0:N0} + {1:N0} = {2:N0} (Expected: {3:N0})", remainingTagsHashSet.Count, leppieExpandedTags.Count, remainingTagsHashSet.Count + leppieExpandedTags.Count, allTags.Count); Logger.LogStartupMessage("Sanity checking excluded/included tags and questions..."); var excludedQuestionCounter = rawQuestions.Count(question => question.Tags.Any(t => leppieExpandedTags.Contains(t))); var includedQuestionCounter = rawQuestions.Count(question => question.Tags.All(t => remainingTagsHashSet.Contains(t))); Logger.Log("{0:N0} EXCLUDED tags cover {1:N0} questions (out of {2:N0})", leppieExpandedTags.Count, excludedQuestionCounter, rawQuestions.Count); Logger.Log( "{0:N0} remaining tags cover {1:N0} questions, {2:N0} + {3:N0} = {4:N0} (Expected: {5:N0})", remainingTagsHashSet.Count, includedQuestionCounter, includedQuestionCounter, excludedQuestionCounter, includedQuestionCounter + excludedQuestionCounter, rawQuestions.Count); Logger.Log(); }
ComplexQueryResult OrNotQuery(int[] tag1Ids, int[] tag2Ids, int [] allTagIds, int pageSize, int skip, CLR.HashSet <string> tagsToExclude = null) { var queryResult = new ComplexQueryResult { Results = new List <Question>(pageSize), BaseQueryCounter = 0, ItemsSkipped = 0, ExcludedCounter = 0 }; // TODO this has a small bug, we can get items out of order as we pull them thru in pairs // if t2 has several items that are larger than t1, t1 will still come out first!! // So algorithm needs to be: // 1) pull the LARGEST value (from t1 or t2) // 2) process this item // 3) repeat 1) again var orNotHashSet = cache.Value.GetCachedHashSet(tag2Ids); var seenBefore = secondCache.Value.GetCachedHashSet(); using (IEnumerator <int> e1 = tag1Ids.AsEnumerable().GetEnumerator()) using (IEnumerator <int> e2 = allTagIds.AsEnumerable().GetEnumerator()) { while (e1.MoveNext() && e2.MoveNext()) { if (queryResult.Results.Count >= pageSize) { break; } queryResult.BaseQueryCounter++; if (tagsToExclude != null && questions[e1.Current].Tags.Any(t => tagsToExclude.Contains(t))) { queryResult.ExcludedCounter++; } else if (orNotHashSet.Contains(e1.Current) == false && seenBefore.Add(e1.Current)) { if (queryResult.ItemsSkipped >= skip) { queryResult.Results.Add(questions[e1.Current]); } else { queryResult.ItemsSkipped++; } } if (queryResult.Results.Count >= pageSize) { break; } // TODO should we be doing this here as well!!?!?! //baseQueryCounter++; if (tagsToExclude != null && questions[e2.Current].Tags.Any(t => tagsToExclude.Contains(t))) { queryResult.ExcludedCounter++; } else if (orNotHashSet.Contains(e2.Current) == false && seenBefore.Add(e2.Current)) { if (queryResult.ItemsSkipped >= skip) { queryResult.Results.Add(questions[e2.Current]); } else { queryResult.ItemsSkipped++; } } } } return(queryResult); }