/// <summary> /// Make sure we find exactly the known data we're after /// </summary> static void CanLocateKnownData(TestContext context) { var index = new Dictionary<string, byte[]>(); var encoder = new Base128Encoder(); // The term "cat" matches documents 2,4,6,8 var docsWithCat = new List<uint>() { 2, 4, 6, 8 }; index["cat"] = encoder.EncodeList(docsWithCat); // The term "dog" matches documents 6,8,10,12 var docsWithDog = new List<uint>() { 6, 8, 10, 12 }; index["dog"] = encoder.EncodeList(docsWithDog); // query for docs that have both "cat" and "dog" var results = new HashSet<uint>(); var firstPass = true; foreach (var pair in index) { var idList = encoder.DecodeList(pair.Value); if (firstPass) { firstPass = false; results.UnionWith(idList); } else { results.IntersectWith(idList); } } // Should have only found 6 and 8 Assert.AreEqual(2, results.Count); Assert.AreEqual((uint)6, results.ElementAt(0)); Assert.AreEqual((uint)8, results.ElementAt(1)); }
/// <summary> /// Helper method to add index entries /// </summary> static void addIndexValue(string termName, string termValue, uint docId, uint position, Dictionary<uint, TermValuePair> termHashMap, Dictionary<uint, DocPositionPair> docHashMap, Dictionary<uint, byte[]> index, Base128Encoder encoder) { var termPair = new TermValuePair(termName, termValue); var termPairHash = termPair.ComputeHashValue(); if (!termHashMap.ContainsKey(termPairHash)) { termHashMap[termPairHash] = termPair; } var docPair = new DocPositionPair(docId, position); var docPairHash = docPair.ComputeHashValue(); if (!docHashMap.ContainsKey(docPairHash)) { docHashMap[docPairHash] = docPair; } if (!index.ContainsKey(termPairHash)) { index[termPairHash] = encoder.EncodeList(new List<uint>() { docPairHash }); } else { var currentList = encoder.DecodeList(index[termPairHash]); currentList.Add(docPairHash); index[termPairHash] = encoder.EncodeList(currentList); } }
/// <summary> /// Tests the varint decoding performance. /// </summary> static void LoadTestVarintDecoding(TestContext context) { // Test config var TERMS = 50000; var DOCS_PER_TERM = 10000; var TOTAL_DOCS = 100000; // test support var sw = new Stopwatch(); var rng = new Random(); // Our simple inverted index and an encoder // Note the index does not map term to list-of-docid... // Instead, it maps term to compressed range. var index = new Dictionary<string, byte[]>(); var encoder = new Base128Encoder(); //////////////////////////////////////////////////////// // // Create index data // for (var j = 0; j < TERMS; j++) { var docLinks = new List<uint>(); // add document links for (var m = 0; m < DOCS_PER_TERM; m++) { docLinks.Add((uint)rng.Next(1, TOTAL_DOCS)); } // compress the docID list // We'll just use 'j' in string form as our term index[j.ToString()] = encoder.EncodeList(docLinks); } //////////////////////////////////////////////////////// // // Search index data // var decodeCount = 0; sw.Start(); // how about just a simple linear scan for now foreach (var termPair in index) { var decodedList = encoder.DecodeList(termPair.Value); decodeCount += decodedList.Count; } //////////////////////////////////////////////////////// // // Results summary // sw.Stop(); context.WriteLine(string.Format("Decoded {0} keys in {1} ms", decodeCount, sw.ElapsedMilliseconds)); }
/// <summary> /// Tests a decode / set-operation scenario /// </summary> static void LoadTestScenario(TestContext context) { // Test config var TERMS = 50000; var DOCS_PER_TERM = 10000; var TOTAL_DOCS = 100000; // test support var sw = new Stopwatch(); var rng = new Random(); // Our simple inverted index and an encoder // Note the index does not map term to list-of-docid... // Instead, it maps term to compressed range. var index = new Dictionary<string, byte[]>(); var encoder = new Base128Encoder(); //////////////////////////////////////////////////////// // // Create index data // for (var j = 0; j < TERMS; j++) { var docLinks = new List<uint>(); // add document links for (var m = 0; m < DOCS_PER_TERM; m++) { docLinks.Add((uint)rng.Next(1, TOTAL_DOCS)); } // compress the docID list // We'll just use 'j' in string form as our term index[j.ToString()] = encoder.EncodeList(docLinks); } //////////////////////////////////////////////////////// // // Search index data // // Let's search for 'terms' that begin with '5000' // We then want all docs that are in both terms var query = new Predicate<string>((s) => s.StartsWith("5000")); var outSet = new HashSet<uint>(); var firstPass = true; sw.Start(); // how about just a simple linear scan for now foreach (var termPair in index) { if (query(termPair.Key)) { // decode var decodedList = encoder.DecodeList(termPair.Value); // can't intersect with an empty set - always result in nullset. if (firstPass) { firstPass = false; outSet.UnionWith(decodedList); } else { outSet.IntersectWith(decodedList); } } } //////////////////////////////////////////////////////// // // Results summary // sw.Stop(); context.WriteLine(string.Format("Found {0} results", outSet.Count)); context.WriteLine(string.Format("Query time: {0}ms", sw.ElapsedMilliseconds)); }
/// <summary> /// Make sure we can get a list back in the same forms. /// </summary> static void ListEncodesAndDecodes(TestContext context) { var rawList = new List<uint>(); for (var j = 0; j < 100; j++) { rawList.Add((uint)j); } var encoder = new Base128Encoder(); var decodedList = encoder.DecodeList(encoder.EncodeList(rawList)); rawList.Sort(); decodedList.Sort(); Assert.AreEqual(rawList.Count, decodedList.Count); for (var m = 0; m < rawList.Count; m++) { Assert.AreEqual(rawList[m], decodedList[m]); } }
/// <summary> /// Just a check to see how much space we're saving /// </summary> static void CompressionYieldsLessStorage(TestContext context) { // create a list of 2,000,000 uints // at 4bytes per uint, this should be ~7.8 - 8.0 MB var TEST_SIZE = 2000000; var docIds = new List<UInt32>(TEST_SIZE); var rng = new Random(); for (var j = 0; j < TEST_SIZE; j++) { docIds.Add((uint)rng.Next(1, Int32.MaxValue)); } // now compress that list var encoder = new Base128Encoder(); var packedData = encoder.EncodeList(docIds); // How much space did we save context.WriteLine(String.Format("Raw size: {0} bytes", TEST_SIZE * 4)); context.WriteLine(String.Format("Packed size: {0} bytes", packedData.Length)); }