public static void Main(string[] args) { FileInfo input = new FileInfo("/home/dweiss/tmp/shuffled.dict"); int buckets = 20; int shareMaxTail = 10; ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter()); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail); TextReader reader = new StreamReader( new FileStream(input.FullName, FileMode.Open), Encoding.UTF8); BytesRef scratch = new BytesRef(); string line; int count = 0; while ((line = reader.ReadLine()) != null) { scratch.CopyChars(line); builder.Add(scratch, count % buckets); if ((count++ % 100000) == 0) { Console.WriteLine("Line: " + count); } } Console.WriteLine("Building FSTCompletion."); FSTCompletion completion = builder.Build(); FileInfo fstFile = new FileInfo("completion.fst"); Console.WriteLine("Done. Writing automaton: " + fstFile.FullName); completion.FST.Save(fstFile); sorter.Dispose(); }
public override void SetUp() { base.SetUp(); FSTCompletionBuilder builder = new FSTCompletionBuilder(); foreach (Input tf in EvalKeys()) { builder.Add(tf.term, (int)tf.v); } completion = builder.Build(); completionAlphabetical = new FSTCompletion(completion.FST, false, true); }
public void TestThreeByte() { //string key = new string(new sbyte[] { // (sbyte) 0xF0, (sbyte) 0xA4, (sbyte) 0xAD, (sbyte) 0xA2}, 0, 4, Encoding.UTF8); string key = Encoding.UTF8.GetString(new byte[] { 0xF0, 0xA4, 0xAD, 0xA2 }); FSTCompletionBuilder builder = new FSTCompletionBuilder(); builder.Add(new BytesRef(key), 0); FSTCompletion lookup = builder.Build(); IEnumerable <FSTCompletion.Completion> result = lookup.DoLookup(StringToCharSequence(key).ToString(), 1); assertEquals(1, result.Count()); }
// LUCENENET specific - renaming from Main() because we must only have 1 entry point. // Not sure why this utility is in a test project anyway - this seems like something that should // be in Lucene.Net.Suggest so we can put it into the lucene-cli tool. public static void Main2(string[] args) { FileInfo input = new FileInfo("/home/dweiss/tmp/shuffled.dict"); int buckets = 20; int shareMaxTail = 10; ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter()); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail); TextReader reader = new StreamReader( new FileStream(input.FullName, FileMode.Open), Encoding.UTF8); BytesRef scratch = new BytesRef(); string line; int count = 0; while ((line = reader.ReadLine()) != null) { scratch.CopyChars(line); builder.Add(scratch, count % buckets); if ((count++ % 100000) == 0) { Console.WriteLine("Line: " + count); } } Console.WriteLine("Building FSTCompletion."); FSTCompletion completion = builder.Build(); FileInfo fstFile = new FileInfo("completion.fst"); Console.WriteLine("Done. Writing automaton: " + fstFile.FullName); completion.FST.Save(fstFile); sorter.Dispose(); }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
public void TestThreeByte() { //string key = new string(new sbyte[] { // (sbyte) 0xF0, (sbyte) 0xA4, (sbyte) 0xAD, (sbyte) 0xA2}, 0, 4, Encoding.UTF8); string key = Encoding.UTF8.GetString(new byte[] { 0xF0, 0xA4, 0xAD, 0xA2 }); FSTCompletionBuilder builder = new FSTCompletionBuilder(); builder.Add(new BytesRef(key), 0); FSTCompletion lookup = builder.Build(); IEnumerable<FSTCompletion.Completion> result = lookup.DoLookup(StringToCharSequence(key).ToString(), 1); assertEquals(1, result.Count()); }