public FSTFieldWriter(VariableGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer) { this.outerInstance = outerInstance; this.fieldInfo = fieldInfo; fstOutputs = PositiveInt32Outputs.Singleton; fstBuilder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, fstOutputs); indexStart = outerInstance.m_output.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream ////System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in fstBuilder.Add(new Int32sRef(), termsFilePointer); startTermsFilePointer = termsFilePointer; }
internal TermsWriter(FSTOrdTermsWriter outerInstance, FieldInfo fieldInfo) { _outerInstance = outerInstance; _numTerms = 0; _fieldInfo = fieldInfo; _longsSize = outerInstance.postingsWriter.SetField(fieldInfo); _outputs = PositiveInt32Outputs.Singleton; _builder = new Builder <Int64>(FST.INPUT_TYPE.BYTE1, _outputs); _lastBlockStatsFp = 0; _lastBlockMetaLongsFp = 0; _lastBlockMetaBytesFp = 0; _lastBlockLongs = new long[_longsSize]; _lastLongs = new long[_longsSize]; _lastMetaBytesFp = 0; }
private void LoadTermsIndex() { if (fst == null) { using (IndexInput clone = (IndexInput)outerInstance.input.Clone()) { clone.Seek(indexStart); fst = new FST <long?>(clone, outerInstance.fstOutputs); } // clone.Dispose(); /* * final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; * Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); * Util.toDot(fst, w, false, false); * System.out.println("FST INDEX: SAVED to " + dotFileName); * w.close(); */ if (outerInstance.indexDivisor > 1) { // subsample Int32sRef scratchIntsRef = new Int32sRef(); PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefFSTEnum <long?> fstEnum = new BytesRefFSTEnum <long?>(fst); BytesRefFSTEnum.InputOutput <long?> result; int count = outerInstance.indexDivisor; while (fstEnum.MoveNext()) { result = fstEnum.Current; if (count == outerInstance.indexDivisor) { builder.Add(Util.Fst.Util.ToInt32sRef(result.Input, scratchIntsRef), result.Output); count = 0; } count++; } fst = builder.Finish(); } } }
private void WriteFST(FieldInfo field, IEnumerable <BytesRef> values) { meta.WriteVInt32(field.Number); meta.WriteByte(MemoryDocValuesProducer.FST); meta.WriteInt64(data.GetFilePointer()); PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; var builder = new Builder <long?>(INPUT_TYPE.BYTE1, outputs); var scratch = new Int32sRef(); long ord = 0; foreach (BytesRef v in values) { builder.Add(Util.ToInt32sRef(v, scratch), ord); ord++; } FST <long?> fst = builder.Finish(); if (fst != null) { fst.Save(data); } meta.WriteVInt64(ord); }
private void WriteFST(FieldInfo field, IEnumerable <BytesRef> values) { meta.WriteVInt32(field.Number); meta.WriteByte((byte)Lucene42DocValuesProducer.FST); meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; Builder <Int64> builder = new Builder <Int64>(INPUT_TYPE.BYTE1, outputs); Int32sRef scratch = new Int32sRef(); long ord = 0; foreach (BytesRef v in values) { builder.Add(Util.ToInt32sRef(v, scratch), ord); ord++; } var fst = builder.Finish(); if (fst != null) { fst.Save(data); } meta.WriteVInt64(ord); }
public UserDictionary(TextReader reader) { string line = null; int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; JCG.List <string[]> featureEntries = new JCG.List <string[]>(); // text, segmentation, readings, POS while ((line = reader.ReadLine()) != null) { // Remove comments line = specialChars.Replace(line, ""); // Skip empty lines or comment lines if (line.Trim().Length == 0) { continue; } string[] values = CSVUtil.Parse(line); featureEntries.Add(values); } // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if its needed/useful? featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0]))); JCG.List <string> data = new JCG.List <string>(featureEntries.Count); JCG.List <int[]> segmentations = new JCG.List <int[]>(featureEntries.Count); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <Int64> fstBuilder = new Builder <Int64>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput); Int32sRef scratch = new Int32sRef(); long ord = 0; foreach (string[] values in featureEntries) { string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd(); string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd(); string pos = values[3]; if (segmentation.Length != readings.Length) { throw RuntimeException.Create("Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.Length + ")" + " does not the match number of readings (" + readings.Length + ")"); } int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.Length; i++) { wordIdAndLength[i + 1] = segmentation[i].Length; data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST string token = values[0]; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); segmentations.Add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.Finish(), false); this.data = data.ToArray(/*new string[data.Count]*/); this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/); }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }
private void LoadTerms() { PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs, outputsInner); var b = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput @in = (IndexInput)outerInstance.input.Clone(); @in.Seek(termsStart); BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); Int32sRef scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(@in, scratch); if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16); int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length); visitedDocs.Set(docID); } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16); totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length); } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); } lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = visitedDocs.Cardinality; fst = b.Finish(); /* * PrintStream ps = new PrintStream("out.dot"); * fst.toDot(ps); * ps.close(); * System.out.println("SAVED out.dot"); */ //System.out.println("FST " + fst.sizeInBytes()); }