/// <summary> /// Decodes the Unicode codepoints from the provided /// <see cref="ICharSequence"/> and places them in the provided scratch /// <see cref="Int32sRef"/>, which must not be <c>null</c>, returning it. /// </summary> public static Int32sRef ToUTF32(string s, Int32sRef scratch) { int charIdx = 0; int intIdx = 0; int charLimit = s.Length; while (charIdx < charLimit) { scratch.Grow(intIdx + 1); int utf32 = Character.CodePointAt(s, charIdx); scratch.Int32s[intIdx] = utf32; charIdx += Character.CharCount(utf32); intIdx++; } scratch.Length = intIdx; return(scratch); }
/// <summary> /// Decodes the Unicode codepoints from the provided /// <see cref="T:char[]"/> and places them in the provided scratch /// <see cref="Int32sRef"/>, which must not be <c>null</c>, returning it. /// </summary> public static Int32sRef ToUTF32(char[] s, int offset, int length, Int32sRef scratch) { int charIdx = offset; int intIdx = 0; int charLimit = offset + length; while (charIdx < charLimit) { scratch.Grow(intIdx + 1); int utf32 = Character.CodePointAt(s, charIdx, charLimit); scratch.Int32s[intIdx] = utf32; charIdx += Character.CharCount(utf32); intIdx++; } scratch.Length = intIdx; return(scratch); }
private T RandomAcceptedWord(FST <T> fst, Int32sRef @in) { FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); IList <FST.Arc <T> > arcs = new List <FST.Arc <T> >(); @in.Length = 0; @in.Offset = 0; T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.GetBytesReader(); while (true) { // read all arcs: fst.ReadFirstTargetArc(arc, arc, fstReader); arcs.Add((new FST.Arc <T>()).CopyFrom(arc)); while (!arc.IsLast) { fst.ReadNextArc(arc, fstReader); arcs.Add((new FST.Arc <T>()).CopyFrom(arc)); } // pick one arc = arcs[Random.Next(arcs.Count)]; arcs.Clear(); // accumulate output output = fst.Outputs.Add(output, arc.Output); // append label if (arc.Label == FST.END_LABEL) { break; } if (@in.Int32s.Length == @in.Length) { @in.Grow(1 + @in.Length); } @in.Int32s[@in.Length++] = arc.Label; } return(output); }
internal static Int32sRef ToInt32sRefUTF32(string s, Int32sRef ir) { int charLength = s.Length; int charIdx = 0; int intIdx = 0; while (charIdx < charLength) { if (intIdx == ir.Int32s.Length) { ir.Grow(intIdx + 1); } int utf32 = Character.CodePointAt(s, charIdx); ir.Int32s[intIdx] = utf32; charIdx += Character.CharCount(utf32); intIdx++; } ir.Length = intIdx; return(ir); }
/// <summary> /// Expert: like <see cref="Util.GetByOutput(FST{long?}, long)"/> except reusing /// <see cref="FST.BytesReader"/>, initial and scratch Arc, and result. /// </summary> public static Int32sRef GetByOutput(FST <long?> fst, long targetOutput, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc, Int32sRef result) { long output = arc.Output.Value; int upto = 0; //System.out.println("reverseLookup output=" + targetOutput); while (true) { //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc); if (arc.IsFinal) { long finalOutput = output + arc.NextFinalOutput.Value; //System.out.println(" isFinal finalOutput=" + finalOutput); if (finalOutput == targetOutput) { result.Length = upto; //System.out.println(" found!"); return(result); } else if (finalOutput > targetOutput) { //System.out.println(" not found!"); return(null); } } if (FST <long?> .TargetHasArcs(arc)) { //System.out.println(" targetHasArcs"); if (result.Int32s.Length == upto) { result.Grow(1 + upto); } fst.ReadFirstRealTargetArc(arc.Target, arc, @in); if (arc.BytesPerArc != 0) { int low = 0; int high = arc.NumArcs - 1; int mid = 0; //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output); bool exact = false; while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid); var flags = (sbyte)@in.ReadByte(); fst.ReadLabel(@in); long minArcOutput; if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) { long arcOutput = fst.Outputs.Read(@in).Value; minArcOutput = output + arcOutput; } else { minArcOutput = output; } if (minArcOutput == targetOutput) { exact = true; break; } else if (minArcOutput < targetOutput) { low = mid + 1; } else { high = mid - 1; } } if (high == -1) { return(null); } else if (exact) { arc.ArcIdx = mid - 1; } else { arc.ArcIdx = low - 2; } fst.ReadNextRealArc(arc, @in); result.Int32s[upto++] = arc.Label; output += arc.Output.Value; } else { FST.Arc <long?> prevArc = null; while (true) { //System.out.println(" cycle label=" + arc.label + " output=" + arc.output); // this is the min output we'd hit if we follow // this arc: long minArcOutput = output + arc.Output.Value; if (minArcOutput == targetOutput) { // Recurse on this arc: //System.out.println(" match! break"); output = minArcOutput; result.Int32s[upto++] = arc.Label; break; } else if (minArcOutput > targetOutput) { if (prevArc == null) { // Output doesn't exist return(null); } else { // Recurse on previous arc: arc.CopyFrom(prevArc); result.Int32s[upto++] = arc.Label; output += arc.Output.Value; //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output); break; } } else if (arc.IsLast) { // Recurse on this arc: output = minArcOutput; //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output); result.Int32s[upto++] = arc.Label; break; } else { // Read next arc in this node: prevArc = scratchArc; prevArc.CopyFrom(arc); //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label); fst.ReadNextRealArc(arc, @in); } } } } else { //System.out.println(" no target arcs; not found!"); return(null); } } }
private void ProcessFacetFields(ITaxonomyWriter taxoWriter, IDictionary <string, IList <FacetField> > byField, Document doc) { foreach (KeyValuePair <string, IList <FacetField> > ent in byField) { string indexFieldName = ent.Key; //System.out.println(" indexFieldName=" + indexFieldName + " fields=" + ent.getValue()); Int32sRef ordinals = new Int32sRef(32); foreach (FacetField facetField in ent.Value) { FacetsConfig.DimConfig ft = GetDimConfig(facetField.Dim); if (facetField.Path.Length > 1 && ft.IsHierarchical == false) { throw new ArgumentException("dimension \"" + facetField.Dim + "\" is not hierarchical yet has " + facetField.Path.Length + " components"); } FacetLabel cp = new FacetLabel(facetField.Dim, facetField.Path); CheckTaxoWriter(taxoWriter); int ordinal = taxoWriter.AddCategory(cp); if (ordinals.Length == ordinals.Int32s.Length) { ordinals.Grow(ordinals.Length + 1); } ordinals.Int32s[ordinals.Length++] = ordinal; //System.out.println("ords[" + (ordinals.length-1) + "]=" + ordinal); //System.out.println(" add cp=" + cp); if (ft.IsMultiValued && (ft.IsHierarchical || ft.RequireDimCount)) { //System.out.println(" add parents"); // Add all parents too: int parent = taxoWriter.GetParent(ordinal); while (parent > 0) { if (ordinals.Int32s.Length == ordinals.Length) { ordinals.Grow(ordinals.Length + 1); } ordinals.Int32s[ordinals.Length++] = parent; parent = taxoWriter.GetParent(parent); } if (ft.RequireDimCount == false) { // Remove last (dimension) ord: ordinals.Length--; } } // Drill down: for (int i = 1; i <= cp.Length; i++) { doc.Add(new StringField(indexFieldName, PathToString(cp.Components, i), Field.Store.NO)); } } // Facet counts: // DocValues are considered stored fields: doc.Add(new BinaryDocValuesField(indexFieldName, DedupAndEncode(ordinals))); } }
public UserDictionary(TextReader reader) { string line = null; int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; JCG.List <string[]> featureEntries = new JCG.List <string[]>(); // text, segmentation, readings, POS while ((line = reader.ReadLine()) != null) { // Remove comments line = specialChars.Replace(line, ""); // Skip empty lines or comment lines if (line.Trim().Length == 0) { continue; } string[] values = CSVUtil.Parse(line); featureEntries.Add(values); } // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if its needed/useful? featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0]))); JCG.List <string> data = new JCG.List <string>(featureEntries.Count); JCG.List <int[]> segmentations = new JCG.List <int[]>(featureEntries.Count); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <Int64> fstBuilder = new Builder <Int64>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput); Int32sRef scratch = new Int32sRef(); long ord = 0; foreach (string[] values in featureEntries) { string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd(); string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd(); string pos = values[3]; if (segmentation.Length != readings.Length) { throw RuntimeException.Create("Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.Length + ")" + " does not the match number of readings (" + readings.Length + ")"); } int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.Length; i++) { wordIdAndLength[i + 1] = segmentation[i].Length; data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST string token = values[0]; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); segmentations.Add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.Finish(), false); this.data = data.ToArray(/*new string[data.Count]*/); this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/); }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }