Пример #1
0
        /// <summary>
        /// Decodes the Unicode codepoints from the provided
        /// <see cref="ICharSequence"/> and places them in the provided scratch
        /// <see cref="Int32sRef"/>, which must not be <c>null</c>, returning it.
        /// </summary>
        public static Int32sRef ToUTF32(string s, Int32sRef scratch)
        {
            int charIdx   = 0;
            int intIdx    = 0;
            int charLimit = s.Length;

            while (charIdx < charLimit)
            {
                scratch.Grow(intIdx + 1);
                int utf32 = Character.CodePointAt(s, charIdx);
                scratch.Int32s[intIdx] = utf32;
                charIdx += Character.CharCount(utf32);
                intIdx++;
            }
            scratch.Length = intIdx;
            return(scratch);
        }
Пример #2
0
        /// <summary>
        /// Decodes the Unicode codepoints from the provided
        /// <see cref="T:char[]"/> and places them in the provided scratch
        /// <see cref="Int32sRef"/>, which must not be <c>null</c>, returning it.
        /// </summary>
        public static Int32sRef ToUTF32(char[] s, int offset, int length, Int32sRef scratch)
        {
            int charIdx   = offset;
            int intIdx    = 0;
            int charLimit = offset + length;

            while (charIdx < charLimit)
            {
                scratch.Grow(intIdx + 1);
                int utf32 = Character.CodePointAt(s, charIdx, charLimit);
                scratch.Int32s[intIdx] = utf32;
                charIdx += Character.CharCount(utf32);
                intIdx++;
            }
            scratch.Length = intIdx;
            return(scratch);
        }
Пример #3
0
        private T RandomAcceptedWord(FST <T> fst, Int32sRef @in)
        {
            FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>());

            IList <FST.Arc <T> > arcs = new List <FST.Arc <T> >();

            @in.Length = 0;
            @in.Offset = 0;
            T NO_OUTPUT = fst.Outputs.NoOutput;
            T output    = NO_OUTPUT;

            FST.BytesReader fstReader = fst.GetBytesReader();

            while (true)
            {
                // read all arcs:
                fst.ReadFirstTargetArc(arc, arc, fstReader);
                arcs.Add((new FST.Arc <T>()).CopyFrom(arc));
                while (!arc.IsLast)
                {
                    fst.ReadNextArc(arc, fstReader);
                    arcs.Add((new FST.Arc <T>()).CopyFrom(arc));
                }

                // pick one
                arc = arcs[Random.Next(arcs.Count)];
                arcs.Clear();

                // accumulate output
                output = fst.Outputs.Add(output, arc.Output);

                // append label
                if (arc.Label == FST.END_LABEL)
                {
                    break;
                }

                if (@in.Int32s.Length == @in.Length)
                {
                    @in.Grow(1 + @in.Length);
                }
                @in.Int32s[@in.Length++] = arc.Label;
            }

            return(output);
        }
Пример #4
0
        internal static Int32sRef ToInt32sRefUTF32(string s, Int32sRef ir)
        {
            int charLength = s.Length;
            int charIdx    = 0;
            int intIdx     = 0;

            while (charIdx < charLength)
            {
                if (intIdx == ir.Int32s.Length)
                {
                    ir.Grow(intIdx + 1);
                }
                int utf32 = Character.CodePointAt(s, charIdx);
                ir.Int32s[intIdx] = utf32;
                charIdx          += Character.CharCount(utf32);
                intIdx++;
            }
            ir.Length = intIdx;
            return(ir);
        }
Пример #5
0
        /// <summary>
        /// Expert: like <see cref="Util.GetByOutput(FST{long?}, long)"/> except reusing
        /// <see cref="FST.BytesReader"/>, initial and scratch Arc, and result.
        /// </summary>
        public static Int32sRef GetByOutput(FST <long?> fst, long targetOutput, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc, Int32sRef result)
        {
            long output = arc.Output.Value;
            int  upto   = 0;

            //System.out.println("reverseLookup output=" + targetOutput);

            while (true)
            {
                //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
                if (arc.IsFinal)
                {
                    long finalOutput = output + arc.NextFinalOutput.Value;
                    //System.out.println("  isFinal finalOutput=" + finalOutput);
                    if (finalOutput == targetOutput)
                    {
                        result.Length = upto;
                        //System.out.println("    found!");
                        return(result);
                    }
                    else if (finalOutput > targetOutput)
                    {
                        //System.out.println("    not found!");
                        return(null);
                    }
                }

                if (FST <long?> .TargetHasArcs(arc))
                {
                    //System.out.println("  targetHasArcs");
                    if (result.Int32s.Length == upto)
                    {
                        result.Grow(1 + upto);
                    }

                    fst.ReadFirstRealTargetArc(arc.Target, arc, @in);

                    if (arc.BytesPerArc != 0)
                    {
                        int low  = 0;
                        int high = arc.NumArcs - 1;
                        int mid  = 0;
                        //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
                        bool exact = false;
                        while (low <= high)
                        {
                            mid          = (int)((uint)(low + high) >> 1);
                            @in.Position = arc.PosArcsStart;
                            @in.SkipBytes(arc.BytesPerArc * mid);
                            var flags = (sbyte)@in.ReadByte();
                            fst.ReadLabel(@in);
                            long minArcOutput;
                            if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0)
                            {
                                long arcOutput = fst.Outputs.Read(@in).Value;
                                minArcOutput = output + arcOutput;
                            }
                            else
                            {
                                minArcOutput = output;
                            }
                            if (minArcOutput == targetOutput)
                            {
                                exact = true;
                                break;
                            }
                            else if (minArcOutput < targetOutput)
                            {
                                low = mid + 1;
                            }
                            else
                            {
                                high = mid - 1;
                            }
                        }

                        if (high == -1)
                        {
                            return(null);
                        }
                        else if (exact)
                        {
                            arc.ArcIdx = mid - 1;
                        }
                        else
                        {
                            arc.ArcIdx = low - 2;
                        }

                        fst.ReadNextRealArc(arc, @in);
                        result.Int32s[upto++] = arc.Label;
                        output += arc.Output.Value;
                    }
                    else
                    {
                        FST.Arc <long?> prevArc = null;

                        while (true)
                        {
                            //System.out.println("    cycle label=" + arc.label + " output=" + arc.output);

                            // this is the min output we'd hit if we follow
                            // this arc:
                            long minArcOutput = output + arc.Output.Value;

                            if (minArcOutput == targetOutput)
                            {
                                // Recurse on this arc:
                                //System.out.println("  match!  break");
                                output = minArcOutput;
                                result.Int32s[upto++] = arc.Label;
                                break;
                            }
                            else if (minArcOutput > targetOutput)
                            {
                                if (prevArc == null)
                                {
                                    // Output doesn't exist
                                    return(null);
                                }
                                else
                                {
                                    // Recurse on previous arc:
                                    arc.CopyFrom(prevArc);
                                    result.Int32s[upto++] = arc.Label;
                                    output += arc.Output.Value;
                                    //System.out.println("    recurse prev label=" + (char) arc.label + " output=" + output);
                                    break;
                                }
                            }
                            else if (arc.IsLast)
                            {
                                // Recurse on this arc:
                                output = minArcOutput;
                                //System.out.println("    recurse last label=" + (char) arc.label + " output=" + output);
                                result.Int32s[upto++] = arc.Label;
                                break;
                            }
                            else
                            {
                                // Read next arc in this node:
                                prevArc = scratchArc;
                                prevArc.CopyFrom(arc);
                                //System.out.println("      after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
                                fst.ReadNextRealArc(arc, @in);
                            }
                        }
                    }
                }
                else
                {
                    //System.out.println("  no target arcs; not found!");
                    return(null);
                }
            }
        }
Пример #6
0
        private void ProcessFacetFields(ITaxonomyWriter taxoWriter, IDictionary <string, IList <FacetField> > byField, Document doc)
        {
            foreach (KeyValuePair <string, IList <FacetField> > ent in byField)
            {
                string indexFieldName = ent.Key;
                //System.out.println("  indexFieldName=" + indexFieldName + " fields=" + ent.getValue());

                Int32sRef ordinals = new Int32sRef(32);
                foreach (FacetField facetField in ent.Value)
                {
                    FacetsConfig.DimConfig ft = GetDimConfig(facetField.Dim);
                    if (facetField.Path.Length > 1 && ft.IsHierarchical == false)
                    {
                        throw new ArgumentException("dimension \"" + facetField.Dim + "\" is not hierarchical yet has " + facetField.Path.Length + " components");
                    }

                    FacetLabel cp = new FacetLabel(facetField.Dim, facetField.Path);

                    CheckTaxoWriter(taxoWriter);
                    int ordinal = taxoWriter.AddCategory(cp);
                    if (ordinals.Length == ordinals.Int32s.Length)
                    {
                        ordinals.Grow(ordinals.Length + 1);
                    }
                    ordinals.Int32s[ordinals.Length++] = ordinal;
                    //System.out.println("ords[" + (ordinals.length-1) + "]=" + ordinal);
                    //System.out.println("  add cp=" + cp);

                    if (ft.IsMultiValued && (ft.IsHierarchical || ft.RequireDimCount))
                    {
                        //System.out.println("  add parents");
                        // Add all parents too:
                        int parent = taxoWriter.GetParent(ordinal);
                        while (parent > 0)
                        {
                            if (ordinals.Int32s.Length == ordinals.Length)
                            {
                                ordinals.Grow(ordinals.Length + 1);
                            }
                            ordinals.Int32s[ordinals.Length++] = parent;
                            parent = taxoWriter.GetParent(parent);
                        }

                        if (ft.RequireDimCount == false)
                        {
                            // Remove last (dimension) ord:
                            ordinals.Length--;
                        }
                    }

                    // Drill down:
                    for (int i = 1; i <= cp.Length; i++)
                    {
                        doc.Add(new StringField(indexFieldName, PathToString(cp.Components, i), Field.Store.NO));
                    }
                }

                // Facet counts:
                // DocValues are considered stored fields:
                doc.Add(new BinaryDocValuesField(indexFieldName, DedupAndEncode(ordinals)));
            }
        }
Пример #7
0
        public UserDictionary(TextReader reader)
        {
            string line   = null;
            int    wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;

            JCG.List <string[]> featureEntries = new JCG.List <string[]>();

            // text, segmentation, readings, POS
            while ((line = reader.ReadLine()) != null)
            {
                // Remove comments
                line = specialChars.Replace(line, "");

                // Skip empty lines or comment lines
                if (line.Trim().Length == 0)
                {
                    continue;
                }
                string[] values = CSVUtil.Parse(line);
                featureEntries.Add(values);
            }

            // TODO: should we allow multiple segmentations per input 'phrase'?
            // the old treemap didn't support this either, and i'm not sure if its needed/useful?
            featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0])));

            JCG.List <string> data          = new JCG.List <string>(featureEntries.Count);
            JCG.List <int[]>  segmentations = new JCG.List <int[]>(featureEntries.Count);

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <Int64>      fstBuilder = new Builder <Int64>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
            Int32sRef            scratch    = new Int32sRef();
            long ord = 0;

            foreach (string[] values in featureEntries)
            {
                string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd();
                string[] readings     = commentLine.Replace(values[2], " ").Split(' ').TrimEnd();
                string   pos          = values[3];

                if (segmentation.Length != readings.Length)
                {
                    throw RuntimeException.Create("Illegal user dictionary entry " + values[0] +
                                                  " - the number of segmentations (" + segmentation.Length + ")" +
                                                  " does not the match number of readings (" + readings.Length + ")");
                }

                int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
                wordIdAndLength[0] = wordId;
                for (int i = 0; i < segmentation.Length; i++)
                {
                    wordIdAndLength[i + 1] = segmentation[i].Length;
                    data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
                    wordId++;
                }
                // add mapping to FST
                string token = values[0];
                scratch.Grow(token.Length);
                scratch.Length = token.Length;
                for (int i = 0; i < token.Length; i++)
                {
                    scratch.Int32s[i] = (int)token[i];
                }
                fstBuilder.Add(scratch, ord);
                segmentations.Add(wordIdAndLength);
                ord++;
            }
            this.fst           = new TokenInfoFST(fstBuilder.Finish(), false);
            this.data          = data.ToArray(/*new string[data.Count]*/);
            this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/);
        }
Пример #8
0
        public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles)
        {
            TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

            // all lines in the file
            Console.WriteLine("  parse...");
            List <string[]> lines = new List <string[]>(400000);

            foreach (string file in csvFiles)
            {
                using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read))
                {
                    Encoding   decoder = Encoding.GetEncoding(encoding);
                    TextReader reader  = new StreamReader(inputStream, decoder);

                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        string[] entry = CSVUtil.Parse(line);

                        if (entry.Length < 13)
                        {
                            Console.WriteLine("Entry in CSV is not valid: " + line);
                            continue;
                        }

                        string[] formatted = FormatEntry(entry);
                        lines.Add(formatted);

                        // NFKC normalize dictionary entry
                        if (normalizeEntries)
                        {
                            //if (normalizer.isNormalized(entry[0])){
                            if (entry[0].IsNormalized(NormalizationForm.FormKC))
                            {
                                continue;
                            }
                            string[] normalizedEntry = new string[entry.Length];
                            for (int i = 0; i < entry.Length; i++)
                            {
                                //normalizedEntry[i] = normalizer.normalize(entry[i]);
                                normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
                            }

                            formatted = FormatEntry(normalizedEntry);
                            lines.Add(formatted);
                        }
                    }
                }
            }

            Console.WriteLine("  sort...");

            // sort by term: we sorted the files already and use a stable sort.
            lines.Sort(new ComparerAnonymousHelper());

            Console.WriteLine("  encode...");

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <long?>      fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
            Int32sRef            scratch    = new Int32sRef();
            long   ord       = -1; // first ord will be 0
            string lastValue = null;

            // build tokeninfo dictionary
            foreach (string[] entry in lines)
            {
                int next = dictionary.Put(entry);

                if (next == offset)
                {
                    Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
                    continue;
                }

                string token = entry[0];
                if (!token.Equals(lastValue, StringComparison.Ordinal))
                {
                    // new word to add to fst
                    ord++;
                    lastValue = token;
                    scratch.Grow(token.Length);
                    scratch.Length = token.Length;
                    for (int i = 0; i < token.Length; i++)
                    {
                        scratch.Int32s[i] = (int)token[i];
                    }
                    fstBuilder.Add(scratch, ord);
                }
                dictionary.AddMapping((int)ord, offset);
                offset = next;
            }

            FST <long?> fst = fstBuilder.Finish();

            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
            dictionary.SetFST(fst);
            Console.WriteLine(" done");

            return(dictionary);
        }