Exemplo n.º 1
0
        private static IEnumerable <ArraySegment <byte> > CalculateSegmentStrip(IEnumerable <ArraySegment <byte> > Input, InputOptions Options)
        {
            bool[] lookup;
            if (Options.HasFlag(InputOptions.StripWhitespaceAndPunctuation))
            {
                lookup = WhitespacePunctuationCharLookup;
            }
            else if (Options.HasFlag(InputOptions.StripWhitespace))
            {
                lookup = WhitespaceCharLookup;
            }
            else if (Options.HasFlag(InputOptions.StripPunctuation))
            {
                lookup = PunctuationCharLookup;
            }
            else
            {
                foreach (var segment in Input)
                {
                    yield return(segment);
                }
                yield break;
            }

            if (lookup == null || lookup.Length < 256)
            {
                throw new InvalidOperationException("Lookup should never be null");
            }

            foreach (var segment in Input)
            {
                // trim start
                var offset = segment.Offset;
                var end    = offset + segment.Count;
                var buffer = segment.Array;
                for (; offset < end; offset++)
                {
                    if (!lookup[buffer[offset]])
                    {
                        goto trimEnd;
                    }
                }
                continue;
trimEnd:
                end--;
                for (; end > offset; end--)
                {
                    if (!lookup[buffer[end]])
                    {
                        yield return(new ArraySegment <byte>(buffer, offset, end - offset));

                        continue;
                    }
                }
            }
        }
Exemplo n.º 2
0
        private static ShocoModel GenerateModelFromSegments(IEnumerable <ArraySegment <byte> > Input, InputOptions Options, int MaximumLeadingBits, int MaxSuccessorBits, int EncodingTypes, bool OptimizeEncoding)
        {
            var input = Input;

            if (Options.HasFlag(InputOptions.SplitNewLine) || Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine))
            {
                input = CalculateSegmentSplit(input, Options);
            }
            if (Options.HasFlag(InputOptions.StripPunctuation) || Options.HasFlag(InputOptions.StripWhitespace))
            {
                input = CalculateSegmentStrip(input, Options);
            }

            return(GenerateModelFromSegments(input, MaximumLeadingBits, MaxSuccessorBits, EncodingTypes, OptimizeEncoding));
        }
Exemplo n.º 3
0
        private static IEnumerable <ArraySegment <byte> > CalculateSegments(IEnumerable <string> Input, Encoding InputEncoding, InputOptions Options)
        {
            var input = Input;

            if (Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine))
            {
                input = input.SelectMany(i => i.Split(WhitespaceChars, StringSplitOptions.RemoveEmptyEntries));
            }
            else if (Options.HasFlag(InputOptions.SplitNewLine))
            {
                input = input.SelectMany(i => i.Split(NewLineChars, StringSplitOptions.RemoveEmptyEntries));
            }
            if (Options.HasFlag(InputOptions.StripWhitespaceAndPunctuation))
            {
                input = input.Select(i => i.Trim(WhitespacePunctuationChars));
            }
            else if (Options.HasFlag(InputOptions.StripWhitespace))
            {
                input = input.Select(i => i.Trim(WhitespaceChars));
            }
            else if (Options.HasFlag(InputOptions.StripPunctuation))
            {
                input = input.Select(i => i.Trim(PunctuationChars));
            }

            var buffer = new byte[2048];

            foreach (var segment in input)
            {
                if (buffer.Length < InputEncoding.GetMaxByteCount(segment.Length))
                {
                    buffer = new byte[InputEncoding.GetMaxByteCount(segment.Length)];
                }

                var length = InputEncoding.GetBytes(segment, 0, segment.Length, buffer, 0);

                if (length <= 1)
                {
                    continue;
                }

                yield return(new ArraySegment <byte>(buffer, 0, length));
            }
        }
Exemplo n.º 4
0
        private static IEnumerable <ArraySegment <byte> > CalculateSegmentSplit(IEnumerable <ArraySegment <byte> > Input, InputOptions Options)
        {
            if (Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine))
            {
                foreach (var segment in Input)
                {
                    var offset = segment.Offset;
                    var buffer = segment.Array;
                    var index  = offset;
                    for (; index < segment.Count; index++)
                    {
                        if (WhitespaceCharLookup[buffer[index]])
                        {
                            if (offset < index)
                            {
                                yield return(new ArraySegment <byte>(buffer, offset, offset - index));

                                offset = index + 1;
                            }
                        }
                    }
                    if (offset < index)
                    {
                        yield return(new ArraySegment <byte>(buffer, offset, offset - index));
                    }
                }
            }
            else if (Options.HasFlag(InputOptions.SplitNewLine))
            {
                foreach (var segment in Input)
                {
                    var offset = segment.Offset;
                    var buffer = segment.Array;
                    var index  = offset;
                    for (; index < segment.Count; index++)
                    {
                        if (buffer[index] == 10 || // '\n'
                            buffer[index] == 13)   // '\r'
                        {
                            if (offset < index)
                            {
                                yield return(new ArraySegment <byte>(buffer, offset, offset - index));

                                offset = index + 1;
                            }
                        }
                    }
                    if (offset < index)
                    {
                        yield return(new ArraySegment <byte>(buffer, offset, offset - index));
                    }
                }
            }
            else
            {
                foreach (var segment in Input)
                {
                    yield return(segment);
                }
            }
        }
Exemplo n.º 5
0
        private static IEnumerable <ArraySegment <byte> > CalculateSegments(IEnumerable <Stream> Input, InputOptions Options)
        {
            var segment     = new byte[128];
            var buffer      = new byte[2048];
            var splitLookup = FalseCharLookup;
            var stripLookup = FalseCharLookup;

            if (Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine))
            {
                splitLookup = WhitespaceCharLookup;
            }
            else if (Options.HasFlag(InputOptions.SplitNewLine))
            {
                splitLookup = NewLineCharLookup;
            }

            if (Options.HasFlag(InputOptions.StripWhitespaceAndPunctuation))
            {
                stripLookup = WhitespacePunctuationCharLookup;
            }
            else if (Options.HasFlag(InputOptions.StripWhitespace))
            {
                stripLookup = WhitespaceCharLookup;
            }
            else if (Options.HasFlag(InputOptions.StripPunctuation))
            {
                stripLookup = PunctuationCharLookup;
            }

            foreach (var stream in Input)
            {
                int segmentIndex = 0;
                var bufferLength = stream.Read(buffer, 0, buffer.Length);
                while (bufferLength > 0)
                {
                    for (int index = 0; index < bufferLength; index++)
                    {
                        var c = buffer[index];
                        if (splitLookup[c])
                        {
                            // trim end
                            for (segmentIndex--; segmentIndex > 0; segmentIndex--)
                            {
                                if (!stripLookup[segment[segmentIndex]])
                                {
                                    break;
                                }
                            }

                            if (segmentIndex++ > 0)
                            {
                                yield return(new ArraySegment <byte>(segment, 0, segmentIndex));
                            }

                            segmentIndex = 0;
                            continue;
                        }
                        if (segmentIndex > 0)
                        {
                            if (segment.Length <= segmentIndex)
                            {
                                Array.Resize(ref segment, segment.Length * 2);
                            }

                            segment[segmentIndex++] = c;
                            continue;
                        }
                        if (stripLookup[c])
                        {
                            continue; // trim start
                        }
                        else
                        {
                            segment[segmentIndex++] = c;
                        }
                    }
                    bufferLength = stream.Read(buffer, 0, buffer.Length);
                }

                if (segmentIndex++ > 0)
                {
                    yield return(new ArraySegment <byte>(segment, 0, segmentIndex));
                }
            }
        }