private static IEnumerable <ArraySegment <byte> > CalculateSegmentStrip(IEnumerable <ArraySegment <byte> > Input, InputOptions Options) { bool[] lookup; if (Options.HasFlag(InputOptions.StripWhitespaceAndPunctuation)) { lookup = WhitespacePunctuationCharLookup; } else if (Options.HasFlag(InputOptions.StripWhitespace)) { lookup = WhitespaceCharLookup; } else if (Options.HasFlag(InputOptions.StripPunctuation)) { lookup = PunctuationCharLookup; } else { foreach (var segment in Input) { yield return(segment); } yield break; } if (lookup == null || lookup.Length < 256) { throw new InvalidOperationException("Lookup should never be null"); } foreach (var segment in Input) { // trim start var offset = segment.Offset; var end = offset + segment.Count; var buffer = segment.Array; for (; offset < end; offset++) { if (!lookup[buffer[offset]]) { goto trimEnd; } } continue; trimEnd: end--; for (; end > offset; end--) { if (!lookup[buffer[end]]) { yield return(new ArraySegment <byte>(buffer, offset, end - offset)); continue; } } } }
private static ShocoModel GenerateModelFromSegments(IEnumerable <ArraySegment <byte> > Input, InputOptions Options, int MaximumLeadingBits, int MaxSuccessorBits, int EncodingTypes, bool OptimizeEncoding) { var input = Input; if (Options.HasFlag(InputOptions.SplitNewLine) || Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine)) { input = CalculateSegmentSplit(input, Options); } if (Options.HasFlag(InputOptions.StripPunctuation) || Options.HasFlag(InputOptions.StripWhitespace)) { input = CalculateSegmentStrip(input, Options); } return(GenerateModelFromSegments(input, MaximumLeadingBits, MaxSuccessorBits, EncodingTypes, OptimizeEncoding)); }
private static IEnumerable <ArraySegment <byte> > CalculateSegments(IEnumerable <string> Input, Encoding InputEncoding, InputOptions Options) { var input = Input; if (Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine)) { input = input.SelectMany(i => i.Split(WhitespaceChars, StringSplitOptions.RemoveEmptyEntries)); } else if (Options.HasFlag(InputOptions.SplitNewLine)) { input = input.SelectMany(i => i.Split(NewLineChars, StringSplitOptions.RemoveEmptyEntries)); } if (Options.HasFlag(InputOptions.StripWhitespaceAndPunctuation)) { input = input.Select(i => i.Trim(WhitespacePunctuationChars)); } else if (Options.HasFlag(InputOptions.StripWhitespace)) { input = input.Select(i => i.Trim(WhitespaceChars)); } else if (Options.HasFlag(InputOptions.StripPunctuation)) { input = input.Select(i => i.Trim(PunctuationChars)); } var buffer = new byte[2048]; foreach (var segment in input) { if (buffer.Length < InputEncoding.GetMaxByteCount(segment.Length)) { buffer = new byte[InputEncoding.GetMaxByteCount(segment.Length)]; } var length = InputEncoding.GetBytes(segment, 0, segment.Length, buffer, 0); if (length <= 1) { continue; } yield return(new ArraySegment <byte>(buffer, 0, length)); } }
private static IEnumerable <ArraySegment <byte> > CalculateSegmentSplit(IEnumerable <ArraySegment <byte> > Input, InputOptions Options) { if (Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine)) { foreach (var segment in Input) { var offset = segment.Offset; var buffer = segment.Array; var index = offset; for (; index < segment.Count; index++) { if (WhitespaceCharLookup[buffer[index]]) { if (offset < index) { yield return(new ArraySegment <byte>(buffer, offset, offset - index)); offset = index + 1; } } } if (offset < index) { yield return(new ArraySegment <byte>(buffer, offset, offset - index)); } } } else if (Options.HasFlag(InputOptions.SplitNewLine)) { foreach (var segment in Input) { var offset = segment.Offset; var buffer = segment.Array; var index = offset; for (; index < segment.Count; index++) { if (buffer[index] == 10 || // '\n' buffer[index] == 13) // '\r' { if (offset < index) { yield return(new ArraySegment <byte>(buffer, offset, offset - index)); offset = index + 1; } } } if (offset < index) { yield return(new ArraySegment <byte>(buffer, offset, offset - index)); } } } else { foreach (var segment in Input) { yield return(segment); } } }
private static IEnumerable <ArraySegment <byte> > CalculateSegments(IEnumerable <Stream> Input, InputOptions Options) { var segment = new byte[128]; var buffer = new byte[2048]; var splitLookup = FalseCharLookup; var stripLookup = FalseCharLookup; if (Options.HasFlag(InputOptions.SplitWhitespaceAndNewLine)) { splitLookup = WhitespaceCharLookup; } else if (Options.HasFlag(InputOptions.SplitNewLine)) { splitLookup = NewLineCharLookup; } if (Options.HasFlag(InputOptions.StripWhitespaceAndPunctuation)) { stripLookup = WhitespacePunctuationCharLookup; } else if (Options.HasFlag(InputOptions.StripWhitespace)) { stripLookup = WhitespaceCharLookup; } else if (Options.HasFlag(InputOptions.StripPunctuation)) { stripLookup = PunctuationCharLookup; } foreach (var stream in Input) { int segmentIndex = 0; var bufferLength = stream.Read(buffer, 0, buffer.Length); while (bufferLength > 0) { for (int index = 0; index < bufferLength; index++) { var c = buffer[index]; if (splitLookup[c]) { // trim end for (segmentIndex--; segmentIndex > 0; segmentIndex--) { if (!stripLookup[segment[segmentIndex]]) { break; } } if (segmentIndex++ > 0) { yield return(new ArraySegment <byte>(segment, 0, segmentIndex)); } segmentIndex = 0; continue; } if (segmentIndex > 0) { if (segment.Length <= segmentIndex) { Array.Resize(ref segment, segment.Length * 2); } segment[segmentIndex++] = c; continue; } if (stripLookup[c]) { continue; // trim start } else { segment[segmentIndex++] = c; } } bufferLength = stream.Read(buffer, 0, buffer.Length); } if (segmentIndex++ > 0) { yield return(new ArraySegment <byte>(segment, 0, segmentIndex)); } } }