public RegexMatches Matches( string text, ICancellable cnc ) { var data = new { cmd = "m", text = text, pattern = Pattern, options = Options }; string json = JsonSerializer.Serialize( data ); string stdout_contents; string stderr_contents; bool r = ProcessUtilities.InvokeExe( cnc, GetClientExePath( ), null, json, out stdout_contents, out stderr_contents, EncodingEnum.UTF8 ); if( !string.IsNullOrWhiteSpace( stderr_contents ) ) { throw new Exception( stderr_contents ); } ClientMatch[] client_matches = JsonSerializer.Deserialize<ClientMatch[]>( stdout_contents ); SimpleMatch[] matches = new SimpleMatch[client_matches.Length]; SimpleTextGetter text_getter = new SimpleTextGetter( text ); for( int i = 0; i < client_matches.Length; i++ ) { ClientMatch m = client_matches[i]; SimpleMatch sm = SimpleMatch.Create( m.index, m.length, text_getter ); foreach( var g in m.groups ) { var sg = sm.AddGroup( g.index, g.length, g.success, g.name ?? string.Empty ); foreach( var c in g.captures ) { sg.AddCapture( c.index, c.length ); } } matches[i] = sm; } return new RegexMatches( matches.Length, matches ); }
public RegexMatches Matches(string text, ICancellable cnc) { MemoryStream stdout_contents; string stderr_contents; int limit; if (!int.TryParse(Options.Limit, out limit)) { limit = 0; } uint flags = 0; if (Options.UREGEX_CANON_EQ) { flags |= 1 << 0; } if (Options.UREGEX_CASE_INSENSITIVE) { flags |= 1 << 1; } if (Options.UREGEX_COMMENTS) { flags |= 1 << 2; } if (Options.UREGEX_DOTALL) { flags |= 1 << 3; } if (Options.UREGEX_LITERAL) { flags |= 1 << 4; } if (Options.UREGEX_MULTILINE) { flags |= 1 << 5; } if (Options.UREGEX_UNIX_LINES) { flags |= 1 << 6; } if (Options.UREGEX_UWORD) { flags |= 1 << 7; } if (Options.UREGEX_ERROR_ON_UNKNOWN_ESCAPES) { flags |= 1 << 8; } #if DEBUG { // For debugging using (var fs = File.Create("debug-icu.dat")) { using (var bw = new BinaryWriter(fs, Encoding.Unicode)) { bw.Write("m"); bw.Write(Pattern); bw.Write(text); bw.Write(flags); bw.Write(limit); } } } #endif Action <Stream> stdinWriter = s => { using (var bw = new BinaryWriter(s, Encoding.Unicode, leaveOpen: false)) { bw.Write("m"); bw.Write(Pattern); bw.Write(text); bw.Write(flags); bw.Write(limit); } }; if (!ProcessUtilities.InvokeExe(cnc, GetIcuClientExePath( ), null, stdinWriter, out stdout_contents, out stderr_contents, EncodingEnum.Unicode)) { return(RegexMatches.Empty); } if (!string.IsNullOrWhiteSpace(stderr_contents)) { throw new Exception(stderr_contents); } using (var br = new BinaryReader(stdout_contents, Encoding.Unicode)) { // read group names var group_names = new Dictionary <int, string>( ); for (; ;) { int i = br.ReadInt32( ); if (i <= 0) { break; } string name = br.ReadString( ); group_names.Add(i, name); } // read matches List <IMatch> matches = new List <IMatch>( ); ISimpleTextGetter stg = null; for (; ;) { int group_count = br.ReadInt32( ); if (group_count < 0) { break; } SimpleMatch match = null;; for (int i = 0; i <= group_count; ++i) { int start = br.ReadInt32( ); bool success = start >= 0; int end; int length; if (success) { end = br.ReadInt32( ); length = success ? end - start : 0; } else { end = 0; length = 0; } if (i == 0) { Debug.Assert(success); Debug.Assert(match == null); if (stg == null) { stg = new SimpleTextGetter(text); } match = SimpleMatch.Create(start, length, stg); match.AddGroup(start, length, success, "0"); } else { string name; if (!group_names.TryGetValue(i, out name)) { name = i.ToString(CultureInfo.InvariantCulture); } match.AddGroup(start, length, success, name); } } Debug.Assert(match != null); matches.Add(match); } return(new RegexMatches(matches.Count, matches)); } }
public RegexMatches Matches(string text, ICancellable cnc) { byte[] text_utf8_bytes = Encoding.UTF8.GetBytes(text); var o = new StringBuilder( ); if (Options.case_insensitive) { o.Append("i"); } if (Options.multi_line) { o.Append("m"); } if (Options.dot_matches_new_line) { o.Append("s"); } if (Options.swap_greed) { o.Append("U"); } if (Options.ignore_whitespace) { o.Append("x"); } if (Options.unicode) { o.Append("u"); } if (Options.octal) { o.Append("O"); } var obj = new { s = Options.@struct, p = Pattern, t = text, o = o.ToString( ), sl = Options.size_limit?.Trim( ) ?? "", dsl = Options.dfa_size_limit?.Trim( ) ?? "", nl = Options.nest_limit?.Trim( ) ?? "", }; string json = JsonSerializer.Serialize(obj); string stdout_contents; string stderr_contents; if (!InvokeRustClient(cnc, json, out stdout_contents, out stderr_contents)) { return(RegexMatches.Empty); } if (!string.IsNullOrWhiteSpace(stderr_contents)) { throw new Exception(stderr_contents); } var response = JsonSerializer.Deserialize <RustClientMatchesResponse>(stdout_contents); var matches = new List <IMatch>( ); ISimpleTextGetter stg = null; foreach (var m in response.matches) { SimpleMatch match = null; for (int group_index = 0; group_index < m.Length; group_index++) { int[] g = m[group_index]; bool success = g.Length == 2; int byte_start = success ? g[0] : 0; int byte_end = success ? g[1] : 0; int char_start = Encoding.UTF8.GetCharCount(text_utf8_bytes, 0, byte_start); int char_end = Encoding.UTF8.GetCharCount(text_utf8_bytes, 0, byte_end); int char_length = char_end - char_start; if (group_index == 0) { Debug.Assert(match == null); Debug.Assert(success); if (stg == null) { stg = new SimpleTextGetter(text); } match = SimpleMatch.Create(char_start, char_end - char_start, stg); } Debug.Assert(match != null); string name = response.names[group_index]; if (string.IsNullOrWhiteSpace(name)) { name = group_index.ToString(CultureInfo.InvariantCulture); } if (success) { match.AddGroup(char_start, char_length, true, name); } else { match.AddGroup(0, 0, false, name); } } Debug.Assert(match != null); matches.Add(match); } return(new RegexMatches(matches.Count, matches)); }
public RegexMatches Matches(string text, ICancellable cnc) { string flags = string.Concat( Options.i ? "i" : "", Options.m ? "m" : "", Options.s ? "s" : "", Options.u ? "u" : "" ); string stdout_contents; string stderr_contents; Action <StreamWriter> stdin_writer = new Action <StreamWriter>(sw => { sw.Write("m \""); WriteJavaScriptString(sw, Pattern); sw.Write("\" \""); sw.Write(flags); sw.Write("\" \""); WriteJavaScriptString(sw, text); sw.Write("\""); }); if (!ProcessUtilities.InvokeExe(cnc, GetClientExePath( ), "i", stdin_writer, out stdout_contents, out stderr_contents, EncodingEnum.UTF8)) { return(RegexMatches.Empty); // (cancelled) } if (!string.IsNullOrWhiteSpace(stderr_contents)) { throw new Exception(stderr_contents); } ResponseMatches client_response = JsonSerializer.Deserialize <ResponseMatches>(stdout_contents); if (client_response == null) { throw new Exception("JavaScript failed."); } if (!string.IsNullOrWhiteSpace(client_response.Error)) { throw new Exception(client_response.Error); } string[] distributed_names = FigureOutGroupNames(client_response); Debug.Assert(distributed_names[0] == null); SimpleTextGetter stg = new SimpleTextGetter(text); List <IMatch> matches = new List <IMatch>( ); foreach (var cm in client_response.Matches) { if (cm.Indices.Any( )) { var start = cm.Indices[0][0]; var end = cm.Indices[0][1]; var sm = SimpleMatch.Create(start, end - start, stg); sm.AddGroup(sm.Index, sm.Length, true, "0"); // (default group) for (int i = 1; i < cm.Indices.Count; ++i) { string name; if (i < distributed_names.Length && distributed_names[i] != null) { name = distributed_names[i]; } else { name = i.ToString(CultureInfo.InvariantCulture); } var g = cm.Indices[i]; if (g == null) { sm.AddGroup(-1, 0, false, name); } else { start = cm.Indices[i][0]; end = cm.Indices[i][1]; sm.AddGroup(start, end - start, true, name); } } matches.Add(sm); } } return(new RegexMatches(matches.Count, matches)); }
public RegexMatches Matches(string text, ICancellable cnc) { byte[] text_utf8_bytes = Encoding.UTF8.GetBytes(text); var flags = new StringBuilder( ); if (Options.i) { flags.Append("i"); } if (Options.m) { flags.Append("m"); } if (Options.s) { flags.Append("s"); } if (Options.x) { flags.Append("x"); } var obj = new { p = Pattern, t = text, f = flags.ToString( ), }; string json = JsonSerializer.Serialize(obj); string stdout_contents; string stderr_contents; if (!InvokeDClient(cnc, json, out stdout_contents, out stderr_contents)) { return(RegexMatches.Empty); } if (!string.IsNullOrWhiteSpace(stderr_contents)) { throw new Exception(stderr_contents); } var response = JsonSerializer.Deserialize <DClientMatchesResponse>(stdout_contents); var matches = new List <IMatch>( ); foreach (var m in response.matches) { SimpleMatch match = null; ISimpleTextGetter stg = null; for (int group_index = 0; group_index < m.groups.Length; group_index++) { int[] g = m.groups[group_index]; bool success = g.Length == 2; if (group_index == 0 && !success) { // if pattern is "()", which matches any position, 'std.regex' does not return captures, // even the main one (all are null); however the match object contains the valid index; // this is a workaround: success = true; g = new[] { m.index, 0 }; } int byte_start = success ? g[0] : 0; int byte_end = byte_start + (success ? g[1] : 0); int byte_length = byte_end - byte_start; int char_start = Encoding.UTF8.GetCharCount(text_utf8_bytes, 0, byte_start); int char_end = Encoding.UTF8.GetCharCount(text_utf8_bytes, 0, byte_end); int char_length = char_end - char_start; if (group_index == 0) { Debug.Assert(match == null); Debug.Assert(success); if (stg == null) { stg = new SimpleTextGetter(text); } match = SimpleMatch.Create(char_start, char_end - char_start, stg); } Debug.Assert(match != null); // try to identify the named group by index and length; // cannot be done univocally in situations like "(?P<name1>(?P<name2>(.))", because index and length are the same string name; var np = m.named_groups .Where(_ => group_index != 0) .Select((ng, j) => new { ng, j }) .Where(p => p.ng[0] >= 0) .FirstOrDefault(z => z.ng[0] == byte_start && z.ng[1] == byte_length && !match.Groups.Any(q => q.Name == response.names[z.j])); if (np == null) { name = null; } else { name = response.names[np.j]; } if (string.IsNullOrWhiteSpace(name)) { name = group_index.ToString(CultureInfo.InvariantCulture); } if (success) { match.AddGroup(char_start, char_length, true, name); } else { match.AddGroup(0, 0, false, name); } } Debug.Assert(match != null); matches.Add(match); } return(new RegexMatches(matches.Count, matches)); }
public RegexMatches Matches(string text, ICancellable cnc) { // TODO: optimise, redesign var all_modifiers = ModifierInfoList.Select(oi => oi.Modifier); string selected_modifiers = SelectedOptions == null ? "" : string.Concat(SelectedOptions.Where(o => all_modifiers.Contains(o))); var matches = new List <IMatch>( ); ISimpleTextGetter stg = null; string assembly_location = Assembly.GetExecutingAssembly( ).Location; string assembly_dir = Path.GetDirectoryName(assembly_location); string perl_dir = Path.Combine(assembly_dir, @"Perl-min\perl"); string perl_exe = Path.Combine(perl_dir, @"bin\perl.exe"); string arguments = @"-CS -e "" my $pattern; eval { use strict; use feature 'unicode_strings'; use utf8; #use re 'eval'; no warnings 'experimental::re_strict'; [*USE RE STRICT*] chomp( $pattern = <STDIN> ); chomp( my $text = <STDIN> ); #print q('), $pattern, q(' ), length $pattern, qq(\n); $pattern = substr $pattern, 1, length($pattern) - 2; $text = substr $text, 1, length($text) - 2; #print q('), $pattern, q(' ), length $pattern, qq(\n); utf8::decode( $pattern ); utf8::decode( $text ); $pattern =~ s/\\\\/\x1F/g; $pattern =~ s/\\n/\n/g; $pattern =~ s/\\r/\r/g; $pattern =~ s/\x1F/\\/g; $text =~ s/\\\\/\x1F/g; $text =~ s/\\n/\n/g; $text =~ s/\\r/\r/g; $text =~ s/\x1F/\\/g; #print 'pattern: ', q('), $pattern, q(' ), length $pattern, qq(\r\n); #print 'text: ', q('), $text, ' ', q(' ), length $text, qq(\r\n); my $re; do { use re qw(Debug PARSE); print STDERR qq(<DEBUG-PARSE\x1F>\n); $re = qr/$pattern/[*MODIFIERS*]; print STDERR qq(</DEBUG-PARSE\x1F>\n); }; my $results = qq(<RESULTS\x1F>); while( $text =~ /$re/g ) { for( my $i = 0; $i < scalar @+; ++$i) { my $success = defined @-[$i]; if( ! $success ) { $results .= '0|0|0'; } else { my $index = @-[$i]; my $length = @+[$i] - @-[$i]; #my $val = @{^CAPTURE}[$i]; $results .= qq(1|$index|$length); } $results .= 'G'; } $results .= 'M'; } $results .= qq(</RESULTS\x1F>); print $results; }; if( $@ ) { print STDERR qq(<ERR\x1F>), $@, qq(</ERR\x1F>\n); } print STDERR qq(<END-ERR\x1F/>\n); """ .Replace("[*MODIFIERS*]", selected_modifiers) .Replace("[*USE RE STRICT*]", SelectedOptions.Contains("strict") ? "use re 'strict';" : ""); string stdout_contents; string stderr_contents; if (!ProcessUtilities.InvokeExe(cnc, perl_exe, arguments, sw => { sw.WriteLine(PrepareString(Pattern)); sw.WriteLine(PrepareString(text)); }, out stdout_contents, out stderr_contents, EncodingEnum.UTF8)) { return(RegexMatches.Empty); } string debug_parse = Regex.Match(stderr_contents, @"<DEBUG-PARSE\x1F>(.*?)</DEBUG-PARSE\x1F>", RegexOptions.Singleline | RegexOptions.Compiled).Groups[1].Value.Trim( ); string error_text = Regex.Match(stderr_contents, @"<ERR\x1F>(.*?)</ERR\x1F>", RegexOptions.Singleline | RegexOptions.Compiled).Groups[1].Value.Trim( ); if (!string.IsNullOrWhiteSpace(error_text)) { string error_message = Regex.Replace(error_text, @"\s+at -e line \d+, <STDIN> line \d+(?=\.\s*$)", "", RegexOptions.Singleline | RegexOptions.Compiled); throw new Exception(error_message); } // try figuring out the names and their numbers var numbered_names = new List <string>( ); foreach (Match m in Regex.Matches(debug_parse, @"(?:\r|\n) +\| +\| +~ CLOSE(\d+) '(.*?)' \(\d+\)(?: -> \w+)?(?:\r|\n)", RegexOptions.Compiled)) { string name = m.Groups[2].Value; int number = int.Parse(m.Groups[1].Value, CultureInfo.InvariantCulture); for (int i = numbered_names.Count; i <= number; ++i) { numbered_names.Add(null); } Debug.Assert(numbered_names[number] == null || numbered_names[number] == name); numbered_names[number] = name; } string results = Regex.Match(stdout_contents, @"<RESULTS\x1F>(.*?)</RESULTS\x1F>", RegexOptions.Singleline | RegexOptions.Compiled).Groups[1].Value.Trim( ); var sph = new SurrogatePairsHelper(text, processSurrogatePairs: true); var split_m = results.Split(new[] { 'M' }, StringSplitOptions.RemoveEmptyEntries); foreach (var m in split_m) { SimpleMatch match = null; var split_g = m.Split(new[] { 'G' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < split_g.Length; i++) { string g = split_g[i]; var split = g.Split('|'); Debug.Assert(split.Length == 3); bool success = split[0] == "1"; string deduced_name = i < numbered_names.Count ? numbered_names[i] : null; if (deduced_name == null) { deduced_name = i.ToString(CultureInfo.InvariantCulture); } if (!success) { match.AddGroup(0, 0, false, deduced_name); } else { int index = int.Parse(split[1], CultureInfo.InvariantCulture); int length = int.Parse(split[2], CultureInfo.InvariantCulture); var(text_index, text_length) = sph.ToTextIndexAndLength(index, length); if (stg == null) { stg = new SimpleTextGetter(text); } if (match == null) { match = SimpleMatch.Create(index, length, text_index, text_length, stg); } match.AddGroup(index, length, text_index, text_length, true, deduced_name); } } matches.Add(match); } return(new RegexMatches(matches.Count, matches)); }
public RegexMatches Matches(string text, ICancellable cnc) { // TODO: optimise, redesign var all_flags = FlagInfoList.Select(oi => oi.Flag); var selected_flags = SelectedOptions?.Where(o => all_flags.Contains(o)) ?? Enumerable.Empty <string>( ); var matches = new List <IMatch>( ); ISimpleTextGetter stg = null; string arguments = @"-I -E -s -S -X utf8 -c "" import sys import re pattern = input().strip(' \r\n') text = input().strip(' \r\n') pattern = pattern.replace('\\\\', '\x1F').replace('\\r', '\r').replace('\\n', '\n').replace('\x1F', '\\') text = text.replace('\\\\', '\x1F').replace('\\r', '\r').replace('\\n', '\n').replace('\x1F', '\\') pattern = pattern[1:-1] text = text[1:-1] #print( f'# pattern=[{pattern}], len={len(pattern)}'); #print( f'# text=[{text}], len={len(text)}'); try: regex = re.compile( pattern, [*FLAGS*]) #print( f'# {regex.groups}') #print( f'# {regex.groupindex}') for key, value in regex.groupindex.items(): print( f'N {value} <{key}>') matches = regex.finditer( text ) for match in matches : print( f'M {match.start()}, {match.end()}') for g in range(0, regex.groups + 1): print( f'G {match.start(g)}, {match.end(g)}' ) except: ex_type, ex, tb = sys.exc_info() print( ex, file = sys.stderr ) "" "; arguments = arguments.Replace("[*FLAGS*]", selected_flags.Any( ) ? string.Join("|", selected_flags.Select(f => "re." + f)) : "0"); string stdout_contents; string stderr_contents; if (!ProcessUtilities.InvokeExe(cnc, GetPythonExePath( ), arguments, sw => { sw.WriteLine(PrepareString(Pattern)); sw.WriteLine(PrepareString(text)); }, out stdout_contents, out stderr_contents, EncodingEnum.UTF8)) { return(RegexMatches.Empty); } if (!string.IsNullOrWhiteSpace(stderr_contents)) { string error_message = stderr_contents; throw new Exception(error_message); } SimpleMatch match = null; int group_i = 0; var names = new Dictionary <int, string>( ); var sph = new SurrogatePairsHelper(text, processSurrogatePairs: true); using (var sr = new StringReader(stdout_contents)) { string line; while ((line = sr.ReadLine( )) != null) { if (line.Length == 0 || line.StartsWith("#")) { continue; } var m = RegexMG.Match(line); if (!m.Success) { if (Debugger.IsAttached) { Debugger.Break( ); } throw new Exception("Internal error in Python engine."); } else { switch (m.Groups["t"].Value) { case "N": { int index = int.Parse(m.Groups["i"].Value, CultureInfo.InvariantCulture); string name = m.Groups["n"].Value; Debug.Assert(!names.ContainsKey(index)); names[index] = name; } break; case "M": { int index = int.Parse(m.Groups["s"].Value, CultureInfo.InvariantCulture); int end = int.Parse(m.Groups["e"].Value, CultureInfo.InvariantCulture); int length = end - index; Debug.Assert(index >= 0 && end >= 0); var(text_index, text_length) = sph.ToTextIndexAndLength(index, length); if (stg == null) { stg = new SimpleTextGetter(text); } match = SimpleMatch.Create(index, length, text_index, text_length, stg); matches.Add(match); group_i = 0; } break; case "G": { int index = int.Parse(m.Groups["s"].Value, CultureInfo.InvariantCulture); int end = int.Parse(m.Groups["e"].Value, CultureInfo.InvariantCulture); int length = end - index; bool success = index >= 0; Debug.Assert(match != null); var(text_index, text_length) = sph.ToTextIndexAndLength(index, length); string name; if (!names.TryGetValue(group_i, out name)) { name = group_i.ToString(CultureInfo.InvariantCulture); } match.AddGroup(index, length, text_index, text_length, success, name); ++group_i; } break; default: if (Debugger.IsAttached) { Debugger.Break( ); } throw new Exception("Internal error in Python engine."); } } } } return(new RegexMatches(matches.Count, matches)); }