public RegexMatches Matches(string text, ICancellable cnc) { // TODO: optimise, redesign var all_modifiers = ModifierInfoList.Select(oi => oi.Modifier); string selected_modifiers = SelectedOptions == null ? "" : string.Concat(SelectedOptions.Where(o => all_modifiers.Contains(o))); var matches = new List <IMatch>( ); ISimpleTextGetter stg = null; string assembly_location = Assembly.GetExecutingAssembly( ).Location; string assembly_dir = Path.GetDirectoryName(assembly_location); string perl_dir = Path.Combine(assembly_dir, @"Perl-min\perl"); string perl_exe = Path.Combine(perl_dir, @"bin\perl.exe"); string arguments = @"-CS -e "" my $pattern; eval { use strict; use feature 'unicode_strings'; use utf8; #use re 'eval'; no warnings 'experimental::re_strict'; [*USE RE STRICT*] chomp( $pattern = <STDIN> ); chomp( my $text = <STDIN> ); #print q('), $pattern, q(' ), length $pattern, qq(\n); $pattern = substr $pattern, 1, length($pattern) - 2; $text = substr $text, 1, length($text) - 2; #print q('), $pattern, q(' ), length $pattern, qq(\n); utf8::decode( $pattern ); utf8::decode( $text ); $pattern =~ s/\\\\/\x1F/g; $pattern =~ s/\\n/\n/g; $pattern =~ s/\\r/\r/g; $pattern =~ s/\x1F/\\/g; $text =~ s/\\\\/\x1F/g; $text =~ s/\\n/\n/g; $text =~ s/\\r/\r/g; $text =~ s/\x1F/\\/g; #print 'pattern: ', q('), $pattern, q(' ), length $pattern, qq(\r\n); #print 'text: ', q('), $text, ' ', q(' ), length $text, qq(\r\n); my $re; do { use re qw(Debug PARSE); print STDERR qq(<DEBUG-PARSE\x1F>\n); $re = qr/$pattern/[*MODIFIERS*]; print STDERR qq(</DEBUG-PARSE\x1F>\n); }; my $results = qq(<RESULTS\x1F>); while( $text =~ /$re/g ) { for( my $i = 0; $i < scalar @+; ++$i) { my $success = defined @-[$i]; if( ! $success ) { $results .= '0|0|0'; } else { my $index = @-[$i]; my $length = @+[$i] - @-[$i]; #my $val = @{^CAPTURE}[$i]; $results .= qq(1|$index|$length); } $results .= 'G'; } $results .= 'M'; } $results .= qq(</RESULTS\x1F>); print $results; }; if( $@ ) { print STDERR qq(<ERR\x1F>), $@, qq(</ERR\x1F>\n); } print STDERR qq(<END-ERR\x1F/>\n); """ .Replace("[*MODIFIERS*]", selected_modifiers) .Replace("[*USE RE STRICT*]", SelectedOptions.Contains("strict") ? "use re 'strict';" : ""); string stdout_contents; string stderr_contents; if (!ProcessUtilities.InvokeExe(cnc, perl_exe, arguments, sw => { sw.WriteLine(PrepareString(Pattern)); sw.WriteLine(PrepareString(text)); }, out stdout_contents, out stderr_contents, EncodingEnum.UTF8)) { return(RegexMatches.Empty); } string debug_parse = Regex.Match(stderr_contents, @"<DEBUG-PARSE\x1F>(.*?)</DEBUG-PARSE\x1F>", RegexOptions.Singleline | RegexOptions.Compiled).Groups[1].Value.Trim( ); string error_text = Regex.Match(stderr_contents, @"<ERR\x1F>(.*?)</ERR\x1F>", RegexOptions.Singleline | RegexOptions.Compiled).Groups[1].Value.Trim( ); if (!string.IsNullOrWhiteSpace(error_text)) { string error_message = Regex.Replace(error_text, @"\s+at -e line \d+, <STDIN> line \d+(?=\.\s*$)", "", RegexOptions.Singleline | RegexOptions.Compiled); throw new Exception(error_message); } // try figuring out the names and their numbers var numbered_names = new List <string>( ); foreach (Match m in Regex.Matches(debug_parse, @"(?:\r|\n) +\| +\| +~ CLOSE(\d+) '(.*?)' \(\d+\)(?: -> \w+)?(?:\r|\n)", RegexOptions.Compiled)) { string name = m.Groups[2].Value; int number = int.Parse(m.Groups[1].Value, CultureInfo.InvariantCulture); for (int i = numbered_names.Count; i <= number; ++i) { numbered_names.Add(null); } Debug.Assert(numbered_names[number] == null || numbered_names[number] == name); numbered_names[number] = name; } string results = Regex.Match(stdout_contents, @"<RESULTS\x1F>(.*?)</RESULTS\x1F>", RegexOptions.Singleline | RegexOptions.Compiled).Groups[1].Value.Trim( ); var sph = new SurrogatePairsHelper(text, processSurrogatePairs: true); var split_m = results.Split(new[] { 'M' }, StringSplitOptions.RemoveEmptyEntries); foreach (var m in split_m) { SimpleMatch match = null; var split_g = m.Split(new[] { 'G' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < split_g.Length; i++) { string g = split_g[i]; var split = g.Split('|'); Debug.Assert(split.Length == 3); bool success = split[0] == "1"; string deduced_name = i < numbered_names.Count ? numbered_names[i] : null; if (deduced_name == null) { deduced_name = i.ToString(CultureInfo.InvariantCulture); } if (!success) { match.AddGroup(0, 0, false, deduced_name); } else { int index = int.Parse(split[1], CultureInfo.InvariantCulture); int length = int.Parse(split[2], CultureInfo.InvariantCulture); var(text_index, text_length) = sph.ToTextIndexAndLength(index, length); if (stg == null) { stg = new SimpleTextGetter(text); } if (match == null) { match = SimpleMatch.Create(index, length, text_index, text_length, stg); } match.AddGroup(index, length, text_index, text_length, true, deduced_name); } } matches.Add(match); } return(new RegexMatches(matches.Count, matches)); }
public RegexMatches Matches(string text, ICancellable cnc) { // TODO: optimise, redesign var all_flags = FlagInfoList.Select(oi => oi.Flag); var selected_flags = SelectedOptions?.Where(o => all_flags.Contains(o)) ?? Enumerable.Empty <string>( ); var matches = new List <IMatch>( ); ISimpleTextGetter stg = null; string arguments = @"-I -E -s -S -X utf8 -c "" import sys import re pattern = input().strip(' \r\n') text = input().strip(' \r\n') pattern = pattern.replace('\\\\', '\x1F').replace('\\r', '\r').replace('\\n', '\n').replace('\x1F', '\\') text = text.replace('\\\\', '\x1F').replace('\\r', '\r').replace('\\n', '\n').replace('\x1F', '\\') pattern = pattern[1:-1] text = text[1:-1] #print( f'# pattern=[{pattern}], len={len(pattern)}'); #print( f'# text=[{text}], len={len(text)}'); try: regex = re.compile( pattern, [*FLAGS*]) #print( f'# {regex.groups}') #print( f'# {regex.groupindex}') for key, value in regex.groupindex.items(): print( f'N {value} <{key}>') matches = regex.finditer( text ) for match in matches : print( f'M {match.start()}, {match.end()}') for g in range(0, regex.groups + 1): print( f'G {match.start(g)}, {match.end(g)}' ) except: ex_type, ex, tb = sys.exc_info() print( ex, file = sys.stderr ) "" "; arguments = arguments.Replace("[*FLAGS*]", selected_flags.Any( ) ? string.Join("|", selected_flags.Select(f => "re." + f)) : "0"); string stdout_contents; string stderr_contents; if (!ProcessUtilities.InvokeExe(cnc, GetPythonExePath( ), arguments, sw => { sw.WriteLine(PrepareString(Pattern)); sw.WriteLine(PrepareString(text)); }, out stdout_contents, out stderr_contents, EncodingEnum.UTF8)) { return(RegexMatches.Empty); } if (!string.IsNullOrWhiteSpace(stderr_contents)) { string error_message = stderr_contents; throw new Exception(error_message); } SimpleMatch match = null; int group_i = 0; var names = new Dictionary <int, string>( ); var sph = new SurrogatePairsHelper(text, processSurrogatePairs: true); using (var sr = new StringReader(stdout_contents)) { string line; while ((line = sr.ReadLine( )) != null) { if (line.Length == 0 || line.StartsWith("#")) { continue; } var m = RegexMG.Match(line); if (!m.Success) { if (Debugger.IsAttached) { Debugger.Break( ); } throw new Exception("Internal error in Python engine."); } else { switch (m.Groups["t"].Value) { case "N": { int index = int.Parse(m.Groups["i"].Value, CultureInfo.InvariantCulture); string name = m.Groups["n"].Value; Debug.Assert(!names.ContainsKey(index)); names[index] = name; } break; case "M": { int index = int.Parse(m.Groups["s"].Value, CultureInfo.InvariantCulture); int end = int.Parse(m.Groups["e"].Value, CultureInfo.InvariantCulture); int length = end - index; Debug.Assert(index >= 0 && end >= 0); var(text_index, text_length) = sph.ToTextIndexAndLength(index, length); if (stg == null) { stg = new SimpleTextGetter(text); } match = SimpleMatch.Create(index, length, text_index, text_length, stg); matches.Add(match); group_i = 0; } break; case "G": { int index = int.Parse(m.Groups["s"].Value, CultureInfo.InvariantCulture); int end = int.Parse(m.Groups["e"].Value, CultureInfo.InvariantCulture); int length = end - index; bool success = index >= 0; Debug.Assert(match != null); var(text_index, text_length) = sph.ToTextIndexAndLength(index, length); string name; if (!names.TryGetValue(group_i, out name)) { name = group_i.ToString(CultureInfo.InvariantCulture); } match.AddGroup(index, length, text_index, text_length, success, name); ++group_i; } break; default: if (Debugger.IsAttached) { Debugger.Break( ); } throw new Exception("Internal error in Python engine."); } } } } return(new RegexMatches(matches.Count, matches)); }