public void addStringToTries(string ruleString)
        {
            if (ruleString.Contains(this.Options.TermAdditionSeparator))
            {
                string[] splitTerm = ruleString.Split(this.Options.TermAdditionSeparator.ToCharArray());

                List <string> newTerm = splitTerm.ToList();
                //If there's only two fields, add an empty field
                if (newTerm.Count == 2)
                {
                    newTerm.Add("");
                }

                //Check that either first or the third field of the new term are non-empty
                if ((newTerm[0].Length > 0) || (newTerm[2].Length > 0))
                {
                    char fileDelimiter = delimiterToChar(this.Options.Delimiter);
                    //Regex for converting unicode escape sequences to characters
                    Regex rx = new Regex(@"\[uU]([0-9a-fA-F]{4})");

                    //Check whether this is a regex term or a normal term
                    if (ruleString[0] == 'r' && ruleString[1] == '\\')
                    {
                        //Remove the regex marker
                        newTerm[0] = newTerm[0].Substring(2);

                        //Convert the unicode escape sequences in the new term
                        List <string> unicodeParsedNewTerm = new List <string>();
                        foreach (string field in newTerm)
                        {
                            unicodeParsedNewTerm.Add(
                                rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); }));
                        }

                        //Validate source and replaces fields
                        foreach (var num in new List <int> {
                            0, 2
                        })
                        {
                            try
                            {
                                int validationResult = this.regexTrieFactory.validateRegex(unicodeParsedNewTerm[num]);
                                if (validationResult != 0)
                                {
                                    List <KeyValuePair <string, string> > results = new List <KeyValuePair <string, string> >();
                                    results.Add(
                                        new KeyValuePair <string, string>(unicodeParsedNewTerm[num],
                                                                          this.regexTrieFactory.errorMessages[validationResult]));
                                    ValidationErrorForm errorForm = new ValidationErrorForm(results);
                                    return;
                                }
                            }
                            catch
                            {
                                //The field does not exist, no need to validate
                            }
                        }

                        if (File.Exists(this.Options.RegexFileName))
                        {
                            TextWriter tw = new StreamWriter(this.Options.RegexFileName, true);
                            tw.WriteLine();
                            tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]);
                            tw.Close();
                        }
                        else if (this.Options.RegexFileName != "" || this.Options.RegexFileName == null)
                        {
                            MessageBox.Show("Regular expression rule file does not exist", "TermInjector");
                            this.Options.RegexFileName = "";
                        }


                        if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == true)
                        {
                            this.regexTrieSource =
                                this.determiniser.determiniseNFA(
                                    this.regexTrieSource);
                        }
                        else if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == false)
                        {
                            this.regexTrieReplaces =
                                this.determiniser.determiniseNFA(
                                    this.regexTrieReplaces);
                        }
                    }
                    else
                    {
                        if (File.Exists(this.Options.GlossaryFileName))
                        {
                            TextWriter tw = new StreamWriter(this.Options.GlossaryFileName, true);
                            tw.WriteLine();
                            tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]);
                            tw.Close();
                        }
                        else if (this.Options.GlossaryFileName != "" || this.Options.GlossaryFileName == null)
                        {
                            MessageBox.Show("Exact match rule file does not exist", "TermInjector");
                        }
                        if (!matchCaseToBool(this.Options.MatchCase))
                        {
                            newTerm[0] = newTerm[0].ToLower();
                            newTerm[2] = newTerm[2].ToLower();
                        }

                        //Convert the unicode escape sequences in the new term
                        List <string> unicodeParsedNewTerm = new List <string>();
                        foreach (string field in newTerm)
                        {
                            unicodeParsedNewTerm.Add(rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); }));
                        }

                        //Add term to normal or fuzzy trie
                        this.trieLoader.addFieldsToTrie(unicodeParsedNewTerm, this.exactMatchTrieSource, this.exactMatchTrieReplaces);
                    }
                }

                //Update the possible new tries to visitors
                initializeVisitors();
            }
            return;
        }
Esempio n. 2
0
        //This loads the regex tries from a file to the two regextries given as parameters
        public void loadRegexTrieFromFile(
            string fileName,
            char delimiter,
            ref RegexTrie <TranslationAndReplacement> regexTrieSource,
            ref RegexTrie <TranslationAndReplacement> regexTrieReplaces)
        {
            //Regextrie is used for visiting source segment, fuzzy regex trie for visiting
            //the fuzzy match target segment

            //Check if file exists, exit method and show a message if it doesn't
            if (!File.Exists(fileName))
            {
                //If the file name is not empty, display alert
                if (fileName != "")
                {
                    MessageBox.Show("Regular expression rule file does not exist", "TermInjector");
                }
                return;
            }

            //Regex for converting unicode escape sequences to characters
            Regex rx = new Regex(@"\\[uU]([0-9a-fA-F]{4})");

            //Counter for restricting glossary size
            int stringMemoryUsage = 0;

            //Counters for checking whether terms are being added
            int     lineCount          = 0;
            int     termCount          = 0;
            Boolean addedNormalRegexes = false;
            Boolean addedFuzzyRegexes  = false;

            using (StreamReader sourceFile = File.OpenText(fileName))
            {
                while (!sourceFile.EndOfStream)
                {
                    //Check if memory usage is within bounds
                    if (stringMemoryUsage > 2500000)
                    {
                        MessageBox.Show("Regular expression rule file loading stopped due to excessive size: Only part of the regular expression rule file has been loaded.", "TermInjector");
                        break;
                    }

                    //Split the line before Unicode conversion (so as not to accidentally add separators)
                    List <string> unicodeEscapedSplitTerm = sourceFile.ReadLine().Split(delimiter).ToList();

                    //Convert the unicode escape sequences in the fields
                    List <string> splitTerm = new List <string>();
                    foreach (var field in unicodeEscapedSplitTerm)
                    {
                        splitTerm.Add(
                            rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); }));
                    }

                    //Check whether the line was valid (at least two fields)
                    if (splitTerm.Count < 2)
                    {
                        lineCount++;
                        continue;
                    }
                    List <string> newTerm = splitTerm.ToList();

                    //If both first and third fields are empty
                    //, skip to next iteration
                    if (newTerm[0].Length == 0 && newTerm[1].Length == 0)
                    {
                        lineCount++;
                        continue;
                    }
                    //If length of list is two, add empty field
                    if (newTerm.Count == 2)
                    {
                        newTerm.Add("");
                    }

                    //Tally the proxy for memory usage, depending on whether source or replaces
                    //field was used as path
                    if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == true)
                    {
                        stringMemoryUsage += newTerm[0].Length;
                        addedNormalRegexes = true;
                        termCount++;
                        lineCount += 1;
                    }
                    else if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == false)
                    {
                        stringMemoryUsage += newTerm[2].Length;
                        addedFuzzyRegexes  = true;
                        termCount++;
                        lineCount += 1;
                    }
                }
                sourceFile.Close();

                //Determinise the regex tries
                if (addedNormalRegexes)
                {
                    //Here's the problem, determiniser breaks the reference: use ref keywords
                    regexTrieSource =
                        this.determiniser.determiniseNFA(
                            regexTrieSource);
                }
                if (addedFuzzyRegexes)
                {
                    regexTrieReplaces =
                        this.determiniser.determiniseNFA(
                            regexTrieReplaces);
                }


                //If the proportion of terms stored and lines read is skewed, the wrong delimiter may have been used.
                //Don't check very small glossaries, as otherwise an empty line or two could trigger the message
                if (lineCount - termCount > (lineCount / 2))
                {
                    string delimiterUsed = "";
                    if (delimiter == '\t')
                    {
                        delimiterUsed = "Tab";
                    }
                    else
                    {
                        delimiterUsed = delimiter.ToString();
                    }
                    MessageBox.Show((string.Format("The amount of regular expression rules stored is small compared to the amount of lines read: {0} lines read, but only {1} regular expression rules found. Are you sure the delimiter character {2} is correct?"
                                                   , lineCount.ToString(), termCount.ToString(), delimiterUsed)), "TermInjector");
                }
            }

            if (this.validationErrors.Count > 0)
            {
                ValidationErrorForm errorForm = new ValidationErrorForm(this.validationErrors);
            }

            this.validationErrors.Clear();
        }