public void addStringToTries(string ruleString) { if (ruleString.Contains(this.Options.TermAdditionSeparator)) { string[] splitTerm = ruleString.Split(this.Options.TermAdditionSeparator.ToCharArray()); List <string> newTerm = splitTerm.ToList(); //If there's only two fields, add an empty field if (newTerm.Count == 2) { newTerm.Add(""); } //Check that either first or the third field of the new term are non-empty if ((newTerm[0].Length > 0) || (newTerm[2].Length > 0)) { char fileDelimiter = delimiterToChar(this.Options.Delimiter); //Regex for converting unicode escape sequences to characters Regex rx = new Regex(@"\[uU]([0-9a-fA-F]{4})"); //Check whether this is a regex term or a normal term if (ruleString[0] == 'r' && ruleString[1] == '\\') { //Remove the regex marker newTerm[0] = newTerm[0].Substring(2); //Convert the unicode escape sequences in the new term List <string> unicodeParsedNewTerm = new List <string>(); foreach (string field in newTerm) { unicodeParsedNewTerm.Add( rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); })); } //Validate source and replaces fields foreach (var num in new List <int> { 0, 2 }) { try { int validationResult = this.regexTrieFactory.validateRegex(unicodeParsedNewTerm[num]); if (validationResult != 0) { List <KeyValuePair <string, string> > results = new List <KeyValuePair <string, string> >(); results.Add( new KeyValuePair <string, string>(unicodeParsedNewTerm[num], this.regexTrieFactory.errorMessages[validationResult])); ValidationErrorForm errorForm = new ValidationErrorForm(results); return; } } catch { //The field does not exist, no need to validate } } if (File.Exists(this.Options.RegexFileName)) { TextWriter tw = new StreamWriter(this.Options.RegexFileName, true); tw.WriteLine(); tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]); tw.Close(); } else if (this.Options.RegexFileName != "" || this.Options.RegexFileName == null) { MessageBox.Show("Regular expression rule file does not exist", "TermInjector"); this.Options.RegexFileName = ""; } if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == true) { this.regexTrieSource = this.determiniser.determiniseNFA( this.regexTrieSource); } else if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == false) { this.regexTrieReplaces = this.determiniser.determiniseNFA( this.regexTrieReplaces); } } else { if (File.Exists(this.Options.GlossaryFileName)) { TextWriter tw = new StreamWriter(this.Options.GlossaryFileName, true); tw.WriteLine(); tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]); tw.Close(); } else if (this.Options.GlossaryFileName != "" || this.Options.GlossaryFileName == null) { MessageBox.Show("Exact match rule file does not exist", "TermInjector"); } if (!matchCaseToBool(this.Options.MatchCase)) { newTerm[0] = newTerm[0].ToLower(); newTerm[2] = newTerm[2].ToLower(); } //Convert the unicode escape sequences in the new term List <string> unicodeParsedNewTerm = new List <string>(); foreach (string field in newTerm) { unicodeParsedNewTerm.Add(rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); })); } //Add term to normal or fuzzy trie this.trieLoader.addFieldsToTrie(unicodeParsedNewTerm, this.exactMatchTrieSource, this.exactMatchTrieReplaces); } } //Update the possible new tries to visitors initializeVisitors(); } return; }
//This loads the regex tries from a file to the two regextries given as parameters public void loadRegexTrieFromFile( string fileName, char delimiter, ref RegexTrie <TranslationAndReplacement> regexTrieSource, ref RegexTrie <TranslationAndReplacement> regexTrieReplaces) { //Regextrie is used for visiting source segment, fuzzy regex trie for visiting //the fuzzy match target segment //Check if file exists, exit method and show a message if it doesn't if (!File.Exists(fileName)) { //If the file name is not empty, display alert if (fileName != "") { MessageBox.Show("Regular expression rule file does not exist", "TermInjector"); } return; } //Regex for converting unicode escape sequences to characters Regex rx = new Regex(@"\\[uU]([0-9a-fA-F]{4})"); //Counter for restricting glossary size int stringMemoryUsage = 0; //Counters for checking whether terms are being added int lineCount = 0; int termCount = 0; Boolean addedNormalRegexes = false; Boolean addedFuzzyRegexes = false; using (StreamReader sourceFile = File.OpenText(fileName)) { while (!sourceFile.EndOfStream) { //Check if memory usage is within bounds if (stringMemoryUsage > 2500000) { MessageBox.Show("Regular expression rule file loading stopped due to excessive size: Only part of the regular expression rule file has been loaded.", "TermInjector"); break; } //Split the line before Unicode conversion (so as not to accidentally add separators) List <string> unicodeEscapedSplitTerm = sourceFile.ReadLine().Split(delimiter).ToList(); //Convert the unicode escape sequences in the fields List <string> splitTerm = new List <string>(); foreach (var field in unicodeEscapedSplitTerm) { splitTerm.Add( rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); })); } //Check whether the line was valid (at least two fields) if (splitTerm.Count < 2) { lineCount++; continue; } List <string> newTerm = splitTerm.ToList(); //If both first and third fields are empty //, skip to next iteration if (newTerm[0].Length == 0 && newTerm[1].Length == 0) { lineCount++; continue; } //If length of list is two, add empty field if (newTerm.Count == 2) { newTerm.Add(""); } //Tally the proxy for memory usage, depending on whether source or replaces //field was used as path if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == true) { stringMemoryUsage += newTerm[0].Length; addedNormalRegexes = true; termCount++; lineCount += 1; } else if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == false) { stringMemoryUsage += newTerm[2].Length; addedFuzzyRegexes = true; termCount++; lineCount += 1; } } sourceFile.Close(); //Determinise the regex tries if (addedNormalRegexes) { //Here's the problem, determiniser breaks the reference: use ref keywords regexTrieSource = this.determiniser.determiniseNFA( regexTrieSource); } if (addedFuzzyRegexes) { regexTrieReplaces = this.determiniser.determiniseNFA( regexTrieReplaces); } //If the proportion of terms stored and lines read is skewed, the wrong delimiter may have been used. //Don't check very small glossaries, as otherwise an empty line or two could trigger the message if (lineCount - termCount > (lineCount / 2)) { string delimiterUsed = ""; if (delimiter == '\t') { delimiterUsed = "Tab"; } else { delimiterUsed = delimiter.ToString(); } MessageBox.Show((string.Format("The amount of regular expression rules stored is small compared to the amount of lines read: {0} lines read, but only {1} regular expression rules found. Are you sure the delimiter character {2} is correct?" , lineCount.ToString(), termCount.ToString(), delimiterUsed)), "TermInjector"); } } if (this.validationErrors.Count > 0) { ValidationErrorForm errorForm = new ValidationErrorForm(this.validationErrors); } this.validationErrors.Clear(); }