public static void Write(string message, LogLevelType level, MPAlignerConfiguration conf=null) { if ((conf==null && level>=confLogLevel) || (conf!=null && level >= conf.logLevel && conf.logLevel!= LogLevelType.NONE)) { DateTime date = DateTime.Now; string dateStr = date.ToString("yyyy-MM-dd HH:mm:ss"); if (level != LogLevelType.ERROR) { Console.Write("[MPAligner] ["); Console.Write(level.ToString()); Console.Write("] "); Console.Write(dateStr); Console.Write(" "); Console.WriteLine(message); } else { Console.Error.Write("[MPAligner] ["); Console.Error.Write(level.ToString()); Console.Error.Write("] "); Console.Error.Write(dateStr); Console.Error.Write(" "); Console.Error.WriteLine(message); } } }
public static void Write(string message, LogLevelType level, MPAlignerConfiguration conf = null) { if ((conf == null && level >= confLogLevel) || (conf != null && level >= conf.logLevel && conf.logLevel != LogLevelType.NONE)) { DateTime date = DateTime.Now; string dateStr = date.ToString("yyyy-MM-dd HH:mm:ss"); if (level != LogLevelType.ERROR) { Console.Write("[MPAligner] ["); Console.Write(level.ToString()); Console.Write("] "); Console.Write(dateStr); Console.Write(" "); Console.WriteLine(message); } else { Console.Error.Write("[MPAligner] ["); Console.Error.Write(level.ToString()); Console.Error.Write("] "); Console.Error.Write(dateStr); Console.Error.Write(" "); Console.Error.WriteLine(message); } } }
public static List<string> GetLangsFromConf(MPAlignerConfiguration configuration) { List<string> res = new List<string>(); foreach(string lang in configuration.stopWordListEntryDict.Keys) { res.Add(lang); } return res; }
/// <summary> /// Loads the configuration from a specified <c>inputFile</c>. /// </summary> /// <param name='inputFile'> /// Input file. /// </param> public void Load(string inputFile) { string inputStr = File.ReadAllText(inputFile, Encoding.UTF8); MPAlignerConfiguration conf = MPFrameworkFunctions.DeserializeString <MPAlignerConfiguration>(inputStr); dictConfEntryDict = conf.dictConfEntryDict; mosesPath = conf.mosesPath; translConfEntryDict = conf.translConfEntryDict; keepTrackOfFiles = conf.keepTrackOfFiles; logLevel = conf.logLevel; forceEnDictInterlingua = conf.forceEnDictInterlingua; forceEnTranslitInterlingua = conf.forceEnTranslitInterlingua; outputFormat = conf.outputFormat; excDictEntryDict = conf.excDictEntryDict; allowTrimmedAlignments = conf.allowTrimmedAlignments; stopWordListEntryDict = conf.stopWordListEntryDict; langPairEntryDict = conf.langPairEntryDict; alignmentThreads = conf.alignmentThreads; useMultiThreadedExecution = conf.useMultiThreadedExecution; printTopTrgForSrc = conf.printTopTrgForSrc; concLen = conf.concLen; }
public static bool GetTranslitConfig(MPAlignerConfiguration configuration, string srcLang, string trgLang, out MPAlignerConfigurationTranslEntry srcTranslitConf, out MPAlignerConfigurationTranslEntry trgTranslitConf, out MPAlignerConfigurationTranslEntry srcToTrgTranslitConf, out MPAlignerConfigurationTranslEntry trgToSrcTranslitConf) { Log.Write ("Searching for transliteration configurations.",LogLevelType.LIMITED_OUTPUT,configuration); srcTranslitConf = null; trgTranslitConf = null; srcToTrgTranslitConf = null; trgToSrcTranslitConf = null; string srcLangKey = srcLang+"_en"; string trgLangKey = trgLang+"_en"; string langKey = srcLang+"_"+trgLang; string langKey2 = trgLang+"_"+srcLang; //Define transliteration directions and whether or not to use EN as interlingua. if (configuration.forceEnTranslitInterlingua && configuration.translConfEntryDict.ContainsKey(srcLangKey)&&configuration.translConfEntryDict.ContainsKey(trgLangKey) && configuration.translConfEntryDict[srcLangKey].use && configuration.translConfEntryDict[trgLangKey].use) { srcTranslitConf= configuration.translConfEntryDict[srcLangKey]; trgTranslitConf= configuration.translConfEntryDict[trgLangKey]; Log.Write ("EN interlingua transliteration loaded for language "+srcLang+": "+ configuration.translConfEntryDict[srcLangKey].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("EN interlingua transliteration loaded for language "+trgLang+": "+ configuration.translConfEntryDict[trgLangKey].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); return true; }else if (configuration.forceEnTranslitInterlingua){ Log.Write ("Cannot force EN interlingua transliteration for the pair "+langKey+" as at least one of the interlingua transliteration configurations is disabled or missing!",LogLevelType.WARNING,configuration); Log.Write ("Will try falling back to direct transliteration without the EN interlingua.",LogLevelType.WARNING,configuration); } //If EN interlingua should not be used or one of the interlingua transliteration configurations is missing, try the direct transliteration. bool foundAtLeastOne = false; if (configuration.translConfEntryDict.ContainsKey(langKey) && configuration.translConfEntryDict[langKey].use) { srcToTrgTranslitConf = configuration.translConfEntryDict[langKey]; Log.Write ("Transliteration loaded for language "+srcLang+" into language "+trgLang+": "+ configuration.translConfEntryDict[langKey].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); foundAtLeastOne = true; } else //If the direct dictionary does not exist, log a warning and continue. { Log.Write ("Direct transliteration for the pair "+langKey+" was not found.",LogLevelType.WARNING,configuration); } //If EN interlingua should not be used or one of the interlingua transliteration configurations is missing, try the direct transliteration also in a reverse direction. if (configuration.translConfEntryDict.ContainsKey(langKey2) && configuration.translConfEntryDict[langKey2].use) { trgToSrcTranslitConf = configuration.translConfEntryDict[langKey2]; Log.Write ("Transliteration loaded for language "+trgLang+" into language "+srcLang+": "+ configuration.translConfEntryDict[langKey2].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); foundAtLeastOne = true; } else //If the direct dictionary does not exist, log a warning and continue. { Log.Write ("Direct transliteration for the pair "+langKey2+" was not found.",LogLevelType.WARNING,configuration); } if (!foundAtLeastOne) { Log.Write ("Direct transliteration for the pairs "+langKey+" nor "+langKey2+" were not found. Will try falling back to interlingua transliteration.",LogLevelType.WARNING,configuration); bool interlinguaTranslitLoaded = false; if (configuration.translConfEntryDict.ContainsKey(srcLangKey) && configuration.translConfEntryDict[srcLangKey].use) { srcTranslitConf= configuration.translConfEntryDict[srcLangKey]; interlinguaTranslitLoaded = true; Log.Write ("EN interlingua transliteration loaded for language "+srcLang+": "+ configuration.translConfEntryDict[srcLangKey].mosesIniPath,LogLevelType.LIMITED_OUTPUT,configuration); } if (configuration.translConfEntryDict.ContainsKey(trgLangKey) && configuration.translConfEntryDict[trgLangKey].use) { trgTranslitConf= configuration.translConfEntryDict[trgLangKey]; interlinguaTranslitLoaded = true; Log.Write ("EN interlingua transliteration loaded for language "+trgLang+": "+ configuration.translConfEntryDict[trgLangKey].mosesIniPath,LogLevelType.LIMITED_OUTPUT,configuration); } if (interlinguaTranslitLoaded) { return true; }else if (configuration.forceEnTranslitInterlingua){ Log.Write ("Cannot force EN interlingua transliteration for the pair "+langKey+" as at least one of the interlingua transliteration configurations is disabled or missing!",LogLevelType.WARNING,configuration); Log.Write ("The system will be executed without transliteration.",LogLevelType.WARNING,configuration); } } return false; }
/// <summary> /// Reads the language pair specific configuration - term alignment thresholds. /// </summary> /// <returns>The language pair configuration.</returns> /// <param name="srcLang">Source language.</param> /// <param name="trgLang">Target language.</param> /// <param name="configuration">Configuration.</param> static MPAlignerConfigurationLangPairEntry ReadLangPairConfig(string srcLang, string trgLang, MPAlignerConfiguration configuration) { string langKey = srcLang + "_" + trgLang; MPAlignerConfigurationLangPairEntry lpeConf = new MPAlignerConfigurationLangPairEntry (); if (configuration.langPairEntryDict.ContainsKey (langKey)) { lpeConf = configuration.langPairEntryDict [langKey]; } else { lpeConf.srcLang = srcLang; lpeConf.trgLang = trgLang; lpeConf.finalAlignmentThr = 0.6; lpeConf.printThr = 0.6;//A default value of 0.6 is usually the lowest value that is still reasonable for the cognate-based overlaps, therefore, wethe default to 0.6. However, for different applications the threshold could be raised even higher. configuration.langPairEntryDict.Add (langKey, lpeConf); } return lpeConf; }
public static void ReadStopwordList(MPAlignerConfiguration configuration, string lang, out Dictionary<string, bool> stopwordDict) { Log.Write ("Searching for a stopword list for laguage "+lang+".",LogLevelType.LIMITED_OUTPUT,configuration); stopwordDict = new Dictionary<string, bool> (); if (configuration.stopWordListEntryDict.ContainsKey (lang) && configuration.stopWordListEntryDict[lang].use) { try{ stopwordDict = StopwordListParser.ParseStopwordList(configuration.stopWordListEntryDict[lang]); Log.Write("Stopword list for language "+lang+" loaded: " + configuration.stopWordListEntryDict[lang].path, LogLevelType.LIMITED_OUTPUT,configuration); } catch{ Log.Write ("Stopword list for laguage "+lang+" was not found or is corrupted.",LogLevelType.WARNING,configuration); } } else { Log.Write ("Stopword list for laguage "+lang+" was not found or is disabled.",LogLevelType.WARNING,configuration); } return; }
public static void ReadExceptionDictionary(MPAlignerConfiguration configuration, string srcLang, string trgLang, out Dictionary<string, Dictionary<string, bool>> srcToTrgExcDict) { srcToTrgExcDict = new Dictionary<string, Dictionary<string, bool>> (); string langKey = srcLang + "_" + trgLang; Log.Write ("Searching for an exception dictionary for the laguage pair "+langKey+".",LogLevelType.LIMITED_OUTPUT,configuration); if (configuration.excDictEntryDict.ContainsKey (langKey)&&configuration.excDictEntryDict[langKey].use) { try { srcToTrgExcDict = ExceptionDictionaryParser.ParseExceptionDictionary(configuration.excDictEntryDict[langKey]); Log.Write("Exception dictionary for the laguage pair "+langKey+" loaded: " + configuration.excDictEntryDict[langKey].path, LogLevelType.LIMITED_OUTPUT,configuration); } catch{ Log.Write ("The exception dictionary for the laguage pair "+langKey+" was not found or is corrupted.",LogLevelType.WARNING,configuration); } } else { Log.Write ("The exception dictionary for the laguage pair "+langKey+" was not found or is disabled.",LogLevelType.WARNING,configuration); } return; }
public static bool ReadDictionaries(MPAlignerConfiguration configuration, string srcLang, string trgLang,out Dictionary<string, Dictionary<string, double>> srcDict, out Dictionary<string, Dictionary<string, double>> trgDict, out Dictionary<string, Dictionary<string, double>> srcToTrgDict, out Dictionary<string, Dictionary<string, double>> trgToSrcDict) { Log.Write ("Searching for and reading dictionaries.",LogLevelType.LIMITED_OUTPUT,configuration); srcDict = null; trgDict = null; srcToTrgDict = null; trgToSrcDict = null; string srcLangKey = srcLang+"_en"; string trgLangKey = trgLang+"_en"; string langKey = srcLang+"_"+trgLang; string langKey2 = trgLang+"_"+srcLang; //Read dictionaries. If reading fails, log a warning and continue. //At first we check if the EN interlingua should be used. if (configuration.forceEnDictInterlingua && configuration.dictConfEntryDict.ContainsKey(srcLangKey)&&configuration.dictConfEntryDict.ContainsKey(trgLangKey)) { if (configuration.dictConfEntryDict[srcLangKey].use && configuration.dictConfEntryDict[trgLangKey].use) { try{ srcDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[srcLangKey]); trgDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[trgLangKey]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[srcLangKey],srcDict); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[trgLangKey],trgDict); Log.Write (srcLangKey+" dictionary with "+srcDict.Count.ToString()+" "+srcLang+" entries loaded: "+configuration.dictConfEntryDict[srcLangKey].path, LogLevelType.LIMITED_OUTPUT,configuration); Log.Write (trgLangKey+" dictionary with "+trgDict.Count.ToString()+" "+trgLang+" entries loaded: "+configuration.dictConfEntryDict[trgLangKey].path, LogLevelType.LIMITED_OUTPUT,configuration); return true;//Interlingua dictionary used. } catch{ srcDict = null; trgDict = null; Log.Write ("Cannot force EN interlingua dictionary usage for the pair "+langKey+" as one of the interlingua dictionaries may be missing or corrupt!",LogLevelType.WARNING,configuration); Log.Write ("Will try fallback to direct dictionary without the EN interlingua.",LogLevelType.WARNING,configuration); } } else { Log.Write ("Cannot force EN interlingua dictionary usage for the pair "+langKey+" as at least one of the interlingua dictionaries is disabled!",LogLevelType.WARNING,configuration); Log.Write ("Will try fallback to direct dictionary without the EN interlingua.",LogLevelType.WARNING,configuration); } } //If EN interlingua should not be used or one of the interlingua dictionaries is missing, try loading the direct dictionary. if (configuration.dictConfEntryDict.ContainsKey(langKey)&&configuration.dictConfEntryDict[langKey].use) { srcDict = null; trgDict = null; try{ srcToTrgDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[langKey]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[langKey],srcToTrgDict); if (configuration.dictConfEntryDict.ContainsKey(langKey2)&&configuration.dictConfEntryDict[langKey2].use) { try { trgToSrcDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[langKey2]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[langKey2],trgToSrcDict); Log.Write (langKey + " dictionary with "+srcToTrgDict.Count.ToString()+" "+srcLang+" entries loaded: "+ configuration.dictConfEntryDict[langKey].path,LogLevelType.LIMITED_OUTPUT,configuration); Log.Write (langKey2 + " dictionary with "+trgToSrcDict.Count.ToString()+" "+trgLang+" entries loaded: "+ configuration.dictConfEntryDict[langKey2].path,LogLevelType.LIMITED_OUTPUT,configuration); return false; } catch { Log.Write ("Cannot read the dictionary for the pair "+langKey2+"! The dictionary may be missing or corrupt! The "+langKey+" dictionary will be inverted.",LogLevelType.WARNING,configuration); trgToSrcDict = GetInverseDictionary(srcToTrgDict, configuration.dictConfEntryDict[langKey]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[langKey],trgToSrcDict); Log.Write (langKey + " dictionary with "+srcToTrgDict.Count.ToString()+" "+srcLang+" entries loaded: "+ configuration.dictConfEntryDict[langKey].path,LogLevelType.LIMITED_OUTPUT,configuration); Log.Write (langKey2 + " dictionary with "+trgToSrcDict.Count.ToString()+" "+trgLang+" entries loaded (inverse of): "+ configuration.dictConfEntryDict[langKey].path,LogLevelType.LIMITED_OUTPUT,configuration); return false; } } else { Log.Write ("For the pair "+langKey2+" the inverted "+langKey+" dictionary will be used.",LogLevelType.WARNING,configuration); trgToSrcDict = GetInverseDictionary(srcToTrgDict, configuration.dictConfEntryDict[langKey]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[langKey],trgToSrcDict); Log.Write (langKey + " dictionary with "+srcToTrgDict.Count.ToString()+" "+srcLang+" entries loaded: "+ configuration.dictConfEntryDict[langKey].path,LogLevelType.LIMITED_OUTPUT,configuration); Log.Write (langKey2 + " dictionary with "+trgToSrcDict.Count.ToString()+" "+trgLang+" entries loaded (inverse of): "+ configuration.dictConfEntryDict[langKey].path,LogLevelType.LIMITED_OUTPUT,configuration); return false; } } catch{ //If a dictionary for a language pair is not given (nor is interlingua usage specified, the system will not use a dictionary at all). srcToTrgDict = null; trgToSrcDict = null; Log.Write ("Cannot read the dictionary for the pair "+langKey+"! The dictionary may be missing or corrupt! The system will try to fall back to the inverse dictionary!",LogLevelType.WARNING,configuration); } } if (configuration.dictConfEntryDict.ContainsKey(langKey2)&&configuration.dictConfEntryDict[langKey2].use) { Log.Write ("Direct dictionary for "+langKey+" missing or disabled. The "+langKey2+" will be used instead.",LogLevelType.WARNING,configuration); srcDict = null; trgDict = null; try { trgToSrcDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[langKey2]); srcToTrgDict = GetInverseDictionary(trgToSrcDict, configuration.dictConfEntryDict[langKey2]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[langKey2],srcToTrgDict); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[langKey2],trgToSrcDict); Log.Write (langKey + " dictionary with "+srcToTrgDict.Count.ToString()+" "+srcLang+" entries loaded: " + configuration.dictConfEntryDict[langKey2].path, LogLevelType.LIMITED_OUTPUT,configuration); Log.Write (langKey2 + " dictionary with "+trgToSrcDict.Count.ToString()+" "+trgLang+" entries loaded (inverse of): " + configuration.dictConfEntryDict[langKey2].path, LogLevelType.LIMITED_OUTPUT,configuration); return false; } catch { Log.Write ("Cannot read the dictionary for the pair "+langKey2+"! The dictionary may be disabled, missing or corrupt. The system will try to fall back to interlingua dictionaries!",LogLevelType.WARNING,configuration); } } bool usingInterlingua = false; if (configuration.dictConfEntryDict.ContainsKey(srcLangKey)&&configuration.dictConfEntryDict[srcLangKey].use) { try{ srcDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[srcLangKey]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[srcLangKey],srcDict); Log.Write (srcLangKey+" dictionary with "+srcDict.Count.ToString()+" "+srcLang+" entries loaded: " + configuration.dictConfEntryDict[srcLangKey].path, LogLevelType.LIMITED_OUTPUT,configuration); //Interlingua dictionary used. usingInterlingua=true; } catch{ srcDict = null; Log.Write ("Source-to-EN dictionary is missing or corrupt!",LogLevelType.WARNING,configuration); } } if (configuration.dictConfEntryDict.ContainsKey(trgLangKey)&&configuration.dictConfEntryDict[trgLangKey].use) { try{ trgDict = ProbabilisticDictionaryParser.ParseDictionary(configuration.dictConfEntryDict[trgLangKey]); ProbabilisticDictionaryParser.FilterTopEquivalents(configuration.dictConfEntryDict[trgLangKey],trgDict); Log.Write (trgLangKey+" dictionary with "+trgDict.Count.ToString()+" "+trgLang+" entries loaded: " + configuration.dictConfEntryDict[trgLangKey].path, LogLevelType.LIMITED_OUTPUT,configuration); //Interlingua dictionary used. usingInterlingua=true; } catch{ trgDict = null; Log.Write ("Target-to-EN dictionary is missing or corrupt!",LogLevelType.WARNING,configuration); } } if (usingInterlingua) { return true; } else //If the direct dictionary does not exist, log a warning and continue. { Log.Write ("At least one of the EN interlingua dictionaries is missing!",LogLevelType.WARNING,configuration); } Log.Write ("Dictionaries for the pair "+langKey+" were not found or loaded. The system will be executed without a dictionary!",LogLevelType.WARNING,configuration); srcDict = null; trgDict = null; srcToTrgDict = null; trgToSrcDict = null; return false;//Interlingua dictionary not used. }
public static void Main(string[] args) { string configFile = null; string method = null; string inputFile = null; string inputFormat = "tagged_plaintext";//Allowed values: tagged_plaintext, preprocessed_terms, term_list string srcInputFile = null; string trgInputFile = null; string srcLang = null; string trgLang = null; string outputFile = null; string consolidatedOutputFile = null; string outputFormat = "";//"tabsep";//Allowed values: ref_tabsep, tabsep, xml string preProcessedTermOutputFile = null;//"/home/marcis/Dropbox/MonoProjects/MPAligner/MPAligner/bin/Debug/testTermData.xml";//null; string tempTranslitFile = null; bool consolidateResults = false; double consolidationThreshold = 0; //bool logPrepData = false; string domainId = ""; string collectionId = ""; //The skipping parameters are just for debugging. Use them only manually! string skipSrc = ""; string skipTrg = ""; MPAlignerConfiguration configuration = null; //Read all configuration parameters from the command line. for (int i=0; i<args.Length; i++) { if ((args [i] == "-c" || args [i] == "--configuration") && args.Length > i + 1) { configFile = args [i + 1]; configuration = new MPAlignerConfiguration (); configuration.Load (configFile); } else if ((args [i] == "-m" || args [i] == "--method") && args.Length > i + 1) { method = args [i + 1]; } else if ((args [i] == "-i" || args [i] == "--input-file") && args.Length > i + 1) { inputFile = args [i + 1]; //} else if (args [i] == "-lp" || args [i] == "--log-pre-processed") { // logPrepData = true; } else if ((args [i] == "-if" || args [i] == "--input-format") && args.Length > i + 1) { inputFormat = args [i + 1]; } else if ((args [i] == "-si" || args [i] == "--source-input") && args.Length > i + 1) { srcInputFile = args [i + 1]; } else if ((args [i] == "-ti" || args [i] == "--target-input") && args.Length > i + 1) { trgInputFile = args [i + 1]; } else if ((args [i] == "-sl" || args [i] == "--source-language") && args.Length > i + 1) { srcLang = MPFramework.MPFrameworkFunctions.GetValidLangString (args [i + 1]); } else if ((args [i] == "-tl" || args [i] == "--target-language") && args.Length > i + 1) { trgLang = MPFramework.MPFrameworkFunctions.GetValidLangString (args [i + 1]); } else if ((args [i] == "-o" || args [i] == "--output-file") && args.Length > i + 1) { outputFile = args [i + 1]; } else if ((args [i] == "-of" || args [i] == "--output-format") && args.Length > i + 1) { outputFormat = args [i + 1]; } else if ((args [i] == "-pto" || args [i] == "--pre-processed-term-output-file") && args.Length > i + 1) { preProcessedTermOutputFile = args [i + 1]; } else if ((args [i] == "-ttf" || args [i] == "--temp-translit-file") && args.Length > i + 1) { tempTranslitFile = args [i + 1]; } else if ((args [i] == "-ss" || args [i] == "--skip-source-file") && args.Length > i + 1) { skipSrc = args [i + 1]; } else if ((args [i] == "-st" || args [i] == "--skip-target-file") && args.Length > i + 1) { skipTrg = args [i + 1]; } else if ((args [i] == "-d_id" || args [i] == "--domain-id") && args.Length > i + 1) { domainId = args [i + 1]; } else if ((args [i] == "-c_id" || args [i] == "--collection-id") && args.Length > i + 1) { collectionId = args [i + 1]; } else if ((args [i] == "-ct" || args [i] == "--consolidation-threshold") && args.Length > i + 1) { //Consolidation works only if the ref_tabsep output format is specified! NumberFormatInfo nfi = new NumberFormatInfo (); nfi.CurrencyDecimalSeparator = "."; nfi.NumberDecimalSeparator = "."; nfi.PercentDecimalSeparator = "."; consolidationThreshold = Convert.ToDouble (args [i + 1], nfi); consolidateResults = true; } } //Break if a method is not defined. if (string.IsNullOrWhiteSpace (method)) { Log.Write ("Method not specified!",LogLevelType.ERROR,configuration); PrintUsage (); return; } //Write a configuration file to the output file if the config method is specified. if (method.ToLower () == "config") { if (string.IsNullOrWhiteSpace (outputFile)) { Log.Write("Output file not specified!",LogLevelType.ERROR,configuration); PrintUsage (); return; } MPAlignerConfiguration conf = new MPAlignerConfiguration (); MPAlignerConfigurationDictEntry cde = new MPAlignerConfigurationDictEntry (); cde.srcLang = "lv"; cde.trgLang = "en"; cde.path = "/home/marcis/TILDE/RESOURCES/DICT/lv_en_noisy"; conf.dictConfEntryDict.Add ("lv_en", cde); cde = new MPAlignerConfigurationDictEntry (); cde.srcLang = "lt"; cde.trgLang = "en"; cde.path = "/home/marcis/TILDE/RESOURCES/DICT/lt_en"; conf.dictConfEntryDict.Add ("lt_en", cde); MPAlignerConfigurationTranslEntry cte = new MPAlignerConfigurationTranslEntry (); cte.mosesIniPath = "/home/marcis/TILDE/RESOURCES/TRANSLIT_WORKING_DIR/LV-EN/lv-en-binarised-model.moses.ini"; cte.srcLang = "lv"; cte.trgLang = "en"; conf.translConfEntryDict.Add ("lv_en", cte); cte = new MPAlignerConfigurationTranslEntry (); cte.mosesIniPath = "/home/marcis/TILDE/RESOURCES/TRANSLIT_WORKING_DIR/LV-EN/lt-en-binarised-model.moses.ini"; cte.srcLang = "lt"; cte.trgLang = "en"; conf.translConfEntryDict.Add ("lt_en", cte); MPAlignerConfigurationLangPairEntry lpe = new MPAlignerConfigurationLangPairEntry (); lpe.srcLang = "lv"; lpe.trgLang = "en"; conf.langPairEntryDict.Add ("lv_en", lpe); lpe = new MPAlignerConfigurationLangPairEntry (); lpe.srcLang = "lt"; lpe.trgLang = "en"; conf.langPairEntryDict.Add ("lt_en", lpe); MPAlignerConfigurationExceptionEntry cee = new MPAlignerConfigurationExceptionEntry (); cee.srcLang = "lv"; cee.trgLang = "en"; cee.path = "/home/marcis/TILDE/RESOURCES/EXC_DICT/lv_en_exc"; conf.excDictEntryDict.Add ("lv_en", cee); cee = new MPAlignerConfigurationExceptionEntry (); cee.srcLang = "lt"; cee.trgLang = "en"; cee.path = "/home/marcis/TILDE/RESOURCES/EXC_DICT/lt_en_exc"; conf.excDictEntryDict.Add ("lt_en", cee); MPAlignerConfigurationStopWordListEntry cswle = new MPAlignerConfigurationStopWordListEntry (); cswle.lang = "lv"; cswle.path = "/home/marcis/TILDE/RESOURCES/STOP_WORD/lv_stop"; conf.stopWordListEntryDict.Add ("lv", cswle); cswle = new MPAlignerConfigurationStopWordListEntry (); cswle.lang = "lt"; cswle.path = "/home/marcis/TILDE/RESOURCES/STOP_WORD/lt_stop"; conf.stopWordListEntryDict.Add ("lt", cswle); cswle = new MPAlignerConfigurationStopWordListEntry (); cswle.lang = "en"; cswle.path = "/home/marcis/TILDE/RESOURCES/STOP_WORD/en_stop"; conf.stopWordListEntryDict.Add ("en", cswle); conf.Save (outputFile); return; } //Try reading the default configuration if none is passed, but if the default configuration can not be found, break. if (string.IsNullOrWhiteSpace (configFile) && File.Exists ("MPAlignerConfig.xml")) { configuration = new MPAlignerConfiguration (); configuration.Load (configFile); } else if (string.IsNullOrWhiteSpace (configFile)) { Log.Write("Configuration file missing in application directory and a substitution runtime configuration file is not specified!",LogLevelType.ERROR,configuration); PrintUsage (); return; } //In the case if an output format is not defined in the command line, read it from the configuration file. if (string.IsNullOrWhiteSpace (outputFormat)) outputFormat = configuration.outputFormat; //In the case if the configuration does not specify an output format, use the default output format. if (string.IsNullOrWhiteSpace (outputFormat)) { outputFormat = "ref_tabsep"; } Log.confLogLevel = configuration.logLevel; if (string.IsNullOrWhiteSpace (tempTranslitFile)) { tempTranslitFile = outputFile+".tmp"; } Log.Write ("configFile: "+(configFile!=null?configFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("method: "+(method!=null?method:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("inputFile: "+(inputFile!=null?inputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("inputFormat: "+(inputFormat!=null?inputFormat:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("srcInputFile: "+(srcInputFile!=null?srcInputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("trgInputFile: "+(trgInputFile!=null?trgInputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("srcLang: "+(srcLang!=null?srcLang:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("trgLang: "+(trgLang!=null?trgLang:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("outputFile: "+(outputFile!=null?outputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("outputFormat: "+(outputFormat!=null?outputFormat:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("preProcessedTermOutputFile: "+(preProcessedTermOutputFile!=null?preProcessedTermOutputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("tempTranslitFile: "+(tempTranslitFile!=null?tempTranslitFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("consolidation threshold: "+(consolidateResults?consolidationThreshold.ToString():""),LogLevelType.LIMITED_OUTPUT,configuration); if (outputFormat == "ref_tabsep" && consolidateResults) { consolidatedOutputFile = outputFile; outputFile += ".raw"; } //For document pair-based alignment. if (method.ToLower () == "taggedfilepairs") { char[] sep = {'\t'}; if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Input file list file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } //Read the alignment thresholds and other language pair specific numerical/single-value parameters. MPAlignerConfigurationLangPairEntry lpeConf = ReadLangPairConfig (srcLang, trgLang, configuration); //The size of the cache may affect the performance of the alignment! Dictionary<string, ProcessedTermEntry> srcTermCache = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermCache = new Dictionary<string, ProcessedTermEntry>(); bool interlinguaDictUsed = false; bool interlinguaTranslitUsed = false; //Define dictionaries for pre-processing. Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; //Define transliteration configurations for pre-processing. MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; //Read dictionaries and transliterations. interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); //Define the alignments (the variable holding alignment results) Dictionary<string,Dictionary<string, AlignmentInfoElement>> alignments = new Dictionary<string, Dictionary<string, AlignmentInfoElement>>(); //Define and read exception dictionaries. Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); //Define and read stopword lists. Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); StreamReader sr = new StreamReader(inputFile,Encoding.UTF8); int pairCounter = 0; bool skip = !string.IsNullOrWhiteSpace(skipSrc)&&!string.IsNullOrWhiteSpace(skipTrg)?true:false; //Read input document alignment file and process file pairs. while(!sr.EndOfStream) { pairCounter++; string line = sr.ReadLine().Trim(); if (string.IsNullOrWhiteSpace(line)) continue; string[] arr = line.Split(sep, StringSplitOptions.RemoveEmptyEntries); if (arr.Length<2) { continue; //If the alignment line does not contain at least two entries, the document alignment is not valid. } string srcFile = arr[0]; string trgFile = arr[1]; if (!File.Exists(srcFile)) { Log.Write("Input file \""+srcFile+"\" cannot be found!",LogLevelType.WARNING,configuration); continue; } if (!File.Exists(trgFile)) { Log.Write("Input file \""+trgFile+"\" cannot be found!",LogLevelType.WARNING,configuration); continue; } string srcFileName = Path.GetFileName(srcFile); string trgFileName = Path.GetFileName(trgFile); //The skipping condition is for debugging - if the system crashes due to insufficient memory... if (skip) { if (srcFileName==skipSrc&&trgFileName == skipTrg) { skip = false; } else { Log.Write("Skipping file pair "+srcFileName+" and " + trgFileName+".",LogLevelType.WARNING,configuration); continue; } } Log.Write("Processing file pair "+srcFileName+" and " + trgFileName+".",LogLevelType.LIMITED_OUTPUT,configuration); //Define term entry data variables (used for sotring terms in pre-pre-processed and pre-processed states). Dictionary<string,SimpleTermEntry> srcInitialList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> trgInitialList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> srcInitialTempList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> trgInitialTempList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string, ProcessedTermEntry> srcTermList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> srcTermTempList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermTempList = new Dictionary<string, ProcessedTermEntry>(); //Two input formats are currently supported - term-tagged plaintext files and term list (one term per line) files. if (inputFormat=="tagged_plaintext") { //Read terms from the term-tagged documents. srcInitialTempList = TermTaggedFileParser.ParseTermTaggedFile(srcFile,Encoding.UTF8, configuration.concLen); trgInitialTempList = TermTaggedFileParser.ParseTermTaggedFile(trgFile,Encoding.UTF8, configuration.concLen); } else { //Read terms from the term list files. srcInitialTempList = ListFileParser.Parse(srcFile,Encoding.UTF8); trgInitialTempList = ListFileParser.Parse(trgFile,Encoding.UTF8); } //Search for already pre-processed source terms in the cache. foreach(string term in srcInitialTempList.Keys) { string lower = term.ToLower(); if (srcTermCache.ContainsKey(lower)) { if (!srcTermList.ContainsKey(lower)) srcTermList.Add(lower, srcTermCache[lower]); } else { srcInitialList.Add(term, srcInitialTempList[term]); } } //Search for already pre-processed target terms in the cache. foreach(string term in trgInitialTempList.Keys) { string lower = term.ToLower(); if (trgTermCache.ContainsKey(lower)) { if (!trgTermList.ContainsKey(lower)) trgTermList.Add(lower, trgTermCache[lower]); } else { trgInitialList.Add(term, trgInitialTempList[term]); } } //Now pre-process terms that have not been pre-processed again. if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } //Update the pre-processed term list for alignment. foreach(string srcTerm in srcTermTempList.Keys) { if (!srcTermList.ContainsKey(srcTerm)) srcTermList.Add(srcTerm,srcTermTempList[srcTerm]); if (!srcTermCache.ContainsKey(srcTerm)) srcTermCache.Add(srcTerm, srcTermTempList[srcTerm]); } foreach(string trgTerm in trgTermTempList.Keys) { if (!trgTermList.ContainsKey(trgTerm)) trgTermList.Add(trgTerm,trgTermTempList[trgTerm]); if (!trgTermCache.ContainsKey(trgTerm)) trgTermCache.Add(trgTerm, trgTermTempList[trgTerm]); } //Execute alignment for one file pair. List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); //The execution may be multi-threaded or single-threaded. The multi-threaded execution may be instable. Therefore, be careful when using multi-threading. if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcFile, trgFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcFile, trgFile, excDict, srcStopWords, trgStopWords); } if (alignment!=null) { foreach(AlignmentInfoElement aie in alignment) { if (!alignments.ContainsKey(aie.srcEntry.lowercaceForm)) { alignments.Add(aie.srcEntry.lowercaceForm, new Dictionary<string, AlignmentInfoElement>()); } if (!alignments[aie.srcEntry.lowercaceForm].ContainsKey(aie.trgEntry.lowercaceForm)) { alignments[aie.srcEntry.lowercaceForm].Add(aie.trgEntry.lowercaceForm, aie); } } } //If pre-processed term cache is full, empty it (this maybe can be imrpoved with the help of some sort of a flowing cache (always circulating). if (srcTermCache.Count>50000) { srcTermCache.Clear(); srcTermCache = new Dictionary<string, ProcessedTermEntry>(); GC.Collect(); GC.WaitForPendingFinalizers(); } if (trgTermCache.Count>50000) { trgTermCache.Clear(); trgTermCache = new Dictionary<string, ProcessedTermEntry>(); GC.Collect(); GC.WaitForPendingFinalizers(); } //After each 50 pairs, print rsults. if (pairCounter%50==0||alignments.Count>50000) { Log.Write("Printing intermediate results after "+pairCounter.ToString()+" file pairs",LogLevelType.LIMITED_OUTPUT,configuration); List<AlignmentInfoElement> resAlignment = new List<AlignmentInfoElement>(); foreach(string src in alignments.Keys) { foreach(string trg in alignments[src].Keys) { resAlignment.Add(alignments[src][trg]); } } AlignmentInfoElement.AppendList(outputFormat,outputFile,resAlignment,lpeConf,srcLang,trgLang,collectionId,domainId); alignments.Clear(); alignments = new Dictionary<string, Dictionary<string, AlignmentInfoElement>>(); GC.Collect(); GC.WaitForPendingFinalizers(); } } sr.Close(); //If there are alignments left, write them to the output file. if (!string.IsNullOrWhiteSpace(outputFile)) { Log.Write("Printing final results after "+pairCounter.ToString()+" file pairs",LogLevelType.LIMITED_OUTPUT,configuration); List<AlignmentInfoElement> resAlignment = new List<AlignmentInfoElement>(); foreach(string src in alignments.Keys) { foreach(string trg in alignments[src].Keys) { resAlignment.Add(alignments[src][trg]); } } AlignmentInfoElement.AppendList(outputFormat,outputFile,resAlignment,lpeConf,srcLang,trgLang,collectionId,domainId); } } else if (method.ToLower () == "singletaggedpair") //TODO: REFACTOR (the file pair list processing could be handled (wisely) through a single file pair processing method!!! { //Define the instances of source and target processed term lists. Dictionary<string, ProcessedTermEntry> srcTermList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermList = new Dictionary<string, ProcessedTermEntry>(); bool interlinguaDictUsed = false; bool interlinguaTranslitUsed = false; if (inputFormat=="preprocessed_terms") { if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Pre-processed term input file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } PreprocessedTermData ptd = PreprocessedTermData.ReadFromFile(inputFile); foreach(ProcessedTermEntry pte in ptd.srcTerms) { if(!srcTermList.ContainsKey(pte.lowercaceForm)) { srcTermList.Add(pte.lowercaceForm,pte); } } foreach(ProcessedTermEntry pte in ptd.trgTerms) { if(!trgTermList.ContainsKey(pte.lowercaceForm)) { trgTermList.Add(pte.lowercaceForm,pte); } } srcLang = ptd.srcLang; trgLang = ptd.trgLang; interlinguaDictUsed = ptd.interlinguaDictUsed; interlinguaTranslitUsed = ptd.interlinguaTranslitUsed; Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } else if (inputFormat=="term_list"||inputFormat=="tagged_plaintext") { if (string.IsNullOrWhiteSpace(srcInputFile)||!File.Exists(srcInputFile)||string.IsNullOrWhiteSpace(trgInputFile)||!File.Exists(trgInputFile)) { Log.Write("Source and/or target files not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } Dictionary<string,SimpleTermEntry> srcInitialList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> trgInitialList = new Dictionary<string, SimpleTermEntry>(); if (inputFormat=="tagged_plaintext") { srcInitialList = TermTaggedFileParser.ParseTermTaggedFile(srcInputFile,Encoding.UTF8, configuration.concLen); trgInitialList = TermTaggedFileParser.ParseTermTaggedFile(trgInputFile,Encoding.UTF8, configuration.concLen); } else { srcInitialList = ListFileParser.Parse(srcInputFile,Encoding.UTF8); trgInitialList = ListFileParser.Parse(trgInputFile,Encoding.UTF8); } Log.Write ("Unprocessed source terms: "+srcInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Unprocessed target terms: "+trgInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); ///If pre-processed terms should be saved for future use an output format is created. /// This functionality is not available for the file pair list-based processing. if (!string.IsNullOrWhiteSpace(preProcessedTermOutputFile)) { List<ProcessedTermEntry> srcTerms = new List<ProcessedTermEntry>(srcTermList.Values); List<ProcessedTermEntry> trgTerms = new List<ProcessedTermEntry>(trgTermList.Values); PreprocessedTermData ptd = new PreprocessedTermData(); ptd.interlinguaDictUsed = interlinguaDictUsed; ptd.interlinguaTranslitUsed = interlinguaTranslitUsed; ptd.srcTerms = srcTerms.ToArray(); ptd.trgTerms = trgTerms.ToArray(); ptd.srcLang = srcLang; ptd.trgLang = trgLang; string outStr = MPFramework.MPFrameworkFunctions.SerializeObjectInstance<PreprocessedTermData>(ptd); File.WriteAllText(preProcessedTermOutputFile,outStr); } Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } else { Log.Write ("Input format UNKNOWN or UNDEFINED.",LogLevelType.ERROR,configuration); return; } } else if (method.ToLower () == "singletermpairlist") //Use this method only if filtering of term pairs or some sort of evaluation is necessary! { //Define the instances of source and target processed term lists. List<ProcessedTermEntry> srcTermList = new List<ProcessedTermEntry>(); List<ProcessedTermEntry> trgTermList = new List<ProcessedTermEntry>(); bool interlinguaDictUsed = false; bool interlinguaTranslitUsed = false; if (inputFormat=="preprocessed_terms") { if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Pre-processed term input file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } PreprocessedTermData ptd = PreprocessedTermData.ReadFromFile(inputFile); srcTermList.AddRange(ptd.srcTerms); trgTermList.AddRange(ptd.trgTerms); Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); srcLang = ptd.srcLang; trgLang = ptd.trgLang; interlinguaDictUsed = ptd.interlinguaDictUsed; interlinguaTranslitUsed = ptd.interlinguaTranslitUsed; Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Log.Write ("Exception dictionary entries: "+excDict.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Log.Write ("Source language stopwords: "+srcStopWords.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); Log.Write ("Target language stopwords: "+trgStopWords.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = Alignment.AlignListPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); Log.Write ("Alignment elements after alignment: "+alignment.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } else { if (string.IsNullOrWhiteSpace(srcInputFile)||!File.Exists(srcInputFile)||string.IsNullOrWhiteSpace(trgInputFile)||!File.Exists(trgInputFile)) { Log.Write("Source and/or target files not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } List<string> srcInitialList = new List<string>(); List<string> trgInitialList = new List<string>(); srcInitialList = ListFileParser.ParseList(srcInputFile,Encoding.UTF8); trgInitialList = ListFileParser.ParseList(trgInputFile,Encoding.UTF8); if (srcInitialList.Count!=trgInitialList.Count) { Log.Write("Source and target term lists are with different lengths",LogLevelType.ERROR,configuration); throw new ArgumentException("Source and target term lists are with different lengths"); } Log.Write ("Unprocessed source terms: "+srcInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Unprocessed target terms: "+trgInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); ///If pre-processed terms should be saved for future use an output format is created. if (!string.IsNullOrWhiteSpace(preProcessedTermOutputFile)) { PreprocessedTermData ptd = new PreprocessedTermData(); ptd.interlinguaDictUsed = interlinguaDictUsed; ptd.interlinguaTranslitUsed = interlinguaTranslitUsed; ptd.srcTerms = srcTermList.ToArray(); ptd.trgTerms = trgTermList.ToArray(); ptd.srcLang = srcLang; ptd.trgLang = trgLang; string outStr = MPFramework.MPFrameworkFunctions.SerializeObjectInstance<PreprocessedTermData>(ptd); File.WriteAllText(preProcessedTermOutputFile,outStr); } Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = Alignment.AlignListPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } } else if (method.ToLower () == "eurovoceval") { if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Eurovoc input file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source or target language not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } configuration.allowTrimmedAlignments = false; //configuration.useMultiThreadedExecution = false; configuration.printTopTrgForSrc = true; string logFile = outputFile+".res.log"; StreamWriter sw = new StreamWriter(logFile, true, Encoding.UTF8); Dictionary<string,List<string>> eurovocDict = ReadEurovocDict(inputFile);; //List<string> langList = GetLangsFromConf(configuration); //for(int i = 0;i<langList.Count;i++) //{ //for(int j = 0;j<langList.Count;j++) //{ //if (i==j) continue; //srcLang = langList[i]; //trgLang = langList[j]; Log.Write("Processing pair "+srcLang+"_"+trgLang,LogLevelType.LIMITED_OUTPUT,configuration); if (Char.IsDigit(outputFile[outputFile.Length-1])) outputFile = outputFile.Substring(0,outputFile.Length-1); string alignmentOutputFile = outputFile+"."+srcLang+"_"+trgLang+".align.txt"; if (File.Exists(alignmentOutputFile)) { Log.Write("Pair "+srcLang+"_"+trgLang+" already processed! Evaluating...",LogLevelType.LIMITED_OUTPUT,configuration); List<StringComparisonElement> terms = new List<StringComparisonElement>(); StreamReader sr = new StreamReader(alignmentOutputFile,Encoding.UTF8); char[] sep = {'\t'}; NumberFormatInfo nfi = new NumberFormatInfo(); nfi.CurrencyDecimalSeparator="."; nfi.NumberDecimalSeparator="."; nfi.PercentDecimalSeparator="."; while(!sr.EndOfStream) { string line = sr.ReadLine().Trim(); string[] arr = line.Split(sep,StringSplitOptions.None); if (arr.Length>=3) { StringComparisonElement sce = new StringComparisonElement(); sce.src = arr[0]; sce.trg = arr[1]; sce.similarity = Convert.ToDouble(arr[2],nfi); terms.Add(sce); } } sr.Close(); terms.Sort(); List<double> scores = new List<double>(); double tmp = 0; while (tmp<=1) { scores.Add(tmp); tmp+=0.01; } List<double> correct = new List<double>(); for(int t=0;t<scores.Count;t++) { correct.Add(0); } List<double> total = new List<double>(); for(int t=0;t<scores.Count;t++) { total.Add(0); } int totalForRec = 0; Dictionary<string,Dictionary<string,bool>> goldList = new Dictionary<string, Dictionary<string, bool>>(); for (int s = 0;s<eurovocDict[srcLang].Count;s++) { if (!eurovocDict[srcLang][s].Contains("(under translation)")&&!eurovocDict[trgLang][s].Contains("(under translation)")) { totalForRec++; if (!goldList.ContainsKey(eurovocDict[srcLang][s].ToLower())) goldList.Add(eurovocDict[srcLang][s].ToLower(), new Dictionary<string,bool>()); if (!goldList[eurovocDict[srcLang][s].ToLower()].ContainsKey(eurovocDict[trgLang][s].ToLower())) goldList[eurovocDict[srcLang][s].ToLower()].Add(eurovocDict[trgLang][s].ToLower(),true); } } string previousSrc = null; foreach(StringComparisonElement sce in terms) { string currSrc = sce.src; if (previousSrc!=currSrc.ToLower()) { string src = sce.src.ToLower(); string trg = sce.trg.ToLower(); double alignScore = sce.similarity; bool corr = false; if (goldList.ContainsKey(src)&&goldList[src].ContainsKey(trg)) corr = true; for (int s =0;s<scores.Count;s++) { if (scores[s]<=alignScore) { if (corr) correct[s]++; total[s]++; } } previousSrc = currSrc.ToLower(); } } for(int s=0;s<scores.Count;s++) { double corr = correct[s]; double tot = total[s]; double totCorr = totalForRec; double prec = corr/tot*100; double rec = corr/totCorr*100; double f1 = prec*rec*2/(prec+rec); sw.WriteLine(srcLang+"\t"+trgLang+"\t"+scores[s].ToString()+"\t"+corr.ToString()+"\t"+tot.ToString()+"\t"+totCorr.ToString()+"\t"+prec.ToString()+"\t"+rec.ToString()+"\t"+f1.ToString()); } sw.Flush(); //} //} sw.Close(); //continue; return; } string preprocessedOutputFile = outputFile+"."+srcLang+"_"+trgLang+".prep.txt"; Dictionary<string,SimpleTermEntry> srcInitialList = StringListToDict(eurovocDict[srcLang]); Dictionary<string,SimpleTermEntry> trgInitialList = StringListToDict(eurovocDict[trgLang]); Log.Write ("Unprocessed source terms: "+srcInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Unprocessed target terms: "+trgInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; bool interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); bool interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); Dictionary<string,ProcessedTermEntry> srcTermList = new Dictionary<string,ProcessedTermEntry>(); Dictionary<string,ProcessedTermEntry> trgTermList = new Dictionary<string,ProcessedTermEntry>(); if (File.Exists(preprocessedOutputFile)) { Log.Write("Preprocessed term data found! Reading pre-processed data to save time!", LogLevelType.WARNING,configuration); PreprocessedTermData ptd1 = PreprocessedTermData.ReadFromFile(preprocessedOutputFile); interlinguaDictUsed = ptd1.interlinguaDictUsed; interlinguaTranslitUsed = ptd1.interlinguaTranslitUsed; foreach(ProcessedTermEntry pte in ptd1.srcTerms) { if (!srcTermList.ContainsKey(pte.lowercaceForm)) { srcTermList.Add(pte.lowercaceForm,pte); } } foreach(ProcessedTermEntry pte in ptd1.trgTerms) { if (!trgTermList.ContainsKey(pte.lowercaceForm)) { trgTermList.Add(pte.lowercaceForm,pte); } } } else if (interlinguaDictUsed&&interlinguaTranslitUsed) { string dir = Path.GetDirectoryName(preprocessedOutputFile); if (!dir.EndsWith(Path.DirectorySeparatorChar.ToString())) dir+=Path.DirectorySeparatorChar.ToString(); string prepSrcToTrgFile = dir+"eurovoc_preprocessed_"+srcLang+"_en.xml"; string prepTrgToSrcFile = dir+"eurovoc_preprocessed_"+trgLang+"_en.xml"; if (File.Exists(prepSrcToTrgFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+srcLang+"_en.xml",LogLevelType.LIMITED_OUTPUT,configuration); srcTermList = ProcessedTermEntry.ReadFromFile(prepSrcToTrgFile); } if (File.Exists(prepTrgToSrcFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+trgLang+"_en.xml",LogLevelType.LIMITED_OUTPUT,configuration); trgTermList = ProcessedTermEntry.ReadFromFile(prepTrgToSrcFile); } } else if (!interlinguaDictUsed&&!interlinguaTranslitUsed) { string dir = Path.GetDirectoryName(preprocessedOutputFile); if (!dir.EndsWith(Path.DirectorySeparatorChar.ToString())) dir+=Path.DirectorySeparatorChar.ToString(); string prepSrcToTrgFile = dir+"eurovoc_preprocessed_"+srcLang+"_"+trgLang+".xml"; string prepTrgToSrcFile = dir+"eurovoc_preprocessed_"+trgLang+"_"+srcLang+".xml"; if (File.Exists(prepSrcToTrgFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+srcLang+"_"+trgLang+".xml",LogLevelType.LIMITED_OUTPUT,configuration); srcTermList = ProcessedTermEntry.ReadFromFile(prepSrcToTrgFile); } if (File.Exists(prepTrgToSrcFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+trgLang+"_"+srcLang+".xml",LogLevelType.LIMITED_OUTPUT,configuration); trgTermList = ProcessedTermEntry.ReadFromFile(prepTrgToSrcFile); } } if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); ///If pre-processed terms should be saved for future use an output format is created. List<ProcessedTermEntry> srcTerms = new List<ProcessedTermEntry>(srcTermList.Values); List<ProcessedTermEntry> trgTerms = new List<ProcessedTermEntry>(trgTermList.Values); PreprocessedTermData ptd = new PreprocessedTermData(); ptd.interlinguaDictUsed = interlinguaDictUsed; ptd.interlinguaTranslitUsed = interlinguaTranslitUsed; ptd.srcTerms = srcTerms.ToArray(); ptd.trgTerms = trgTerms.ToArray(); ptd.srcLang = srcLang; ptd.trgLang = trgLang; string outStr = MPFramework.MPFrameworkFunctions.SerializeObjectInstance<PreprocessedTermData>(ptd); File.WriteAllText(preprocessedOutputFile,outStr); Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); //Need to pre-set the alignment thresholds, otherwise these will be overriden by defaults. MPAlignerConfigurationLangPairEntry lpeConf = ReadLangPairConfig (srcLang, trgLang, configuration); List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } //Multi-threaded execution is not stable at the moment... //List<AlignmentInfoElement> alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); AlignmentInfoElement.PrintList(outputFormat, alignmentOutputFile, alignment, configuration.printTopTrgForSrc, lpeConf,srcLang,trgLang,collectionId,domainId); { List<double> scores = new List<double>(); double tmp = 0; while (tmp<=1) { scores.Add(tmp); tmp+=0.01; } List<double> correct = new List<double>(); for(int t=0;t<scores.Count;t++) { correct.Add(0); } List<double> total = new List<double>(); for(int t=0;t<scores.Count;t++) { total.Add(0); } int totalForRec = 0; Dictionary<string,Dictionary<string,bool>> goldList = new Dictionary<string, Dictionary<string, bool>>(); for (int s = 0;s<eurovocDict[srcLang].Count;s++) { if (!eurovocDict[srcLang][s].ToLower().Contains("(under translation)")&&!eurovocDict[trgLang][s].ToLower().Contains("(under translation)")) { totalForRec++; if (!goldList.ContainsKey(eurovocDict[srcLang][s].ToLower())) goldList.Add(eurovocDict[srcLang][s].ToLower(), new Dictionary<string,bool>()); if (!goldList[eurovocDict[srcLang][s].ToLower()].ContainsKey(eurovocDict[trgLang][s].ToLower())) goldList[eurovocDict[srcLang][s].ToLower()].Add(eurovocDict[trgLang][s].ToLower(),true); } } string previousSrc = null; alignment.Sort(); foreach(AlignmentInfoElement aie in alignment) { string currSrc = AlignmentInfoElement.GetStrFromEntry(aie.srcEntry.surfaceFormWords, aie.minSrcId, aie.maxSrcId); if (previousSrc!=currSrc.ToLower()) { string src = aie.srcEntry.surfaceForm.ToLower(); string trg = aie.trgEntry.surfaceForm.ToLower(); double alignScore = aie.alignmentScore; bool corr = false; if (goldList.ContainsKey(src)&&goldList[src].ContainsKey(trg)) corr = true; for (int s =0;s<scores.Count;s++) { if (scores[s]<=alignScore) { if (corr) correct[s]++; total[s]++; } } previousSrc = currSrc.ToLower(); } } for(int s=0;s<scores.Count;s++) { double corr = correct[s]; double tot = total[s]; double totCorr = totalForRec; double prec = corr/tot*100; double rec = corr/totCorr*100; double f1 = prec*rec*2/(prec+rec); sw.WriteLine(srcLang+"\t"+trgLang+"\t"+scores[s].ToString()+"\t"+corr.ToString()+"\t"+tot.ToString()+"\t"+totCorr.ToString()+"\t"+prec.ToString()+"\t"+rec.ToString()+"\t"+f1.ToString()); } sw.Flush(); //} //} sw.Close(); } } if (File.Exists(tempTranslitFile)) File.Delete(tempTranslitFile); if (consolidateResults) { Log.Write ("Consolidating aligned term pairs with a threshold of: "+consolidationThreshold.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); //In the case if -ct (consolidation threshold) was defined and the output format has been ref_tabsep, the consolidation of results is perfomed. ConsolidationElement.ConsolidateRefTabsep(outputFile, consolidatedOutputFile,consolidationThreshold); } }
public static List<AlignmentInfoElement> AlignListPairs(MPAlignerConfiguration configuration, List<ProcessedTermEntry> srcTermList, List<ProcessedTermEntry> trgTermList, bool interlinguaDictUsed, bool interlinguaTranslitUsed, string srcLang, string trgLang, string srcFile, string trgFile, Dictionary<string, Dictionary<string, bool>> excDict, Dictionary<string, bool> srcStopWords, Dictionary<string, bool> trgStopWords) { if (configuration == null||configuration.langPairEntryDict==null||string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { return null; } string langKey = srcLang+"_"+trgLang; MPAlignerConfigurationLangPairEntry lpeConf = new MPAlignerConfigurationLangPairEntry(); if (configuration.langPairEntryDict.ContainsKey(langKey)) { lpeConf = configuration.langPairEntryDict[langKey]; } else { lpeConf = new MPAlignerConfigurationLangPairEntry(); lpeConf.srcLang = srcLang; lpeConf.trgLang = trgLang; } List<AlignmentInfoElement> res = new List<AlignmentInfoElement>(); for(int i=0;i< srcTermList.Count;i++) { ProcessedTermEntry srcPte = srcTermList[i]; ProcessedTermEntry trgPte = trgTermList[i]; if (srcPte!=null && trgPte!=null) { AlignmentInfoElement aie = new AlignmentInfoElement(); List<WordAlignmentElement> srcToTrg = new List<WordAlignmentElement>(); List<WordAlignmentElement> trgToSrc = new List<WordAlignmentElement>(); maxStrLen = 0; if (interlinguaDictUsed && interlinguaTranslitUsed) { ///Types: /// 0 - dictionary, /// 1 - simple translit, /// 2 - target or source, /// 3 - translit //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0); //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1); //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (lpeConf, srcPte.translationList, trgPte.transliterationList, srcToTrg, trgToSrc, 0, 3); //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0); //Translation is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (lpeConf, srcPte.transliterationList, trgPte.translationList, srcToTrg, trgToSrc, 3, 0); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1); //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 3); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } else if (interlinguaTranslitUsed) { //Translation is in target language; SOURCE TRANSLATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1); //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 2); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); //Translation is in target language; SOURCE vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0); } else if (interlinguaDictUsed) { //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0); //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1); //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0); //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3); //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } else { //Translation is in target language; SOURCE TRANSLATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2); //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2); //Translation is in target language; SOURCE vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0); //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } aie.srcToTrgAlignments = srcToTrg; aie.trgToSrcAlignments = trgToSrc; aie.srcEntry = srcPte; aie.trgEntry = trgPte; ConsolidateOverlaps(lpeConf,aie, excDict); bool valid = CreateStrListsForEval(configuration,aie,srcStopWords,trgStopWords,false); aie.alignmentScore = EvaluateAlignmentScore(lpeConf,aie); //If you wish to debug the process, comment the lines below that clear the alignments... aie.srcToTrgAlignments.Clear(); aie.trgToSrcAlignments.Clear(); aie.consolidatedAlignment.Clear(); aie.srcFile = srcFile; aie.trgFile = trgFile; res.Add(aie); } } return res; }
public static List<AlignmentInfoElement> AlignPairsMultiThreaded(MPAlignerConfiguration configuration, Dictionary<string, ProcessedTermEntry> srcTerms, Dictionary<string, ProcessedTermEntry> trgTerms, bool interlinguaDictUsed, bool interlinguaTranslitUsed, string srcLang, string trgLang, string srcFile, string trgFile, Dictionary<string, Dictionary<string, bool>> excDict, Dictionary<string, bool> srcStopWords, Dictionary<string, bool> trgStopWords) { if (configuration == null||configuration.langPairEntryDict==null||string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { return null; } Log.Write ("Starting alignmet of "+ srcTerms.Count.ToString()+" "+srcLang+" and "+ trgTerms.Count.ToString()+" "+trgLang+" terms.",LogLevelType.LIMITED_OUTPUT); int threadCount = configuration.alignmentThreads; STPStartInfo stpStartInfo = new STPStartInfo(); stpStartInfo.IdleTimeout = 100*1000; stpStartInfo.MaxWorkerThreads = 5*threadCount; stpStartInfo.MinWorkerThreads = threadCount; stpStartInfo.EnableLocalPerformanceCounters = true; SmartThreadPool smartThreadPool = new SmartThreadPool(stpStartInfo); string langKey = srcLang+"_"+trgLang; MPAlignerConfigurationLangPairEntry lpeConf = new MPAlignerConfigurationLangPairEntry(); if (configuration.langPairEntryDict.ContainsKey(langKey)) { lpeConf = configuration.langPairEntryDict[langKey]; } else { lpeConf = new MPAlignerConfigurationLangPairEntry(); lpeConf.srcLang = srcLang; lpeConf.trgLang = trgLang; } int counter = 0; //threadedAlignments = new List<AlignmentInfoElement>(); List<AlignmentInfoElement> res = new List<AlignmentInfoElement>(); Dictionary<string,Dictionary<string,bool>> alignedList = new Dictionary<string, Dictionary<string, bool>>(); List<IWorkItemResult<AlignmentInfoElement>> wirList = new List<IWorkItemResult<AlignmentInfoElement>>(1000); _configuration = configuration; _interlinguaDictUsed=interlinguaDictUsed; _interlinguaTranslitUsed=interlinguaTranslitUsed; _srcFile=srcFile; _trgFile=trgFile; _excDict=excDict; _srcStopWords=srcStopWords; _trgStopWords=trgStopWords; _lpeConf=lpeConf; foreach(string srcTerm in srcTerms.Keys) { counter++; if (counter%50==0) { Console.Write("."); if (counter%1000==0) { Console.WriteLine(" - "+counter.ToString()); } } ProcessedTermEntry srcPte = srcTerms[srcTerm]; foreach(string trgTerm in trgTerms.Keys) { //List<Tuple<ProcessedTermEntry,ProcessedTermEntry>> unProcessed = new List<Tuple<ProcessedTermEntry, ProcessedTermEntry>>(); if (wirList.Count>=100000) { smartThreadPool.WaitForIdle(); for(int i=0;i<wirList.Count;i++) { if (wirList[i].IsCompleted && wirList[i].Exception==null) { AlignmentInfoElement aie = (AlignmentInfoElement)wirList[i].Result; if (aie!=null && (!alignedList.ContainsKey(aie.alignedLowSrcStr)||!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr))) { res.Add(aie); if (!alignedList.ContainsKey(aie.alignedLowSrcStr)) alignedList.Add(aie.alignedLowSrcStr,new Dictionary<string, bool>()); if (!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr)) alignedList[aie.alignedLowSrcStr].Add(aie.alignedLowTrgStr,true); } } else if (!wirList[i].IsCompleted) { int times = 100; while(!wirList[i].IsCompleted && times>0) { times--; System.Threading.Thread.Sleep(100); } if (wirList[i].IsCompleted && wirList[i].Exception==null) { AlignmentInfoElement aie = (AlignmentInfoElement)wirList[i].Result; if (aie!=null && (!alignedList.ContainsKey(aie.alignedLowSrcStr)||!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr))) { res.Add(aie); if (!alignedList.ContainsKey(aie.alignedLowSrcStr)) alignedList.Add(aie.alignedLowSrcStr,new Dictionary<string, bool>()); if (!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr)) alignedList[aie.alignedLowSrcStr].Add(aie.alignedLowTrgStr,true); } } } } wirList.Clear(); } try { IWorkItemResult<AlignmentInfoElement> wir = smartThreadPool.QueueWorkItem( new Amib.Threading.Func<ProcessedTermEntry, ProcessedTermEntry, AlignmentInfoElement>(AlignSingleTermPair), srcPte, trgTerms[trgTerm]); if (wir!=null) wirList.Add(wir); } catch { Log.Write("Thread exception catched - cannot create a new thread within term alignment!", LogLevelType.WARNING); } //smartThreadPool /*while(smartThreadPool.PerformanceCountersReader.WorkItemsQueued>=100) { System.Threading.Thread.Sleep(5); }*/ //AlignmentInfoElement aie = AlignSingleTermPair (configuration, trgTerms[trgTerm], interlinguaDictUsed, interlinguaTranslitUsed, srcFile, trgFile, excDict, srcStopWords, trgStopWords, lpeConf, srcPte); //if (aie!=null) //{ //res.Add(aie); //} } } //Console.WriteLine(); if (wirList.Count>0) { smartThreadPool.WaitForIdle(); for(int i=0;i<wirList.Count;i++) { if (wirList[i].IsCompleted && wirList[i].Exception==null) { AlignmentInfoElement aie = (AlignmentInfoElement)wirList[i].Result; if (aie!=null && (!alignedList.ContainsKey(aie.alignedLowSrcStr)||!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr))) { res.Add(aie); if (!alignedList.ContainsKey(aie.alignedLowSrcStr)) alignedList.Add(aie.alignedLowSrcStr,new Dictionary<string, bool>()); if (!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr)) alignedList[aie.alignedLowSrcStr].Add(aie.alignedLowTrgStr,true); } } else if (!wirList[i].IsCompleted) { int times = 100; while(!wirList[i].IsCompleted && times>0) { times--; System.Threading.Thread.Sleep(100); } if (wirList[i].IsCompleted && wirList[i].Exception==null) { AlignmentInfoElement aie = (AlignmentInfoElement)wirList[i].Result; if (aie!=null && (!alignedList.ContainsKey(aie.alignedLowSrcStr)||!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr))) { res.Add(aie); if (!alignedList.ContainsKey(aie.alignedLowSrcStr)) alignedList.Add(aie.alignedLowSrcStr,new Dictionary<string, bool>()); if (!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr)) alignedList[aie.alignedLowSrcStr].Add(aie.alignedLowTrgStr,true); } } } } wirList.Clear(); } try{ smartThreadPool.Shutdown(true,100); smartThreadPool.Dispose(); smartThreadPool = null; GC.Collect(); GC.WaitForPendingFinalizers(); } catch { try { smartThreadPool.Shutdown(true,100); smartThreadPool.Dispose(); smartThreadPool = null; GC.Collect(); GC.WaitForPendingFinalizers(); } catch { GC.Collect(); GC.WaitForPendingFinalizers(); } } Log.Write ("Alignmet finished - "+ res.Count.ToString()+" term pairs aligned over the alignment threshold " +lpeConf.finalAlignmentThr.ToString()+".\n",LogLevelType.LIMITED_OUTPUT); return res; }
public static List<AlignmentInfoElement> AlignPairs(MPAlignerConfiguration configuration, Dictionary<string, ProcessedTermEntry> srcTerms, Dictionary<string, ProcessedTermEntry> trgTerms, bool interlinguaDictUsed, bool interlinguaTranslitUsed, string srcLang, string trgLang, string srcFile, string trgFile, Dictionary<string, Dictionary<string, bool>> excDict, Dictionary<string, bool> srcStopWords, Dictionary<string, bool> trgStopWords) { if (configuration == null||configuration.langPairEntryDict==null||string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { return null; } Log.Write ("Starting alignmet of "+ srcTerms.Count.ToString()+" "+srcLang+" and "+ trgTerms.Count.ToString()+" "+trgLang+" terms.",LogLevelType.LIMITED_OUTPUT); string langKey = srcLang+"_"+trgLang; MPAlignerConfigurationLangPairEntry lpeConf = new MPAlignerConfigurationLangPairEntry(); if (configuration.langPairEntryDict.ContainsKey(langKey)) { lpeConf = configuration.langPairEntryDict[langKey]; } else { lpeConf = new MPAlignerConfigurationLangPairEntry(); lpeConf.srcLang = srcLang; lpeConf.trgLang = trgLang; } int counter = 0; Dictionary<string,Dictionary<string,bool>> alignedList = new Dictionary<string, Dictionary<string, bool>>(); List<AlignmentInfoElement> res = new List<AlignmentInfoElement>(); foreach(string srcTerm in srcTerms.Keys) { counter++; if (counter%50==0) { Console.Write("."); if (counter%1000==0) { Console.WriteLine(" - "+counter.ToString()); } } ProcessedTermEntry srcPte = srcTerms[srcTerm]; foreach(string trgTerm in trgTerms.Keys) { ProcessedTermEntry trgPte = trgTerms[trgTerm]; if (srcPte!=null && trgPte!=null) { AlignmentInfoElement aie = new AlignmentInfoElement(); List<WordAlignmentElement> srcToTrg = new List<WordAlignmentElement>(); List<WordAlignmentElement> trgToSrc = new List<WordAlignmentElement>(); maxStrLen = 0; if (interlinguaDictUsed && interlinguaTranslitUsed) { ///Types: /// 0 - dictionary, /// 1 - simple translit, /// 2 - target, /// 3 - translit //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0); //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1); //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (lpeConf, srcPte.translationList, trgPte.transliterationList, srcToTrg, trgToSrc, 0, 3); //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0); //Translation is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (lpeConf, srcPte.transliterationList, trgPte.translationList, srcToTrg, trgToSrc, 3, 0); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1); //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 3); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } else if (interlinguaTranslitUsed) { //Translation is in target language; SOURCE TRANSLATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1); //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 2); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); //Translation is in target language; SOURCE vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0); } else if (interlinguaDictUsed) { //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0); //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1); //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0); //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3); } else { //Translation is in target language; SOURCE TRANSLATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2); //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET AlignStringProbabEntryListToStringList (lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2); //Translation is in target language; SOURCE vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0); //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } aie.srcToTrgAlignments = srcToTrg; aie.trgToSrcAlignments = trgToSrc; aie.srcToTrgAlignments.Sort( delegate(WordAlignmentElement w1, WordAlignmentElement w2) { double avgW1Overlap = (w1.fromOverlap+w1.toOverlap)/2; double avgW2Overlap = (w2.fromOverlap+w2.toOverlap)/2; // Descending sort of toOverlap's if the if (avgW1Overlap!=avgW2Overlap) { return avgW2Overlap.CompareTo(avgW1Overlap); } if (w2.fromLen == w1.fromLen) { if (w2.toOverlap==w1.toOverlap) { return w1.fromId.CompareTo(w2.fromId); } return w2.toOverlap.CompareTo(w1.toOverlap); } return w2.fromLen.CompareTo(w1.fromLen); } ); aie.trgToSrcAlignments.Sort( delegate(WordAlignmentElement w1, WordAlignmentElement w2) { double avgW1Overlap = (w1.fromOverlap+w1.toOverlap)/2; double avgW2Overlap = (w2.fromOverlap+w2.toOverlap)/2; // Descending sort of toOverlap's if the if (avgW1Overlap!=avgW2Overlap) { return avgW2Overlap.CompareTo(avgW1Overlap); } // Descending sort of toOverlap's if the if (w2.toLen == w1.toLen) { if (w2.fromOverlap==w1.fromOverlap) { return w1.toId.CompareTo(w2.toId); } return w2.fromOverlap.CompareTo(w1.fromOverlap); } return w2.toLen.CompareTo(w1.toLen); } ); aie.srcEntry = srcPte; aie.trgEntry = trgPte; ConsolidateOverlaps(lpeConf,aie, excDict); if(CreateStrListsForEval(configuration,aie,srcStopWords,trgStopWords)) { if (!alignedList.ContainsKey(aie.alignedLowSrcStr)||!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr)) { if (!alignedList.ContainsKey(aie.alignedLowSrcStr)) alignedList.Add(aie.alignedLowSrcStr,new Dictionary<string, bool>()); if (!alignedList[aie.alignedLowSrcStr].ContainsKey(aie.alignedLowTrgStr)) alignedList[aie.alignedLowSrcStr].Add(aie.alignedLowTrgStr,true); aie.alignmentScore = EvaluateAlignmentScore(lpeConf,aie); if (aie.alignmentScore>=lpeConf.finalAlignmentThr) { //If you wish to debug the process, comment the lines below that clear the alignments... aie.srcToTrgAlignments.Clear(); aie.trgToSrcAlignments.Clear(); aie.consolidatedAlignment.Clear(); aie.srcFile = srcFile; aie.trgFile = trgFile; res.Add(aie); } } } } } } Console.WriteLine(" - "+counter.ToString()); Log.Write ("Alignmet finished - "+ res.Count.ToString()+" term pairs aligned over the alignment threshold " +lpeConf.finalAlignmentThr.ToString()+".\n",LogLevelType.LIMITED_OUTPUT); return res; }
public static bool CreateStrListsForEval(MPAlignerConfiguration conf, AlignmentInfoElement aie, Dictionary<string,bool> srcStopWords, Dictionary<string,bool> trgStopWords, bool stripListsOnError = true) { if (aie == null || aie.consolidatedAlignment == null || aie.srcEntry == null || aie.trgEntry == null) { return false; } aie.minSrcId = Int32.MaxValue; aie.minTrgId = Int32.MaxValue; aie.maxSrcId = Int32.MinValue; aie.maxTrgId = Int32.MinValue; double srcMultiplier = 1; double trgMultiplier = 1; int prevFromId = -1; int prevToId = -1; int minSrcNonStop = Int32.MaxValue; int maxSrcNonStop = Int32.MinValue; int minTrgNonStop = Int32.MaxValue; int maxTrgNonStop = Int32.MinValue; //aie.finalAlignment = new List<StringComparisonElement>(); StringBuilder src = new StringBuilder (); StringBuilder trg = new StringBuilder (); //StringBuilder srcL = new StringBuilder (); //StringBuilder trgL = new StringBuilder (); Dictionary<int,bool> srcIds = new Dictionary<int, bool> (); Dictionary<int,bool> trgIds = new Dictionary<int, bool> (); bool onlyStopSrc = true; bool onlyStopTrg = true; /*if (!conf.allowTrimmedAlignments) { aie.minSrcId = 0; aie.maxSrcId = aie.srcEntry.lowercaseWords.Count-1; aie.minTrgId = 0; aie.maxTrgId = aie.trgEntry.lowercaseWords.Count-1; }*/ foreach (WordAlignmentElement wae in aie.consolidatedAlignment) { double srcLen = aie.srcEntry.lowercaseWords [wae.fromId].Length; double trgLen = aie.trgEntry.lowercaseWords [wae.toId].Length; //if (conf.allowTrimmedAlignments) //{ if (wae.fromId < aie.minSrcId) { aie.minSrcId = wae.fromId; } if (wae.fromId < minSrcNonStop && !srcStopWords.ContainsKey (aie.srcEntry.lowercaseWords [wae.fromId])) { minSrcNonStop = wae.fromId; } if (wae.fromId > aie.maxSrcId) { aie.maxSrcId = wae.fromId; } if (wae.fromId > maxSrcNonStop && !srcStopWords.ContainsKey (aie.srcEntry.lowercaseWords [wae.fromId])) { maxSrcNonStop = wae.fromId; } if (wae.toId < aie.minTrgId) { aie.minTrgId = wae.toId; } if (wae.toId < minTrgNonStop && !trgStopWords.ContainsKey (aie.trgEntry.lowercaseWords [wae.toId])) { minTrgNonStop = wae.toId; } if (wae.toId > aie.maxTrgId) { aie.maxTrgId = wae.toId; } if (wae.toId > maxTrgNonStop && !trgStopWords.ContainsKey (aie.trgEntry.lowercaseWords [wae.toId])) { maxTrgNonStop = wae.toId; } //} //TODO: For the future - there is a limitation that you cannot evaluate alignments where for one token you have acquired overlaps in different languages (f.e., term is in [EN][LV] aligned segments, but the final string can only be in EN - it cannot be split in two parts!). if (!conf.allowTrimmedAlignments ||(minSrcNonStop<=wae.fromId && maxSrcNonStop>=wae.fromId && minTrgNonStop<=wae.toId && maxTrgNonStop>=wae.toId)) { if (wae.fromId == prevFromId) { if (wae.toId != prevToId) { prevToId = wae.toId; //trg.Append(" "); string trgStr = GetCorrectString (aie.trgEntry, wae.toId, wae.toType, wae.toTypeId); if (!trgStopWords.ContainsKey (aie.trgEntry.lowercaseWords [wae.toId])) onlyStopTrg = false; trg.Append (trgStr); trgMultiplier *= (((aie.trgEntry.len - trgLen) / aie.trgEntry.len) + (trgLen / aie.trgEntry.len) * GetProbab (aie.trgEntry, wae.toId, wae.toType, wae.toTypeId)); } } else if (wae.toId == prevToId) { if (wae.fromId != prevFromId) { prevFromId = wae.fromId; //src.Append(" "); string srcStr = GetCorrectString (aie.srcEntry, wae.fromId, wae.fromType, wae.fromTypeId); if (!srcStopWords.ContainsKey (aie.srcEntry.lowercaseWords [wae.fromId])) onlyStopSrc = false; src.Append (srcStr); srcMultiplier *= (((aie.srcEntry.len - srcLen) / aie.srcEntry.len) + (srcLen / aie.srcEntry.len) * GetProbab (aie.srcEntry, wae.fromId, wae.fromType, wae.fromTypeId)); } } else { prevToId = wae.toId; prevFromId = wae.fromId; string srcStr = GetCorrectString (aie.srcEntry, wae.fromId, wae.fromType, wae.fromTypeId); if (!srcStopWords.ContainsKey (aie.srcEntry.lowercaseWords [wae.fromId])) onlyStopSrc = false; src.Append (srcStr); string trgStr = GetCorrectString (aie.trgEntry, wae.toId, wae.toType, wae.toTypeId); if (!trgStopWords.ContainsKey (aie.trgEntry.lowercaseWords [wae.toId])) onlyStopTrg = false; trg.Append (trgStr); srcMultiplier *= (((aie.srcEntry.len - srcLen) / aie.srcEntry.len) + (srcLen / aie.srcEntry.len) * GetProbab (aie.srcEntry, wae.fromId, wae.fromType, wae.fromTypeId)); trgMultiplier *= (((aie.trgEntry.len - trgLen) / aie.trgEntry.len) + (trgLen / aie.trgEntry.len) * GetProbab (aie.trgEntry, wae.toId, wae.toType, wae.toTypeId)); } if (!srcIds.ContainsKey (wae.fromId)) srcIds.Add (wae.fromId, true); if (!trgIds.ContainsKey (wae.toId)) trgIds.Add (wae.toId, true); } } //Try to find words in the middle of the current alignment that have not been aligned. //If such are found, penalise the source and target alignments by the length of the wrong alignment. for (int i=0; i<aie.srcEntry.lowercaseWords.Count; i++) { if (i >= aie.minSrcId && i <= aie.maxSrcId && !srcIds.ContainsKey (i)) { string str = aie.srcEntry.lowercaseWords [i]; if (!srcStopWords.ContainsKey (str)) onlyStopSrc = false; src.Append (str); trg.Append (new String (' ', str.Length)); } } for (int i=0; i<aie.trgEntry.lowercaseWords.Count; i++) { if (i >= aie.minTrgId && i <= aie.maxTrgId && !trgIds.ContainsKey (i)) { string str = aie.trgEntry.lowercaseWords [i]; if (!trgStopWords.ContainsKey (str)) onlyStopTrg = false; trg.Append (str); src.Append (new String (' ', str.Length)); } } if (src.Length > 0) { aie.srcStrForAlignment = src.ToString (); aie.trgStrForAlignment = trg.ToString (); } aie.srcMultiplier = srcMultiplier; aie.trgMultiplier = trgMultiplier; bool wasBad = false; if (!conf.allowTrimmedAlignments) { if (aie.minSrcId > 0 || aie.minTrgId > 0 || aie.maxSrcId + 1 < aie.srcEntry.lowercaseWords.Count || aie.maxTrgId + 1 < aie.trgEntry.lowercaseWords.Count) { wasBad = true; } aie.minSrcId = 0; aie.minTrgId = 0; aie.maxSrcId = aie.srcEntry.lowercaseWords.Count - 1; aie.maxTrgId = aie.trgEntry.lowercaseWords.Count - 1; } else { if (minSrcNonStop >= 0 && minSrcNonStop < aie.srcEntry.lowercaseWords.Count && maxSrcNonStop >= 0 && maxSrcNonStop < aie.srcEntry.lowercaseWords.Count && minTrgNonStop >= 0 && minTrgNonStop < aie.trgEntry.lowercaseWords.Count && maxTrgNonStop >= 0 && maxTrgNonStop < aie.trgEntry.lowercaseWords.Count) { aie.minSrcId = minSrcNonStop; aie.minTrgId = minTrgNonStop; aie.maxSrcId = maxSrcNonStop; aie.maxTrgId = maxTrgNonStop; } else if (stripListsOnError) { aie.srcStrForAlignment = ""; aie.trgStrForAlignment = ""; aie.alignedLowSrcStr = ""; aie.alignedLowTrgStr = ""; return false; } } //Just to be on the safe side - check whether the ID's are in index boundaries: if (aie.minSrcId < 0 || aie.minTrgId < 0 || aie.maxSrcId + 1 > aie.srcEntry.lowercaseWords.Count || aie.maxTrgId + 1 > aie.trgEntry.lowercaseWords.Count) { aie.minSrcId = 0; aie.minTrgId = 0; aie.maxSrcId = aie.srcEntry.lowercaseWords.Count - 1; aie.maxTrgId = aie.trgEntry.lowercaseWords.Count - 1; if (stripListsOnError) { aie.srcStrForAlignment = ""; aie.trgStrForAlignment = ""; aie.alignedLowSrcStr = ""; aie.alignedLowTrgStr = ""; } return false; } if (wasBad) { if (stripListsOnError) { aie.srcStrForAlignment = ""; aie.trgStrForAlignment = ""; aie.alignedLowSrcStr = ""; aie.alignedLowTrgStr = ""; } return false; } if (onlyStopSrc || onlyStopTrg) { if (stripListsOnError) { aie.srcStrForAlignment = ""; aie.trgStrForAlignment = ""; aie.alignedLowSrcStr = ""; aie.alignedLowTrgStr = ""; } return false; } aie.alignedLowSrcStr = AlignmentInfoElement.GetStrFromEntry(aie.srcEntry.lowercaseWords,aie.minSrcId,aie.maxSrcId); aie.alignedLowTrgStr = AlignmentInfoElement.GetStrFromEntry(aie.trgEntry.lowercaseWords,aie.minTrgId,aie.maxTrgId); return true; }